summaryrefslogtreecommitdiffstats
path: root/third_party
diff options
context:
space:
mode:
Diffstat (limited to 'third_party')
-rw-r--r--third_party/aom/CMakeLists.txt76
-rw-r--r--third_party/aom/README.md10
-rw-r--r--third_party/aom/aom/aom.h6
-rw-r--r--third_party/aom/aom/aom_codec.h14
-rw-r--r--third_party/aom/aom/aom_decoder.h6
-rw-r--r--third_party/aom/aom/aom_encoder.h35
-rw-r--r--third_party/aom/aom/aom_frame_buffer.h6
-rw-r--r--third_party/aom/aom/aom_image.h6
-rw-r--r--third_party/aom/aom/aom_integer.h6
-rw-r--r--third_party/aom/aom/aomcx.h54
-rw-r--r--third_party/aom/aom/aomdx.h14
-rw-r--r--third_party/aom/aom/exports_com13
-rw-r--r--third_party/aom/aom/exports_dec2
-rw-r--r--third_party/aom/aom/exports_enc9
-rw-r--r--third_party/aom/aom/exports_test2
-rw-r--r--third_party/aom/aom/internal/aom_codec_internal.h8
-rw-r--r--third_party/aom/aom/src/aom_decoder.c4
-rw-r--r--third_party/aom/aom/src/aom_encoder.c8
-rw-r--r--third_party/aom/aom_dsp/aom_convolve.c1
-rw-r--r--third_party/aom/aom_dsp/aom_convolve.h56
-rw-r--r--third_party/aom/aom_dsp/aom_dsp.cmake30
-rw-r--r--third_party/aom/aom_dsp/aom_dsp_common.h6
-rwxr-xr-xthird_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl118
-rw-r--r--third_party/aom/aom_dsp/aom_filter.h6
-rw-r--r--third_party/aom/aom_dsp/aom_simd.h6
-rw-r--r--third_party/aom/aom_dsp/aom_simd_inline.h6
-rw-r--r--third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c6
-rw-r--r--third_party/aom/aom_dsp/arm/subpel_variance_neon.c22
-rw-r--r--third_party/aom/aom_dsp/binary_codes_reader.c14
-rw-r--r--third_party/aom/aom_dsp/binary_codes_reader.h8
-rw-r--r--third_party/aom/aom_dsp/binary_codes_writer.c18
-rw-r--r--third_party/aom/aom_dsp/binary_codes_writer.h7
-rw-r--r--third_party/aom/aom_dsp/bitreader.h12
-rw-r--r--third_party/aom/aom_dsp/bitreader_buffer.c10
-rw-r--r--third_party/aom/aom_dsp/bitreader_buffer.h8
-rw-r--r--third_party/aom/aom_dsp/bitwriter.h6
-rw-r--r--third_party/aom/aom_dsp/bitwriter_buffer.c12
-rw-r--r--third_party/aom/aom_dsp/bitwriter_buffer.h8
-rw-r--r--third_party/aom/aom_dsp/blend.h6
-rw-r--r--third_party/aom/aom_dsp/buf_ans.h6
-rw-r--r--third_party/aom/aom_dsp/daalaboolreader.c6
-rw-r--r--third_party/aom/aom_dsp/daalaboolreader.h9
-rw-r--r--third_party/aom/aom_dsp/daalaboolwriter.h6
-rw-r--r--third_party/aom/aom_dsp/entcode.h6
-rw-r--r--third_party/aom/aom_dsp/entdec.c1
-rw-r--r--third_party/aom/aom_dsp/entdec.h6
-rw-r--r--third_party/aom/aom_dsp/entenc.c1
-rw-r--r--third_party/aom/aom_dsp/entenc.h6
-rw-r--r--third_party/aom/aom_dsp/fft_common.h6
-rw-r--r--third_party/aom/aom_dsp/grain_synthesis.c67
-rw-r--r--third_party/aom/aom_dsp/grain_synthesis.h24
-rw-r--r--third_party/aom/aom_dsp/grain_table.h2
-rw-r--r--third_party/aom/aom_dsp/intrapred_common.h6
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve_msa.h6
-rw-r--r--third_party/aom/aom_dsp/mips/common_dspr2.h6
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c1
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c1
-rw-r--r--third_party/aom/aom_dsp/mips/convolve_common_dspr2.h6
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h6
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h6
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h6
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_msa.h6
-rw-r--r--third_party/aom/aom_dsp/mips/macros_msa.h6
-rw-r--r--third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c22
-rw-r--r--third_party/aom/aom_dsp/noise_model.c4
-rw-r--r--third_party/aom/aom_dsp/noise_model.h6
-rw-r--r--third_party/aom/aom_dsp/noise_util.h6
-rw-r--r--third_party/aom/aom_dsp/postproc.h6
-rw-r--r--third_party/aom/aom_dsp/prob.h6
-rw-r--r--third_party/aom/aom_dsp/psnr.c1
-rw-r--r--third_party/aom/aom_dsp/psnr.h6
-rw-r--r--third_party/aom/aom_dsp/quantize.c217
-rw-r--r--third_party/aom/aom_dsp/quantize.h29
-rw-r--r--third_party/aom/aom_dsp/sad.c7
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics.h6
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h9
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics_c.h6
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h6
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics.h6
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h6
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_c.h6
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h6
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h6
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics.h6
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h6
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics_c.h6
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h6
-rw-r--r--third_party/aom/aom_dsp/sse.c52
-rw-r--r--third_party/aom/aom_dsp/ssim.c2
-rw-r--r--third_party/aom/aom_dsp/ssim.h6
-rw-r--r--third_party/aom/aom_dsp/txfm_common.h6
-rw-r--r--third_party/aom/aom_dsp/variance.c399
-rw-r--r--third_party/aom/aom_dsp/variance.h16
-rw-r--r--third_party/aom/aom_dsp/x86/aom_asm_stubs.c7
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c556
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c7
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c900
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c514
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c8
-rw-r--r--third_party/aom/aom_dsp/x86/blend_mask_sse4.h237
-rw-r--r--third_party/aom/aom_dsp/x86/blend_sse4.h69
-rw-r--r--third_party/aom/aom_dsp/x86/common_avx2.h6
-rw-r--r--third_party/aom/aom_dsp/x86/convolve.h32
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_avx2.h11
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_common_intrin.h6
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_sse2.h6
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_sse4_1.h6
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h6
-rw-r--r--third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm351
-rw-r--r--third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c78
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c707
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c70
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c149
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_avx2.c140
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse2.c72
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse4.c12
-rw-r--r--third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c4
-rw-r--r--third_party/aom/aom_dsp/x86/loopfilter_avx2.c916
-rw-r--r--third_party/aom/aom_dsp/x86/loopfilter_sse2.c1275
-rw-r--r--third_party/aom/aom_dsp/x86/lpf_common_sse2.h6
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c1
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h6
-rw-r--r--third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h6
-rw-r--r--third_party/aom/aom_dsp/x86/mem_sse2.h6
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h58
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h15
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_variance_avx2.c190
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_variance_sse4.c41
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm67
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_sse2.c329
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm44
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_x86.h77
-rw-r--r--third_party/aom/aom_dsp/x86/sse_avx2.c250
-rw-r--r--third_party/aom/aom_dsp/x86/sse_sse4.c241
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_avx2.c79
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_sse2.c10
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_sse2.h22
-rw-r--r--third_party/aom/aom_dsp/x86/synonyms.h15
-rw-r--r--third_party/aom/aom_dsp/x86/synonyms_avx2.h16
-rw-r--r--third_party/aom/aom_dsp/x86/transpose_sse2.h6
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_avx2.h6
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_sse2.h6
-rw-r--r--third_party/aom/aom_dsp/x86/variance_avx2.c3
-rw-r--r--third_party/aom/aom_dsp/x86/variance_sse2.c166
-rw-r--r--third_party/aom/aom_mem/aom_mem.h6
-rw-r--r--third_party/aom/aom_mem/include/aom_mem_intrnl.h6
-rw-r--r--third_party/aom/aom_ports/aom_once.h6
-rw-r--r--third_party/aom/aom_ports/aom_timer.h6
-rw-r--r--third_party/aom/aom_ports/arm.h8
-rw-r--r--third_party/aom/aom_ports/arm_cpudetect.c8
-rw-r--r--third_party/aom/aom_ports/bitops.h10
-rw-r--r--third_party/aom/aom_ports/emmintrin_compat.h6
-rw-r--r--third_party/aom/aom_ports/mem.h6
-rw-r--r--third_party/aom/aom_ports/mem_ops.h6
-rw-r--r--third_party/aom/aom_ports/mem_ops_aligned.h6
-rw-r--r--third_party/aom/aom_ports/msvc.h6
-rw-r--r--third_party/aom/aom_ports/ppc.h6
-rw-r--r--third_party/aom/aom_ports/sanitizer.h6
-rw-r--r--third_party/aom/aom_ports/system_state.h6
-rw-r--r--third_party/aom/aom_ports/x86.h6
-rw-r--r--third_party/aom/aom_scale/aom_scale.h6
-rw-r--r--third_party/aom/aom_scale/generic/yv12config.c24
-rw-r--r--third_party/aom/aom_scale/yv12config.h10
-rw-r--r--third_party/aom/aom_util/aom_thread.h6
-rw-r--r--third_party/aom/aom_util/debug_util.h6
-rw-r--r--third_party/aom/aom_util/endian_inl.h6
-rw-r--r--third_party/aom/apps/aomdec.c143
-rw-r--r--third_party/aom/apps/aomenc.c77
-rw-r--r--third_party/aom/apps/aomenc.h7
-rw-r--r--third_party/aom/av1/av1.cmake14
-rw-r--r--third_party/aom/av1/av1_cx_iface.c160
-rw-r--r--third_party/aom/av1/av1_dx_iface.c62
-rw-r--r--third_party/aom/av1/av1_iface_common.h7
-rw-r--r--third_party/aom/av1/common/alloccommon.h6
-rw-r--r--third_party/aom/av1/common/arm/av1_inv_txfm_neon.c2447
-rw-r--r--third_party/aom/av1/common/arm/av1_inv_txfm_neon.h8
-rw-r--r--third_party/aom/av1/common/arm/blend_a64_hmask_neon.c4
-rw-r--r--third_party/aom/av1/common/arm/blend_a64_vmask_neon.c4
-rw-r--r--third_party/aom/av1/common/arm/cfl_neon.c4
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon.c363
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon.h6
-rw-r--r--third_party/aom/av1/common/arm/jnt_convolve_neon.c512
-rw-r--r--third_party/aom/av1/common/arm/mem_neon.h15
-rw-r--r--third_party/aom/av1/common/arm/selfguided_neon.c18
-rw-r--r--third_party/aom/av1/common/arm/transpose_neon.h83
-rw-r--r--third_party/aom/av1/common/arm/warp_plane_neon.c714
-rw-r--r--third_party/aom/av1/common/arm/wiener_convolve_neon.c145
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm1d.c140
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm1d.h6
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm1d_cfg.h6
-rw-r--r--third_party/aom/av1/common/av1_loopfilter.c945
-rw-r--r--third_party/aom/av1/common/av1_loopfilter.h120
-rwxr-xr-xthird_party/aom/av1/common/av1_rtcd_defs.pl46
-rw-r--r--third_party/aom/av1/common/av1_txfm.c50
-rw-r--r--third_party/aom/av1/common/av1_txfm.h32
-rw-r--r--third_party/aom/av1/common/blockd.c64
-rw-r--r--third_party/aom/av1/common/blockd.h53
-rw-r--r--third_party/aom/av1/common/cdef.h6
-rw-r--r--third_party/aom/av1/common/cdef_block.h6
-rw-r--r--third_party/aom/av1/common/cdef_block_simd.h5
-rw-r--r--third_party/aom/av1/common/cfl.h6
-rw-r--r--third_party/aom/av1/common/common.h6
-rw-r--r--third_party/aom/av1/common/common_data.h75
-rw-r--r--third_party/aom/av1/common/convolve.c116
-rw-r--r--third_party/aom/av1/common/convolve.h21
-rw-r--r--third_party/aom/av1/common/entropy.h6
-rw-r--r--third_party/aom/av1/common/entropymode.h8
-rw-r--r--third_party/aom/av1/common/entropymv.c55
-rw-r--r--third_party/aom/av1/common/entropymv.h16
-rw-r--r--third_party/aom/av1/common/enums.h12
-rw-r--r--third_party/aom/av1/common/filter.h22
-rw-r--r--third_party/aom/av1/common/frame_buffers.c11
-rw-r--r--third_party/aom/av1/common/frame_buffers.h12
-rw-r--r--third_party/aom/av1/common/idct.c274
-rw-r--r--third_party/aom/av1/common/idct.h31
-rw-r--r--third_party/aom/av1/common/mv.h8
-rw-r--r--third_party/aom/av1/common/mvref_common.c381
-rw-r--r--third_party/aom/av1/common/mvref_common.h43
-rw-r--r--third_party/aom/av1/common/obmc.h14
-rw-r--r--third_party/aom/av1/common/obu_util.c147
-rw-r--r--third_party/aom/av1/common/obu_util.h47
-rw-r--r--third_party/aom/av1/common/odintrin.h12
-rw-r--r--third_party/aom/av1/common/onyxc_int.h29
-rw-r--r--third_party/aom/av1/common/ppc/cfl_ppc.c85
-rw-r--r--third_party/aom/av1/common/pred_common.c4
-rw-r--r--third_party/aom/av1/common/pred_common.h6
-rw-r--r--third_party/aom/av1/common/quant_common.h6
-rw-r--r--third_party/aom/av1/common/reconinter.c652
-rw-r--r--third_party/aom/av1/common/reconinter.h148
-rw-r--r--third_party/aom/av1/common/reconintra.h6
-rw-r--r--third_party/aom/av1/common/resize.c39
-rw-r--r--third_party/aom/av1/common/resize.h6
-rw-r--r--third_party/aom/av1/common/restoration.c131
-rw-r--r--third_party/aom/av1/common/restoration.h7
-rw-r--r--third_party/aom/av1/common/scale.h7
-rw-r--r--third_party/aom/av1/common/scan.h6
-rw-r--r--third_party/aom/av1/common/seg_common.h6
-rw-r--r--third_party/aom/av1/common/thread_common.c18
-rw-r--r--third_party/aom/av1/common/thread_common.h6
-rw-r--r--third_party/aom/av1/common/tile_common.c16
-rw-r--r--third_party/aom/av1/common/tile_common.h9
-rw-r--r--third_party/aom/av1/common/timing.h6
-rw-r--r--third_party/aom/av1/common/token_cdfs.h5
-rw-r--r--third_party/aom/av1/common/txb_common.h243
-rw-r--r--third_party/aom/av1/common/warped_motion.c4
-rw-r--r--third_party/aom/av1/common/warped_motion.h6
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c1
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c6
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h6
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c6
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h10
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse2.h6
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.h11
-rw-r--r--third_party/aom/av1/common/x86/cfl_simd.h5
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_avx2.c2
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_sse2.c3
-rw-r--r--third_party/aom/av1/common/x86/convolve_sse2.c11
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c1
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c1
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c1
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c1117
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c3935
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c1
-rw-r--r--third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h28
-rw-r--r--third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c268
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_avx2.c211
-rw-r--r--third_party/aom/av1/common/x86/reconinter_avx2.c496
-rw-r--r--third_party/aom/av1/common/x86/selfguided_avx2.c23
-rw-r--r--third_party/aom/av1/common/x86/selfguided_sse4.c24
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_sse4.c809
-rw-r--r--third_party/aom/av1/common/x86/wiener_convolve_avx2.c3
-rw-r--r--third_party/aom/av1/common/x86/wiener_convolve_sse2.c3
-rw-r--r--third_party/aom/av1/decoder/accounting.h6
-rw-r--r--third_party/aom/av1/decoder/decodeframe.c555
-rw-r--r--third_party/aom/av1/decoder/decodeframe.h8
-rw-r--r--third_party/aom/av1/decoder/decodemv.c179
-rw-r--r--third_party/aom/av1/decoder/decodemv.h6
-rw-r--r--third_party/aom/av1/decoder/decoder.c21
-rw-r--r--third_party/aom/av1/decoder/decoder.h12
-rw-r--r--third_party/aom/av1/decoder/decodetxb.h6
-rw-r--r--third_party/aom/av1/decoder/detokenize.h6
-rw-r--r--third_party/aom/av1/decoder/dthread.h6
-rw-r--r--third_party/aom/av1/decoder/inspection.h6
-rw-r--r--third_party/aom/av1/decoder/obu.c176
-rw-r--r--third_party/aom/av1/decoder/obu.h29
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.c7
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.h6
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.c8
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.h6
-rw-r--r--third_party/aom/av1/encoder/aq_variance.c179
-rw-r--r--third_party/aom/av1/encoder/aq_variance.h10
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d.c147
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d.h6
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h6
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.c68
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.h6
-rw-r--r--third_party/aom/av1/encoder/bitstream.c312
-rw-r--r--third_party/aom/av1/encoder/bitstream.h16
-rw-r--r--third_party/aom/av1/encoder/block.h53
-rw-r--r--third_party/aom/av1/encoder/blockiness.c1
-rw-r--r--third_party/aom/av1/encoder/context_tree.c17
-rw-r--r--third_party/aom/av1/encoder/context_tree.h8
-rw-r--r--third_party/aom/av1/encoder/corner_detect.h6
-rw-r--r--third_party/aom/av1/encoder/corner_match.h6
-rw-r--r--third_party/aom/av1/encoder/cost.h6
-rw-r--r--third_party/aom/av1/encoder/dwt.h5
-rw-r--r--third_party/aom/av1/encoder/encodeframe.c1075
-rw-r--r--third_party/aom/av1/encoder/encodeframe.h11
-rw-r--r--third_party/aom/av1/encoder/encodemb.c71
-rw-r--r--third_party/aom/av1/encoder/encodemb.h17
-rw-r--r--third_party/aom/av1/encoder/encodemv.c26
-rw-r--r--third_party/aom/av1/encoder/encodemv.h14
-rw-r--r--third_party/aom/av1/encoder/encoder.c490
-rw-r--r--third_party/aom/av1/encoder/encoder.h105
-rw-r--r--third_party/aom/av1/encoder/encodetxb.c48
-rw-r--r--third_party/aom/av1/encoder/encodetxb.h6
-rw-r--r--third_party/aom/av1/encoder/ethread.c252
-rw-r--r--third_party/aom/av1/encoder/ethread.h6
-rw-r--r--third_party/aom/av1/encoder/extend.h6
-rw-r--r--third_party/aom/av1/encoder/firstpass.c992
-rw-r--r--third_party/aom/av1/encoder/firstpass.h19
-rw-r--r--third_party/aom/av1/encoder/global_motion.c4
-rw-r--r--third_party/aom/av1/encoder/global_motion.h6
-rw-r--r--third_party/aom/av1/encoder/grain_test_vectors.h6
-rw-r--r--third_party/aom/av1/encoder/hash.h8
-rw-r--r--third_party/aom/av1/encoder/hash_motion.c94
-rw-r--r--third_party/aom/av1/encoder/hash_motion.h16
-rw-r--r--third_party/aom/av1/encoder/hybrid_fwd_txfm.c38
-rw-r--r--third_party/aom/av1/encoder/hybrid_fwd_txfm.h6
-rw-r--r--third_party/aom/av1/encoder/lookahead.h6
-rw-r--r--third_party/aom/av1/encoder/mathutils.h7
-rw-r--r--third_party/aom/av1/encoder/mbgraph.c7
-rw-r--r--third_party/aom/av1/encoder/mbgraph.h6
-rw-r--r--third_party/aom/av1/encoder/mcomp.c231
-rw-r--r--third_party/aom/av1/encoder/mcomp.h33
-rw-r--r--third_party/aom/av1/encoder/ml.c16
-rw-r--r--third_party/aom/av1/encoder/ml.h11
-rw-r--r--third_party/aom/av1/encoder/palette.h6
-rw-r--r--third_party/aom/av1/encoder/partition_model_weights.h1457
-rw-r--r--third_party/aom/av1/encoder/picklpf.c3
-rw-r--r--third_party/aom/av1/encoder/picklpf.h6
-rw-r--r--third_party/aom/av1/encoder/pickrst.c128
-rw-r--r--third_party/aom/av1/encoder/pickrst.h23
-rw-r--r--third_party/aom/av1/encoder/pustats.h183
-rw-r--r--third_party/aom/av1/encoder/random.h6
-rw-r--r--third_party/aom/av1/encoder/ransac.h6
-rw-r--r--third_party/aom/av1/encoder/rate_distortion_model_params.h6
-rw-r--r--third_party/aom/av1/encoder/ratectrl.c96
-rw-r--r--third_party/aom/av1/encoder/ratectrl.h29
-rw-r--r--third_party/aom/av1/encoder/ratectrl_xiph.c0
-rw-r--r--third_party/aom/av1/encoder/ratectrl_xiph.h0
-rw-r--r--third_party/aom/av1/encoder/rd.c494
-rw-r--r--third_party/aom/av1/encoder/rd.h48
-rw-r--r--third_party/aom/av1/encoder/rdopt.c4551
-rw-r--r--third_party/aom/av1/encoder/rdopt.h18
-rw-r--r--third_party/aom/av1/encoder/reconinter_enc.c627
-rw-r--r--third_party/aom/av1/encoder/reconinter_enc.h127
-rw-r--r--third_party/aom/av1/encoder/segmentation.h6
-rw-r--r--third_party/aom/av1/encoder/speed_features.c145
-rw-r--r--third_party/aom/av1/encoder/speed_features.h144
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.c81
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.h6
-rw-r--r--third_party/aom/av1/encoder/tokenize.h6
-rw-r--r--third_party/aom/av1/encoder/tx_prune_model_weights.h1482
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c259
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c75
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h6
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h6
-rw-r--r--third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h9
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_avx2.c130
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_sse4.c46
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c344
-rw-r--r--third_party/aom/av1/encoder/x86/pickrst_avx2.c403
-rw-r--r--third_party/aom/av1/encoder/x86/pickrst_sse4.c389
-rw-r--r--third_party/aom/av1/encoder/x86/wedge_utils_avx2.c4
-rw-r--r--third_party/aom/av1/exports_com2
-rw-r--r--third_party/aom/av1/exports_dec1
-rw-r--r--third_party/aom/av1/exports_test2
-rw-r--r--third_party/aom/build/cmake/aom_config_defaults.cmake248
-rw-r--r--third_party/aom/build/cmake/aom_configure.cmake55
-rw-r--r--third_party/aom/build/cmake/aom_experiment_deps.cmake4
-rw-r--r--third_party/aom/build/cmake/aom_optimization.cmake4
-rw-r--r--third_party/aom/build/cmake/exports.cmake7
-rw-r--r--third_party/aom/build/cmake/exports_sources.cmake16
-rw-r--r--third_party/aom/build/cmake/generate_aom_config_templates.cmake19
-rw-r--r--third_party/aom/build/cmake/generate_exports.cmake24
-rw-r--r--third_party/aom/build/cmake/ios-Info.plist (renamed from third_party/aom/build/make/ios-Info.plist)0
-rwxr-xr-xthird_party/aom/build/cmake/iosbuild.sh (renamed from third_party/aom/build/make/iosbuild.sh)0
-rwxr-xr-xthird_party/aom/build/cmake/rtcd.pl (renamed from third_party/aom/build/make/rtcd.pl)6
-rw-r--r--third_party/aom/build/cmake/util.cmake91
-rwxr-xr-xthird_party/aom/build/cmake/version.pl10
-rw-r--r--third_party/aom/build/make/thumb.pm64
-rw-r--r--third_party/aom/common/args.h6
-rw-r--r--third_party/aom/common/av1_config.c511
-rw-r--r--third_party/aom/common/av1_config.h86
-rw-r--r--third_party/aom/common/ivfdec.h6
-rw-r--r--third_party/aom/common/ivfenc.h6
-rw-r--r--third_party/aom/common/md5_utils.h6
-rw-r--r--third_party/aom/common/obudec.c2
-rw-r--r--third_party/aom/common/obudec.h6
-rw-r--r--third_party/aom/common/rawenc.c44
-rw-r--r--third_party/aom/common/rawenc.h32
-rw-r--r--third_party/aom/common/tools_common.h6
-rw-r--r--third_party/aom/common/video_common.h6
-rw-r--r--third_party/aom/common/video_reader.c14
-rw-r--r--third_party/aom/common/video_reader.h6
-rw-r--r--third_party/aom/common/video_writer.h6
-rw-r--r--third_party/aom/common/warnings.h6
-rw-r--r--third_party/aom/common/webmdec.h6
-rw-r--r--third_party/aom/common/webmenc.h6
-rw-r--r--third_party/aom/common/y4menc.c113
-rw-r--r--third_party/aom/common/y4menc.h16
-rw-r--r--third_party/aom/common/y4minput.c2
-rw-r--r--third_party/aom/common/y4minput.h6
-rw-r--r--third_party/aom/docs.cmake20
-rw-r--r--third_party/aom/examples/aom_cx_set_ref.c3
-rw-r--r--third_party/aom/examples/encoder_util.h6
-rw-r--r--third_party/aom/examples/lightfield_bitstream_parsing.c3
-rw-r--r--third_party/aom/examples/lightfield_decoder.c5
-rw-r--r--third_party/aom/examples/lightfield_encoder.c5
-rw-r--r--third_party/aom/examples/lightfield_tile_list_decoder.c5
-rw-r--r--third_party/aom/examples/noise_model.c11
-rw-r--r--third_party/aom/stats/aomstats.h6
-rw-r--r--third_party/aom/stats/rate_hist.h6
-rw-r--r--third_party/aom/test/acm_random.h6
-rw-r--r--third_party/aom/test/av1_config_test.cc164
-rw-r--r--third_party/aom/test/av1_convolve_2d_test_util.cc28
-rw-r--r--third_party/aom/test/av1_convolve_2d_test_util.h6
-rw-r--r--third_party/aom/test/av1_convolve_scale_test.cc2
-rw-r--r--third_party/aom/test/av1_encoder_parms_get_to_decoder.cc158
-rw-r--r--third_party/aom/test/av1_fwd_txfm2d_test.cc157
-rw-r--r--third_party/aom/test/av1_highbd_iht_test.cc201
-rw-r--r--third_party/aom/test/av1_round_shift_array_test.cc2
-rw-r--r--third_party/aom/test/av1_txfm_test.h6
-rw-r--r--third_party/aom/test/blend_a64_mask_test.cc260
-rw-r--r--third_party/aom/test/blockd_test.cc122
-rw-r--r--third_party/aom/test/boolcoder_test.cc39
-rw-r--r--third_party/aom/test/clear_system_state.h6
-rw-r--r--third_party/aom/test/codec_factory.h12
-rw-r--r--third_party/aom/test/coding_path_sync.cc4
-rw-r--r--third_party/aom/test/comp_avg_pred_test.cc6
-rw-r--r--third_party/aom/test/comp_avg_pred_test.h197
-rw-r--r--third_party/aom/test/comp_mask_variance_test.cc126
-rw-r--r--third_party/aom/test/decode_api_test.cc2
-rw-r--r--third_party/aom/test/decode_test_driver.h6
-rw-r--r--third_party/aom/test/encode_api_test.cc12
-rw-r--r--third_party/aom/test/encode_test_driver.cc13
-rw-r--r--third_party/aom/test/encode_test_driver.h6
-rw-r--r--third_party/aom/test/encodetxb_test.cc40
-rw-r--r--third_party/aom/test/ethread_test.cc70
-rw-r--r--third_party/aom/test/external_frame_buffer_test.cc512
-rw-r--r--third_party/aom/test/fft_test.cc110
-rw-r--r--third_party/aom/test/function_equivalence_test.h6
-rw-r--r--third_party/aom/test/hiprec_convolve_test_util.h6
-rw-r--r--third_party/aom/test/i420_video_source.h6
-rw-r--r--third_party/aom/test/invalid_file_test.cc44
-rw-r--r--third_party/aom/test/ivf_video_source.h6
-rw-r--r--third_party/aom/test/log2_test.cc50
-rw-r--r--third_party/aom/test/md5_helper.h6
-rw-r--r--third_party/aom/test/obmc_variance_test.cc61
-rw-r--r--third_party/aom/test/pickrst_test.cc187
-rw-r--r--third_party/aom/test/quantize_func_test.cc122
-rw-r--r--third_party/aom/test/reconinter_test.cc34
-rw-r--r--third_party/aom/test/register_state_check.h6
-rw-r--r--third_party/aom/test/sum_squares_test.cc42
-rw-r--r--third_party/aom/test/test-data.sha120
-rw-r--r--third_party/aom/test/test.cmake29
-rw-r--r--third_party/aom/test/test_data_util.cmake20
-rw-r--r--third_party/aom/test/test_vector_test.cc3
-rw-r--r--third_party/aom/test/test_vectors.cc3
-rw-r--r--third_party/aom/test/test_vectors.h6
-rw-r--r--third_party/aom/test/transform_test_base.h6
-rw-r--r--third_party/aom/test/util.h6
-rw-r--r--third_party/aom/test/variance_test.cc117
-rw-r--r--third_party/aom/test/video_source.h6
-rw-r--r--third_party/aom/test/warp_filter_test.cc30
-rw-r--r--third_party/aom/test/warp_filter_test_util.cc123
-rw-r--r--third_party/aom/test/warp_filter_test_util.h20
-rw-r--r--third_party/aom/test/webm_video_source.h6
-rw-r--r--third_party/aom/test/wiener_test.cc280
-rw-r--r--third_party/aom/test/y4m_test.cc28
-rw-r--r--third_party/aom/test/y4m_video_source.h6
-rw-r--r--third_party/aom/test/yuv_video_source.h6
-rw-r--r--third_party/aom/third_party/vector/LICENSE19
-rw-r--r--third_party/aom/third_party/vector/README.libaom14
-rw-r--r--third_party/aom/tools/obu_parser.cc3
-rw-r--r--third_party/aom/tools/obu_parser.h6
-rw-r--r--third_party/aom/tools/txfm_analyzer/txfm_graph.h6
488 files changed, 32952 insertions, 14796 deletions
diff --git a/third_party/aom/CMakeLists.txt b/third_party/aom/CMakeLists.txt
index a1e810d49..a58e54f40 100644
--- a/third_party/aom/CMakeLists.txt
+++ b/third_party/aom/CMakeLists.txt
@@ -18,45 +18,6 @@ if(NOT EMSCRIPTEN)
endif()
endif()
-option(ENABLE_CCACHE "Enable ccache support." OFF)
-option(ENABLE_DECODE_PERF_TESTS "Enables decoder performance tests" OFF)
-option(ENABLE_DISTCC "Enable distcc support." OFF)
-option(ENABLE_DOCS "Enable documentation generation (doxygen required)." ON)
-option(ENABLE_ENCODE_PERF_TESTS "Enables encoder performance tests" OFF)
-option(ENABLE_EXAMPLES "Enables build of example code." ON)
-option(ENABLE_GOMA "Enable goma support." OFF)
-option(ENABLE_IDE_TEST_HOSTING
- "Enables running tests within IDEs like Visual Studio and Xcode." OFF)
-option(ENABLE_NASM "Use nasm instead of yasm for x86 assembly." OFF)
-option(ENABLE_TESTDATA "Enables unit test data download targets." ON)
-option(ENABLE_TESTS "Enables unit tests." ON)
-option(ENABLE_TOOLS "Enable applications in tools sub directory." ON)
-option(ENABLE_WERROR "Converts warnings to errors at compile time." OFF)
-
-# ARM assembly/intrinsics flags.
-option(ENABLE_NEON "Enables NEON optimizations on ARM targets." ON)
-
-# MIPS assembly/intrinsics flags.
-option(ENABLE_DSPR2 "Enables DSPR2 optimizations on MIPS targets." OFF)
-option(ENABLE_MSA "Enables MSA optimizations on MIPS targets." OFF)
-
-# VSX intrinsics flags.
-option(ENABLE_VSX "Enables VSX optimizations on PowerPC targets." ON)
-
-# x86/x86_64 assembly/intrinsics flags.
-option(ENABLE_MMX "Enables MMX optimizations on x86/x86_64 targets." ON)
-option(ENABLE_SSE "Enables SSE optimizations on x86/x86_64 targets." ON)
-option(ENABLE_SSE2 "Enables SSE2 optimizations on x86/x86_64 targets." ON)
-option(ENABLE_SSE3 "Enables SSE3 optimizations on x86/x86_64 targets." ON)
-option(ENABLE_SSSE3 "Enables SSSE3 optimizations on x86/x86_64 targets." ON)
-option(ENABLE_SSE4_1 "Enables SSE4_1 optimizations on x86/x86_64 targets." ON)
-option(ENABLE_SSE4_2 "Enables SSE4_2 optimizations on x86/x86_64 targets." ON)
-option(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets." ON)
-option(ENABLE_AVX2 "Enables AVX2 optimizations on x86/x86_64 targets." ON)
-
-# $BUILD_SHARED_LIBS is a CMake built-in-- it's listed here for visibility.
-option(BUILD_SHARED_LIBS "CMake should generate a shared library build." OFF)
-
project(AOM C CXX)
set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
@@ -95,7 +56,7 @@ list(APPEND AOM_RTCD_SOURCES
"${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
"${AOM_ROOT}/av1/common/av1_rtcd_defs.pl"
"${AOM_ROOT}/av1/common/av1_rtcd.c"
- "${AOM_ROOT}/build/make/rtcd.pl")
+ "${AOM_ROOT}/build/cmake/rtcd.pl")
list(APPEND AOM_LIBWEBM_SOURCES
"${AOM_ROOT}/third_party/libwebm/common/hdr_util.cc"
@@ -161,12 +122,19 @@ list(APPEND AOM_SOURCES
"${AOM_ROOT}/aom/src/aom_image.c"
"${AOM_ROOT}/aom/src/aom_integer.c")
-list(APPEND AOM_COMMON_APP_UTIL_SOURCES "${AOM_ROOT}/common/args.c"
- "${AOM_ROOT}/common/args.h" "${AOM_ROOT}/common/md5_utils.c"
+list(APPEND AOM_COMMON_APP_UTIL_SOURCES
+ "${AOM_ROOT}/common/args.c"
+ "${AOM_ROOT}/common/args.h"
+ "${AOM_ROOT}/common/av1_config.c"
+ "${AOM_ROOT}/common/av1_config.h"
+ "${AOM_ROOT}/common/md5_utils.c"
"${AOM_ROOT}/common/md5_utils.h"
"${AOM_ROOT}/common/tools_common.c"
"${AOM_ROOT}/common/tools_common.h"
- "${AOM_ROOT}/common/video_common.h" "${AOM_ROOT}/common/y4menc.c"
+ "${AOM_ROOT}/common/video_common.h"
+ "${AOM_ROOT}/common/rawenc.c"
+ "${AOM_ROOT}/common/rawenc.h"
+ "${AOM_ROOT}/common/y4menc.c"
"${AOM_ROOT}/common/y4menc.h")
list(APPEND AOM_DECODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/ivfdec.c"
@@ -255,10 +223,10 @@ add_rtcd_build_step("${AOM_ROOT}/av1/common/av1_rtcd_defs.pl"
add_library(aom_rtcd OBJECT ${AOM_RTCD_SOURCES})
add_dependencies(aom_rtcd aom_version)
-if (ENABLE_EXAMPLES)
-add_library(aom_encoder_stats OBJECT ${AOM_ENCODER_STATS_SOURCES})
-endif ()
-
+if(ENABLE_EXAMPLES)
+ add_library(aom_encoder_stats OBJECT ${AOM_ENCODER_STATS_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_encoder_stats)
+endif()
add_library(aom ${AOM_SOURCES} $<TARGET_OBJECTS:aom_rtcd>)
if(NOT MSVC AND NOT APPLE)
@@ -288,8 +256,8 @@ endforeach()
# aom_common_app_util library must define this function. This is a convenience
# to allow omission of the function from applications that might want to use
# other pieces of the util support without defining usage_exit().
-file(WRITE "${AOM_CONFIG_DIR}/usage_exit.c" "void usage_exit(void) {}")
-file(WRITE "${AOM_CONFIG_DIR}/usage_exit.cc"
+file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.c" "void usage_exit(void) {}")
+file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.cc"
"extern \"C\" void usage_exit(void) {}")
#
@@ -349,6 +317,8 @@ if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
add_preproc_definition(_POSIX_SOURCE)
append_link_flag_to_target("inspect" "-s TOTAL_MEMORY=402653184")
append_link_flag_to_target("inspect" "-s MODULARIZE=1")
+ append_link_flag_to_target(
+ "inspect" "-s \'EXTRA_EXPORTED_RUNTIME_METHODS=[\"UTF8ToString\"]\'")
append_link_flag_to_target("inspect"
"-s EXPORT_NAME=\"\'DecoderModule\'\"")
append_link_flag_to_target("inspect" "--memory-init-file 0")
@@ -407,7 +377,7 @@ if(CONFIG_AV1_ENCODER)
# TODO(tomfinegan): Sort out why a simple link command with
# aom_entropy_optimizer.c won't work on macos, but dragging in all the
# helper machinery allows the link to succeed.
- add_executable(aom_entropy_optimizer "${AOM_CONFIG_DIR}/usage_exit.c"
+ add_executable(aom_entropy_optimizer "${AOM_GEN_SRC_DIR}/usage_exit.c"
"${AOM_ROOT}/tools/aom_entropy_optimizer.c"
$<TARGET_OBJECTS:aom_common_app_util>
$<TARGET_OBJECTS:aom_encoder_app_util>)
@@ -434,7 +404,7 @@ endif()
if(ENABLE_TOOLS)
if(CONFIG_AV1_DECODER)
require_cxx_flag_nomsvc("-std=c++11" NO)
- add_executable(dump_obu "${AOM_CONFIG_DIR}/usage_exit.cc"
+ add_executable(dump_obu "${AOM_GEN_SRC_DIR}/usage_exit.cc"
"${AOM_ROOT}/tools/dump_obu.cc"
"${AOM_ROOT}/tools/obu_parser.cc"
"${AOM_ROOT}/tools/obu_parser.h"
@@ -555,9 +525,7 @@ endif()
if(HAVE_PTHREAD_H AND CONFIG_MULTITHREAD)
find_package(Threads)
- foreach(app_target ${AOM_APP_TARGETS})
- target_link_libraries(${app_target} ${AOM_LIB_LINK_TYPE} Threads::Threads)
- endforeach()
+ target_link_libraries(aom ${AOM_LIB_LINK_TYPE} Threads::Threads)
endif()
if(XCODE)
diff --git a/third_party/aom/README.md b/third_party/aom/README.md
index f5446f9d9..cab3f9993 100644
--- a/third_party/aom/README.md
+++ b/third_party/aom/README.md
@@ -414,6 +414,16 @@ First, add the new test data file to the `aom-test-data` bucket of the
`aomedia-testing` project on Google Cloud Platform. You may need to ask someone
with the necessary access permissions to do this for you.
+NOTE: When a new test data file is added to the `aom-test-data` bucket, its
+"Public access" is initially "Not public". We need to change its
+"Public access" to "Public" by using the following
+[`gsutil`](https://cloud.google.com/storage/docs/gsutil_install) command:
+~~~
+ $ gsutil acl ch -g all:R gs://aom-test-data/test-data-file-name
+~~~
+This command grants the `AllUsers` group READ access to the file named
+"test-data-file-name" in the `aom-test-data` bucket.
+
Once the new test data file has been added to `aom-test-data`, create a CL to
add the name of the new test data file to `test/test_data_util.cmake` and add
the SHA1 checksum of the new test data file to `test/test-data.sha1`. (The SHA1
diff --git a/third_party/aom/aom/aom.h b/third_party/aom/aom/aom.h
index c5ef2517d..b1cc1ecce 100644
--- a/third_party/aom/aom/aom.h
+++ b/third_party/aom/aom/aom.h
@@ -28,8 +28,8 @@
/*!\file
* \brief Provides controls common to both the AOM encoder and decoder.
*/
-#ifndef AOM_AOM_H_
-#define AOM_AOM_H_
+#ifndef AOM_AOM_AOM_H_
+#define AOM_AOM_AOM_H_
#include "aom/aom_codec.h"
#include "aom/aom_image.h"
@@ -144,4 +144,4 @@ AOM_CTRL_USE_TYPE(AV1_COPY_NEW_FRAME_IMAGE, aom_image_t *)
} // extern "C"
#endif
-#endif // AOM_AOM_H_
+#endif // AOM_AOM_AOM_H_
diff --git a/third_party/aom/aom/aom_codec.h b/third_party/aom/aom/aom_codec.h
index 63e358624..fc0df5b9e 100644
--- a/third_party/aom/aom/aom_codec.h
+++ b/third_party/aom/aom/aom_codec.h
@@ -33,11 +33,11 @@
* }
* </pre>
*
- * Once initialized, the instance is manged using other functions from
+ * Once initialized, the instance is managed using other functions from
* the aom_codec_* family.
*/
-#ifndef AOM_AOM_CODEC_H_
-#define AOM_AOM_CODEC_H_
+#ifndef AOM_AOM_AOM_CODEC_H_
+#define AOM_AOM_AOM_CODEC_H_
#ifdef __cplusplus
extern "C" {
@@ -259,12 +259,6 @@ typedef enum aom_superblock_size {
*
*/
int aom_codec_version(void);
-#define AOM_VERSION_MAJOR(v) \
- ((v >> 16) & 0xff) /**< extract major from packed version */
-#define AOM_VERSION_MINOR(v) \
- ((v >> 8) & 0xff) /**< extract minor from packed version */
-#define AOM_VERSION_PATCH(v) \
- ((v >> 0) & 0xff) /**< extract patch from packed version */
/*!\brief Return the version major number */
#define aom_codec_version_major() ((aom_codec_version() >> 16) & 0xff)
@@ -526,4 +520,4 @@ typedef struct cfg_options {
#ifdef __cplusplus
}
#endif
-#endif // AOM_AOM_CODEC_H_
+#endif // AOM_AOM_AOM_CODEC_H_
diff --git a/third_party/aom/aom/aom_decoder.h b/third_party/aom/aom/aom_decoder.h
index 3bbdcd7e2..06c2dc5f7 100644
--- a/third_party/aom/aom/aom_decoder.h
+++ b/third_party/aom/aom/aom_decoder.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_AOM_DECODER_H_
-#define AOM_AOM_DECODER_H_
+#ifndef AOM_AOM_AOM_DECODER_H_
+#define AOM_AOM_AOM_DECODER_H_
/*!\defgroup decoder Decoder Algorithm Interface
* \ingroup codec
@@ -361,4 +361,4 @@ aom_codec_err_t aom_codec_set_frame_buffer_functions(
#ifdef __cplusplus
}
#endif
-#endif // AOM_AOM_DECODER_H_
+#endif // AOM_AOM_AOM_DECODER_H_
diff --git a/third_party/aom/aom/aom_encoder.h b/third_party/aom/aom/aom_encoder.h
index 6003088ed..0894ca9e3 100644
--- a/third_party/aom/aom/aom_encoder.h
+++ b/third_party/aom/aom/aom_encoder.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_AOM_ENCODER_H_
-#define AOM_AOM_ENCODER_H_
+#ifndef AOM_AOM_AOM_ENCODER_H_
+#define AOM_AOM_AOM_ENCODER_H_
/*!\defgroup encoder Encoder Algorithm Interface
* \ingroup codec
@@ -54,13 +54,6 @@ extern "C" {
*/
#define AOM_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */
-/*! Can output one partition at a time. Each partition is returned in its
- * own AOM_CODEC_CX_FRAME_PKT, with the FRAME_IS_FRAGMENT flag set for
- * every partition but the last. In this mode all frames are always
- * returned partition by partition.
- */
-#define AOM_CODEC_CAP_OUTPUT_PARTITION 0x20000
-
/*! Can support input images at greater than 8 bitdepth.
*/
#define AOM_CODEC_CAP_HIGHBITDEPTH 0x40000
@@ -74,7 +67,6 @@ extern "C" {
*/
#define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
/*!\brief Make the encoder output one partition at a time. */
-#define AOM_CODEC_USE_OUTPUT_PARTITION 0x20000
#define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
/*!\brief Generic fixed size buffer structure
@@ -119,11 +111,6 @@ typedef uint32_t aom_codec_frame_flags_t;
typedef uint32_t aom_codec_er_flags_t;
/*!\brief Improve resiliency against losses of whole frames */
#define AOM_ERROR_RESILIENT_DEFAULT 0x1
-/*!\brief The frame partitions are independently decodable by the bool decoder,
- * meaning that partitions can be decoded even though earlier partitions have
- * been lost. Note that intra prediction is still done over the partition
- * boundary. */
-#define AOM_ERROR_RESILIENT_PARTITIONS 0x2
/*!\brief Encoder output packet variants
*
@@ -849,13 +836,25 @@ aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
/*!\brief Get global stream headers
*
* Retrieves a stream level global header packet, if supported by the codec.
+ * Calls to this function should be deferred until all configuration information
+ * has been passed to libaom. Otherwise the global header data may be
+ * invalidated by additional configuration changes.
+ *
+ * The AV1 implementation of this function returns an OBU. The OBU returned is
+ * in Low Overhead Bitstream Format. Specifically, the obu_has_size_field bit is
+ * set, and the buffer contains the obu_size field for the returned OBU.
*
* \param[in] ctx Pointer to this instance's context
*
* \retval NULL
- * Encoder does not support global header
+ * Encoder does not support global header, or an error occurred while
+ * generating the global header.
+ *
* \retval Non-NULL
- * Pointer to buffer containing global header packet
+ * Pointer to buffer containing global header packet. The caller owns the
+ * memory associated with this buffer, and must free the 'buf' member of the
+ * aom_fixed_buf_t as well as the aom_fixed_buf_t pointer. Memory returned
+ * must be freed via call to free().
*/
aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
@@ -979,4 +978,4 @@ const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx);
#ifdef __cplusplus
}
#endif
-#endif // AOM_AOM_ENCODER_H_
+#endif // AOM_AOM_AOM_ENCODER_H_
diff --git a/third_party/aom/aom/aom_frame_buffer.h b/third_party/aom/aom/aom_frame_buffer.h
index b979fcf2b..fba4322f8 100644
--- a/third_party/aom/aom/aom_frame_buffer.h
+++ b/third_party/aom/aom/aom_frame_buffer.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_AOM_FRAME_BUFFER_H_
-#define AOM_AOM_FRAME_BUFFER_H_
+#ifndef AOM_AOM_AOM_FRAME_BUFFER_H_
+#define AOM_AOM_AOM_FRAME_BUFFER_H_
/*!\file
* \brief Describes the decoder external frame buffer interface.
@@ -81,4 +81,4 @@ typedef int (*aom_release_frame_buffer_cb_fn_t)(void *priv,
} // extern "C"
#endif
-#endif // AOM_AOM_FRAME_BUFFER_H_
+#endif // AOM_AOM_AOM_FRAME_BUFFER_H_
diff --git a/third_party/aom/aom/aom_image.h b/third_party/aom/aom/aom_image.h
index b01317b3f..a960127f1 100644
--- a/third_party/aom/aom/aom_image.h
+++ b/third_party/aom/aom/aom_image.h
@@ -13,8 +13,8 @@
* \brief Describes the aom image descriptor and associated operations
*
*/
-#ifndef AOM_AOM_IMAGE_H_
-#define AOM_AOM_IMAGE_H_
+#ifndef AOM_AOM_AOM_IMAGE_H_
+#define AOM_AOM_AOM_IMAGE_H_
#ifdef __cplusplus
extern "C" {
@@ -328,4 +328,4 @@ int aom_img_plane_height(const aom_image_t *img, int plane);
} // extern "C"
#endif
-#endif // AOM_AOM_IMAGE_H_
+#endif // AOM_AOM_AOM_IMAGE_H_
diff --git a/third_party/aom/aom/aom_integer.h b/third_party/aom/aom/aom_integer.h
index 907d4cbec..90263bd4f 100644
--- a/third_party/aom/aom/aom_integer.h
+++ b/third_party/aom/aom/aom_integer.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_AOM_INTEGER_H_
-#define AOM_AOM_INTEGER_H_
+#ifndef AOM_AOM_AOM_INTEGER_H_
+#define AOM_AOM_AOM_INTEGER_H_
/* get ptrdiff_t, size_t, wchar_t, NULL */
#include <stddef.h>
@@ -103,4 +103,4 @@ int aom_uleb_encode_fixed_size(uint64_t value, size_t available,
} // extern "C"
#endif // __cplusplus
-#endif // AOM_AOM_INTEGER_H_
+#endif // AOM_AOM_AOM_INTEGER_H_
diff --git a/third_party/aom/aom/aomcx.h b/third_party/aom/aom/aomcx.h
index e77e5f693..013ddf57e 100644
--- a/third_party/aom/aom/aomcx.h
+++ b/third_party/aom/aom/aomcx.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_AOMCX_H_
-#define AOM_AOMCX_H_
+#ifndef AOM_AOM_AOMCX_H_
+#define AOM_AOM_AOMCX_H_
/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
* \ingroup aom
@@ -181,10 +181,6 @@ enum aome_enc_control_id {
*/
AOME_SET_CPUUSED = 13,
- /*!\brief Speed features for codec development
- */
- AOME_SET_DEVSF,
-
/*!\brief Codec control function to enable automatic set and use alf frames.
*/
AOME_SET_ENABLEAUTOALTREF,
@@ -286,14 +282,19 @@ enum aome_enc_control_id {
*/
AV1E_SET_LOSSLESS = AV1E_SET_GF_CBR_BOOST_PCT + 2,
+ /** control function to enable the row based multi-threading of encoder. A
+ * value that is equal to 1 indicates that row based multi-threading is
+ * enabled.
+ */
+ AV1E_SET_ROW_MT,
+
/*!\brief Codec control function to set number of tile columns.
*
* In encoding and decoding, AV1 allows an input image frame be partitioned
- * into separated vertical tile columns, which can be encoded or decoded
+ * into separate vertical tile columns, which can be encoded or decoded
* independently. This enables easy implementation of parallel encoding and
- * decoding. This control requests the encoder to use column tiles in
- * encoding an input frame, with number of tile columns (in Log2 unit) as
- * the parameter:
+ * decoding. The parameter for this control describes the number of tile
+ * columns (in log2 units), which has a valid range of [0, 6]:
* 0 = 1 tile column
* 1 = 2 tile columns
* 2 = 4 tile columns
@@ -310,16 +311,14 @@ enum aome_enc_control_id {
/*!\brief Codec control function to set number of tile rows.
*
* In encoding and decoding, AV1 allows an input image frame be partitioned
- * into separated horizontal tile rows. Tile rows are encoded or decoded
- * sequentially. Even though encoding/decoding of later tile rows depends on
- * earlier ones, this allows the encoder to output data packets for tile rows
- * prior to completely processing all tile rows in a frame, thereby reducing
- * the latency in processing between input and output. The parameter
- * for this control describes the number of tile rows, which has a valid
- * range [0, 2]:
+ * into separate horizontal tile rows, which can be encoded or decoded
+ * independently. The parameter for this control describes the number of tile
+ * rows (in log2 units), which has a valid range of [0, 6]:
* 0 = 1 tile row
* 1 = 2 tile rows
* 2 = 4 tile rows
+ * .....
+ * n = 2**n tile rows
*
* By default, the value is 0, i.e. one single row tile for entire image.
*/
@@ -860,6 +859,12 @@ enum aome_enc_control_id {
/*!\brief Sets the denoisers block size */
AV1E_SET_DENOISE_BLOCK_SIZE,
+
+ /*!\brief Sets the chroma subsampling x value */
+ AV1E_SET_CHROMA_SUBSAMPLING_X,
+
+ /*!\brief Sets the chroma subsampling y value */
+ AV1E_SET_CHROMA_SUBSAMPLING_Y,
};
/*!\brief aom 1-D scaling mode
@@ -993,6 +998,9 @@ AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
#define AOM_CTRL_AOME_SET_CQ_LEVEL
+AOM_CTRL_USE_TYPE(AV1E_SET_ROW_MT, int)
+#define AOM_CTRL_AV1E_SET_ROW_MT
+
AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, int)
#define AOM_CTRL_AV1E_SET_TILE_COLUMNS
AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, int)
@@ -1137,10 +1145,6 @@ AOM_CTRL_USE_TYPE(AV1E_GET_ACTIVEMAP, aom_active_map_t *)
AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int)
#define AOM_CTRL_AV1E_SET_COLOR_RANGE
-/*!\brief
- *
- * TODO(rbultje) : add support of the control in ffmpeg
- */
#define AOM_CTRL_AV1E_SET_RENDER_SIZE
AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
@@ -1179,10 +1183,16 @@ AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_BLOCK_SIZE, unsigned int);
#define AOM_CTRL_AV1E_SET_DENOISE_BLOCK_SIZE
#endif
+AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_X, unsigned int)
+#define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_X
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_Y, unsigned int)
+#define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_Y
+
/*!\endcond */
/*! @} - end defgroup aom_encoder */
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AOM_AOMCX_H_
+#endif // AOM_AOM_AOMCX_H_
diff --git a/third_party/aom/aom/aomdx.h b/third_party/aom/aom/aomdx.h
index 50ff22410..765856a1b 100644
--- a/third_party/aom/aom/aomdx.h
+++ b/third_party/aom/aom/aomdx.h
@@ -18,8 +18,8 @@
* \brief Provides definitions for using AOM or AV1 within the aom Decoder
* interface.
*/
-#ifndef AOM_AOMDX_H_
-#define AOM_AOMDX_H_
+#ifndef AOM_AOM_AOMDX_H_
+#define AOM_AOM_AOMDX_H_
#ifdef __cplusplus
extern "C" {
@@ -227,6 +227,12 @@ enum aom_dec_control_id {
*/
AV1_SET_INSPECTION_CALLBACK,
+ /** control function to set the skip film grain flag. Valid values are
+ * integers. The decoder will skip the film grain when its value is set to
+ * nonzero. The default value is 0.
+ */
+ AV1D_SET_SKIP_FILM_GRAIN,
+
AOM_DECODER_CTRL_ID_MAX,
};
@@ -276,6 +282,8 @@ AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int)
#define AOM_CTRL_AV1D_EXT_TILE_DEBUG
AOM_CTRL_USE_TYPE(AV1D_SET_ROW_MT, unsigned int)
#define AOM_CTRL_AV1D_SET_ROW_MT
+AOM_CTRL_USE_TYPE(AV1D_SET_SKIP_FILM_GRAIN, int)
+#define AOM_CTRL_AV1D_SET_SKIP_FILM_GRAIN
AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int)
#define AOM_CTRL_AV1D_SET_IS_ANNEXB
AOM_CTRL_USE_TYPE(AV1D_SET_OPERATING_POINT, int)
@@ -291,4 +299,4 @@ AOM_CTRL_USE_TYPE(AV1_SET_INSPECTION_CALLBACK, aom_inspect_init *)
} // extern "C"
#endif
-#endif // AOM_AOMDX_H_
+#endif // AOM_AOM_AOMDX_H_
diff --git a/third_party/aom/aom/exports_com b/third_party/aom/aom/exports_com
index a87a4536c..2798bd51a 100644
--- a/third_party/aom/aom/exports_com
+++ b/third_party/aom/aom/exports_com
@@ -10,14 +10,23 @@ text aom_codec_version
text aom_codec_version_extra_str
text aom_codec_version_str
text aom_img_alloc
+text aom_img_alloc_with_border
text aom_img_flip
text aom_img_free
-text aom_img_plane_width
text aom_img_plane_height
+text aom_img_plane_width
text aom_img_set_rect
text aom_img_wrap
-text aom_img_alloc_with_border
+text aom_malloc
+text aom_rb_bytes_read
+text aom_rb_read_bit
+text aom_rb_read_literal
+text aom_rb_read_uvlc
text aom_uleb_decode
text aom_uleb_encode
text aom_uleb_encode_fixed_size
text aom_uleb_size_in_bytes
+text aom_wb_bytes_written
+text aom_wb_write_bit
+text aom_wb_write_literal
+text aom_wb_write_unsigned_literal
diff --git a/third_party/aom/aom/exports_dec b/third_party/aom/aom/exports_dec
index de8fe449a..d7d1c4f7d 100644
--- a/third_party/aom/aom/exports_dec
+++ b/third_party/aom/aom/exports_dec
@@ -6,3 +6,5 @@ text aom_codec_peek_stream_info
text aom_codec_register_put_frame_cb
text aom_codec_register_put_slice_cb
text aom_codec_set_frame_buffer_functions
+text aom_obu_type_to_string
+text aom_read_obu_header
diff --git a/third_party/aom/aom/exports_enc b/third_party/aom/aom/exports_enc
index 0dcca7da3..918d742f0 100644
--- a/third_party/aom/aom/exports_enc
+++ b/third_party/aom/aom/exports_enc
@@ -7,3 +7,12 @@ text aom_codec_get_cx_data
text aom_codec_get_global_headers
text aom_codec_get_preview_frame
text aom_codec_set_cx_data_buf
+text aom_film_grain_table_append
+text aom_film_grain_table_free
+text aom_film_grain_table_write
+text aom_flat_block_finder_init
+text aom_flat_block_finder_run
+text aom_noise_model_init
+text aom_noise_model_get_grain_parameters
+text aom_noise_model_save_latest
+text aom_noise_model_update
diff --git a/third_party/aom/aom/exports_test b/third_party/aom/aom/exports_test
new file mode 100644
index 000000000..01b864bae
--- /dev/null
+++ b/third_party/aom/aom/exports_test
@@ -0,0 +1,2 @@
+text aom_dsp_rtcd
+text aom_scale_rtcd
diff --git a/third_party/aom/aom/internal/aom_codec_internal.h b/third_party/aom/aom/internal/aom_codec_internal.h
index 88bf78ef2..21c0dc69c 100644
--- a/third_party/aom/aom/internal/aom_codec_internal.h
+++ b/third_party/aom/aom/internal/aom_codec_internal.h
@@ -38,11 +38,11 @@
* }
* </pre>
*
- * Once initialized, the instance is manged using other functions from
+ * Once initialized, the instance is managed using other functions from
* the aom_codec_* family.
*/
-#ifndef AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
-#define AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
+#ifndef AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
+#define AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
#include "../aom_decoder.h"
#include "../aom_encoder.h"
#include <stdarg.h>
@@ -438,4 +438,4 @@ void aom_merge_corrupted_flag(int *corrupted, int value);
} // extern "C"
#endif
-#endif // AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
+#endif // AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
diff --git a/third_party/aom/aom/src/aom_decoder.c b/third_party/aom/aom/src/aom_decoder.c
index e0cec10b6..8c9111faf 100644
--- a/third_party/aom/aom/src/aom_decoder.c
+++ b/third_party/aom/aom/src/aom_decoder.c
@@ -101,9 +101,7 @@ aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
size_t data_sz, void *user_priv) {
aom_codec_err_t res;
- /* Sanity checks */
- /* NULL data ptr allowed if data_sz is 0 too */
- if (!ctx || (!data && data_sz) || (data && !data_sz))
+ if (!ctx)
res = AOM_CODEC_INVALID_PARAM;
else if (!ctx->iface || !ctx->priv)
res = AOM_CODEC_ERROR;
diff --git a/third_party/aom/aom/src/aom_encoder.c b/third_party/aom/aom/src/aom_encoder.c
index 22765d6a6..523f40bbe 100644
--- a/third_party/aom/aom/src/aom_encoder.c
+++ b/third_party/aom/aom/src/aom_encoder.c
@@ -16,7 +16,9 @@
#include "config/aom_config.h"
#if HAVE_FEXCEPT
+#ifndef _GNU_SOURCE
#define _GNU_SOURCE
+#endif
#include <fenv.h>
#endif
@@ -46,9 +48,6 @@ aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
res = AOM_CODEC_INCAPABLE;
else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
res = AOM_CODEC_INCAPABLE;
- else if ((flags & AOM_CODEC_USE_OUTPUT_PARTITION) &&
- !(iface->caps & AOM_CODEC_CAP_OUTPUT_PARTITION))
- res = AOM_CODEC_INCAPABLE;
else {
ctx->iface = iface;
ctx->name = iface->name;
@@ -81,9 +80,6 @@ aom_codec_err_t aom_codec_enc_init_multi_ver(
res = AOM_CODEC_INCAPABLE;
else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
res = AOM_CODEC_INCAPABLE;
- else if ((flags & AOM_CODEC_USE_OUTPUT_PARTITION) &&
- !(iface->caps & AOM_CODEC_CAP_OUTPUT_PARTITION))
- res = AOM_CODEC_INCAPABLE;
else {
int i;
void *mem_loc = NULL;
diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c
index bba37e227..4791826da 100644
--- a/third_party/aom/aom_dsp/aom_convolve.c
+++ b/third_party/aom/aom_dsp/aom_convolve.c
@@ -16,7 +16,6 @@
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_ports/mem.h"
diff --git a/third_party/aom/aom_dsp/aom_convolve.h b/third_party/aom/aom_dsp/aom_convolve.h
deleted file mode 100644
index 6f5b888e4..000000000
--- a/third_party/aom/aom_dsp/aom_convolve.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#ifndef AOM_DSP_AOM_CONVOLVE_H_
-#define AOM_DSP_AOM_CONVOLVE_H_
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Note: Fixed size intermediate buffers, place limits on parameters
-// of some functions. 2d filtering proceeds in 2 steps:
-// (1) Interpolate horizontally into an intermediate buffer, temp.
-// (2) Interpolate temp vertically to derive the sub-pixel result.
-// Deriving the maximum number of rows in the temp buffer (135):
-// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-// --Largest block size is 64x64 pixels.
-// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-// original frame (in 1/16th pixel units).
-// --Must round-up because block may be located at sub-pixel position.
-// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-// TODO(wtc): Update the above comment to explain the value 263 used in aom.
-#define MAX_EXT_SIZE 263
-
-#define EXTRAPREC_BITS 2
-#define EXTRAPREC_CLAMP_LIMIT(bd) (1 << ((bd) + 1 + EXTRAPREC_BITS))
-
-typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h);
-
-typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_DSP_AOM_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake
index 7c0111a69..11ff73756 100644
--- a/third_party/aom/aom_dsp/aom_dsp.cmake
+++ b/third_party/aom/aom_dsp/aom_dsp.cmake
@@ -15,11 +15,14 @@ set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1)
list(APPEND AOM_DSP_COMMON_SOURCES
"${AOM_ROOT}/aom_dsp/aom_convolve.c"
- "${AOM_ROOT}/aom_dsp/aom_convolve.h"
"${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
"${AOM_ROOT}/aom_dsp/aom_filter.h"
"${AOM_ROOT}/aom_dsp/aom_simd.h"
"${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
+ "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
+ "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
+ "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
+ "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
"${AOM_ROOT}/aom_dsp/blend.h"
"${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
"${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
@@ -64,7 +67,8 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
"${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
"${AOM_ROOT}/aom_dsp/x86/mem_sse2.h"
"${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h"
- "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h")
+ "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h")
list(APPEND AOM_DSP_COMMON_ASM_SSSE3
"${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
@@ -76,6 +80,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
"${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
+ "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h"
"${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
"${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
"${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
@@ -88,7 +93,8 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
"${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c")
+ "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c")
list(APPEND AOM_DSP_COMMON_INTRIN_NEON
"${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
@@ -125,12 +131,9 @@ if(CONFIG_AV1_DECODER)
"${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
"${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
"${AOM_ROOT}/aom_dsp/bitreader.h"
- "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
- "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
"${AOM_ROOT}/aom_dsp/daalaboolreader.c"
"${AOM_ROOT}/aom_dsp/daalaboolreader.h"
- "${AOM_ROOT}/aom_dsp/entdec.c"
- "${AOM_ROOT}/aom_dsp/entdec.h"
+ "${AOM_ROOT}/aom_dsp/entdec.c" "${AOM_ROOT}/aom_dsp/entdec.h"
"${AOM_ROOT}/aom_dsp/grain_synthesis.c"
"${AOM_ROOT}/aom_dsp/grain_synthesis.h")
endif()
@@ -140,8 +143,6 @@ if(CONFIG_AV1_ENCODER)
"${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
"${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
"${AOM_ROOT}/aom_dsp/bitwriter.h"
- "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
- "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
"${AOM_ROOT}/aom_dsp/daalaboolwriter.c"
"${AOM_ROOT}/aom_dsp/daalaboolwriter.h"
"${AOM_ROOT}/aom_dsp/entenc.c"
@@ -158,13 +159,13 @@ if(CONFIG_AV1_ENCODER)
"${AOM_ROOT}/aom_dsp/quantize.c"
"${AOM_ROOT}/aom_dsp/quantize.h"
"${AOM_ROOT}/aom_dsp/sad.c"
+ "${AOM_ROOT}/aom_dsp/sse.c"
"${AOM_ROOT}/aom_dsp/sad_av1.c"
"${AOM_ROOT}/aom_dsp/sum_squares.c"
"${AOM_ROOT}/aom_dsp/variance.c"
"${AOM_ROOT}/aom_dsp/variance.h")
list(APPEND AOM_DSP_ENCODER_ASM_SSE2
- "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_impl_sse2.asm"
"${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
"${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
"${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
@@ -178,11 +179,11 @@ if(CONFIG_AV1_ENCODER)
"${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
"${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
- "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h"
"${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
@@ -199,8 +200,12 @@ if(CONFIG_AV1_ENCODER)
"${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c")
+ "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
"${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
@@ -219,6 +224,7 @@ if(CONFIG_AV1_ENCODER)
list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
"${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
"${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
"${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h
index c5dc9a834..a185b23c8 100644
--- a/third_party/aom/aom_dsp/aom_dsp_common.h
+++ b/third_party/aom/aom_dsp/aom_dsp_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_AOM_DSP_COMMON_H_
-#define AOM_DSP_AOM_DSP_COMMON_H_
+#ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_
+#define AOM_AOM_DSP_AOM_DSP_COMMON_H_
#include "config/aom_config.h"
@@ -95,4 +95,4 @@ static INLINE unsigned int negative_to_zero(int value) {
} // extern "C"
#endif
-#endif // AOM_DSP_AOM_DSP_COMMON_H_
+#endif // AOM_AOM_DSP_AOM_DSP_COMMON_H_
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
index 1a9ac3184..8e8a480fe 100755
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -519,23 +519,23 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
# Quantization
#
if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
- add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
- add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
- add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
} # CONFIG_AV1_ENCODER
if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
- add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/aom_highbd_quantize_b sse2 avx2/;
- add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/aom_highbd_quantize_b_32x32 sse2/;
- add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
} # CONFIG_AV1_ENCODER
@@ -543,12 +543,12 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
# Alpha blending with mask
#
add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params";
-specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 neon/;
+specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/;
add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd";
add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby";
add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
-specialize "aom_blend_a64_mask", qw/sse4_1/;
+specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
@@ -569,15 +569,22 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
specialize qw/aom_highbd_subtract_block sse2/;
+ add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
+ specialize qw/aom_sse sse4_1 avx2/;
+
+ add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
+ specialize qw/aom_highbd_sse sse4_1 avx2/;
+
if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
#
# Sum of Squares
#
add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
- specialize qw/aom_sum_squares_2d_i16 sse2/;
+ specialize qw/aom_sum_squares_2d_i16 sse2 avx2/;
add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
specialize qw/aom_sum_squares_i16 sse2/;
+
}
@@ -830,7 +837,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_highbd_sad16x64x4d sse2/;
specialize qw/aom_highbd_sad64x16x4d sse2/;
-
#
# Structured Similarity (SSIM)
#
@@ -888,36 +894,43 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
#
add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
- int subpel_y_q3, const uint8_t *ref, int ref_stride";
+ int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search";
specialize qw/aom_upsampled_pred sse2/;
add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride";
+ int ref_stride, int subpel_search";
specialize qw/aom_comp_avg_upsampled_pred sse2/;
add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+ int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
+ add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+ int subpel_search";
+ specialize qw/aom_comp_mask_upsampled_pred sse2/;
+
add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint16_t *comp_pred, int width, int height, int subpel_x_q3,
- int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd";
+ const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
specialize qw/aom_highbd_upsampled_pred sse2/;
add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd";
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param";
+ int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/;
@@ -1101,7 +1114,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
- specialize "aom_obmc_variance${w}x${h}", q/sse4_1/;
+ specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/;
specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
}
@@ -1154,17 +1167,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
- #
- # Specialty Subpixel
- #
- add_proto qw/uint32_t aom_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_variance_halfpixvar16x16_h sse2/;
-
- add_proto qw/uint32_t aom_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_variance_halfpixvar16x16_v sse2/;
-
- add_proto qw/uint32_t aom_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_variance_halfpixvar16x16_hv sse2/;
#
# Comp Avg
@@ -1174,6 +1176,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
specialize qw/aom_jnt_comp_avg_pred ssse3/;
+ add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance128x128 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance128x64 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_12_variance64x128 sse2/;
add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/aom_highbd_12_variance64x64 sse2/;
@@ -1209,40 +1219,58 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance128x128 sse2 avx2/;
+
+ add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance128x64 sse2 avx2/;
+
+ add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_10_variance64x128 sse2 avx2/;
+
add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance64x64 sse2/;
+ specialize qw/aom_highbd_10_variance64x64 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance64x32 sse2/;
+ specialize qw/aom_highbd_10_variance64x32 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance32x64 sse2/;
+ specialize qw/aom_highbd_10_variance32x64 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance32x32 sse2/;
+ specialize qw/aom_highbd_10_variance32x32 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance32x16 sse2/;
+ specialize qw/aom_highbd_10_variance32x16 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance16x32 sse2/;
+ specialize qw/aom_highbd_10_variance16x32 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance16x16 sse2/;
+ specialize qw/aom_highbd_10_variance16x16 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance16x8 sse2/;
+ specialize qw/aom_highbd_10_variance16x8 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance8x16 sse2/;
+ specialize qw/aom_highbd_10_variance8x16 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance8x8 sse2/;
+ specialize qw/aom_highbd_10_variance8x8 sse2 avx2/;
add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance128x128 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance128x64 sse2/;
+
+ add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/aom_highbd_8_variance64x128 sse2/;
+
add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/aom_highbd_8_variance64x64 sse2/;
@@ -1310,9 +1338,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/aom_highbd_12_mse8x8 sse2/;
- add_proto qw/void aom_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+ add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
- add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+ add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
specialize qw/aom_highbd_jnt_comp_avg_pred sse2/;
#
@@ -1539,8 +1567,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
specialize qw/aom_comp_mask_pred ssse3 avx2/;
- add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
- specialize qw/aom_highbd_comp_mask_pred avx2/;
+ add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
+ specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
} # CONFIG_AV1_ENCODER
diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h
index fd4f51b29..00686ac38 100644
--- a/third_party/aom/aom_dsp/aom_filter.h
+++ b/third_party/aom/aom_dsp/aom_filter.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_AOM_FILTER_H_
-#define AOM_DSP_AOM_FILTER_H_
+#ifndef AOM_AOM_DSP_AOM_FILTER_H_
+#define AOM_AOM_DSP_AOM_FILTER_H_
#include "aom/aom_integer.h"
@@ -53,4 +53,4 @@ static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
} // extern "C"
#endif
-#endif // AOM_DSP_AOM_FILTER_H_
+#endif // AOM_AOM_DSP_AOM_FILTER_H_
diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h
index 392b36627..ab950ca55 100644
--- a/third_party/aom/aom_dsp/aom_simd.h
+++ b/third_party/aom/aom_dsp/aom_simd.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_AOM_AOM_SIMD_H_
-#define AOM_DSP_AOM_AOM_SIMD_H_
+#ifndef AOM_AOM_DSP_AOM_SIMD_H_
+#define AOM_AOM_DSP_AOM_SIMD_H_
#include <stdint.h>
@@ -35,4 +35,4 @@
#include "simd/v256_intrinsics.h"
#endif
-#endif // AOM_DSP_AOM_AOM_SIMD_H_
+#endif // AOM_AOM_DSP_AOM_SIMD_H_
diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h
index 02a8b3a17..eb333f6f6 100644
--- a/third_party/aom/aom_dsp/aom_simd_inline.h
+++ b/third_party/aom/aom_dsp/aom_simd_inline.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_AOM_SIMD_INLINE_H_
-#define AOM_DSP_AOM_SIMD_INLINE_H_
+#ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_
+#define AOM_AOM_DSP_AOM_SIMD_INLINE_H_
#include "aom/aom_integer.h"
@@ -18,4 +18,4 @@
#define SIMD_INLINE static AOM_FORCE_INLINE
#endif
-#endif // AOM_DSP_AOM_SIMD_INLINE_H_
+#endif // AOM_AOM_DSP_AOM_SIMD_INLINE_H_
diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
index 82c0b0e28..e7f08a5fd 100644
--- a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
@@ -86,7 +86,8 @@ static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
const int16x8_t vec_round_bits) {
int16x8_t src0_0, src0_1;
int16x8_t src1_0, src1_1;
- uint64x2_t tu0, tu1, tu2, tu3;
+ uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0),
+ tu3 = vdupq_n_u64(0);
int16x8_t mask0_1, mask2_3;
int16x8_t res0, res1;
@@ -154,7 +155,8 @@ void aom_lowbd_blend_a64_d16_mask_neon(
assert(IS_POWER_OF_TWO(w));
uint8x8_t s0, s1, s2, s3;
- uint32x2_t tu0, tu1, tu2, tu3;
+ uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
+ tu3 = vdup_n_u32(0);
uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
int16x8_t mask0, mask1, mask2, mask3;
int16x8_t mask4, mask5, mask6, mask7;
diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
index 44d821821..cf618eee7 100644
--- a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
+++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
@@ -17,13 +17,9 @@
#include "aom_ports/mem.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
#include "aom_dsp/variance.h"
-static const uint8_t bilinear_filters[8][2] = {
- { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
- { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
-};
-
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
uint8_t *output_ptr,
unsigned int src_pixels_per_line,
@@ -83,9 +79,9 @@ unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
- bilinear_filters[xoffset]);
+ bilinear_filters_2t[xoffset]);
var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
- bilinear_filters[yoffset]);
+ bilinear_filters_2t[yoffset]);
return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
}
@@ -98,9 +94,9 @@ unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
- bilinear_filters[xoffset]);
+ bilinear_filters_2t[xoffset]);
var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
- bilinear_filters[yoffset]);
+ bilinear_filters_2t[yoffset]);
return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
}
@@ -113,9 +109,9 @@ unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
- bilinear_filters[xoffset]);
+ bilinear_filters_2t[xoffset]);
var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
- bilinear_filters[yoffset]);
+ bilinear_filters_2t[yoffset]);
return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
}
@@ -128,8 +124,8 @@ unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
- bilinear_filters[xoffset]);
+ bilinear_filters_2t[xoffset]);
var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
- bilinear_filters[yoffset]);
+ bilinear_filters_2t[yoffset]);
return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c
index d05c3efdc..01088010a 100644
--- a/third_party/aom/aom_dsp/binary_codes_reader.c
+++ b/third_party/aom/aom_dsp/binary_codes_reader.c
@@ -36,7 +36,7 @@ static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) {
uint16_t aom_read_primitive_quniform_(aom_reader *r,
uint16_t n ACCT_STR_PARAM) {
if (n <= 1) return 0;
- const int l = get_msb(n - 1) + 1;
+ const int l = get_msb(n) + 1;
const int m = (1 << l) - n;
const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
@@ -45,7 +45,7 @@ uint16_t aom_read_primitive_quniform_(aom_reader *r,
static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb,
uint16_t n) {
if (n <= 1) return 0;
- const int l = get_msb(n - 1) + 1;
+ const int l = get_msb(n) + 1;
const int m = (1 << l) - n;
const int v = aom_rb_read_literal(rb, l - 1);
return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb);
@@ -121,13 +121,3 @@ int16_t aom_rb_read_signed_primitive_refsubexpfin(
const uint16_t scaled_n = (n << 1) - 1;
return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1;
}
-
-uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
- int leading_zeros = 0;
- while (!aom_rb_read_bit(rb)) ++leading_zeros;
- // Maximum 32 bits.
- if (leading_zeros >= 32) return UINT32_MAX;
- const uint32_t base = (1u << leading_zeros) - 1;
- const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
- return base + value;
-}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h
index 5253c6154..364a67469 100644
--- a/third_party/aom/aom_dsp/binary_codes_reader.h
+++ b/third_party/aom/aom_dsp/binary_codes_reader.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_BINARY_CODES_READER_H_
-#define AOM_DSP_BINARY_CODES_READER_H_
+#ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_
+#define AOM_AOM_DSP_BINARY_CODES_READER_H_
#ifdef __cplusplus
extern "C" {
@@ -40,10 +40,8 @@ uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
int16_t aom_rb_read_signed_primitive_refsubexpfin(
struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref);
-uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb);
-
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AOM_DSP_BINARY_CODES_READER_H_
+#endif // AOM_AOM_DSP_BINARY_CODES_READER_H_
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c
index 8f74f0942..ee7a9f567 100644
--- a/third_party/aom/aom_dsp/binary_codes_writer.c
+++ b/third_party/aom/aom_dsp/binary_codes_writer.c
@@ -59,7 +59,7 @@ int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) {
// Encodes a value v in [0, n-1] quasi-uniformly
void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) {
if (n <= 1) return;
- const int l = get_msb(n - 1) + 1;
+ const int l = get_msb(n) + 1;
const int m = (1 << l) - n;
if (v < m) {
aom_write_literal(w, v, l - 1);
@@ -72,7 +72,7 @@ void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) {
static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb,
uint16_t n, uint16_t v) {
if (n <= 1) return;
- const int l = get_msb(n - 1) + 1;
+ const int l = get_msb(n) + 1;
const int m = (1 << l) - n;
if (v < m) {
aom_wb_write_literal(wb, v, l - 1);
@@ -84,7 +84,7 @@ static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb,
int aom_count_primitive_quniform(uint16_t n, uint16_t v) {
if (n <= 1) return 0;
- const int l = get_msb(n - 1) + 1;
+ const int l = get_msb(n) + 1;
const int m = (1 << l) - n;
return v < m ? l - 1 : l;
}
@@ -208,15 +208,3 @@ int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
const uint16_t scaled_n = (n << 1) - 1;
return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v);
}
-
-void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) {
- int64_t shift_val = ++v;
- int leading_zeroes = 1;
-
- assert(shift_val > 0);
-
- while (shift_val >>= 1) leading_zeroes += 2;
-
- aom_wb_write_literal(wb, 0, leading_zeroes >> 1);
- aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1);
-}
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h
index 784c721a6..c360e0e29 100644
--- a/third_party/aom/aom_dsp/binary_codes_writer.h
+++ b/third_party/aom/aom_dsp/binary_codes_writer.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_BINARY_CODES_WRITER_H_
-#define AOM_DSP_BINARY_CODES_WRITER_H_
+#ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_
+#define AOM_AOM_DSP_BINARY_CODES_WRITER_H_
#ifdef __cplusplus
extern "C" {
@@ -61,9 +61,8 @@ int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
uint16_t v);
int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
int16_t v);
-void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AOM_DSP_BINARY_CODES_WRITER_H_
+#endif // AOM_AOM_DSP_BINARY_CODES_WRITER_H_
diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h
index 328935be9..7c0efcc78 100644
--- a/third_party/aom/aom_dsp/bitreader.h
+++ b/third_party/aom/aom_dsp/bitreader.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_BITREADER_H_
-#define AOM_DSP_BITREADER_H_
+#ifndef AOM_AOM_DSP_BITREADER_H_
+#define AOM_AOM_DSP_BITREADER_H_
#include <assert.h>
#include <limits.h>
@@ -69,6 +69,12 @@ static INLINE int aom_reader_has_error(aom_reader *r) {
return aom_daala_reader_has_error(r);
}
+// Returns true if the bit reader has tried to decode more data from the buffer
+// than was actually provided.
+static INLINE int aom_reader_has_overflowed(const aom_reader *r) {
+ return aom_daala_reader_has_overflowed(r);
+}
+
// Returns the position in the bit reader in bits.
static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
return aom_daala_reader_tell(r);
@@ -151,4 +157,4 @@ static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
} // extern "C"
#endif
-#endif // AOM_DSP_BITREADER_H_
+#endif // AOM_AOM_DSP_BITREADER_H_
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c
index 02b5ef924..b53211784 100644
--- a/third_party/aom/aom_dsp/bitreader_buffer.c
+++ b/third_party/aom/aom_dsp/bitreader_buffer.c
@@ -55,3 +55,13 @@ int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
return ((int)value) >> nbits;
}
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
+ int leading_zeros = 0;
+ while (!aom_rb_read_bit(rb)) ++leading_zeros;
+ // Maximum 32 bits.
+ if (leading_zeros >= 32) return UINT32_MAX;
+ const uint32_t base = (1u << leading_zeros) - 1;
+ const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
+ return base + value;
+}
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h
index 5c94ab883..725ca1ea2 100644
--- a/third_party/aom/aom_dsp/bitreader_buffer.h
+++ b/third_party/aom/aom_dsp/bitreader_buffer.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_BITREADER_BUFFER_H_
-#define AOM_DSP_BITREADER_BUFFER_H_
+#ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_
+#define AOM_AOM_DSP_BITREADER_BUFFER_H_
#include <limits.h>
@@ -41,8 +41,10 @@ uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits);
int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb);
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AOM_DSP_BITREADER_BUFFER_H_
+#endif // AOM_AOM_DSP_BITREADER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h
index de1b1d048..b5ecc2382 100644
--- a/third_party/aom/aom_dsp/bitwriter.h
+++ b/third_party/aom/aom_dsp/bitwriter.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_BITWRITER_H_
-#define AOM_DSP_BITWRITER_H_
+#ifndef AOM_AOM_DSP_BITWRITER_H_
+#define AOM_AOM_DSP_BITWRITER_H_
#include <assert.h>
@@ -86,4 +86,4 @@ static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
} // extern "C"
#endif
-#endif // AOM_DSP_BITWRITER_H_
+#endif // AOM_AOM_DSP_BITWRITER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c
index a563bf684..596246deb 100644
--- a/third_party/aom/aom_dsp/bitwriter_buffer.c
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.c
@@ -73,3 +73,15 @@ void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
int bits) {
aom_wb_write_literal(wb, data, bits + 1);
}
+
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) {
+ int64_t shift_val = ++v;
+ int leading_zeroes = 1;
+
+ assert(shift_val > 0);
+
+ while (shift_val >>= 1) leading_zeroes += 2;
+
+ aom_wb_write_literal(wb, 0, leading_zeroes >> 1);
+ aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1);
+}
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h
index f7f75a097..d0311284f 100644
--- a/third_party/aom/aom_dsp/bitwriter_buffer.h
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_BITWRITER_BUFFER_H_
-#define AOM_DSP_BITWRITER_BUFFER_H_
+#ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_
+#define AOM_AOM_DSP_BITWRITER_BUFFER_H_
#include "aom/aom_integer.h"
@@ -42,8 +42,10 @@ void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
int bits);
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v);
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AOM_DSP_BITWRITER_BUFFER_H_
+#endif // AOM_AOM_DSP_BITWRITER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h
index 434bb83a1..fd87dc181 100644
--- a/third_party/aom/aom_dsp/blend.h
+++ b/third_party/aom/aom_dsp/blend.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_BLEND_H_
-#define AOM_DSP_BLEND_H_
+#ifndef AOM_AOM_DSP_BLEND_H_
+#define AOM_AOM_DSP_BLEND_H_
#include "aom_ports/mem.h"
@@ -42,4 +42,4 @@
#define DIFF_FACTOR_LOG2 4
#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2)
-#endif // AOM_DSP_BLEND_H_
+#endif // AOM_AOM_DSP_BLEND_H_
diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h
index cf7df1dbf..985fcdf9e 100644
--- a/third_party/aom/aom_dsp/buf_ans.h
+++ b/third_party/aom/aom_dsp/buf_ans.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_BUF_ANS_H_
-#define AOM_DSP_BUF_ANS_H_
+#ifndef AOM_AOM_DSP_BUF_ANS_H_
+#define AOM_AOM_DSP_BUF_ANS_H_
// Buffered forward ANS writer.
// Symbols are written to the writer in forward (decode) order and serialized
// backwards due to ANS's stack like behavior.
@@ -133,4 +133,4 @@ static INLINE int buf_ans_write_end(struct BufAnsCoder *const c) {
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
-#endif // AOM_DSP_BUF_ANS_H_
+#endif // AOM_AOM_DSP_BUF_ANS_H_
diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c
index 4e224904e..6c2259f23 100644
--- a/third_party/aom/aom_dsp/daalaboolreader.c
+++ b/third_party/aom/aom_dsp/daalaboolreader.c
@@ -39,3 +39,9 @@ uint32_t aom_daala_reader_tell(const daala_reader *r) {
uint32_t aom_daala_reader_tell_frac(const daala_reader *r) {
return od_ec_dec_tell_frac(&r->ec);
}
+
+int aom_daala_reader_has_overflowed(const daala_reader *r) {
+ const uint32_t tell_bits = aom_daala_reader_tell(r);
+ const uint32_t tell_bytes = (tell_bits + 7) >> 3;
+ return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer);
+}
diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h
index 60c197a49..ba78f916d 100644
--- a/third_party/aom/aom_dsp/daalaboolreader.h
+++ b/third_party/aom/aom_dsp/daalaboolreader.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_DAALABOOLREADER_H_
-#define AOM_DSP_DAALABOOLREADER_H_
+#ifndef AOM_AOM_DSP_DAALABOOLREADER_H_
+#define AOM_AOM_DSP_DAALABOOLREADER_H_
#include "aom/aom_integer.h"
#include "aom_dsp/entdec.h"
@@ -44,6 +44,9 @@ const uint8_t *aom_daala_reader_find_begin(daala_reader *r);
const uint8_t *aom_daala_reader_find_end(daala_reader *r);
uint32_t aom_daala_reader_tell(const daala_reader *r);
uint32_t aom_daala_reader_tell_frac(const daala_reader *r);
+// Returns true if the reader has tried to decode more data from the buffer
+// than was actually provided.
+int aom_daala_reader_has_overflowed(const daala_reader *r);
static INLINE int aom_daala_read(daala_reader *r, int prob) {
int bit;
@@ -154,4 +157,4 @@ static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf,
} // extern "C"
#endif
-#endif
+#endif // AOM_AOM_DSP_DAALABOOLREADER_H_
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h
index f9c596c73..3848877ce 100644
--- a/third_party/aom/aom_dsp/daalaboolwriter.h
+++ b/third_party/aom/aom_dsp/daalaboolwriter.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_DAALABOOLWRITER_H_
-#define AOM_DSP_DAALABOOLWRITER_H_
+#ifndef AOM_AOM_DSP_DAALABOOLWRITER_H_
+#define AOM_AOM_DSP_DAALABOOLWRITER_H_
#include <stdio.h>
@@ -75,4 +75,4 @@ static INLINE void daala_write_symbol(daala_writer *w, int symb,
} // extern "C"
#endif
-#endif
+#endif // AOM_AOM_DSP_DAALABOOLWRITER_H_
diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h
index 5c15526e9..7ba2b1c39 100644
--- a/third_party/aom/aom_dsp/entcode.h
+++ b/third_party/aom/aom_dsp/entcode.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_ENTCODE_H_
-#define AOM_DSP_ENTCODE_H_
+#ifndef AOM_AOM_DSP_ENTCODE_H_
+#define AOM_AOM_DSP_ENTCODE_H_
#include <limits.h>
#include <stddef.h>
@@ -37,4 +37,4 @@ typedef uint32_t od_ec_window;
OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
uint32_t rng);
-#endif // AOM_DSP_ENTCODE_H_
+#endif // AOM_AOM_DSP_ENTCODE_H_
diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c
index b8e9078c3..d1764c47b 100644
--- a/third_party/aom/aom_dsp/entdec.c
+++ b/third_party/aom/aom_dsp/entdec.c
@@ -112,6 +112,7 @@ static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
int ret) {
int d;
assert(rng <= 65535U);
+ // The number of leading zeros in the 16-bit binary representation of rng.
d = 16 - OD_ILOG_NZ(rng);
dec->cnt -= d;
/*This is equivalent to shifting in 1's instead of 0's.*/
diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h
index e35c3f99f..283bf1831 100644
--- a/third_party/aom/aom_dsp/entdec.h
+++ b/third_party/aom/aom_dsp/entdec.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#if !defined(_entdec_H)
-#define _entdec_H (1)
+#ifndef AOM_AOM_DSP_ENTDEC_H_
+#define AOM_AOM_DSP_ENTDEC_H_
#include <limits.h>
#include "aom_dsp/entcode.h"
@@ -80,4 +80,4 @@ OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec)
} // extern "C"
#endif
-#endif
+#endif // AOM_AOM_DSP_ENTDEC_H_
diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c
index 6866de9b9..a61da263c 100644
--- a/third_party/aom/aom_dsp/entenc.c
+++ b/third_party/aom/aom_dsp/entenc.c
@@ -60,6 +60,7 @@ static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
int s;
c = enc->cnt;
assert(rng <= 65535U);
+ // The number of leading zeros in the 16-bit binary representation of rng.
d = 16 - OD_ILOG_NZ(rng);
s = c + d;
/*TODO: Right now we flush every time we have at least one byte available.
diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h
index 1988f6818..3551d4250 100644
--- a/third_party/aom/aom_dsp/entenc.h
+++ b/third_party/aom/aom_dsp/entenc.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#if !defined(_entenc_H)
-#define _entenc_H (1)
+#ifndef AOM_AOM_DSP_ENTENC_H_
+#define AOM_AOM_DSP_ENTENC_H_
#include <stddef.h>
#include "aom_dsp/entcode.h"
@@ -82,4 +82,4 @@ void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src);
} // extern "C"
#endif
-#endif
+#endif // AOM_AOM_DSP_ENTENC_H_
diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h
index 2f3cd5fdc..5137331ae 100644
--- a/third_party/aom/aom_dsp/fft_common.h
+++ b/third_party/aom/aom_dsp/fft_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_FFT_COMMON_H_
-#define AOM_DSP_FFT_COMMON_H_
+#ifndef AOM_AOM_DSP_FFT_COMMON_H_
+#define AOM_AOM_DSP_FFT_COMMON_H_
#ifdef __cplusplus
extern "C" {
@@ -1047,4 +1047,4 @@ void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \
}
-#endif // AOM_DSP_FFT_COMMON_H_
+#endif // AOM_AOM_DSP_FFT_COMMON_H_
diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c
index ff1ec41a2..b96e1c319 100644
--- a/third_party/aom/aom_dsp/grain_synthesis.c
+++ b/third_party/aom/aom_dsp/grain_synthesis.c
@@ -396,14 +396,15 @@ static void init_random_generator(int luma_line, uint16_t seed) {
random_register ^= ((luma_num * 173 + 105) & 255);
}
-static void generate_luma_grain_block(
+// Return 0 for success, -1 for failure
+static int generate_luma_grain_block(
const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
int luma_block_size_y, int luma_block_size_x, int luma_grain_stride,
int left_pad, int top_pad, int right_pad, int bottom_pad) {
if (params->num_y_points == 0) {
memset(luma_grain_block, 0,
sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride);
- return;
+ return 0;
}
int bit_depth = params->bit_depth;
@@ -433,9 +434,11 @@ static void generate_luma_grain_block(
((wsum + rounding_offset) >> params->ar_coeff_shift),
grain_min, grain_max);
}
+ return 0;
}
-static void generate_chroma_grain_blocks(
+// Return 0 for success, -1 for failure
+static int generate_chroma_grain_blocks(
const aom_film_grain_t *params,
// int** pred_pos_luma,
int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block,
@@ -510,10 +513,11 @@ static void generate_chroma_grain_blocks(
wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma;
wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma;
} else {
- printf(
+ fprintf(
+ stderr,
"Grain synthesis: prediction between two chroma components is "
"not supported!");
- exit(1);
+ return -1;
}
}
if (params->num_cb_points || params->chroma_scaling_from_luma)
@@ -527,6 +531,7 @@ static void generate_chroma_grain_blocks(
((wsum_cr + rounding_offset) >> params->ar_coeff_shift),
grain_min, grain_max);
}
+ return 0;
}
static void init_scaling_function(const int scaling_points[][2], int num_points,
@@ -910,8 +915,8 @@ static void hor_boundary_overlap(int *top_block, int top_stride,
}
}
-void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
- aom_image_t *dst) {
+int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
+ aom_image_t *dst) {
uint8_t *luma, *cb, *cr;
int height, width, luma_stride, chroma_stride;
int use_high_bit_depth = 0;
@@ -953,8 +958,8 @@ void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
chroma_subsamp_y = 0;
break;
default: // unknown input format
- printf("Film grain error: input format is not supported!");
- exit(1);
+ fprintf(stderr, "Film grain error: input format is not supported!");
+ return -1;
}
assert(params->bit_depth == src->bit_depth);
@@ -1011,17 +1016,16 @@ void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth;
chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth;
- av1_add_film_grain_run(params, luma, cb, cr, height, width, luma_stride,
- chroma_stride, use_high_bit_depth, chroma_subsamp_y,
- chroma_subsamp_x, mc_identity);
- return;
+ return av1_add_film_grain_run(
+ params, luma, cb, cr, height, width, luma_stride, chroma_stride,
+ use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
}
-void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
- uint8_t *cb, uint8_t *cr, int height, int width,
- int luma_stride, int chroma_stride,
- int use_high_bit_depth, int chroma_subsamp_y,
- int chroma_subsamp_x, int mc_identity) {
+int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
+ uint8_t *cb, uint8_t *cr, int height, int width,
+ int luma_stride, int chroma_stride,
+ int use_high_bit_depth, int chroma_subsamp_y,
+ int chroma_subsamp_x, int mc_identity) {
int **pred_pos_luma;
int **pred_pos_chroma;
int *luma_grain_block;
@@ -1085,18 +1089,20 @@ void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y,
chroma_subsamp_x);
- generate_luma_grain_block(params, pred_pos_luma, luma_grain_block,
- luma_block_size_y, luma_block_size_x,
- luma_grain_stride, left_pad, top_pad, right_pad,
- bottom_pad);
-
- generate_chroma_grain_blocks(
- params,
- // pred_pos_luma,
- pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block,
- luma_grain_stride, chroma_block_size_y, chroma_block_size_x,
- chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad,
- chroma_subsamp_y, chroma_subsamp_x);
+ if (generate_luma_grain_block(params, pred_pos_luma, luma_grain_block,
+ luma_block_size_y, luma_block_size_x,
+ luma_grain_stride, left_pad, top_pad, right_pad,
+ bottom_pad))
+ return -1;
+
+ if (generate_chroma_grain_blocks(
+ params,
+ // pred_pos_luma,
+ pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block,
+ luma_grain_stride, chroma_block_size_y, chroma_block_size_x,
+ chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad,
+ chroma_subsamp_y, chroma_subsamp_x))
+ return -1;
init_scaling_function(params->scaling_points_y, params->num_y_points,
scaling_lut_y);
@@ -1399,4 +1405,5 @@ void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block,
&cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf,
&cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf);
+ return 0;
}
diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h
index 65feb6068..7aee6f6f4 100644
--- a/third_party/aom/aom_dsp/grain_synthesis.h
+++ b/third_party/aom/aom_dsp/grain_synthesis.h
@@ -13,8 +13,8 @@
* \brief Describes film grain parameters and film grain synthesis
*
*/
-#ifndef AOM_AOM_GRAIN_SYNTHESIS_H_
-#define AOM_AOM_GRAIN_SYNTHESIS_H_
+#ifndef AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
+#define AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
#ifdef __cplusplus
extern "C" {
@@ -85,6 +85,8 @@ typedef struct {
*
* Add film grain to an image
*
+ * Returns 0 for success, -1 for failure
+ *
* \param[in] grain_params Grain parameters
* \param[in] luma luma plane
* \param[in] cb cb plane
@@ -94,25 +96,27 @@ typedef struct {
* \param[in] luma_stride luma plane stride
* \param[in] chroma_stride chroma plane stride
*/
-void av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
- uint8_t *cb, uint8_t *cr, int height, int width,
- int luma_stride, int chroma_stride,
- int use_high_bit_depth, int chroma_subsamp_y,
- int chroma_subsamp_x, int mc_identity);
+int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
+ uint8_t *cb, uint8_t *cr, int height, int width,
+ int luma_stride, int chroma_stride,
+ int use_high_bit_depth, int chroma_subsamp_y,
+ int chroma_subsamp_x, int mc_identity);
/*!\brief Add film grain
*
* Add film grain to an image
*
+ * Returns 0 for success, -1 for failure
+ *
* \param[in] grain_params Grain parameters
* \param[in] src Source image
* \param[out] dst Resulting image with grain
*/
-void av1_add_film_grain(const aom_film_grain_t *grain_params,
- const aom_image_t *src, aom_image_t *dst);
+int av1_add_film_grain(const aom_film_grain_t *grain_params,
+ const aom_image_t *src, aom_image_t *dst);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AOM_AOM_GRAIN_SYNTHESIS_H_
+#endif // AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h
index 5c20413b2..a8ac50730 100644
--- a/third_party/aom/aom_dsp/grain_table.h
+++ b/third_party/aom/aom_dsp/grain_table.h
@@ -99,4 +99,4 @@ void aom_film_grain_table_free(aom_film_grain_table_t *t);
}
#endif
-#endif
+#endif // AOM_AOM_DSP_GRAIN_TABLE_H_
diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h
index e047d98bc..3ec62a86e 100644
--- a/third_party/aom/aom_dsp/intrapred_common.h
+++ b/third_party/aom/aom_dsp/intrapred_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _AOM_DSP_INTRAPRED_COMMON_H
-#define _AOM_DSP_INTRAPRED_COMMON_H
+#ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_
+#define AOM_AOM_DSP_INTRAPRED_COMMON_H_
#include "config/aom_config.h"
@@ -44,4 +44,4 @@ static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
};
/* clang-format on */
-#endif // _AOM_DSP_INTRAPRED_COMMON_H
+#endif // AOM_AOM_DSP_INTRAPRED_COMMON_H_
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
index a0627c074..852415c20 100644
--- a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
+++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
-#define AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
+#ifndef AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
+#define AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
#include "aom_dsp/mips/macros_msa.h"
#include "aom_dsp/aom_filter.h"
@@ -76,4 +76,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
res7_m, out0, out1, out2, out3); \
}
-#endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */
+#endif // AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h
index d51bfa899..c42188d62 100644
--- a/third_party/aom/aom_dsp/mips/common_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/common_dspr2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_COMMON_MIPS_DSPR2_H_
-#define AOM_COMMON_MIPS_DSPR2_H_
+#ifndef AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
#include <assert.h>
@@ -48,4 +48,4 @@ static INLINE void prefetch_store_streamed(unsigned char *dst) {
} // extern "C"
#endif
-#endif // AOM_COMMON_MIPS_DSPR2_H_
+#endif // AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
index 2a8f75938..097da73ca 100644
--- a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
@@ -15,7 +15,6 @@
#include "config/aom_dsp_rtcd.h"
#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
index ac87936da..40abfd89e 100644
--- a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
@@ -15,7 +15,6 @@
#include "config/aom_dsp_rtcd.h"
#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
index e7b8d531b..e5d48a884 100644
--- a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_
-#define AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_
+#ifndef AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
#include <assert.h>
@@ -45,4 +45,4 @@ void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
} // extern "C"
#endif
-#endif // AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_
+#endif // AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
index 3e38ef3fb..28f0dc35a 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
-#define AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
#include <stdlib.h>
@@ -733,4 +733,4 @@ static INLINE void wide_mbfilter_dspr2(
} // extern "C"
#endif
-#endif // AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
index cb599cf2e..62295d69d 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
-#define AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
#include <stdlib.h>
@@ -434,4 +434,4 @@ extern "C" {
} // extern "C"
#endif
-#endif // AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
index 6db1dac08..a0f57f386 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
-#define AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
#include <stdlib.h>
@@ -354,4 +354,4 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
} // extern "C"
#endif
-#endif // AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_msa.h b/third_party/aom/aom_dsp/mips/loopfilter_msa.h
index 450594262..54b0bb4bd 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_msa.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_msa.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_LOOPFILTER_MSA_H_
-#define AOM_DSP_LOOPFILTER_MSA_H_
+#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
+#define AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
#include "aom_dsp/mips/macros_msa.h"
@@ -248,4 +248,4 @@
mask_out = limit_in < (v16u8)mask_out; \
mask_out = __msa_xori_b(mask_out, 0xff); \
}
-#endif /* AOM_DSP_LOOPFILTER_MSA_H_ */
+#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h
index eb919d42b..9bfc27147 100644
--- a/third_party/aom/aom_dsp/mips/macros_msa.h
+++ b/third_party/aom/aom_dsp/mips/macros_msa.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_MIPS_MACROS_MSA_H_
-#define AOM_DSP_MIPS_MACROS_MSA_H_
+#ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_
+#define AOM_AOM_DSP_MIPS_MACROS_MSA_H_
#include <msa.h>
@@ -2055,4 +2055,4 @@
\
tmp1_m; \
})
-#endif /* AOM_DSP_MIPS_MACROS_MSA_H_ */
+#endif // AOM_AOM_DSP_MIPS_MACROS_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
index a8ee85b6b..810b6efaa 100644
--- a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
+++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
@@ -13,13 +13,9 @@
#include "aom_ports/mem.h"
#include "aom_dsp/mips/macros_msa.h"
+#include "aom_dsp/aom_filter.h"
#include "aom_dsp/variance.h"
-static const uint8_t bilinear_filters_msa[8][2] = {
- { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
- { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
-};
-
#define CALC_MSE_AVG_B(src, ref, var, sub) \
{ \
v16u8 src_l0_m, src_l1_m; \
@@ -1626,8 +1622,8 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
uint32_t *sse) { \
int32_t diff; \
uint32_t var; \
- const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
- const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
+ const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \
+ const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \
\
if (yoffset) { \
if (xoffset) { \
@@ -1680,8 +1676,8 @@ AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64)
int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \
uint32_t *sse, const uint8_t *sec_pred) { \
int32_t diff; \
- const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
- const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
+ const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \
+ const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \
\
if (yoffset) { \
if (xoffset) { \
@@ -1730,8 +1726,8 @@ uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
int32_t ref_stride, uint32_t *sse,
const uint8_t *sec_pred) {
int32_t diff;
- const uint8_t *h_filter = bilinear_filters_msa[xoffset];
- const uint8_t *v_filter = bilinear_filters_msa[yoffset];
+ const uint8_t *h_filter = bilinear_filters_2t[xoffset];
+ const uint8_t *v_filter = bilinear_filters_2t[yoffset];
if (yoffset) {
if (xoffset) {
@@ -1763,8 +1759,8 @@ uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \
uint32_t *sse, const uint8_t *sec_pred) { \
int32_t diff; \
- const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
- const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
+ const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \
+ const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \
\
if (yoffset) { \
if (xoffset) { \
diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c
index 5975c62e8..2faee8506 100644
--- a/third_party/aom/aom_dsp/noise_model.c
+++ b/third_party/aom/aom_dsp/noise_model.c
@@ -1135,7 +1135,9 @@ int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag);
return 0;
}
+ uint16_t random_seed = film_grain->random_seed;
memset(film_grain, 0, sizeof(*film_grain));
+ film_grain->random_seed = random_seed;
film_grain->apply_grain = 1;
film_grain->update_parameters = 1;
@@ -1633,7 +1635,7 @@ int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
return 0;
}
if (!film_grain->random_seed) {
- film_grain->random_seed = 1071;
+ film_grain->random_seed = 7391;
}
memcpy(raw_data[0], ctx->denoised[0],
(strides[0] * sd->y_height) << use_highbd);
diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h
index b07bf8617..049d5be15 100644
--- a/third_party/aom/aom_dsp/noise_model.h
+++ b/third_party/aom/aom_dsp/noise_model.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_NOISE_MODEL_H_
-#define AOM_DSP_NOISE_MODEL_H_
+#ifndef AOM_AOM_DSP_NOISE_MODEL_H_
+#define AOM_AOM_DSP_NOISE_MODEL_H_
#ifdef __cplusplus
extern "C" {
@@ -320,4 +320,4 @@ void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
-#endif // AOM_DSP_NOISE_MODEL_H_
+#endif // AOM_AOM_DSP_NOISE_MODEL_H_
diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h
index ea4d9e3de..2284a171a 100644
--- a/third_party/aom/aom_dsp/noise_util.h
+++ b/third_party/aom/aom_dsp/noise_util.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_NOISE_UTIL_H_
-#define AOM_DSP_NOISE_UTIL_H_
+#ifndef AOM_AOM_DSP_NOISE_UTIL_H_
+#define AOM_AOM_DSP_NOISE_UTIL_H_
#ifdef __cplusplus
extern "C" {
@@ -65,4 +65,4 @@ int aom_noise_data_validate(const double *data, int w, int h);
} // extern "C"
#endif // __cplusplus
-#endif // AOM_DSP_NOISE_UTIL_H_
+#endif // AOM_AOM_DSP_NOISE_UTIL_H_
diff --git a/third_party/aom/aom_dsp/postproc.h b/third_party/aom/aom_dsp/postproc.h
index 11a8c5ad7..f3d87f264 100644
--- a/third_party/aom/aom_dsp/postproc.h
+++ b/third_party/aom/aom_dsp/postproc.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_POSTPROC_H_
-#define AOM_DSP_POSTPROC_H_
+#ifndef AOM_AOM_DSP_POSTPROC_H_
+#define AOM_AOM_DSP_POSTPROC_H_
#ifdef __cplusplus
extern "C" {
@@ -23,4 +23,4 @@ int aom_setup_noise(double sigma, int size, char *noise);
}
#endif
-#endif // AOM_DSP_POSTPROC_H_
+#endif // AOM_AOM_DSP_POSTPROC_H_
diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h
index 85dd4249d..d003a986e 100644
--- a/third_party/aom/aom_dsp/prob.h
+++ b/third_party/aom/aom_dsp/prob.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_PROB_H_
-#define AOM_DSP_PROB_H_
+#ifndef AOM_AOM_DSP_PROB_H_
+#define AOM_AOM_DSP_PROB_H_
#include <assert.h>
#include <stdio.h>
@@ -668,4 +668,4 @@ static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
} // extern "C"
#endif
-#endif // AOM_DSP_PROB_H_
+#endif // AOM_AOM_DSP_PROB_H_
diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c
index 37d3bb585..50f376a4a 100644
--- a/third_party/aom/aom_dsp/psnr.c
+++ b/third_party/aom/aom_dsp/psnr.c
@@ -295,7 +295,6 @@ int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
default: assert(plane >= 0 && plane <= 2); return 0;
}
}
- (void)highbd;
switch (plane) {
case 0: return aom_get_y_sse(a, b);
case 1: return aom_get_u_sse(a, b);
diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h
index 8300b0a88..58e4e71ee 100644
--- a/third_party/aom/aom_dsp/psnr.h
+++ b/third_party/aom/aom_dsp/psnr.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_PSNR_H_
-#define AOM_DSP_PSNR_H_
+#ifndef AOM_AOM_DSP_PSNR_H_
+#define AOM_AOM_DSP_PSNR_H_
#include "aom_scale/yv12config.h"
@@ -76,4 +76,4 @@ double aom_psnrhvs(const YV12_BUFFER_CONFIG *source,
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AOM_DSP_PSNR_H_
+#endif // AOM_AOM_DSP_PSNR_H_
diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c
index e1601cc7d..62dbd86a9 100644
--- a/third_party/aom/aom_dsp/quantize.c
+++ b/third_party/aom/aom_dsp/quantize.c
@@ -13,8 +13,8 @@
#include "aom_mem/aom_mem.h"
void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan,
@@ -29,56 +29,54 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Pre-scan pass
- for (i = (int)n_coeffs - 1; i >= 0; i--) {
- const int rc = scan[i];
- const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
- const int coeff = coeff_ptr[rc] * wt;
-
- if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) &&
- coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
- non_zero_count--;
- else
- break;
- }
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+
+ if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) &&
+ coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
+ non_zero_count--;
+ else
+ break;
+ }
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp32;
+
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+ int64_t tmp =
+ clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+ INT16_MIN, INT16_MAX);
+ tmp *= wt;
+ tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ quant_shift_ptr[rc != 0]) >>
+ (16 - log_scale + AOM_QM_BITS)); // quantization
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
- // Quantization pass: All coefficients with index >= zero_flag are
- // skippable. Note: zero_flag can be zero.
- for (i = 0; i < non_zero_count; i++) {
- const int rc = scan[i];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- int tmp32;
-
- const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
- if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
- int64_t tmp =
- clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
- INT16_MIN, INT16_MAX);
- tmp *= wt;
- tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
- quant_shift_ptr[rc != 0]) >>
- (16 - log_scale + AOM_QM_BITS)); // quantization
- qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
- const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
- const int dequant =
- (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
- AOM_QM_BITS;
- const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
- dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
-
- if (tmp32) eob = i;
- }
+ if (tmp32) eob = i;
}
}
*eob_ptr = eob + 1;
}
void highbd_quantize_b_helper_c(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
@@ -95,42 +93,40 @@ void highbd_quantize_b_helper_c(
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Pre-scan pass
- for (i = 0; i < n_coeffs; i++) {
- const int rc = scan[i];
- const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
- const int coeff = coeff_ptr[rc] * wt;
-
- // If the coefficient is out of the base ZBIN range, keep it for
- // quantization.
- if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) ||
- coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
- idx_arr[idx++] = i;
- }
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) ||
+ coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
+ idx_arr[idx++] = i;
+ }
- // Quantization pass: only process the coefficients selected in
- // pre-scan pass. Note: idx can be zero.
- for (i = 0; i < idx; i++) {
- const int rc = scan[idx_arr[i]];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
- const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int64_t tmp1 =
- abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
- const int64_t tmpw = tmp1 * wt;
- const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
- const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
- (16 - log_scale + AOM_QM_BITS));
- qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
- dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
- AOM_QM_BITS;
- const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
- dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
- if (abs_qcoeff) eob = idx_arr[i];
- }
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = scan[idx_arr[i]];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+ const int64_t tmpw = tmp1 * wt;
+ const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+ (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (abs_qcoeff) eob = idx_arr[i];
}
*eob_ptr = eob + 1;
}
@@ -138,74 +134,73 @@ void highbd_quantize_b_helper_c(
/* These functions should only be called when quantisation matrices
are not used. */
void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
- quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
- dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0);
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 0);
}
void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
- dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1);
+ quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 1);
}
void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
- dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2);
+ quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 2);
}
void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr,
- round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
NULL, NULL, 0);
}
void aom_highbd_quantize_b_32x32_c(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr,
- round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
NULL, NULL, 1);
}
void aom_highbd_quantize_b_64x64_c(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr,
- round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
NULL, NULL, 2);
}
diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h
index 56d50b929..c55ab234e 100644
--- a/third_party/aom/aom_dsp/quantize.h
+++ b/third_party/aom/aom_dsp/quantize.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_QUANTIZE_H_
-#define AOM_DSP_QUANTIZE_H_
+#ifndef AOM_AOM_DSP_QUANTIZE_H_
+#define AOM_AOM_DSP_QUANTIZE_H_
#include "config/aom_config.h"
@@ -21,8 +21,8 @@ extern "C" {
#endif
void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan,
@@ -30,24 +30,23 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const qm_val_t *iqm_ptr, const int log_scale);
void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan);
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
void highbd_quantize_b_helper_c(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
const qm_val_t *iqm_ptr, const int log_scale);
void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -57,4 +56,4 @@ void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
} // extern "C"
#endif
-#endif // AOM_DSP_QUANTIZE_H_
+#endif // AOM_AOM_DSP_QUANTIZE_H_
diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c
index ede4c583b..1e24df4a5 100644
--- a/third_party/aom/aom_dsp/sad.c
+++ b/third_party/aom/aom_dsp/sad.c
@@ -200,15 +200,16 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
const uint8_t *second_pred) { \
uint16_t comp_pred[m * n]; \
- aom_highbd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, m, n, \
+ ref, ref_stride); \
return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
} \
unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
uint16_t comp_pred[m * n]; \
- aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, \
- ref_stride, jcp_param); \
+ aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, \
+ m, n, ref, ref_stride, jcp_param); \
return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
}
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
index 51a38a7e1..01dbb8fd2 100644
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics.h
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V128_INTRINSICS_H
-#define _V128_INTRINSICS_H
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
#include <stdio.h>
#include <stdlib.h>
@@ -341,4 +341,4 @@ SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
return c_v128_ssd_s16_sum(s);
}
-#endif /* _V128_INTRINSICS_H */
+#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
index d4fec4237..3c669d579 100644
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V128_INTRINSICS_H
-#define _V128_INTRINSICS_H
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
#include <arm_neon.h>
@@ -375,7 +375,8 @@ SIMD_INLINE uint32_t v128_movemask_8(v128 a) {
uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
vandq_u8(vreinterpretq_u8_s64(a),
vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
- return v64_u64(v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
+ return v64_low_u32(
+ v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
#endif
}
@@ -954,4 +955,4 @@ SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
}
-#endif /* _V128_INTRINSICS_H */
+#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
index e508f6ad7..bbe9a9d28 100644
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V128_INTRINSICS_C_H
-#define _V128_INTRINSICS_C_H
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
#include <stdio.h>
#include <stdlib.h>
@@ -885,4 +885,4 @@ SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
-#endif /* _V128_INTRINSICS_C_H */
+#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
index f9043fe99..6c7241ff4 100644
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V128_INTRINSICS_H
-#define _V128_INTRINSICS_H
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
#include <stdint.h>
#include "aom_dsp/simd/v64_intrinsics_x86.h"
@@ -653,4 +653,4 @@ SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
}
-#endif /* _V128_INTRINSICS_H */
+#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
index 4b70cc57b..cb99d35b7 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V256_INTRINSICS_H
-#define _V256_INTRINSICS_H
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
#include <stdio.h>
#include <stdlib.h>
@@ -373,4 +373,4 @@ SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
return c_v256_ssd_s16_sum(s);
}
-#endif /* _V256_INTRINSICS_H */
+#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
index d96638488..bd86ea172 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
@@ -9,9 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V256_INTRINSICS_H
-#define _V256_INTRINSICS_H
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
#include "aom_dsp/simd/v256_intrinsics_v128.h"
-#endif /* _V256_INTRINSICS_H */
+#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
index 5b412df71..a1c08e95a 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V256_INTRINSICS_C_H
-#define _V256_INTRINSICS_C_H
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
#include <stdio.h>
#include <stdlib.h>
@@ -950,4 +950,4 @@ SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s,
SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; }
-#endif /* _V256_INTRINSICS_C_H */
+#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
index 60b2a1791..d5b7905ef 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V256_INTRINSICS_V128_H
-#define _V256_INTRINSICS_V128_H
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
#if HAVE_NEON
#include "aom_dsp/simd/v128_intrinsics_arm.h"
@@ -870,4 +870,4 @@ SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
}
-#endif /* _V256_INTRINSICS_V128_H */
+#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
index 05f205169..44594bc41 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V256_INTRINSICS_H
-#define _V256_INTRINSICS_H
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
#if !defined(__AVX2__)
@@ -747,4 +747,4 @@ SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
#endif
-#endif /* _V256_INTRINSICS_H */
+#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
index 6ce53c6a9..afc55428d 100644
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics.h
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V64_INTRINSICS_H
-#define _V64_INTRINSICS_H
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
#include <stdio.h>
#include <stdlib.h>
@@ -229,4 +229,4 @@ SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
return c_v64_shr_n_s32(a, c);
}
-#endif /* _V64_INTRINSICS_H */
+#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
index 267441b02..8f39ad6e8 100644
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V64_INTRINSICS_H
-#define _V64_INTRINSICS_H
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
#include <arm_neon.h>
@@ -677,4 +677,4 @@ SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
#endif
-#endif /* _V64_INTRINSICS_H */
+#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
index 8158899cb..028d68c4f 100644
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V64_INTRINSICS_C_H
-#define _V64_INTRINSICS_C_H
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
/* Note: This implements the intrinsics in plain, unoptimised C.
Intended for reference, porting or debugging. */
@@ -965,4 +965,4 @@ SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) {
return c_v64_shr_s32(a, c);
}
-#endif /* _V64_INTRINSICS_C_H */
+#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
index 130052ee1..5f9a57b37 100644
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _V64_INTRINSICS_H
-#define _V64_INTRINSICS_H
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
#include <emmintrin.h>
#if defined(__SSSE3__)
@@ -488,4 +488,4 @@ SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
-#endif /* _V64_INTRINSICS_H */
+#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/sse.c b/third_party/aom/aom_dsp/sse.c
new file mode 100644
index 000000000..249394807
--- /dev/null
+++ b/third_party/aom/aom_dsp/sse.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* Sum the difference between every corresponding element of the buffers. */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int y, x;
+ int64_t sse = 0;
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ const int32_t diff = abs(a[x] - b[x]);
+ sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sse;
+}
+
+int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
+ int b_stride, int width, int height) {
+ int y, x;
+ int64_t sse = 0;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]);
+ sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sse;
+}
diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c
index 6ce3d7acb..681770ba9 100644
--- a/third_party/aom/aom_dsp/ssim.c
+++ b/third_party/aom/aom_dsp/ssim.c
@@ -275,7 +275,7 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
for (i = 0; i < height;
i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
for (j = 0; j < width; j += 4, ++c) {
- Ssimv sv = { 0 };
+ Ssimv sv = { 0, 0, 0, 0, 0, 0 };
double ssim;
double ssim2;
double dssim;
diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h
index c8a389dfe..55038f4c2 100644
--- a/third_party/aom/aom_dsp/ssim.h
+++ b/third_party/aom/aom_dsp/ssim.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_SSIM_H_
-#define AOM_DSP_SSIM_H_
+#ifndef AOM_AOM_DSP_SSIM_H_
+#define AOM_AOM_DSP_SSIM_H_
#define MAX_SSIM_DB 100.0;
@@ -84,4 +84,4 @@ double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
} // extern "C"
#endif
-#endif // AOM_DSP_SSIM_H_
+#endif // AOM_AOM_DSP_SSIM_H_
diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h
index 7deb0aea3..f98242840 100644
--- a/third_party/aom/aom_dsp/txfm_common.h
+++ b/third_party/aom/aom_dsp/txfm_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_TXFM_COMMON_H_
-#define AOM_DSP_TXFM_COMMON_H_
+#ifndef AOM_AOM_DSP_TXFM_COMMON_H_
+#define AOM_AOM_DSP_TXFM_COMMON_H_
#include "aom_dsp/aom_dsp_common.h"
#include "av1/common/enums.h"
@@ -88,4 +88,4 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
return rv;
}
-#endif // AOM_DSP_TXFM_COMMON_H_
+#endif // AOM_AOM_DSP_TXFM_COMMON_H_
diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c
index 817ebe15d..23b715309 100644
--- a/third_party/aom/aom_dsp/variance.c
+++ b/third_party/aom/aom_dsp/variance.c
@@ -55,24 +55,6 @@ uint32_t aom_get_mb_ss_c(const int16_t *a) {
return sum;
}
-uint32_t aom_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- uint32_t *sse) {
- return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse);
-}
-
-uint32_t aom_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- uint32_t *sse) {
- return aom_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse);
-}
-
-uint32_t aom_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- uint32_t *sse) {
- return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse);
-}
-
static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int w, int h, uint32_t *sse, int *sum) {
int i, j;
@@ -302,7 +284,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
int mi_row, int mi_col, const MV *const mv,
uint8_t *comp_pred, int width, int height,
int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride) {
+ int ref_stride, int subpel_search) {
// expect xd == NULL only in tests
if (xd != NULL) {
const MB_MODE_INFO *mi = xd->mi[0];
@@ -370,7 +352,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
// Get convolve parameters.
- ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
const InterpFilters filters =
av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
@@ -387,7 +369,9 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
}
const InterpFilterParams *filter =
- av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+ (subpel_search == 1)
+ ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+ : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
if (!subpel_x_q3 && !subpel_y_q3) {
for (int i = 0; i < height; i++) {
@@ -398,13 +382,13 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
} else if (!subpel_y_q3) {
const int16_t *const kernel =
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
- width, height);
+ aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
+ -1, width, height);
} else if (!subpel_x_q3) {
const int16_t *const kernel =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
- width, height);
+ aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
+ 16, width, height);
} else {
DECLARE_ALIGNED(16, uint8_t,
temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
@@ -415,12 +399,12 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
const int intermediate_height =
(((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
- ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
- width, intermediate_height);
- aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
- MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
- width, height);
+ aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
+ ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+ width, intermediate_height);
+ aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
+ MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+ width, height);
}
}
@@ -429,11 +413,11 @@ void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
uint8_t *comp_pred, const uint8_t *pred,
int width, int height, int subpel_x_q3,
int subpel_y_q3, const uint8_t *ref,
- int ref_stride) {
+ int ref_stride, int subpel_search) {
int i, j;
aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride);
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
@@ -466,13 +450,14 @@ void aom_jnt_comp_avg_upsampled_pred_c(
MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const JNT_COMP_PARAMS *jcp_param) {
+ int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
int i, j;
const int fwd_offset = jcp_param->fwd_offset;
const int bck_offset = jcp_param->bck_offset;
- aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride);
+ aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
@@ -706,125 +691,125 @@ void aom_highbd_var_filter_block2d_bil_second_pass(
dst, dst_stride, sse); \
}
-#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
- uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
- \
- return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
- dst, dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
- \
- return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
- dst, dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
- \
- return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
- dst, dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \
- \
- return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
- dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \
- \
- return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
- dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \
- \
- return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
- dst_stride, sse); \
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+ uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \
+ \
+ return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \
+ \
+ return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \
+ \
+ return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
}
/* All three forms of the variance are available in the same sizes. */
@@ -867,12 +852,13 @@ HIGHBD_MSE(16, 8)
HIGHBD_MSE(8, 16)
HIGHBD_MSE(8, 8)
-void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
int width, int height, const uint8_t *ref8,
int ref_stride) {
int i, j;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int tmp = pred[j] + ref[j];
@@ -887,9 +873,10 @@ void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
const struct AV1Common *const cm, int mi_row,
int mi_col, const MV *const mv,
- uint16_t *comp_pred, int width, int height,
+ uint8_t *comp_pred8, int width, int height,
int subpel_x_q3, int subpel_y_q3,
- const uint8_t *ref8, int ref_stride, int bd) {
+ const uint8_t *ref8, int ref_stride, int bd,
+ int subpel_search) {
// expect xd == NULL only in tests
if (xd != NULL) {
const MB_MODE_INFO *mi = xd->mi[0];
@@ -902,8 +889,6 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
if (is_scaled) {
// Note: This is mostly a copy from the >=8X8 case in
// build_inter_predictors() function, with some small tweaks.
- uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);
-
// Some assumptions.
const int plane = 0;
@@ -958,7 +943,7 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
// Get convolve parameters.
- ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
const InterpFilters filters =
av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
@@ -975,13 +960,14 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
}
const InterpFilterParams *filter =
- av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+ (subpel_search == 1)
+ ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+ : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
if (!subpel_x_q3 && !subpel_y_q3) {
- const uint16_t *ref;
- int i;
- ref = CONVERT_TO_SHORTPTR(ref8);
- for (i = 0; i < height; i++) {
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ for (int i = 0; i < height; i++) {
memcpy(comp_pred, ref, width * sizeof(*comp_pred));
comp_pred += width;
ref += ref_stride;
@@ -989,13 +975,13 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
} else if (!subpel_y_q3) {
const int16_t *const kernel =
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
- width, kernel, 16, NULL, -1, width, height, bd);
+ aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
+ NULL, -1, width, height, bd);
} else if (!subpel_x_q3) {
const int16_t *const kernel =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
- width, NULL, -1, kernel, 16, width, height, bd);
+ aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
+ kernel, 16, width, height, bd);
} else {
DECLARE_ALIGNED(16, uint16_t,
temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
@@ -1012,22 +998,23 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
intermediate_height, bd);
aom_highbd_convolve8_vert(
CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
- MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
- 16, width, height, bd);
+ MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+ bd);
}
}
void aom_highbd_comp_avg_upsampled_pred_c(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd) {
+ int ref_stride, int bd, int subpel_search) {
int i, j;
const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd);
+ bd, subpel_search);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
@@ -1037,7 +1024,7 @@ void aom_highbd_comp_avg_upsampled_pred_c(
}
}
-void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
int width, int height, const uint8_t *ref8,
int ref_stride,
const JNT_COMP_PARAMS *jcp_param) {
@@ -1046,6 +1033,7 @@ void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
const int bck_offset = jcp_param->bck_offset;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
@@ -1061,17 +1049,18 @@ void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
void aom_highbd_jnt_comp_avg_upsampled_pred_c(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) {
+ int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+ int subpel_search) {
int i, j;
const int fwd_offset = jcp_param->fwd_offset;
const int bck_offset = jcp_param->bck_offset;
const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd);
+ bd, subpel_search);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
@@ -1104,21 +1093,23 @@ void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
}
}
-void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm,
- int mi_row, int mi_col, const MV *const mv,
- uint8_t *comp_pred, const uint8_t *pred,
- int width, int height, int subpel_x_q3,
- int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const uint8_t *mask,
- int mask_stride, int invert_mask) {
+void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask,
+ int subpel_search) {
if (subpel_x_q3 | subpel_y_q3) {
- aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride);
+ aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
ref = comp_pred;
ref_stride = width;
}
- aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
- mask_stride, invert_mask);
+ aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
+ mask_stride, invert_mask);
}
#define MASK_SUBPIX_VAR(W, H) \
@@ -1164,13 +1155,14 @@ MASK_SUBPIX_VAR(32, 8)
MASK_SUBPIX_VAR(16, 64)
MASK_SUBPIX_VAR(64, 16)
-void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
int width, int height, const uint8_t *ref8,
int ref_stride, const uint8_t *mask,
int mask_stride, int invert_mask) {
int i, j;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
if (!invert_mask)
@@ -1187,16 +1179,15 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
void aom_highbd_comp_mask_upsampled_pred(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
- int bd) {
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+ int bd, int subpel_search) {
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd);
- aom_highbd_comp_mask_pred(comp_pred, pred8, width, height,
- CONVERT_TO_BYTEPTR(comp_pred), width, mask,
- mask_stride, invert_mask);
+ bd, subpel_search);
+ aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
+ mask, mask_stride, invert_mask);
}
#define HIGHBD_MASK_SUBPIX_VAR(W, H) \
@@ -1214,7 +1205,7 @@ void aom_highbd_comp_mask_upsampled_pred(
aom_highbd_var_filter_block2d_bil_second_pass( \
fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
\
- aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \
+ aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
invert_mask); \
\
@@ -1236,7 +1227,7 @@ void aom_highbd_comp_mask_upsampled_pred(
aom_highbd_var_filter_block2d_bil_second_pass( \
fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
\
- aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \
+ aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
invert_mask); \
\
@@ -1258,7 +1249,7 @@ void aom_highbd_comp_mask_upsampled_pred(
aom_highbd_var_filter_block2d_bil_second_pass( \
fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
\
- aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \
+ aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
invert_mask); \
\
diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h
index b954470de..362da29d3 100644
--- a/third_party/aom/aom_dsp/variance.h
+++ b/third_party/aom/aom_dsp/variance.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_VARIANCE_H_
-#define AOM_DSP_VARIANCE_H_
+#ifndef AOM_AOM_DSP_VARIANCE_H_
+#define AOM_AOM_DSP_VARIANCE_H_
#include "config/aom_config.h"
@@ -70,18 +70,12 @@ typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-void aom_comp_mask_upsampled_pred(
- MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
-
void aom_highbd_comp_mask_upsampled_pred(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
- int bd);
+ int bd, int subpel_search);
typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
const int32_t *wsrc,
@@ -133,4 +127,4 @@ uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
} // extern "C"
#endif
-#endif // AOM_DSP_VARIANCE_H_
+#endif // AOM_AOM_DSP_VARIANCE_H_
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
index 401fbdc48..5f5bf5f14 100644
--- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -22,6 +22,13 @@ filter8_1dfunction aom_filter_block1d8_h8_sse2;
filter8_1dfunction aom_filter_block1d4_v8_sse2;
filter8_1dfunction aom_filter_block1d4_h8_sse2;
+#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2
+#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2
+#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2
+#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2
+#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2
+#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2
+
filter8_1dfunction aom_filter_block1d16_v2_sse2;
filter8_1dfunction aom_filter_block1d16_h2_sse2;
filter8_1dfunction aom_filter_block1d8_v2_sse2;
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index f3fe50372..94b5da171 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -74,6 +74,87 @@ static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
_mm256_extractf128_si256(*a, 1));
}
+static void aom_filter_block1d4_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ firstFilters =
+ _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+ filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ srcRegFilt32b1_1 =
+ _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 4 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+ srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 4 bytes
+ *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+ }
+}
+
static void aom_filter_block1d4_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -179,6 +260,100 @@ static void aom_filter_block1d4_h8_avx2(
}
}
+static void aom_filter_block1d8_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt2Reg, filt3Reg;
+ __m256i secondFilters, thirdFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 8 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 8 bytes
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+ }
+}
+
static void aom_filter_block1d8_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -311,6 +486,121 @@ static void aom_filter_block1d8_h8_avx2(
}
}
+static void aom_filter_block1d16_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt2Reg, filt3Reg;
+ __m256i secondFilters, thirdFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, srcReg32b2, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 =
+ xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m256i srcReg1, srcReg12;
+ __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
+
+ srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
+ srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
+
+ // filter the source buffer
+ srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
+ srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
+ srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
+ srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
+ srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr,
+ _mm256_castsi256_si128(srcRegFilt1_1));
+ }
+}
+
static void aom_filter_block1d16_h8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -507,6 +797,92 @@ static void aom_filter_block1d16_h8_avx2(
}
}
+static void aom_filter_block1d8_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg45_56_lo;
+ __m256i resReg23_34_lo, resReg45_56_lo;
+ __m256i resReglo, resReg;
+ __m256i secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+ resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+ // add and saturate the results together
+ resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4x = srcReg6x;
+ }
+}
+
static void aom_filter_block1d8_v8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
@@ -659,6 +1035,104 @@ static void aom_filter_block1d8_v8_avx2(
}
}
+static void aom_filter_block1d16_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
+ __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
+ __m256i resReglo, resReghi, resReg;
+ __m256i secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+ srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+ srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+ resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+ // add and saturate the results together
+ resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
+ resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
+
+ // add and saturate the results together
+ resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+ resReghi = _mm256_srai_epi16(resReghi, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReghi);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg23_34_hi = srcReg45_56_hi;
+ srcReg4x = srcReg6x;
+ }
+}
+
static void aom_filter_block1d16_v8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
@@ -854,6 +1328,88 @@ static void aom_filter_block1d16_v8_avx2(
}
}
+static void aom_filter_block1d4_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg45_56_lo;
+ __m256i srcReg2345_3456_lo;
+ __m256i resReglo, resReg;
+ __m256i firstFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ firstFilters =
+ _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+ srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+
+ resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4x = srcReg6x;
+ }
+}
+
#if HAVE_AVX2 && HAVE_SSSE3
filter8_1dfunction aom_filter_block1d4_v8_ssse3;
filter8_1dfunction aom_filter_block1d16_v2_ssse3;
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 6bcb4a512..325a21b76 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -287,6 +287,13 @@ filter8_1dfunction aom_filter_block1d8_h8_ssse3;
filter8_1dfunction aom_filter_block1d4_v8_ssse3;
filter8_1dfunction aom_filter_block1d4_h8_ssse3;
+#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3
+#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3
+#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3
+#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3
+#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3
+#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3
+
filter8_1dfunction aom_filter_block1d16_v2_ssse3;
filter8_1dfunction aom_filter_block1d16_h2_ssse3;
filter8_1dfunction aom_filter_block1d8_v2_ssse3;
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
new file mode 100644
index 000000000..67fb4d32b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -0,0 +1,900 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> // SSE4.1
+#include <immintrin.h> // AVX2
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w16_avx2(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
+ int shift) {
+ const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+ const __m256i s0_0 = yy_loadu_256(src0);
+ const __m256i s1_0 = yy_loadu_256(src1);
+ __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+ _mm256_unpacklo_epi16(*m0, max_minus_m0));
+ __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+ _mm256_unpackhi_epi16(*m0, max_minus_m0));
+ res0_lo =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+ res0_hi =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+ const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+ __m256i res = _mm256_packus_epi16(res0, res0);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
+}
+
+static INLINE void blend_a64_d16_mask_w32_avx2(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
+ const __m256i *v_maxval, int shift) {
+ const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+ const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
+ const __m256i s0_0 = yy_loadu_256(src0);
+ const __m256i s0_1 = yy_loadu_256(src0 + 16);
+ const __m256i s1_0 = yy_loadu_256(src1);
+ const __m256i s1_1 = yy_loadu_256(src1 + 16);
+ __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+ _mm256_unpacklo_epi16(*m0, max_minus_m0));
+ __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+ _mm256_unpackhi_epi16(*m0, max_minus_m0));
+ __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
+ _mm256_unpacklo_epi16(*m1, max_minus_m1));
+ __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
+ _mm256_unpackhi_epi16(*m1, max_minus_m1));
+ res0_lo =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+ res0_hi =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+ res1_lo =
+ _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
+ res1_hi =
+ _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
+ const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+ const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
+ __m256i res = _mm256_packus_epi16(res0, res1);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ _mm256_storeu_si256((__m256i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m = xx_loadu_128(mask);
+ const __m256i m0 = _mm256_cvtepu8_epi16(m);
+
+ blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m = yy_loadu_256(mask + j);
+ const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
+ const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
+
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ const __m256i m_i00 = yy_loadu_256(mask);
+ const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
+
+ const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+ const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+ const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+
+ blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+ const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+ const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
+ const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
+
+ const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+ const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
+ const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+ const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
+ const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+ const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
+
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i zeros = _mm256_setzero_si256();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+ const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+ const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+
+ blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i zeros = _mm256_setzero_si256();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+ const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+ const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+ const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
+ const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+ const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
+
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + j);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+ const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+ const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
+
+ blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i zeros = _mm256_setzero_si256();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m_i00 = yy_loadu_256(mask + j);
+ const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
+
+ const __m256i m_ac =
+ _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
+ const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
+ const __m256i m1 =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
+
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+void aom_lowbd_blend_a64_d16_mask_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+ const int round_offset =
+ ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+ (1 << (round_bits - 1)))
+ << AOM_BLEND_A64_ROUND_BITS;
+
+ const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+ const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+ const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
+
+ if (subw == 0 && subh == 0) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ } else if (subw == 1 && subh == 1) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ } else if (subw == 1 && subh == 0) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ }
+}
+
+static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+ const __m256i *v_m0_b,
+ const __m256i *v_m1_b,
+ const int32_t bits) {
+ const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
+ const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
+ const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
+ const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
+
+ const __m256i v_p0_w =
+ _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
+ _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+ const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+ const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
+ const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
+ return v_res;
+}
+
+static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+ const __m256i *v_m0_b,
+ const __m256i *v_m1_b,
+ const int32_t bits) {
+ const __m256i v_s0_b = yy_loadu_256(src0);
+ const __m256i v_s1_b = yy_loadu_256(src1);
+
+ const __m256i v_p0_w =
+ _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+ const __m256i v_p1_w =
+ _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
+ _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+ const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+ const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
+ const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
+ return v_res;
+}
+
+static INLINE void blend_a64_mask_sx_sy_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h) {
+ const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ const __m256i v_ral_b = yy_loadu_256(mask);
+ const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
+ const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+ const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+ const __m256i v_rvsbl_w =
+ _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+ const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+
+ const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
+ const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+ AOM_BLEND_A64_ROUND_BITS);
+
+ xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
+ const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
+ const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
+ const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
+ const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+ const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
+ const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+ const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
+ const __m256i v_rvsbl_w =
+ _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+ const __m256i v_rvsbh_w =
+ _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
+ const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+ const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+ const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
+ const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
+ const __m256i v_m0_b =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_ra_b = xx_loadu_128(mask);
+ const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h);
+ break;
+ default:
+ blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ break;
+ }
+}
+
+static INLINE void blend_a64_mask_sx_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
+ do {
+ const __m256i v_rl_b = yy_loadu_256(mask);
+ const __m256i v_al_b =
+ _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
+
+ const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
+ const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+ AOM_BLEND_A64_ROUND_BITS);
+
+ xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
+ const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
+ const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
+ const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
+ const __m256i v_al_b =
+ _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
+ const __m256i v_ah_b =
+ _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
+
+ const __m256i v_m0_b =
+ _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_r_b = xx_loadl_64(mask);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_r_b = xx_loadu_128(mask);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h);
+ break;
+ default:
+ blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ break;
+ }
+}
+
+static INLINE void blend_a64_mask_sy_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h) {
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ const __m128i v_ra_b = xx_loadu_128(mask);
+ const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_ra_b = yy_loadu_256(mask + c);
+ const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
+ const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_ra_b = xx_loadl_32(mask);
+ const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h);
+ break;
+ default:
+ blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ }
+}
+
+static INLINE void blend_a64_mask_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_m0_b = yy_loadu_256(mask + c);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_m0_b = xx_loadl_32(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_m0_b = xx_loadl_64(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ do {
+ const __m128i v_m0_b = xx_loadu_128(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ default:
+ blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ }
+}
+
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w,
+ int h, int subx, int suby) {
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, mask_stride, w, h, subx, suby);
+ } else {
+ if (subx & suby) {
+ blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ } else if (subx) {
+ blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ } else if (suby) {
+ blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ } else {
+ blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, mask_stride, w, h);
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
index 49c20b467..9d6b4c2f7 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -20,6 +20,7 @@
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
#include "config/aom_dsp_rtcd.h"
@@ -32,19 +33,13 @@ static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
(void)w;
-
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_m0_b = xx_loadl_32(mask);
- const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_32(dst, v_res_b);
dst += dst_stride;
@@ -59,19 +54,13 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
(void)w;
-
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_m0_b = xx_loadl_64(mask);
- const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_64(dst, v_res_b);
dst += dst_stride;
@@ -85,23 +74,17 @@ static void blend_a64_mask_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
int c;
for (c = 0; c < w; c += 16) {
- const __m128i v_m0l_b = xx_loadl_64(mask + c);
- const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
- const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
- const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
- const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
- const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
- const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
- const __m128i v_resh_w =
- blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
+ const __m128i v_m0_b = xx_loadu_128(mask + c);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
xx_storeu_128(dst + c, v_res_b);
}
@@ -120,23 +103,20 @@ static void blend_a64_mask_sx_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
(void)w;
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_r_b = xx_loadl_64(mask);
- const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
- const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_32(dst, v_res_b);
dst += dst_stride;
@@ -150,22 +130,20 @@ static void blend_a64_mask_sx_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
(void)w;
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_r_b = xx_loadu_128(mask);
- const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
- const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_64(dst, v_res_b);
@@ -180,28 +158,24 @@ static void blend_a64_mask_sx_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
int c;
for (c = 0; c < w; c += 16) {
- const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
- const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
- const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
- const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
-
- const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
- const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
- const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
- const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
- const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
- const __m128i v_resh_w =
- blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
-
- const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+ const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
+ const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
xx_storeu_128(dst + c, v_res_b);
}
@@ -220,21 +194,18 @@ static void blend_a64_mask_sy_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
(void)w;
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
do {
const __m128i v_ra_b = xx_loadl_32(mask);
const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
- const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_32(dst, v_res_b);
@@ -249,21 +220,16 @@ static void blend_a64_mask_sy_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
(void)w;
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
const __m128i v_ra_b = xx_loadl_64(mask);
const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
- const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
- const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_64(dst, v_res_b);
@@ -278,26 +244,18 @@ static void blend_a64_mask_sy_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zero = _mm_setzero_si128();
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
int c;
for (c = 0; c < w; c += 16) {
const __m128i v_ra_b = xx_loadu_128(mask + c);
const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
- const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
- const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
- const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
- const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
- const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
- const __m128i v_resh_w =
- blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
-
- const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
xx_storeu_128(dst + c, v_res_b);
}
@@ -316,27 +274,24 @@ static void blend_a64_mask_sx_sy_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
(void)w;
do {
const __m128i v_ra_b = xx_loadl_64(mask);
const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
- const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
- const __m128i v_rvsb_w =
- _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
- const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_32(dst, v_res_b);
@@ -351,27 +306,25 @@ static void blend_a64_mask_sx_sy_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
(void)w;
do {
const __m128i v_ra_b = xx_loadu_128(mask);
const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
- const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
- const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
- const __m128i v_rvsb_w =
- _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
- const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
xx_storel_64(dst, v_res_b);
@@ -388,8 +341,8 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
const uint8_t *mask, uint32_t mask_stride, int w, int h) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
do {
int c;
for (c = 0; c < w; c += 16) {
@@ -410,14 +363,11 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
- const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
- const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
- const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
- const __m128i v_resh_w =
- blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
xx_storeu_128(dst + c, v_res_b);
}
@@ -921,24 +871,140 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
}
}
-static INLINE void blend_a64_d16_mask(uint8_t *dst, const CONV_BUF_TYPE *src0,
- const CONV_BUF_TYPE *src1,
- const __m128i *m,
- const __m128i *v_round_offset,
- const __m128i *v_maxval, int round_bits) {
- const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
- const __m128i s0 = xx_loadl_64(src0);
- const __m128i s1 = xx_loadl_64(src1);
- const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
- const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
- const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
- const __m128i res_b = _mm_srli_epi32(res_a, AOM_BLEND_A64_ROUND_BITS);
- const __m128i res_c = _mm_sub_epi32(res_b, *v_round_offset);
- const __m128i res_d = xx_roundn_epi32(res_c, round_bits);
- const __m128i res_e = _mm_packs_epi32(res_d, res_d);
- const __m128i res = _mm_packus_epi16(res_e, res_e);
-
- xx_storel_32(dst, res);
+static INLINE void blend_a64_d16_mask_w16_sse41(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
+ const __m128i *v_maxval, int shift) {
+ const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
+ const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
+ const __m128i s0_0 = xx_loadu_128(src0);
+ const __m128i s0_1 = xx_loadu_128(src0 + 8);
+ const __m128i s1_0 = xx_loadu_128(src1);
+ const __m128i s1_1 = xx_loadu_128(src1 + 8);
+ __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
+ _mm_unpacklo_epi16(*m0, max_minus_m0));
+ __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
+ _mm_unpackhi_epi16(*m0, max_minus_m0));
+ __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
+ _mm_unpacklo_epi16(*m1, max_minus_m1));
+ __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
+ _mm_unpackhi_epi16(*m1, max_minus_m1));
+ res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
+ res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
+ res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
+ res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
+ const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
+ const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
+ const __m128i res = _mm_packus_epi16(res0, res1);
+
+ _mm_storeu_si128((__m128i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m = xx_loadu_128(mask + j);
+ const __m128i m0 = _mm_cvtepu8_epi16(m);
+ const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
+
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+ const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+ const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+
+ const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+ const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+ const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+ const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+ const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+ const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+ const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+ const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
+ const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
+ const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
+ const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
+
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + j);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+ const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+ const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
+ const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
+
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
}
void aom_lowbd_blend_a64_d16_mask_sse4_1(
@@ -947,12 +1013,15 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1(
const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
ConvolveParams *conv_params) {
const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1));
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int round_offset =
+ ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+ (1 << (round_bits - 1)))
+ << AOM_BLEND_A64_ROUND_BITS;
+
+ const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
@@ -961,69 +1030,80 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1(
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i v_ro_a = xx_loadl_32(&round_offset);
- const __m128i v_round_offset = _mm_shuffle_epi32(v_ro_a, 0);
- const __m128i one_w = _mm_set1_epi16(1);
- const __m128i one_b = _mm_set1_epi8(1);
- const __m128i two_w = _mm_set1_epi16(2);
+ const __m128i v_round_offset = _mm_set1_epi32(round_offset);
if (subw == 0 && subh == 0) {
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 4) {
- const __m128i m0 = xx_loadl_32(&mask[i * mask_stride + j]);
- const __m128i m = _mm_cvtepu8_epi16(m0);
-
- blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
- &src1[i * src1_stride + j], &m, &v_round_offset,
- &v_maxval, round_bits);
- }
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
}
+
} else if (subw == 1 && subh == 1) {
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 4) {
- const __m128i m_i0 =
- xx_loadl_64(&mask[(2 * i) * mask_stride + (2 * j)]);
- const __m128i m_i1 =
- xx_loadl_64(&mask[(2 * i + 1) * mask_stride + (2 * j)]);
- const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
- const __m128i m_bd = _mm_maddubs_epi16(m_i1, one_b);
- const __m128i m_acbd = _mm_add_epi16(m_ac, m_bd);
- const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
- const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
-
- blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
- &src1[i * src1_stride + j], &m, &v_round_offset,
- &v_maxval, round_bits);
- }
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
}
} else if (subw == 1 && subh == 0) {
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 4) {
- const __m128i m_i0 = xx_loadl_64(&mask[i * mask_stride + (2 * j)]);
- const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
- const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w);
- const __m128i m = _mm_srli_epi16(m_ac_1, 1);
-
- blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
- &src1[i * src1_stride + j], &m, &v_round_offset,
- &v_maxval, round_bits);
- }
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
}
} else {
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 4) {
- const __m128i m_i0 = xx_loadl_64(&mask[(2 * i) * mask_stride + j]);
- const __m128i m_i1 = xx_loadl_64(&mask[(2 * i + 1) * mask_stride + j]);
- const __m128i m_i01 = _mm_unpacklo_epi8(m_i0, m_i1);
- const __m128i m_ac = _mm_maddubs_epi16(m_i01, one_b);
- const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w);
- const __m128i m = _mm_srli_epi16(m_ac_1, 1);
-
- blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
- &src1[i * src1_stride + j], &m, &v_round_offset,
- &v_maxval, round_bits);
- }
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
}
}
}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
index 59506bdfe..064910232 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -39,7 +39,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
const __m128i v_m0_w = _mm_set1_epi16(*mask);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+ const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
@@ -64,7 +64,7 @@ static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
const __m128i v_m0_w = _mm_set1_epi16(*mask);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+ const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
@@ -90,9 +90,9 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
const __m128i v_m0_w = _mm_set1_epi16(*mask);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
for (c = 0; c < w; c += 16) {
- const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w);
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
const __m128i v_resh_w =
- blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w);
+ blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
new file mode 100644
index 000000000..c071fdcfc
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w4_sse41(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+ int shift) {
+ const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+ const __m128i s0 = xx_loadl_64(src0);
+ const __m128i s1 = xx_loadl_64(src1);
+ const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
+ const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
+ const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
+ const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
+ const __m128i res_d = _mm_srai_epi32(res_c, shift);
+ const __m128i res_e = _mm_packs_epi32(res_d, res_d);
+ const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+ xx_storel_32(dst, res);
+}
+
+static INLINE void blend_a64_d16_mask_w8_sse41(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+ int shift) {
+ const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+ const __m128i s0 = xx_loadu_128(src0);
+ const __m128i s1 = xx_loadu_128(src1);
+ __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
+ _mm_unpacklo_epi16(*m, max_minus_m));
+ __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
+ _mm_unpackhi_epi16(*m, max_minus_m));
+ res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
+ res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
+ const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
+ const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+ _mm_storel_epi64((__m128i *)(dst), res);
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m0 = xx_loadl_32(mask);
+ const __m128i m = _mm_cvtepu8_epi16(m0);
+
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m0 = xx_loadl_64(mask);
+ const __m128i m = _mm_cvtepu8_epi16(m0);
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+ const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+ const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadu_128(mask);
+ const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+ const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+ const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+ const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadu_128(mask);
+ const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+ const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+#endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
index 4880438bc..8d9b32510 100644
--- a/third_party/aom/aom_dsp/x86/blend_sse4.h
+++ b/third_party/aom/aom_dsp/x86/blend_sse4.h
@@ -9,42 +9,44 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_BLEND_SSE4_H_
-#define AOM_DSP_X86_BLEND_SSE4_H_
+#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_SSE4_H_
#include "aom_dsp/blend.h"
#include "aom_dsp/x86/synonyms.h"
+static const uint8_t g_blend_a64_mask_shuffle[32] = {
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+};
//////////////////////////////////////////////////////////////////////////////
// Common kernels
//////////////////////////////////////////////////////////////////////////////
static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i *v_m0_w, const __m128i *v_m1_w) {
const __m128i v_s0_b = xx_loadl_32(src0);
const __m128i v_s1_b = xx_loadl_32(src1);
const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
return v_res_w;
}
static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i *v_m0_w, const __m128i *v_m1_w) {
const __m128i v_s0_b = xx_loadl_64(src0);
const __m128i v_s1_b = xx_loadl_64(src1);
const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
@@ -53,6 +55,51 @@ static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
return v_res_w;
}
+static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_b, const __m128i *v_m1_b,
+ const __m128i *rounding) {
+ const __m128i v_s0_b = xx_loadl_32(src0);
+ const __m128i v_s1_b = xx_loadl_32(src1);
+
+ const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+ const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+ const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+ return v_res;
+}
+
+static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_b, const __m128i *v_m1_b,
+ const __m128i *rounding) {
+ const __m128i v_s0_b = xx_loadl_64(src0);
+ const __m128i v_s1_b = xx_loadl_64(src1);
+
+ const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+ const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+ const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+ return v_res;
+}
+
+static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_b, const __m128i *v_m1_b,
+ const __m128i *rounding) {
+ const __m128i v_s0_b = xx_loadu_128(src0);
+ const __m128i v_s1_b = xx_loadu_128(src1);
+
+ const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+ const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
+ _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+ const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+ const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
+ const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
+ return v_res;
+}
+
typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w);
@@ -141,4 +188,4 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
return v_res_w;
}
-#endif // AOM_DSP_X86_BLEND_SSE4_H_
+#endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
index 3f46420dd..96fe4ebb6 100644
--- a/third_party/aom/aom_dsp/x86/common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/common_avx2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_COMMON_AVX2_H
-#define AOM_DSP_X86_COMMON_AVX2_H
+#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_COMMON_AVX2_H_
#include <immintrin.h>
@@ -144,4 +144,4 @@ static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
}
-#endif
+#endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
index 36fb1963a..3e19682cd 100644
--- a/third_party/aom/aom_dsp/x86/convolve.h
+++ b/third_party/aom/aom_dsp/x86/convolve.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_CONVOLVE_H_
-#define AOM_DSP_X86_CONVOLVE_H_
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_H_
#include <assert.h>
@@ -17,7 +17,6 @@
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
-#include "aom_dsp/aom_convolve.h"
typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
uint8_t *output_ptr, ptrdiff_t out_pitch,
@@ -34,7 +33,30 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
(void)y_step_q4; \
assert((-128 <= filter[3]) && (filter[3] <= 127)); \
assert(step_q4 == 16); \
- if (filter[0] | filter[1] | filter[2]) { \
+ if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \
+ (filter[2] | filter[5])) { \
+ while (w >= 16) { \
+ aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else if (filter[0] | filter[1] | filter[2]) { \
while (w >= 16) { \
aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter); \
@@ -153,4 +175,4 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
} \
}
-#endif // AOM_DSP_X86_CONVOLVE_H_
+#endif // AOM_AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
index 72fabd236..30253f65c 100644
--- a/third_party/aom/aom_dsp/x86/convolve_avx2.h
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_
-#define AOM_DSP_X86_CONVOLVE_AVX2_H_
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
// filters for 16
DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
@@ -29,6 +29,11 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
};
+DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+};
+
static INLINE void prepare_coeffs_lowbd(
const InterpFilterParams *const filter_params, const int subpel_q4,
__m256i *const coeffs /* [4] */) {
@@ -191,4 +196,4 @@ static INLINE __m256i highbd_convolve_rounding(
return res_round;
}
-#endif
+#endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
index e80c5872f..707bd2d78 100644
--- a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
+++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
-#define _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
// Note:
// This header file should be put below any x86 intrinsics head file
@@ -28,4 +28,4 @@ static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
_mm_store_si128((__m128i *)dst, d);
}
-#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
+#endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
index 399df5d6d..445d04b10 100644
--- a/third_party/aom/aom_dsp/x86/convolve_sse2.h
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_CONVOLVE_SSE2_H_
-#define AOM_DSP_X86_CONVOLVE_SSE2_H_
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
// Note:
// This header file should be put below any x86 intrinsics head file
@@ -118,4 +118,4 @@ static INLINE __m128i highbd_convolve_rounding_sse2(
return res_round;
}
-#endif
+#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
index d48c25667..6b8388d84 100644
--- a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
+++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_
-#define _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
// Note:
// This header file should be put below any x86 intrinsics head file
@@ -50,4 +50,4 @@ static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
return res;
}
-#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
+#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
index 12ccf7f26..260d8dd58 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_
-#define AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
#ifdef __cplusplus
extern "C" {
@@ -152,4 +152,4 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
} // extern "C"
#endif
-#endif // AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
deleted file mode 100644
index 99f17ebdf..000000000
--- a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
+++ /dev/null
@@ -1,351 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-SECTION .text
-
-;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
-; int ref_stride,
-; unsigned char *src,
-; int src_stride,
-; unsigned int height,
-; int *sum,
-; unsigned int *sumsquared)
-global sym(aom_half_horiz_vert_variance16x_h_sse2) PRIVATE
-sym(aom_half_horiz_vert_variance16x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref
-
- mov rdi, arg(2) ;src
- movsxd rcx, dword ptr arg(4) ;height
- movsxd rax, dword ptr arg(1) ;ref_stride
- movsxd rdx, dword ptr arg(3) ;src_stride
-
- pxor xmm0, xmm0 ;
-
- movdqu xmm5, XMMWORD PTR [rsi]
- movdqu xmm3, XMMWORD PTR [rsi+1]
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
-
- lea rsi, [rsi + rax]
-
-aom_half_horiz_vert_variance16x_h_1:
- movdqu xmm1, XMMWORD PTR [rsi] ;
- movdqu xmm2, XMMWORD PTR [rsi+1] ;
- pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
- pavgb xmm5, xmm1 ; xmm = vertical average of the above
-
- movdqa xmm4, xmm5
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
- punpckhbw xmm4, xmm0
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
- psubw xmm5, xmm3 ; xmm5 -= xmm3
-
- movq xmm3, QWORD PTR [rdi+8]
- punpcklbw xmm3, xmm0
- psubw xmm4, xmm3
-
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm4
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm4, xmm4
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm4
-
- movdqa xmm5, xmm1 ; save xmm1 for use on the next row
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
-
- sub rcx, 1 ;
- jnz aom_half_horiz_vert_variance16x_h_1 ;
-
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void aom_half_vert_variance16x_h_sse2(unsigned char *ref,
-; int ref_stride,
-; unsigned char *src,
-; int src_stride,
-; unsigned int height,
-; int *sum,
-; unsigned int *sumsquared)
-global sym(aom_half_vert_variance16x_h_sse2) PRIVATE
-sym(aom_half_vert_variance16x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref
-
- mov rdi, arg(2) ;src
- movsxd rcx, dword ptr arg(4) ;height
- movsxd rax, dword ptr arg(1) ;ref_stride
- movsxd rdx, dword ptr arg(3) ;src_stride
-
- movdqu xmm5, XMMWORD PTR [rsi]
- lea rsi, [rsi + rax ]
- pxor xmm0, xmm0
-
-aom_half_vert_variance16x_h_1:
- movdqu xmm3, XMMWORD PTR [rsi]
-
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- movdqa xmm4, xmm5
- punpcklbw xmm5, xmm0
- punpckhbw xmm4, xmm0
-
- movq xmm2, QWORD PTR [rdi]
- punpcklbw xmm2, xmm0
- psubw xmm5, xmm2
- movq xmm2, QWORD PTR [rdi+8]
- punpcklbw xmm2, xmm0
- psubw xmm4, xmm2
-
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm4
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm4, xmm4
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm4
-
- movdqa xmm5, xmm3
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
-
- sub rcx, 1
- jnz aom_half_vert_variance16x_h_1
-
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void aom_half_horiz_variance16x_h_sse2(unsigned char *ref,
-; int ref_stride
-; unsigned char *src,
-; int src_stride,
-; unsigned int height,
-; int *sum,
-; unsigned int *sumsquared)
-global sym(aom_half_horiz_variance16x_h_sse2) PRIVATE
-sym(aom_half_horiz_variance16x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref
-
- mov rdi, arg(2) ;src
- movsxd rcx, dword ptr arg(4) ;height
- movsxd rax, dword ptr arg(1) ;ref_stride
- movsxd rdx, dword ptr arg(3) ;src_stride
-
- pxor xmm0, xmm0 ;
-
-aom_half_horiz_variance16x_h_1:
- movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
- movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
-
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- movdqa xmm1, xmm5
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
- punpckhbw xmm1, xmm0
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
- movq xmm2, QWORD PTR [rdi+8]
- punpcklbw xmm2, xmm0
-
- psubw xmm5, xmm3 ; xmm5 -= xmm3
- psubw xmm1, xmm2
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm1
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm1, xmm1
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm1
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
-
- sub rcx, 1 ;
- jnz aom_half_horiz_variance16x_h_1 ;
-
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
- times 8 dw 64
-align 16
-aom_bilinear_filters_sse2:
- dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
- dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
- dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
- dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
- dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
- dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
- dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
deleted file mode 100644
index 2a018c1cf..000000000
--- a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
- int ref_stride,
- const unsigned char *src,
- int src_stride, unsigned int height,
- int *sum, unsigned int *sumsquared);
-void aom_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
- const unsigned char *src, int src_stride,
- unsigned int height, int *sum,
- unsigned int *sumsquared);
-void aom_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
- const unsigned char *src, int src_stride,
- unsigned int height, int *sum,
- unsigned int *sumsquared);
-
-uint32_t aom_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
- int src_stride,
- const unsigned char *dst,
- int dst_stride, uint32_t *sse) {
- int xsum0;
- unsigned int xxsum0;
-
- aom_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
- &xsum0, &xxsum0);
-
- *sse = xxsum0;
- assert(xsum0 <= 255 * 16 * 16);
- assert(xsum0 >= -255 * 16 * 16);
- return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-}
-
-uint32_t aom_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
- int src_stride,
- const unsigned char *dst,
- int dst_stride, uint32_t *sse) {
- int xsum0;
- unsigned int xxsum0;
- aom_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0,
- &xxsum0);
-
- *sse = xxsum0;
- assert(xsum0 <= 255 * 16 * 16);
- assert(xsum0 >= -255 * 16 * 16);
- return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-}
-
-uint32_t aom_variance_halfpixvar16x16_hv_sse2(const unsigned char *src,
- int src_stride,
- const unsigned char *dst,
- int dst_stride, uint32_t *sse) {
- int xsum0;
- unsigned int xxsum0;
-
- aom_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
- &xsum0, &xxsum0);
-
- *sse = xxsum0;
- assert(xsum0 <= 255 * 16 * 16);
- assert(xsum0 >= -255 * 16 * 16);
- return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
index 83e0098ba..097e0778f 100644
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -327,6 +327,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
__m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
const unsigned char *lt, const unsigned char *thr, int bd) {
int i;
+ const __m128i zero = _mm_setzero_si128();
__m128i blimit, limit, thresh;
__m128i t80;
get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
@@ -355,13 +356,18 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
flat2 = _mm_unpacklo_epi64(flat2, flat2);
// flat and wide flat calculations
- __m128i flat_p[3], flat_q[3], flat_pq[3];
- __m128i flat2_p[6], flat2_q[6];
- __m128i flat2_pq[6];
- {
- __m128i work0;
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i flat_p[3], flat_q[3], flat_pq[3];
+ __m128i flat2_p[6], flat2_q[6];
+ __m128i flat2_pq[6];
+ __m128i sum_p6, sum_p3;
const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4);
+
+ __m128i work0, work0_0, work0_1, sum_p_0;
__m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
__m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
sum_p = _mm_add_epi16(sum_p, sum_lp);
@@ -369,30 +375,23 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
__m128i sum_lq = _mm_srli_si128(sum_lp, 8);
__m128i sum_q = _mm_srli_si128(sum_p, 8);
- sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
- work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
- flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0]));
- flat2_q[0] =
- _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0]));
-
- flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0]));
+ flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
- __m128i sum_p6, sum_p3;
sum_p6 = _mm_add_epi16(pq[6], pq[6]);
sum_p3 = _mm_add_epi16(pq[3], pq[3]);
- sum_q = _mm_sub_epi16(sum_p, p[5]);
- sum_p = _mm_sub_epi16(sum_p, q[5]);
+ sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
+ sum_p = _mm_sub_epi16(sum_p_0, q[5]);
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
- flat2_p[1] = _mm_add_epi16(sum_p, work0);
- flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+ work0_1 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
- sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+ sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
sum_lp = _mm_sub_epi16(sum_lp, q[2]);
work0 = _mm_add_epi16(sum_p3, pq[1]);
@@ -402,21 +401,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
- flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
- flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
-
- sum_p = _mm_sub_epi16(sum_p, q[4]);
- sum_q = _mm_sub_epi16(sum_q, p[4]);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
- flat2_p[2] = _mm_add_epi16(sum_p, work0);
- flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
-
sum_lp = _mm_sub_epi16(sum_lp, q[1]);
- sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+ sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
work0 = _mm_add_epi16(sum_p3, pq[2]);
@@ -425,54 +411,88 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[3]);
- sum_q = _mm_sub_epi16(sum_q, p[3]);
-
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
- flat2_p[3] = _mm_add_epi16(sum_p, work0);
- flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[2]);
- sum_q = _mm_sub_epi16(sum_q, p[2]);
-
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
- flat2_p[4] = _mm_add_epi16(sum_p, work0);
- flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[1]);
- sum_q = _mm_sub_epi16(sum_q, p[1]);
-
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
- flat2_p[5] = _mm_add_epi16(sum_p, work0);
- flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
- }
-
- // highbd_filter8
- pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
- pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
-
- for (i = 0; i < 3; i++) {
- pq[i] = _mm_andnot_si128(flat, pq[i]);
- flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
- pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
- }
-
- // highbd_filter16
- for (i = 5; i >= 0; i--) {
- // p[i] remains unchanged if !(flat2 && flat && mask)
- pq[i] = _mm_andnot_si128(flat2, pq[i]);
- flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
- // get values for when (flat2 && flat && mask)
- pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values
+ int flat2_mask =
+ (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+ if (flat2_mask) {
+ flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
+ flat2_q[0] = _mm_add_epi16(
+ sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
+
+ flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+ flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+ flat2_pq[0] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+ flat2_pq[1] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+
+ sum_p = _mm_sub_epi16(sum_p, q[4]);
+ sum_q = _mm_sub_epi16(sum_q, pq[4]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+ flat2_p[2] = _mm_add_epi16(sum_p, work0);
+ flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[2] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[3]);
+ sum_q = _mm_sub_epi16(sum_q, pq[3]);
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+ flat2_p[3] = _mm_add_epi16(sum_p, work0);
+ flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[3] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[2]);
+ sum_q = _mm_sub_epi16(sum_q, pq[2]);
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+ flat2_p[4] = _mm_add_epi16(sum_p, work0);
+ flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[4] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[1]);
+ sum_q = _mm_sub_epi16(sum_q, pq[1]);
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+ flat2_p[5] = _mm_add_epi16(sum_p, work0);
+ flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[5] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+ } // flat2
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // highbd_filter8
+ pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+ pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+
+ for (i = 0; i < 3; i++) {
+ pq[i] = _mm_andnot_si128(flat, pq[i]);
+ flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
+ pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
+ }
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ if (flat2_mask) {
+ for (i = 0; i < 6; i++) {
+ pq[i] = _mm_andnot_si128(flat2, pq[i]);
+ flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
+ pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values
+ }
+ }
+ } else {
+ pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+ pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
}
}
@@ -500,6 +520,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
const uint8_t *thr1, int bd) {
__m128i blimit, limit, thresh, t80;
+ const __m128i zero = _mm_setzero_si128();
+
get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
&t80);
__m128i mask;
@@ -512,27 +534,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
__m128i ps[2], qs[2];
highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
// flat and wide flat calculations
- __m128i flat_p[3], flat_q[3];
- __m128i flat2_p[6], flat2_q[6];
- {
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i flat_p[3], flat_q[3];
+ __m128i flat2_p[6], flat2_q[6];
const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4);
- __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
+ __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
__m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
__m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
- sum_p = _mm_add_epi16(sum_p, sum_lp);
+ sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
__m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
sum_q = _mm_add_epi16(sum_q, sum_lq);
- sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
- flat2_p[0] = _mm_srli_epi16(
- _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
- _mm_add_epi16(p[1], q[0]))),
- 4);
- flat2_q[0] = _mm_srli_epi16(
- _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
- _mm_add_epi16(p[0], q[1]))),
- 4);
flat_p[0] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
flat_q[0] =
@@ -541,117 +558,160 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
__m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
__m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
__m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
- sum_q = _mm_sub_epi16(sum_p, p[5]);
- sum_p = _mm_sub_epi16(sum_p, q[5]);
- flat2_p[1] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
- 4);
- flat2_q[1] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
- 4);
+
+ sum_q = _mm_sub_epi16(sum_p_0, p[5]);
+ __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
sum_lq = _mm_sub_epi16(sum_lp, p[2]);
sum_lp = _mm_sub_epi16(sum_lp, q[2]);
flat_p[1] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
flat_q[1] =
_mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p3 = _mm_add_epi16(sum_p3, p[3]);
- sum_q3 = _mm_add_epi16(sum_q3, q[3]);
- sum_p = _mm_sub_epi16(sum_p, q[4]);
- sum_q = _mm_sub_epi16(sum_q, p[4]);
- flat2_p[2] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
- 4);
- flat2_q[2] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
- 4);
+
sum_lp = _mm_sub_epi16(sum_lp, q[1]);
sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+ sum_p3 = _mm_add_epi16(sum_p3, p[3]);
+ sum_q3 = _mm_add_epi16(sum_q3, q[3]);
flat_p[2] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
flat_q[2] =
_mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[3]);
- sum_q = _mm_sub_epi16(sum_q, p[3]);
- flat2_p[3] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
- 4);
- flat2_q[3] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[2]);
- sum_q = _mm_sub_epi16(sum_q, p[2]);
- flat2_p[4] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
- 4);
- flat2_q[4] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[1]);
- sum_q = _mm_sub_epi16(sum_q, p[1]);
- flat2_p[5] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p, _mm_add_epi16(
- sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
- 4);
- flat2_q[5] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q, _mm_add_epi16(
- sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
- 4);
- }
- // highbd_filter8
- p[2] = _mm_andnot_si128(flat, p[2]);
- // p2 remains unchanged if !(flat && mask)
- flat_p[2] = _mm_and_si128(flat, flat_p[2]);
- // when (flat && mask)
- p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values
- q[2] = _mm_andnot_si128(flat, q[2]);
- flat_q[2] = _mm_and_si128(flat, flat_q[2]);
- q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values
- int i;
- for (i = 1; i >= 0; i--) {
- ps[i] = _mm_andnot_si128(flat, ps[i]);
- flat_p[i] = _mm_and_si128(flat, flat_p[i]);
- p[i] = _mm_or_si128(ps[i], flat_p[i]);
- qs[i] = _mm_andnot_si128(flat, qs[i]);
- flat_q[i] = _mm_and_si128(flat, flat_q[i]);
- q[i] = _mm_or_si128(qs[i], flat_q[i]);
- }
- // highbd_filter16
- for (i = 5; i >= 0; i--) {
- // p[i] remains unchanged if !(flat2 && flat && mask)
- p[i] = _mm_andnot_si128(flat2, p[i]);
- flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
- // get values for when (flat2 && flat && mask)
- p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
- q[i] = _mm_andnot_si128(flat2, q[i]);
- flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
- q[i] = _mm_or_si128(q[i], flat2_q[i]);
+
+ int flat2_mask =
+ (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+ if (flat2_mask) {
+ flat2_p[0] = _mm_srli_epi16(
+ _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
+ _mm_add_epi16(p[1], q[0]))),
+ 4);
+ flat2_q[0] = _mm_srli_epi16(
+ _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
+ _mm_add_epi16(p[0], q[1]))),
+ 4);
+
+ flat2_p[1] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
+ 4);
+ flat2_q[1] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[4]);
+ sum_q = _mm_sub_epi16(sum_q, p[4]);
+ flat2_p[2] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
+ 4);
+ flat2_q[2] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[3]);
+ sum_q = _mm_sub_epi16(sum_q, p[3]);
+ flat2_p[3] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
+ 4);
+ flat2_q[3] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[2]);
+ sum_q = _mm_sub_epi16(sum_q, p[2]);
+ flat2_p[4] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
+ 4);
+ flat2_q[4] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[1]);
+ sum_q = _mm_sub_epi16(sum_q, p[1]);
+ flat2_p[5] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
+ 4);
+ flat2_q[5] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
+ 4);
+ }
+ // highbd_filter8
+ int i;
+ for (i = 0; i < 2; i++) {
+ ps[i] = _mm_andnot_si128(flat, ps[i]);
+ flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+ p[i] = _mm_or_si128(ps[i], flat_p[i]);
+ qs[i] = _mm_andnot_si128(flat, qs[i]);
+ flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+ q[i] = _mm_or_si128(qs[i], flat_q[i]);
+ }
+ p[2] = _mm_andnot_si128(flat, p[2]);
+ // p2 remains unchanged if !(flat && mask)
+ flat_p[2] = _mm_and_si128(flat, flat_p[2]);
+ // when (flat && mask)
+ p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values
+ q[2] = _mm_andnot_si128(flat, q[2]);
+ flat_q[2] = _mm_and_si128(flat, flat_q[2]);
+ q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values
+
+ for (i = 0; i < 2; i++) {
+ ps[i] = _mm_andnot_si128(flat, ps[i]);
+ flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+ p[i] = _mm_or_si128(ps[i], flat_p[i]);
+ qs[i] = _mm_andnot_si128(flat, qs[i]);
+ flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+ q[i] = _mm_or_si128(qs[i], flat_q[i]);
+ }
+ // highbd_filter16
+ if (flat2_mask) {
+ for (i = 0; i < 6; i++) {
+ // p[i] remains unchanged if !(flat2 && flat && mask)
+ p[i] = _mm_andnot_si128(flat2, p[i]);
+ flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+ // get values for when (flat2 && flat && mask)
+ p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
+ q[i] = _mm_andnot_si128(flat2, q[i]);
+ flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+ q[i] = _mm_or_si128(q[i], flat2_q[i]);
+ }
+ }
+ } else {
+ p[0] = ps[0];
+ q[0] = qs[0];
+ p[1] = ps[1];
+ q[1] = qs[1];
}
}
@@ -696,6 +756,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
&thresh, &hev, &mask);
+ // lp filter
+ highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
// flat_mask
flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
@@ -707,53 +770,56 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
// replicate for the further "merged variables" usage
flat = _mm_unpacklo_epi64(flat, flat);
- {
- __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b, workp_c;
+ __m128i pq0x2_pq1, pq1_pq2;
// op1
- workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
- _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
- *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4
+ pq0x2_pq1 =
+ _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1
+ pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+ pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
- workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
- workp_shft0 = _mm_add_epi16(
- workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+ workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
+ workp_b =
+ _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
// op0
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1
- workp_a =
- _mm_add_epi16(workp_a,
- workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
-
- flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3);
+ workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
+ workp_a = _mm_add_epi16(workp_a,
+ workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+ workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+ flat_p1p0 = _mm_srli_epi16(workp_b, 3);
// oq0
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
- *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4
- workp_b = _mm_add_epi16(*q1, *q2);
- workp_shft0 = _mm_add_epi16(
- workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
+ pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4
+ workp_b = _mm_srli_si128(pq1_pq2, 8);
+ workp_a = _mm_add_epi16(
+ workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
+ // workp_shft0 = _mm_srli_epi16(workp_a, 3);
// oq1
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
- *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4
+ workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
+ pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4
workp_b = _mm_add_epi16(*q2, *q2);
- workp_shft1 = _mm_add_epi16(
- workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+ workp_b =
+ _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
- flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
- }
- // lp filter
- highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
+ workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+ flat_q0q1 = _mm_srli_epi16(workp_a, 3);
- qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
- ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
}
static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
@@ -797,6 +863,17 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
mask = _mm_subs_epu16(mask, limit0);
mask = _mm_cmpeq_epi16(mask, zero);
+ // lp filter
+ __m128i ps[2], qs[2], p[2], q[2];
+ {
+ p[0] = *p0;
+ p[1] = *p1;
+ q[0] = *q0;
+ q[1] = *q1;
+ // filter_mask and hev_mask
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ }
+
// flat_mask
flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
flat = _mm_max_epi16(flat, work);
@@ -806,7 +883,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
flat = _mm_cmpeq_epi16(flat, zero);
flat = _mm_and_si128(flat, mask); // flat & mask
- {
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
__m128i workp_a, workp_b, workp_shft0, workp_shft1;
// op1
@@ -842,33 +921,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
workp_shft1 = _mm_add_epi16(
workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
oq1 = _mm_srli_epi16(workp_shft1, 3);
- }
- // lp filter
- __m128i ps[2], qs[2], p[2], q[2];
- {
- p[0] = *p0;
- p[1] = *p1;
- q[0] = *q0;
- q[1] = *q1;
- // filter_mask and hev_mask
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
- }
-
- qs[0] = _mm_andnot_si128(flat, qs[0]);
- oq0 = _mm_and_si128(flat, oq0);
- *q0 = _mm_or_si128(qs[0], oq0);
- qs[1] = _mm_andnot_si128(flat, qs[1]);
- oq1 = _mm_and_si128(flat, oq1);
- *q1 = _mm_or_si128(qs[1], oq1);
-
- ps[0] = _mm_andnot_si128(flat, ps[0]);
- op0 = _mm_and_si128(flat, op0);
- *p0 = _mm_or_si128(ps[0], op0);
-
- ps[1] = _mm_andnot_si128(flat, ps[1]);
- op1 = _mm_and_si128(flat, op1);
- *p1 = _mm_or_si128(ps[1], op1);
+ qs[0] = _mm_andnot_si128(flat, qs[0]);
+ oq0 = _mm_and_si128(flat, oq0);
+ *q0 = _mm_or_si128(qs[0], oq0);
+
+ qs[1] = _mm_andnot_si128(flat, qs[1]);
+ oq1 = _mm_and_si128(flat, oq1);
+ *q1 = _mm_or_si128(qs[1], oq1);
+
+ ps[0] = _mm_andnot_si128(flat, ps[0]);
+ op0 = _mm_and_si128(flat, op0);
+ *p0 = _mm_or_si128(ps[0], op0);
+
+ ps[1] = _mm_andnot_si128(flat, ps[1]);
+ op1 = _mm_and_si128(flat, op1);
+ *p1 = _mm_or_si128(ps[1], op1);
+ } else {
+ *q0 = qs[0];
+ *q1 = qs[1];
+ *p0 = ps[0];
+ *p1 = ps[1];
+ }
}
void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
@@ -926,7 +1000,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
__m128i mask, hev, flat;
__m128i pq[4];
__m128i p1p0, q1q0, ps1ps0, qs1qs0;
- __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1;
+ __m128i work_a, opq2, flat_p1p0, flat_q0q1;
pq[0] = _mm_unpacklo_epi64(*p0, *q0);
pq[1] = _mm_unpacklo_epi64(*p1, *q1);
@@ -944,6 +1018,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
&thresh, &hev, &mask);
+ // lp filter
+ highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
// flat_mask4
flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
flat = _mm_max_epi16(abs_p1p0, flat);
@@ -956,15 +1033,15 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
// replicate for the further "merged variables" usage
flat = _mm_unpacklo_epi64(flat, flat);
- {
- __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
// Added before shift for rounding part of ROUND_POWER_OF_TWO
// o*p2
workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
- op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+ workp_c = _mm_add_epi16(workp_a, workp_c);
// o*p1
workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
@@ -992,27 +1069,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
// oq2
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
- oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- }
+ workp_a = _mm_add_epi16(workp_a, workp_b);
+ opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
- // lp filter
- highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
-
- qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
- ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
- work_a = _mm_andnot_si128(flat, *q2);
- *q2 = _mm_and_si128(flat, oq2);
- *q2 = _mm_or_si128(work_a, *q2);
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
- work_a = _mm_andnot_si128(flat, *p2);
- *p2 = _mm_and_si128(flat, op2);
- *p2 = _mm_or_si128(work_a, *p2);
+ work_a = _mm_andnot_si128(flat, pq[2]);
+ *p2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_a, *p2);
+ *q2 = _mm_srli_si128(*p2, 8);
+ }
}
static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
@@ -1058,17 +1130,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
mask = _mm_subs_epu16(mask, limit0);
mask = _mm_cmpeq_epi16(mask, zero);
+ // lp filter
+ __m128i ps[2], qs[2], p[2], q[2];
+ {
+ p[0] = *p0;
+ p[1] = *p1;
+ q[0] = *q0;
+ q[1] = *q1;
+ // filter_mask and hev_mask
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ }
+
flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
flat = _mm_max_epi16(work1, flat);
work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
flat = _mm_max_epi16(work0, flat);
flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
flat = _mm_cmpeq_epi16(flat, zero);
flat = _mm_and_si128(flat, mask); // flat & mask
- {
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
__m128i workp_a, workp_b;
// Added before shift for rounding part of ROUND_POWER_OF_TWO
@@ -1101,42 +1184,36 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- }
- // lp filter
- __m128i ps[2], qs[2], p[2], q[2];
- {
- p[0] = *p0;
- p[1] = *p1;
- q[0] = *q0;
- q[1] = *q1;
- // filter_mask and hev_mask
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ qs[0] = _mm_andnot_si128(flat, qs[0]);
+ oq0 = _mm_and_si128(flat, oq0);
+ *q0 = _mm_or_si128(qs[0], oq0);
+
+ qs[1] = _mm_andnot_si128(flat, qs[1]);
+ oq1 = _mm_and_si128(flat, oq1);
+ *q1 = _mm_or_si128(qs[1], oq1);
+
+ ps[0] = _mm_andnot_si128(flat, ps[0]);
+ op0 = _mm_and_si128(flat, op0);
+ *p0 = _mm_or_si128(ps[0], op0);
+
+ ps[1] = _mm_andnot_si128(flat, ps[1]);
+ op1 = _mm_and_si128(flat, op1);
+ *p1 = _mm_or_si128(ps[1], op1);
+
+ work_a = _mm_andnot_si128(flat, *q2);
+ *q2 = _mm_and_si128(flat, oq2);
+ *q2 = _mm_or_si128(work_a, *q2);
+
+ work_a = _mm_andnot_si128(flat, *p2);
+ *p2 = _mm_and_si128(flat, op2);
+ *p2 = _mm_or_si128(work_a, *p2);
+ } else {
+ *q0 = qs[0];
+ *q1 = qs[1];
+ *p0 = ps[0];
+ *p1 = ps[1];
}
-
- qs[0] = _mm_andnot_si128(flat, qs[0]);
- oq0 = _mm_and_si128(flat, oq0);
- *q0 = _mm_or_si128(qs[0], oq0);
-
- qs[1] = _mm_andnot_si128(flat, qs[1]);
- oq1 = _mm_and_si128(flat, oq1);
- *q1 = _mm_or_si128(qs[1], oq1);
-
- ps[0] = _mm_andnot_si128(flat, ps[0]);
- op0 = _mm_and_si128(flat, op0);
- *p0 = _mm_or_si128(ps[0], op0);
-
- ps[1] = _mm_andnot_si128(flat, ps[1]);
- op1 = _mm_and_si128(flat, op1);
- *p1 = _mm_or_si128(ps[1], op1);
-
- work_a = _mm_andnot_si128(flat, *q2);
- *q2 = _mm_and_si128(flat, oq2);
- *q2 = _mm_or_si128(work_a, *q2);
-
- work_a = _mm_andnot_si128(flat, *p2);
- *p2 = _mm_and_si128(flat, op2);
- *p2 = _mm_or_si128(work_a, *p2);
}
void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
index dea113a29..b9689202a 100644
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -110,7 +110,7 @@ static INLINE void quantize(const __m256i *qp, __m256i *c,
}
void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
+ const int16_t *zbin_ptr,
const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
@@ -120,12 +120,23 @@ void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
(void)scan;
const unsigned int step = 8;
- if (LIKELY(!skip_block)) {
- __m256i qp[5], coeff;
- init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp);
- coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ __m256i qp[5], coeff;
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp);
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
- __m256i eob = _mm256_setzero_si256();
+ while (n_coeffs > 0) {
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
coeff_ptr += step;
@@ -133,40 +144,17 @@ void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dqcoeff_ptr += step;
iscan += step;
n_coeffs -= step;
-
- update_qp(qp);
-
- while (n_coeffs > 0) {
- coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
- quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
-
- coeff_ptr += step;
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- iscan += step;
- n_coeffs -= step;
- }
- {
- __m256i eob_s;
- eob_s = _mm256_shuffle_epi32(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 1);
- eob = _mm256_max_epi16(eob, eob_s);
- const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
- _mm256_extractf128_si256(eob, 1));
- *eob_ptr = _mm_extract_epi16(final_eob, 0);
- }
- } else {
- do {
- const __m256i zero = _mm256_setzero_si256();
- _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero);
- _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero);
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- n_coeffs -= step;
- } while (n_coeffs > 0);
- *eob_ptr = 0;
+ }
+ {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+ const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+ _mm256_extractf128_si256(eob, 1));
+ *eob_ptr = _mm_extract_epi16(final_eob, 0);
}
}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
index 5570ca5b7..58e5f98e5 100644
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -16,7 +16,7 @@
#include "aom_ports/mem.h"
void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
- int skip_block, const int16_t *zbin_ptr,
+ const int16_t *zbin_ptr,
const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
@@ -41,50 +41,48 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Pre-scan pass
- for (i = ((int)count / 4) - 1; i >= 0; i--) {
- __m128i coeffs, cmp1, cmp2;
- int test;
- coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
- cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
- cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
- cmp1 = _mm_and_si128(cmp1, cmp2);
- test = _mm_movemask_epi8(cmp1);
- if (test == 0xffff)
- non_zero_regs--;
- else
- break;
- }
+ // Pre-scan pass
+ for (i = ((int)count / 4) - 1; i >= 0; i--) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (test == 0xffff)
+ non_zero_regs--;
+ else
+ break;
+ }
- // Quantization pass:
- for (i = 0; i < non_zero_regs; i++) {
- __m128i coeffs, coeffs_sign, tmp1, tmp2;
- int test;
- int abs_coeff[4];
- int coeff_sign[4];
-
- coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
- coeffs_sign = _mm_srai_epi32(coeffs, 31);
- coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
- tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
- tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
- tmp1 = _mm_or_si128(tmp1, tmp2);
- test = _mm_movemask_epi8(tmp1);
- _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
- _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
-
- for (j = 0; j < 4; j++) {
- if (test & (1 << (4 * j))) {
- int k = 4 * i + j;
- const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
- const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
- const uint32_t abs_qcoeff =
- (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
- qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
- dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
- if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
- }
+ // Quantization pass:
+ for (i = 0; i < non_zero_regs; i++) {
+ __m128i coeffs, coeffs_sign, tmp1, tmp2;
+ int test;
+ int abs_coeff[4];
+ int coeff_sign[4];
+
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ coeffs_sign = _mm_srai_epi32(coeffs, 31);
+ coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+ tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+ tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ test = _mm_movemask_epi8(tmp1);
+ _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+ _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+ for (j = 0; j < 4; j++) {
+ if (test & (1 << (4 * j))) {
+ int k = 4 * i + j;
+ const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+ const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+ qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+ dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+ if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
}
}
}
@@ -92,8 +90,8 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
}
void aom_highbd_quantize_b_32x32_sse2(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
@@ -116,38 +114,35 @@ void aom_highbd_quantize_b_32x32_sse2(
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
- if (!skip_block) {
- // Pre-scan pass
- for (i = 0; i < n_coeffs / 4; i++) {
- __m128i coeffs, cmp1, cmp2;
- int test;
- coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
- cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
- cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
- cmp1 = _mm_and_si128(cmp1, cmp2);
- test = _mm_movemask_epi8(cmp1);
- if (!(test & 0xf)) idx_arr[idx++] = i * 4;
- if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
- if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
- if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
- }
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+ }
- // Quantization pass: only process the coefficients selected in
- // pre-scan pass. Note: idx can be zero.
- for (i = 0; i < idx; i++) {
- const int rc = idx_arr[i];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int64_t tmp1 =
- abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
- const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
- const uint32_t abs_qcoeff =
- (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
- qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
- if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
- }
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
}
*eob_ptr = eob + 1;
}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
new file mode 100644
index 000000000..9b1b4c9de
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h> // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum) {
+ __m256i v_sum_d = _mm256_setzero_si256();
+ __m256i v_sse_d = _mm256_setzero_si256();
+ for (int i = 0; i < 8; i += 2) {
+ const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
+ const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+ const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
+ const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
+ __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
+ __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
+ v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
+ v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
+ const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+ const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+ v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+ v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+ src += src_stride * 2;
+ ref += ref_stride * 2;
+ }
+ __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
+ __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
+ __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ *sum = _mm_extract_epi32(v_d, 0);
+ *sse = _mm_extract_epi32(v_d, 1);
+}
+
+void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum) {
+ __m256i v_sum_d = _mm256_setzero_si256();
+ __m256i v_sse_d = _mm256_setzero_si256();
+ const __m256i one = _mm256_set1_epi16(1);
+ for (int i = 0; i < 16; ++i) {
+ const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+ const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+ v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+ v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+ src += src_stride;
+ ref += ref_stride;
+ }
+ __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ *sum = _mm_extract_epi32(v_d, 0);
+ *sse = _mm_extract_epi32(v_d, 1);
+}
+
+static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+#define VAR_FN(w, h, block_size, shift) \
+ uint32_t aom_highbd_10_variance##w##x##h##_avx2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_avx2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+VAR_FN(128, 128, 16, 14);
+VAR_FN(128, 64, 16, 13);
+VAR_FN(64, 128, 16, 13);
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+VAR_FN(16, 4, 16, 6);
+VAR_FN(8, 32, 8, 8);
+VAR_FN(32, 8, 8, 8);
+VAR_FN(16, 64, 16, 10);
+VAR_FN(64, 16, 16, 10);
+
+#undef VAR_FN
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
index 131c16aa9..47b052abc 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -179,6 +179,9 @@ HIGH_GET_VAR(8);
return (var >= 0) ? (uint32_t)var : 0; \
}
+VAR_FN(128, 128, 16, 14);
+VAR_FN(128, 64, 16, 13);
+VAR_FN(64, 128, 16, 13);
VAR_FN(64, 64, 16, 12);
VAR_FN(64, 32, 16, 11);
VAR_FN(32, 64, 16, 11);
@@ -590,10 +593,10 @@ FNS(sse2);
void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
const struct AV1Common *const cm,
int mi_row, int mi_col, const MV *const mv,
- uint16_t *comp_pred, int width, int height,
+ uint8_t *comp_pred8, int width, int height,
int subpel_x_q3, int subpel_y_q3,
- const uint8_t *ref8, int ref_stride,
- int bd) {
+ const uint8_t *ref8, int ref_stride, int bd,
+ int subpel_search) {
// expect xd == NULL only in tests
if (xd != NULL) {
const MB_MODE_INFO *mi = xd->mi[0];
@@ -606,8 +609,6 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
if (is_scaled) {
// Note: This is mostly a copy from the >=8X8 case in
// build_inter_predictors() function, with some small tweaks.
- uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);
-
// Some assumptions.
const int plane = 0;
@@ -661,7 +662,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
// Get convolve parameters.
- ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
const InterpFilters filters =
av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
@@ -677,10 +678,13 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
}
const InterpFilterParams *filter =
- av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+ (subpel_search == 1)
+ ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+ : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
if (!subpel_x_q3 && !subpel_y_q3) {
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
if (width >= 8) {
int i;
assert(!(width & 7));
@@ -711,13 +715,13 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
} else if (!subpel_y_q3) {
const int16_t *const kernel =
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
- width, kernel, 16, NULL, -1, width, height, bd);
+ aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
+ NULL, -1, width, height, bd);
} else if (!subpel_x_q3) {
const int16_t *const kernel =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
- width, NULL, -1, kernel, 16, width, height, bd);
+ aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
+ kernel, 16, width, height, bd);
} else {
DECLARE_ALIGNED(16, uint16_t,
temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
@@ -734,30 +738,29 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
intermediate_height, bd);
aom_highbd_convolve8_vert(
CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
- MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
- 16, width, height, bd);
+ MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+ bd);
}
}
void aom_highbd_comp_avg_upsampled_pred_sse2(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd) {
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- int n;
- int i;
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+ int ref_stride, int bd, int subpel_search) {
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd);
+ bd, subpel_search);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
/*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
assert(!(width * height & 7));
- n = width * height >> 3;
- for (i = 0; i < n; i++) {
- __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred);
+ int n = width * height >> 3;
+ for (int i = 0; i < n; i++) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
__m128i p0 = _mm_loadu_si128((const __m128i *)pred);
- _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu16(s0, p0));
- comp_pred += 8;
+ _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
+ comp_pred16 += 8;
pred += 8;
}
}
@@ -777,7 +780,7 @@ static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
xx_storeu_128(result, shift);
}
-void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred,
+void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
const uint8_t *pred8, int width,
int height, const uint8_t *ref8,
int ref_stride,
@@ -792,6 +795,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred,
_mm_set_epi16(round, round, round, round, round, round, round, round);
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
if (width >= 8) {
// Read 8 pixels one row at a time
@@ -830,15 +834,16 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred,
void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) {
+ int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+ int subpel_search) {
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
int n;
int i;
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd);
+ bd, subpel_search);
assert(!(width * height & 7));
n = width * height >> 3;
@@ -850,13 +855,14 @@ void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
const __m128i r =
_mm_set_epi16(round, round, round, round, round, round, round, round);
+ uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
for (i = 0; i < n; i++) {
- __m128i p0 = xx_loadu_128(comp_pred);
+ __m128i p0 = xx_loadu_128(comp_pred16);
__m128i p1 = xx_loadu_128(pred);
- highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+ highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
- comp_pred += 8;
+ comp_pred16 += 8;
pred += 8;
}
}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
index 6c247a91b..df5449a9d 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
@@ -168,8 +168,8 @@ uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
bilinear_filters_2t[yoffset]);
- aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
- 4);
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
sse);
@@ -188,8 +188,8 @@ uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
bilinear_filters_2t[yoffset]);
- aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
- 4);
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
dst_stride, sse);
@@ -208,8 +208,8 @@ uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
bilinear_filters_2t[yoffset]);
- aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
- 4);
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
dst_stride, sse);
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
index eaf1f347b..f9a41a210 100644
--- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -120,11 +120,11 @@ void aom_jnt_comp_avg_upsampled_pred_ssse3(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const JNT_COMP_PARAMS *jcp_param) {
+ int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
int n;
int i;
aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride);
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
/*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
assert(!(width * height & 15));
n = width * height >> 4;
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
deleted file mode 100644
index 18862dd3e..000000000
--- a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
+++ /dev/null
@@ -1,916 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h> /* AVX2 */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-
-void aom_lpf_horizontal_16_avx2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- __m128i mask, hev, flat, flat2;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi8(1);
- __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
- __m128i abs_p1p0;
-
- const __m128i thresh =
- _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
- const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
- const __m128i blimit =
- _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
-
- q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
- q4p4 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
- q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
- q3p3 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
- q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- q2p2 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
- q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- q1p1 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
- p1q1 = _mm_shuffle_epi32(q1p1, 78);
- q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0p0 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
- p0q0 = _mm_shuffle_epi32(q0p0, 78);
-
- {
- __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
- abs_p1p0 =
- _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
- fe = _mm_set1_epi8(0xfe);
- ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- abs_p0q0 =
- _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
- abs_p1q1 =
- _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(abs_p1p0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
-
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
- _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
-
- // lp filter
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i t1 = _mm_set1_epi16(0x1);
- __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
- __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
- __m128i qs0 = _mm_xor_si128(p0q0, t80);
- __m128i qs1 = _mm_xor_si128(p1q1, t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
- __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
- __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
- filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, qs0ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- /* (aom_filter + 3 * (qs0 - ps0)) & mask */
- filt = _mm_and_si128(filt, mask);
-
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- filter1 = _mm_unpacklo_epi8(zero, filter1);
- filter1 = _mm_srai_epi16(filter1, 0xB);
- filter2 = _mm_unpacklo_epi8(zero, filter2);
- filter2 = _mm_srai_epi16(filter2, 0xB);
-
- /* Filter1 >> 3 */
- filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
- qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
- /* filt >> 1 */
- filt = _mm_adds_epi16(filter1, t1);
- filt = _mm_srai_epi16(filt, 1);
- filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
- filt);
- filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
- qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
- // loopfilter done
-
- {
- __m128i work;
- flat = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
- _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
- flat = _mm_max_epu8(abs_p1p0, flat);
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
-
- q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
- q5p5 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
-
- q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
- q6p6 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
-
- flat2 = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
- _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
-
- q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
- q7p7 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
-
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
- _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));
-
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
- {
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i four = _mm_set1_epi16(4);
- __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
- __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
- __m128i pixelFilter_p, pixelFilter_q;
- __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
- __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
- p7_16 = _mm_unpacklo_epi8(q7p7, zero);
- p6_16 = _mm_unpacklo_epi8(q6p6, zero);
- p5_16 = _mm_unpacklo_epi8(q5p5, zero);
- p4_16 = _mm_unpacklo_epi8(q4p4, zero);
- p3_16 = _mm_unpacklo_epi8(q3p3, zero);
- p2_16 = _mm_unpacklo_epi8(q2p2, zero);
- p1_16 = _mm_unpacklo_epi8(q1p1, zero);
- p0_16 = _mm_unpacklo_epi8(q0p0, zero);
- q0_16 = _mm_unpackhi_epi8(q0p0, zero);
- q1_16 = _mm_unpackhi_epi8(q1p1, zero);
- q2_16 = _mm_unpackhi_epi8(q2p2, zero);
- q3_16 = _mm_unpackhi_epi8(q3p3, zero);
- q4_16 = _mm_unpackhi_epi8(q4p4, zero);
- q5_16 = _mm_unpackhi_epi8(q5p5, zero);
- q6_16 = _mm_unpackhi_epi8(q6p6, zero);
- q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
- pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
- _mm_add_epi16(p4_16, p3_16));
- pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
- _mm_add_epi16(q4_16, q3_16));
-
- pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
- pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
- pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
- pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
- pixelFilter_p =
- _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
- pixetFilter_p2p1p0 = _mm_add_epi16(
- four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
- flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
-
- flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(p7_16, p7_16);
- sum_q7 = _mm_add_epi16(q7_16, q7_16);
- sum_p3 = _mm_add_epi16(p3_16, p3_16);
- sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
- pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
- flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
- flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- sum_p3 = _mm_add_epi16(sum_p3, p3_16);
- sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
- flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
- flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
- flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
- flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
- flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
- sum_p7 = _mm_add_epi16(sum_p7, p7_16);
- sum_q7 = _mm_add_epi16(sum_q7, q7_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
- flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
- }
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- flat = _mm_shuffle_epi32(flat, 68);
- flat2 = _mm_shuffle_epi32(flat2, 68);
-
- q2p2 = _mm_andnot_si128(flat, q2p2);
- flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
- q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
- qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
- flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
- q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
- qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
- flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
- q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
- q6p6 = _mm_andnot_si128(flat2, q6p6);
- flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
- q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
- _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
- _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
-
- q5p5 = _mm_andnot_si128(flat2, q5p5);
- flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
- q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
- _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
- _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
-
- q4p4 = _mm_andnot_si128(flat2, q4p4);
- flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
- q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
- _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
- _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
-
- q3p3 = _mm_andnot_si128(flat2, q3p3);
- flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
- q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
- _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
- _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
-
- q2p2 = _mm_andnot_si128(flat2, q2p2);
- flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
- q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
- _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
- _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
-
- q1p1 = _mm_andnot_si128(flat2, q1p1);
- flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
- q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
- _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
- _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
-
- q0p0 = _mm_andnot_si128(flat2, q0p0);
- flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
- q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
- _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
- _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
- }
-}
-
-DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
- 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
- 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
-};
-
-void aom_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- __m128i mask, hev, flat, flat2;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi8(1);
- __m128i p7, p6, p5;
- __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
- __m128i q5, q6, q7;
- __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
- p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
-
- const __m128i thresh =
- _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
- const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
- const __m128i blimit =
- _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
-
- p256_4 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
- p256_3 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
- p256_2 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
- p256_1 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
- p256_0 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
- q256_0 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
- q256_1 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
- q256_2 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
- q256_3 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
- q256_4 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
-
- p4 = _mm256_castsi256_si128(p256_4);
- p3 = _mm256_castsi256_si128(p256_3);
- p2 = _mm256_castsi256_si128(p256_2);
- p1 = _mm256_castsi256_si128(p256_1);
- p0 = _mm256_castsi256_si128(p256_0);
- q0 = _mm256_castsi256_si128(q256_0);
- q1 = _mm256_castsi256_si128(q256_1);
- q2 = _mm256_castsi256_si128(q256_2);
- q3 = _mm256_castsi256_si128(q256_3);
- q4 = _mm256_castsi256_si128(q256_4);
-
- {
- const __m128i abs_p1p0 =
- _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 =
- _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 =
- _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 =
- _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
- __m128i work;
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(flat, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
- _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
- mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
- _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
-
- // lp filter
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
-
- __m128i ps1 = _mm_xor_si128(p1, t80);
- __m128i ps0 = _mm_xor_si128(p0, t80);
- __m128i qs0 = _mm_xor_si128(q0, t80);
- __m128i qs1 = _mm_xor_si128(q1, t80);
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
- __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
- flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
- flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
-
- filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, ps0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- /* (aom_filter + 3 * (qs0 - ps0)) & mask */
- filt = _mm_and_si128(filt, mask);
-
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- /* Filter1 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
- qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-
- /* Filter2 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
- ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-
- /* filt >> 1 */
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
- filt = _mm_andnot_si128(hev, filt);
- ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
- qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
- // loopfilter done
-
- {
- __m128i work;
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
- _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
- _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
- _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
-
- p256_5 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
- q256_5 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
- p5 = _mm256_castsi256_si128(p256_5);
- q5 = _mm256_castsi256_si128(q256_5);
- flat2 = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
- _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
-
- flat2 = _mm_max_epu8(work, flat2);
- p256_6 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
- q256_6 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
- p6 = _mm256_castsi256_si128(p256_6);
- q6 = _mm256_castsi256_si128(q256_6);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
- _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
-
- flat2 = _mm_max_epu8(work, flat2);
-
- p256_7 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
- q256_7 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
- p7 = _mm256_castsi256_si128(p256_7);
- q7 = _mm256_castsi256_si128(q256_7);
- work = _mm_max_epu8(
- _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
- _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
-
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
- {
- const __m256i eight = _mm256_set1_epi16(8);
- const __m256i four = _mm256_set1_epi16(4);
- __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
- pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
- const __m256i filter =
- _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
- p256_7 = _mm256_shuffle_epi8(p256_7, filter);
- p256_6 = _mm256_shuffle_epi8(p256_6, filter);
- p256_5 = _mm256_shuffle_epi8(p256_5, filter);
- p256_4 = _mm256_shuffle_epi8(p256_4, filter);
- p256_3 = _mm256_shuffle_epi8(p256_3, filter);
- p256_2 = _mm256_shuffle_epi8(p256_2, filter);
- p256_1 = _mm256_shuffle_epi8(p256_1, filter);
- p256_0 = _mm256_shuffle_epi8(p256_0, filter);
- q256_0 = _mm256_shuffle_epi8(q256_0, filter);
- q256_1 = _mm256_shuffle_epi8(q256_1, filter);
- q256_2 = _mm256_shuffle_epi8(q256_2, filter);
- q256_3 = _mm256_shuffle_epi8(q256_3, filter);
- q256_4 = _mm256_shuffle_epi8(q256_4, filter);
- q256_5 = _mm256_shuffle_epi8(q256_5, filter);
- q256_6 = _mm256_shuffle_epi8(q256_6, filter);
- q256_7 = _mm256_shuffle_epi8(q256_7, filter);
-
- pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
- _mm256_add_epi16(p256_4, p256_3));
- pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
- _mm256_add_epi16(q256_4, q256_3));
-
- pixetFilter_p2p1p0 =
- _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
- pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
- pixetFilter_q2q1q0 =
- _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
- pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-
- pixelFilter_p = _mm256_add_epi16(
- eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
-
- pixetFilter_p2p1p0 = _mm256_add_epi16(
- four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);
-
- flat2_p0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);
-
- flat2_q0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
- res_p =
- _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(p256_3, p256_0)),
- 3);
-
- flat_p0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
- res_q =
- _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(q256_3, q256_0)),
- 3);
-
- flat_q0 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
- sum_p7 = _mm256_add_epi16(p256_7, p256_7);
-
- sum_q7 = _mm256_add_epi16(q256_7, q256_7);
-
- sum_p3 = _mm256_add_epi16(p256_3, p256_3);
-
- sum_q3 = _mm256_add_epi16(q256_3, q256_3);
-
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
-
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);
-
- flat2_p1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);
-
- flat2_q1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
- pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
-
- pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
-
- res_p =
- _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(sum_p3, p256_1)),
- 3);
-
- flat_p1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
- res_q =
- _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
- _mm256_add_epi16(sum_q3, q256_1)),
- 3);
-
- flat_q1 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
- sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
-
- sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
-
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
-
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);
-
- flat2_p2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);
-
- flat2_q2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
- pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
-
- pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
-
- res_p =
- _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
- _mm256_add_epi16(sum_p3, p256_2)),
- 3);
-
- flat_p2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
- res_q =
- _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
- _mm256_add_epi16(sum_q3, q256_2)),
- 3);
-
- flat_q2 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
-
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);
-
- flat2_p3 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);
-
- flat2_q3 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
-
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);
-
- flat2_p4 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);
-
- flat2_q4 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
-
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);
-
- flat2_p5 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);
-
- flat2_q5 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
-
- sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
-
- sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
-
- pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
-
- pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
-
- res_p = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);
-
- flat2_p6 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
-
- res_q = _mm256_srli_epi16(
- _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);
-
- flat2_q6 = _mm256_castsi256_si128(
- _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
- }
-
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- p2 = _mm_andnot_si128(flat, p2);
- flat_p2 = _mm_and_si128(flat, flat_p2);
- p2 = _mm_or_si128(flat_p2, p2);
-
- p1 = _mm_andnot_si128(flat, ps1);
- flat_p1 = _mm_and_si128(flat, flat_p1);
- p1 = _mm_or_si128(flat_p1, p1);
-
- p0 = _mm_andnot_si128(flat, ps0);
- flat_p0 = _mm_and_si128(flat, flat_p0);
- p0 = _mm_or_si128(flat_p0, p0);
-
- q0 = _mm_andnot_si128(flat, qs0);
- flat_q0 = _mm_and_si128(flat, flat_q0);
- q0 = _mm_or_si128(flat_q0, q0);
-
- q1 = _mm_andnot_si128(flat, qs1);
- flat_q1 = _mm_and_si128(flat, flat_q1);
- q1 = _mm_or_si128(flat_q1, q1);
-
- q2 = _mm_andnot_si128(flat, q2);
- flat_q2 = _mm_and_si128(flat, flat_q2);
- q2 = _mm_or_si128(flat_q2, q2);
-
- p6 = _mm_andnot_si128(flat2, p6);
- flat2_p6 = _mm_and_si128(flat2, flat2_p6);
- p6 = _mm_or_si128(flat2_p6, p6);
- _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
-
- p5 = _mm_andnot_si128(flat2, p5);
- flat2_p5 = _mm_and_si128(flat2, flat2_p5);
- p5 = _mm_or_si128(flat2_p5, p5);
- _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
-
- p4 = _mm_andnot_si128(flat2, p4);
- flat2_p4 = _mm_and_si128(flat2, flat2_p4);
- p4 = _mm_or_si128(flat2_p4, p4);
- _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
-
- p3 = _mm_andnot_si128(flat2, p3);
- flat2_p3 = _mm_and_si128(flat2, flat2_p3);
- p3 = _mm_or_si128(flat2_p3, p3);
- _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
-
- p2 = _mm_andnot_si128(flat2, p2);
- flat2_p2 = _mm_and_si128(flat2, flat2_p2);
- p2 = _mm_or_si128(flat2_p2, p2);
- _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-
- p1 = _mm_andnot_si128(flat2, p1);
- flat2_p1 = _mm_and_si128(flat2, flat2_p1);
- p1 = _mm_or_si128(flat2_p1, p1);
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-
- p0 = _mm_andnot_si128(flat2, p0);
- flat2_p0 = _mm_and_si128(flat2, flat2_p0);
- p0 = _mm_or_si128(flat2_p0, p0);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-
- q0 = _mm_andnot_si128(flat2, q0);
- flat2_q0 = _mm_and_si128(flat2, flat2_q0);
- q0 = _mm_or_si128(flat2_q0, q0);
- _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
-
- q1 = _mm_andnot_si128(flat2, q1);
- flat2_q1 = _mm_and_si128(flat2, flat2_q1);
- q1 = _mm_or_si128(flat2_q1, q1);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-
- q2 = _mm_andnot_si128(flat2, q2);
- flat2_q2 = _mm_and_si128(flat2, flat2_q2);
- q2 = _mm_or_si128(flat2_q2, q2);
- _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-
- q3 = _mm_andnot_si128(flat2, q3);
- flat2_q3 = _mm_and_si128(flat2, flat2_q3);
- q3 = _mm_or_si128(flat2_q3, q3);
- _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
-
- q4 = _mm_andnot_si128(flat2, q4);
- flat2_q4 = _mm_and_si128(flat2, flat2_q4);
- q4 = _mm_or_si128(flat2_q4, q4);
- _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
-
- q5 = _mm_andnot_si128(flat2, q5);
- flat2_q5 = _mm_and_si128(flat2, flat2_q5);
- q5 = _mm_or_si128(flat2_q5, q5);
- _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
-
- q6 = _mm_andnot_si128(flat2, q6);
- flat2_q6 = _mm_and_si128(flat2, flat2_q6);
- q6 = _mm_or_si128(flat2_q6, q6);
- _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
- }
- _mm256_zeroupper();
-}
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
index f1eac233b..9d88b5e49 100644
--- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
@@ -249,6 +249,63 @@ static INLINE void transpose16x8_8x16_sse2(
*d7 = _mm_unpackhi_epi64(w7, w15);
}
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them independently while flipping the second matrix horizontaly Used for 14
+// taps filter pq pairs inverse
+static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7,
+ __m128i *pq0, __m128i *pq1,
+ __m128i *pq2, __m128i *pq3) {
+ __m128i w10, w11, w12, w13;
+ __m128i w0, w1, w2, w3, w4, w5;
+ __m128i d0, d1, d2, d3;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ w2 = _mm_unpacklo_epi8(
+ *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ w3 = _mm_unpacklo_epi8(
+ *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ d0 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ d2 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+ w10 = _mm_unpacklo_epi8(
+ *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
+ w11 = _mm_unpacklo_epi8(
+ *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
+ w12 = _mm_unpacklo_epi8(
+ *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
+ w13 = _mm_unpacklo_epi8(
+ *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
+
+ w4 = _mm_unpackhi_epi16(
+ w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpackhi_epi16(
+ w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ d1 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ d3 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+ *pq0 = _mm_unpacklo_epi64(d0, d1); // pq
+ *pq1 = _mm_unpackhi_epi64(d0, d1); // pq
+ *pq2 = _mm_unpacklo_epi64(d2, d3); // pq
+ *pq3 = _mm_unpackhi_epi64(d2, d3); // pq
+}
+
static INLINE void transpose8x16_16x8_sse2(
__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
__m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
@@ -300,9 +357,120 @@ static INLINE void transpose8x16_16x8_sse2(
*d14d15 = _mm_unpackhi_epi64(w7, w15);
}
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them to 4x8 independently while flipping the second matrix horizontaly. Used
+// for 14 taps pq pairs creation
+static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *q0p0,
+ __m128i *q1p1, __m128i *q2p2,
+ __m128i *q3p3, __m128i *q4p4,
+ __m128i *q5p5, __m128i *q6p6,
+ __m128i *q7p7) {
+ __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ w2 = _mm_unpackhi_epi8(
+ *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
+ w3 = _mm_unpackhi_epi8(
+ *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
+
+ ww0 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ww1 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ww2 = _mm_unpacklo_epi16(
+ w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311
+ ww3 = _mm_unpackhi_epi16(
+ w2,
+ w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315
+
+ *q7p7 = _mm_unpacklo_epi32(
+ ww0,
+ _mm_srli_si128(
+ ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx
+ *q6p6 = _mm_unpackhi_epi32(
+ _mm_slli_si128(ww0, 4),
+ ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx
+ *q5p5 = _mm_unpackhi_epi32(
+ ww0,
+ _mm_slli_si128(
+ ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx
+ *q4p4 = _mm_unpacklo_epi32(
+ _mm_srli_si128(ww0, 12),
+ ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx
+ *q3p3 = _mm_unpacklo_epi32(
+ ww1,
+ _mm_srli_si128(
+ ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx
+ *q2p2 = _mm_unpackhi_epi32(
+ _mm_slli_si128(ww1, 4),
+ ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx
+ *q1p1 = _mm_unpackhi_epi32(
+ ww1,
+ _mm_slli_si128(
+ ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
+ *q0p0 = _mm_unpacklo_epi32(
+ _mm_srli_si128(ww1, 12),
+ ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
+}
+
static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
__m128i *hev, __m128i *mask,
__m128i *qs1qs0, __m128i *ps1ps0) {
+ __m128i filter, filter2filter1, work;
+ __m128i ps1ps0_work, qs1qs0_work;
+ __m128i hev1;
+ const __m128i t3t4 =
+ _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+ ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+ qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+ /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+ work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+ filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
+ /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+ filter = _mm_subs_epi8(filter, work);
+ filter = _mm_subs_epi8(filter, work);
+ filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */
+ filter = _mm_and_si128(filter, *mask); /* & mask */
+ filter = _mm_unpacklo_epi32(filter, filter);
+
+ /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+ /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+ filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+ filter2filter1 =
+ _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit
+ filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
+ filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
+
+ /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+ filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
+ filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit
+ filter = _mm_srai_epi16(filter, 9); /* round */
+ filter = _mm_packs_epi16(filter, filter);
+ filter = _mm_andnot_si128(*hev, filter);
+ filter = _mm_unpacklo_epi32(filter, filter);
+
+ filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
+ hev1 = _mm_srli_si128(filter2filter1, 8);
+ /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+ qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+ /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+ ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+
+ *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+ *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
+ __m128i *hev, __m128i *mask,
+ __m128i *qs1qs0,
+ __m128i *ps1ps0) {
const __m128i t3t4 =
_mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
const __m128i t80 = _mm_set1_epi8(0x80);
@@ -356,6 +524,49 @@ static AOM_FORCE_INLINE void lpf_internal_4_sse2(
__m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
__m128i q1p1, q0p0, p1p0, q1q0;
__m128i abs_p0q0, abs_p1q1;
+ __m128i mask, flat, hev;
+ const __m128i zero = _mm_setzero_si128();
+
+ q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+ p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
+ q1q0 = _mm_srli_si128(p1p0, 8);
+
+ /* (abs(q1 - q0), abs(p1 - p0) */
+ flat = abs_diff(q1p1, q0p0);
+ /* abs(p1 - q1), abs(p0 - q0) */
+ __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ hev = _mm_unpacklo_epi8(flat, zero);
+
+ hev = _mm_cmpgt_epi16(hev, *thresh);
+ hev = _mm_packs_epi16(hev, hev);
+ hev = _mm_unpacklo_epi32(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+ abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */
+ abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+
+ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+ mask = _mm_unpacklo_epi32(mask, flat);
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
+
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
+ __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+ __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+ __m128i q1p1, q0p0, p1p0, q1q0;
+ __m128i abs_p0q0, abs_p1q1;
__m128i mask, hev;
const __m128i zero = _mm_setzero_si128();
@@ -390,14 +601,14 @@ static AOM_FORCE_INLINE void lpf_internal_4_sse2(
mask = _mm_cmpeq_epi8(mask, zero);
mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
- filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+ filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
}
void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
const uint8_t *_blimit, const uint8_t *_limit,
const uint8_t *_thresh) {
const __m128i zero = _mm_setzero_si128();
- __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+ __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
_mm_loadl_epi64((const __m128i *)_limit));
__m128i thresh =
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
@@ -413,9 +624,9 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
xx_storel_32(s - 1 * p, ps1ps0);
- xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8));
+ xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
xx_storel_32(s + 0 * p, qs1qs0);
- xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8));
+ xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
}
void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
@@ -425,7 +636,7 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
__m128i p1, p0, q0, q1;
const __m128i zero = _mm_setzero_si128();
- __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+ __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
_mm_loadl_epi64((const __m128i *)_limit));
__m128i thresh =
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
@@ -442,8 +653,8 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
// Transpose 8x4 to 4x8
- p1 = _mm_srli_si128(p1p0, 8);
- q1 = _mm_srli_si128(q1q0, 8);
+ p1 = _mm_srli_si128(p1p0, 4);
+ q1 = _mm_srli_si128(q1q0, 4);
transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
@@ -455,10 +666,10 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
xx_storel_32(s - (num + 1) * p, x);
- xx_storel_32(s + num * p, _mm_srli_si128(x, 8));
+ xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
}
-static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
__m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
__m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
__m128i *thresh) {
@@ -503,38 +714,31 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2(
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
mask = _mm_subs_epu8(mask, *limit);
mask = _mm_cmpeq_epi8(mask, zero);
- // replicate for the further "merged variables" usage
- mask = _mm_unpacklo_epi64(mask, mask);
}
// lp filter - the same for 6, 8 and 14 versions
- filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+ filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
// loopfilter done
__m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
__m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
- {
- __m128i work;
- flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
- flat = _mm_max_epu8(abs_p1p0, flat);
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
- work = abs_diff(*q6p6, *q0p0);
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
- {
+ __m128i work;
+ flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+
const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4);
__m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
@@ -619,137 +823,413 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2(
_mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
flat_q1p1 = _mm_packus_epi16(res_p, res_q);
- sum_p6 = _mm_add_epi16(sum_p6, p6_16);
- sum_q6 = _mm_add_epi16(sum_q6, q6_16);
- sum_p3 = _mm_add_epi16(sum_p3, p3_16);
- sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
- 4);
- flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
res_p = _mm_srli_epi16(
_mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
res_q = _mm_srli_epi16(
_mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
flat_q2p2 = _mm_packus_epi16(res_p, res_q);
- sum_p6 = _mm_add_epi16(sum_p6, p6_16);
- sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
- 4);
- flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
- sum_p6 = _mm_add_epi16(sum_p6, p6_16);
- sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
- 4);
- flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
- sum_p6 = _mm_add_epi16(sum_p6, p6_16);
- sum_q6 = _mm_add_epi16(sum_q6, q6_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ // work with flat2
+ flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+ work = abs_diff(*q6p6, *q0p0);
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
- 4);
- flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat = _mm_unpacklo_epi64(flat, flat);
+ *q2p2 = _mm_andnot_si128(flat, *q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
+ 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
+ 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
+ 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
+ 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat2 = _mm_unpacklo_epi64(flat2, flat2);
+
+ *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+
+ *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+
+ *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+
+ *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
+
+ *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+
+ *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
+ }
+ } else {
+ *q0p0 = qs0ps0;
+ *q1p1 = qs1ps1;
}
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+}
- flat = _mm_shuffle_epi32(flat, 68);
- flat2 = _mm_shuffle_epi32(flat2, 68);
+static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+ __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+ __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i mask, hev, flat, flat2;
+ __m128i flat2_pq[6], flat_pq[3];
+ __m128i qs0ps0, qs1ps1;
+ __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+ __m128i abs_p1p0;
- *q2p2 = _mm_andnot_si128(flat, *q2p2);
- flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
- *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+ p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
+ q1q0 = _mm_srli_si128(p1p0, 8);
- qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
- flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
- *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+ __m128i fe, ff, work;
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+ abs_p1p0 = abs_diff(*q1p1, *q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+ fe = _mm_set1_epi8(0xfe);
+ ff = _mm_cmpeq_epi8(fe, fe);
+ abs_p0q0 = abs_diff(p1p0, q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
- qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
- flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
- *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- *q5p5 = _mm_andnot_si128(flat2, *q5p5);
- flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
- *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi32(hev, hev);
- *q4p4 = _mm_andnot_si128(flat2, *q4p4);
- flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
- *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_unpacklo_epi32(mask, zero);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
- *q3p3 = _mm_andnot_si128(flat2, *q3p3);
- flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
- *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+ work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
- *q2p2 = _mm_andnot_si128(flat2, *q2p2);
- flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
- *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+ qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
+ qs1ps1 = _mm_srli_si128(qs0ps0, 8);
+ // loopfilter done
- *q1p1 = _mm_andnot_si128(flat2, *q1p1);
- flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
- *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+ flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ flat = _mm_unpacklo_epi32(flat, flat);
+ flat = _mm_unpacklo_epi64(flat, flat);
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pq_16[7];
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i sum_p6;
+ __m128i sum_p3;
+
+ pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
+ pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
+ pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
+ pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
+ pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
+ pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
+ pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
+ q0_16 = _mm_srli_si128(pq_16[0], 8);
+ q1_16 = _mm_srli_si128(pq_16[1], 8);
+ q2_16 = _mm_srli_si128(pq_16[2], 8);
+ q3_16 = _mm_srli_si128(pq_16[3], 8);
+ q4_16 = _mm_srli_si128(pq_16[4], 8);
+ q5_16 = _mm_srli_si128(pq_16[5], 8);
+
+ __m128i flat_p[3], flat_q[3];
+ __m128i flat2_p[6], flat2_q[6];
+
+ __m128i work0, work0_0, work0_1, sum_p_0;
+ __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
+ __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
+ sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+ __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+ __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+ flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
+ flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
+
+ sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
+ sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
+
+ sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
+ sum_p = _mm_sub_epi16(sum_p_0, q5_16);
+
+ work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
+ work0_1 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
+
+ sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
+ sum_lp = _mm_sub_epi16(sum_lp, q2_16);
+
+ work0 = _mm_add_epi16(sum_p3, pq_16[1]);
+ flat_p[1] = _mm_add_epi16(sum_lp, work0);
+ flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+ flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+ flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+ flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
+ flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
+
+ sum_lp = _mm_sub_epi16(sum_lp, q1_16);
+ sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
+
+ sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
+ work0 = _mm_add_epi16(sum_p3, pq_16[2]);
+
+ flat_p[2] = _mm_add_epi16(sum_lp, work0);
+ flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+ flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+ flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
- *q0p0 = _mm_andnot_si128(flat2, *q0p0);
- flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
- *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
+ work = abs_diff(*q6p6, *q0p0);
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ flat2 = _mm_unpacklo_epi32(flat2, flat2);
+
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
+ *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
+ *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
+
+ *q2p2 = _mm_andnot_si128(flat, *q2p2);
+ flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
+ *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
+
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+ flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
+ flat2_q[0] = _mm_add_epi16(
+ sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
+
+ flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+ flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+ flat2_pq[0] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+ flat2_pq[1] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+ flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
+ flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
+
+ sum_p = _mm_sub_epi16(sum_p, q4_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
+ flat2_p[2] = _mm_add_epi16(sum_p, work0);
+ flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[2] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+ flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ sum_p = _mm_sub_epi16(sum_p, q3_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
+
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
+ flat2_p[3] = _mm_add_epi16(sum_p, work0);
+ flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[3] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+ flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ sum_p = _mm_sub_epi16(sum_p, q2_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
+
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
+ flat2_p[4] = _mm_add_epi16(sum_p, work0);
+ flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[4] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+ flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ sum_p = _mm_sub_epi16(sum_p, q1_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
+
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
+ flat2_p[5] = _mm_add_epi16(sum_p, work0);
+ flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[5] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+ flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+ flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
+ *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
+
+ *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+ flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
+ *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
+
+ *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+ flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
+ *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
+
+ *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+ flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
+ *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
+
+ *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+ flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
+ *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
+
+ *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+ flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
+ *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
+ }
+ } else {
+ *q0p0 = qs0ps0;
+ *q1p1 = qs1ps1;
+ }
}
void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
@@ -761,22 +1241,22 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
__m128i limit = _mm_load_si128((const __m128i *)_limit);
__m128i thresh = _mm_load_si128((const __m128i *)_thresh);
- q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
+ q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
_mm_cvtsi32_si128(*(int *)(s + 4 * p)));
- q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
+ q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
_mm_cvtsi32_si128(*(int *)(s + 3 * p)));
- q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
+ q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
_mm_cvtsi32_si128(*(int *)(s + 2 * p)));
- q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
+ q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
_mm_cvtsi32_si128(*(int *)(s + 1 * p)));
- q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
+ q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
_mm_cvtsi32_si128(*(int *)(s - 0 * p)));
- q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
+ q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
_mm_cvtsi32_si128(*(int *)(s + 5 * p)));
- q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
+ q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
_mm_cvtsi32_si128(*(int *)(s + 6 * p)));
lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
@@ -790,7 +1270,7 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
store_buffer_horz_8(q5p5, p, 5, s);
}
-static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
__m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
__m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
__m128i *thresh) {
@@ -810,6 +1290,7 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
const __m128i one = _mm_set1_epi8(1);
const __m128i fe = _mm_set1_epi8(0xfe);
const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+
{
// filter_mask and hev_mask
__m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
@@ -847,8 +1328,9 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
mask = _mm_subs_epu8(mask, *limit);
mask = _mm_cmpeq_epi8(mask, zero);
- // replicate for the further "merged variables" usage
- mask = _mm_unpacklo_epi64(mask, mask);
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
// flat_mask
flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
@@ -861,9 +1343,9 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
}
// 5 tap filter
- {
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
const __m128i four = _mm_set1_epi16(4);
-
__m128i workp_a, workp_b, workp_shft0, workp_shft1;
p2_16 = _mm_unpacklo_epi8(*p2, zero);
p1_16 = _mm_unpacklo_epi8(*p1, zero);
@@ -906,18 +1388,149 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+ *q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+ *p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
}
+}
- // lp filter - the same for 6, 8 and 14 versions
- filter4_sse2(p1p0, q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+ __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+ __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, hev, flat;
+ __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+ __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
+ __m128i ps1ps0, qs1qs0;
+
+ q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+ *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
+ *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
+
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ {
+ // filter_mask and hev_mask
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+ abs_p0q0 = abs_diff(*p1p0, *q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+ // considering sse doesn't have unsigned elements comparison the idea is
+ // to find at least one case when X > limit, it means the corresponding
+ // mask bit is set.
+ // to achieve that we find global max value of all inputs of abs(x-y) or
+ // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+ // otherwise - not
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi32(hev, hev);
- qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
- *q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_unpacklo_epi32(mask, zero);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
- ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
- *p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+ work = abs_diff(q2p2, q1p1);
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+
+ // flat_mask
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi32(flat, flat);
+ flat = _mm_unpacklo_epi64(flat, flat);
+ }
+
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_c;
+ __m128i pq0x2_pq1, pq1_pq2;
+ pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_srli_si128(pq0_16, 8);
+ q2_16 = _mm_srli_si128(pq2_16, 8);
+
+ // op1
+ pq0x2_pq1 =
+ _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1
+ pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+ pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
+ workp_b =
+ _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+ // op0
+ workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
+ workp_a = _mm_add_epi16(workp_a,
+ workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+ workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+ workp_b = _mm_srli_epi16(workp_b, 3);
+
+ flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
+
+ // oq0
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
+ pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4
+ workp_b = _mm_srli_si128(pq1_pq2, 8);
+ workp_a = _mm_add_epi16(
+ workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
+ // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+ // oq1
+ workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
+ pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4
+ workp_b = _mm_add_epi16(q2_16, q2_16);
+ workp_b =
+ _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+ workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+ workp_a = _mm_srli_epi16(workp_a, 3);
+
+ flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+ *q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+ *p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+ }
}
void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
@@ -941,9 +1554,9 @@ void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
&limit, &thresh);
xx_storel_32(s - 1 * p, p1p0);
- xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
+ xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
xx_storel_32(s + 0 * p, q1q0);
- xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
+ xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
}
void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
@@ -970,8 +1583,8 @@ void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
- lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
- &limit, &thresh);
+ lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+ &limit, &thresh);
_mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
_mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
@@ -982,15 +1595,168 @@ void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
static AOM_FORCE_INLINE void lpf_internal_8_sse2(
__m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
__m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
- __m128i *p2_out, __m128i *q2_out, __m128i *blimit, __m128i *limit,
- __m128i *thresh) {
+ __m128i *blimit, __m128i *limit, __m128i *thresh) {
const __m128i zero = _mm_setzero_si128();
__m128i mask, hev, flat;
__m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
flat_p1p0, flat_q0q1;
__m128i q2p2, q1p1, q0p0;
__m128i q1q0, p1p0, ps1ps0, qs1qs0;
- __m128i work_a, op2, oq2;
+ __m128i work_pq, opq2, pq2;
+
+ q3p3 = _mm_unpacklo_epi32(*p3, *q3);
+ q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+ p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0
+ q1q0 = _mm_srli_si128(p1p0, 8);
+
+ // filter_mask and hev_mask
+
+ // considering sse doesn't have unsigned elements comparison the idea is to
+ // find at least one case when X > limit, it means the corresponding mask
+ // bit is set.
+ // to achieve that we find global max value of all inputs of abs(x-y) or
+ // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+ // otherwise - not
+
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+ abs_p0q0 = abs_diff(p1p0, q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi32(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_unpacklo_epi32(mask, zero);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+
+ // flat_mask4
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi32(flat, flat);
+ flat = _mm_unpacklo_epi64(flat, flat);
+
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
+ p2_16 = _mm_unpacklo_epi8(*p2, zero);
+ p1_16 = _mm_unpacklo_epi8(*p1, zero);
+ p0_16 = _mm_unpacklo_epi8(*p0, zero);
+ q0_16 = _mm_unpacklo_epi8(*q0, zero);
+ q1_16 = _mm_unpacklo_epi8(*q1, zero);
+ q2_16 = _mm_unpacklo_epi8(*q2, zero);
+ p3_16 = _mm_unpacklo_epi8(*p3, zero);
+ q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+ // op2
+ workp_a =
+ _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+ workp_shft2 = _mm_add_epi16(workp_a, workp_b);
+
+ // op1
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+ workp_c = _mm_add_epi16(workp_a, workp_b);
+ // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // op0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+ workp_d = _mm_add_epi16(workp_a, workp_b);
+ // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
+ workp_c = _mm_srli_epi16(workp_c, 3);
+ flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
+
+ // oq0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+ // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ workp_c = _mm_add_epi16(workp_a, workp_b);
+
+ // oq1
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+ workp_d = _mm_add_epi16(workp_a, workp_b);
+ // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
+ workp_c = _mm_srli_epi16(workp_c, 3);
+ flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
+
+ // oq2
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+ workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+ workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
+ workp_c = _mm_srli_epi16(workp_c, 3);
+
+ opq2 = _mm_packus_epi16(workp_c, workp_c);
+
+ work_pq = _mm_andnot_si128(flat, q2p2);
+ pq2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_pq, pq2);
+ *q2 = _mm_srli_si128(*p2, 4);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
+ __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+ __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+ __m128i *blimit, __m128i *limit, __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, hev, flat;
+ __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+ flat_p1p0, flat_q0q1;
+ __m128i q2p2, q1p1, q0p0;
+ __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+ __m128i work_pq, opq2, pq2;
q3p3 = _mm_unpacklo_epi64(*p3, *q3);
q2p2 = _mm_unpacklo_epi64(*p2, *q2);
@@ -1043,11 +1809,11 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
mask = _mm_subs_epu8(mask, *limit);
mask = _mm_cmpeq_epi8(mask, zero);
- // replicate for the further "merged variables" usage
- mask = _mm_unpacklo_epi64(mask, mask);
- // flat_mask4
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+ // flat_mask4
flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
flat = _mm_max_epu8(abs_p1p0, flat);
@@ -1059,11 +1825,11 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
flat = _mm_unpacklo_epi64(flat, flat);
}
- // filter8
- {
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
const __m128i four = _mm_set1_epi16(4);
- __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+ __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
p2_16 = _mm_unpacklo_epi8(*p2, zero);
p1_16 = _mm_unpacklo_epi8(*p1, zero);
p0_16 = _mm_unpacklo_epi8(*p0, zero);
@@ -1078,8 +1844,7 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
_mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
- workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- op2 = _mm_packus_epi16(workp_shft0, workp_shft0);
+ workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
// op1
workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
@@ -1108,27 +1873,22 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- oq2 = _mm_packus_epi16(workp_shft1, workp_shft1);
- }
- // lp filter - the same for 6, 8 and 14 versions
- filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+ opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
- qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+ work_pq = _mm_andnot_si128(flat, q2p2);
+ pq2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_pq, pq2);
+ *q2 = _mm_srli_si128(*p2, 8);
- ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
- work_a = _mm_andnot_si128(flat, *q2);
- q2_16 = _mm_and_si128(flat, oq2);
- *q2_out = _mm_or_si128(work_a, q2_16);
-
- work_a = _mm_andnot_si128(flat, *p2);
- p2_16 = _mm_and_si128(flat, op2);
- *p2_out = _mm_or_si128(work_a, p2_16);
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
}
void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
@@ -1136,7 +1896,7 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
const unsigned char *_limit,
const unsigned char *_thresh) {
__m128i p2, p1, p0, q0, q1, q2, p3, q3;
- __m128i q1q0, p1p0, p2_out, q2_out;
+ __m128i q1q0, p1p0;
__m128i blimit = _mm_load_si128((const __m128i *)_blimit);
__m128i limit = _mm_load_si128((const __m128i *)_limit);
__m128i thresh = _mm_load_si128((const __m128i *)_thresh);
@@ -1151,14 +1911,14 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
- &p2_out, &q2_out, &blimit, &limit, &thresh);
+ &blimit, &limit, &thresh);
xx_storel_32(s - 1 * p, p1p0);
- xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
+ xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
xx_storel_32(s + 0 * p, q1q0);
- xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
- xx_storel_32(s - 3 * p, p2_out);
- xx_storel_32(s + 2 * p, q2_out);
+ xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+ xx_storel_32(s - 3 * p, p2);
+ xx_storel_32(s + 2 * p, q2);
}
void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
@@ -1196,8 +1956,8 @@ void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
_mm_loadl_epi64((__m128i *)(s + 6 * p)));
- lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
- &limit, &thresh);
+ lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+ &blimit, &limit, &thresh);
_mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
_mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
@@ -1227,7 +1987,7 @@ void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
_mm_load_si128((__m128i *)_thresh1));
__m128i p2, p1, p0, q0, q1, q2, p3, q3;
- __m128i q1q0, p1p0, p2_out, q2_out;
+ __m128i q1q0, p1p0;
p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
@@ -1238,15 +1998,15 @@ void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
- lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
- &p2_out, &q2_out, &blimit, &limit, &thresh);
+ lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
_mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
_mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
_mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
_mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
- _mm_storel_epi64((__m128i *)(s - 3 * p), p2_out);
- _mm_storel_epi64((__m128i *)(s + 2 * p), q2_out);
+ _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+ _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
}
void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
@@ -1282,7 +2042,7 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
__m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
- lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+ lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
_mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
_mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
@@ -1331,7 +2091,7 @@ void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
&q1);
- lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+ lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
p1 = _mm_srli_si128(ps1ps0, 8);
q1 = _mm_srli_si128(qs1qs0, 8);
@@ -1372,8 +2132,8 @@ void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
&limit, &thresh);
- p0 = _mm_srli_si128(p1p0, 8);
- q0 = _mm_srli_si128(q1q0, 8);
+ p0 = _mm_srli_si128(p1p0, 4);
+ q0 = _mm_srli_si128(q1q0, 4);
transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
@@ -1419,8 +2179,8 @@ void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
d5 = _mm_srli_si128(d4d5, 8);
d7 = _mm_srli_si128(d6d7, 8);
- lpf_internal_6_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, &blimit,
- &limit, &thresh);
+ lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
p0 = _mm_srli_si128(p1p0, 8);
q0 = _mm_srli_si128(q1q0, 8);
@@ -1444,7 +2204,7 @@ void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
const unsigned char *_thresh) {
__m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i p2, p0, q0, q2;
+ __m128i p0, q0;
__m128i x2, x1, x0, x3;
__m128i q1q0, p1p0;
__m128i blimit = _mm_load_si128((const __m128i *)_blimit);
@@ -1459,13 +2219,13 @@ void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
&d7);
// Loop filtering
- lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &p2,
- &q2, &blimit, &limit, &thresh);
+ lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
- p0 = _mm_srli_si128(p1p0, 8);
- q0 = _mm_srli_si128(q1q0, 8);
+ p0 = _mm_srli_si128(p1p0, 4);
+ q0 = _mm_srli_si128(q1q0, 4);
- transpose8x8_low_sse2(&d0, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0, &d1,
+ transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
&d2, &d3);
_mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
@@ -1490,7 +2250,7 @@ void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
__m128i d1, d3, d5, d7;
__m128i q1q0, p1p0;
- __m128i p2, p1, q1, q2;
+ __m128i p1, q1;
__m128i d0d1, d2d3, d4d5, d6d7;
x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
@@ -1510,14 +2270,14 @@ void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
d5 = _mm_srli_si128(d4d5, 8);
d7 = _mm_srli_si128(d6d7, 8);
- lpf_internal_8_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0,
- &p1p0, &p2, &q2, &blimit, &limit, &thresh);
+ lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
+ &q1q0, &p1p0, &blimit, &limit, &thresh);
p1 = _mm_srli_si128(p1p0, 8);
q1 = _mm_srli_si128(q1q0, 8);
- transpose8x8_sse2(&d0d1, &p2, &p1, &p1p0, &q1q0, &q1, &q2, &d7, &d0d1, &d2d3,
- &d4d5, &d6d7);
+ transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
+ &d2d3, &d4d5, &d6d7);
_mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
_mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
@@ -1533,65 +2293,30 @@ void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
const unsigned char *_blimit,
const unsigned char *_limit,
const unsigned char *_thresh) {
- __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
- __m128i x6, x5, x4, x3, x2, x1, x0;
- __m128i p0, p1, p2, p3, p4, p5, p6, p7;
- __m128i q0, q1, q2, q3, q4, q5, q6, q7;
- __m128i p0_out, p1_out, p2_out, p3_out;
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+ __m128i x6, x5, x4, x3;
+ __m128i pq0, pq1, pq2, pq3;
__m128i blimit = _mm_load_si128((__m128i *)_blimit);
__m128i limit = _mm_load_si128((__m128i *)_limit);
__m128i thresh = _mm_load_si128((__m128i *)_thresh);
- x6 = _mm_loadl_epi64((__m128i *)((s - 8) + 0 * p));
- x5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p));
- x4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p));
- x3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p));
-
- transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &p0, &p1, &p2, &p3, &p4, &p5, &p6,
- &p7);
+ x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+ x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+ x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+ x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
- x6 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
- x5 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
- x4 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
- x3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
-
- transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6,
- &q7);
-
- q6p6 = _mm_unpacklo_epi64(p1, q6);
- q5p5 = _mm_unpacklo_epi64(p2, q5);
- q4p4 = _mm_unpacklo_epi64(p3, q4);
- q3p3 = _mm_unpacklo_epi64(p4, q3);
- q2p2 = _mm_unpacklo_epi64(p5, q2);
- q1p1 = _mm_unpacklo_epi64(p6, q1);
- q0p0 = _mm_unpacklo_epi64(p7, q0);
+ transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
+ &q5p5, &q6p6, &q7p7);
lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
&limit, &thresh);
- transpose8x8_low_sse2(&p0, &p1, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
- &p0_out, &p1_out, &p2_out, &p3_out);
-
- x0 = _mm_srli_si128(q0p0, 8);
- x1 = _mm_srli_si128(q1p1, 8);
- x2 = _mm_srli_si128(q2p2, 8);
- x3 = _mm_srli_si128(q3p3, 8);
- x4 = _mm_srli_si128(q4p4, 8);
- x5 = _mm_srli_si128(q5p5, 8);
- x6 = _mm_srli_si128(q6p6, 8);
-
- transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &q0, &q1, &q2,
- &q3);
-
- _mm_storel_epi64((__m128i *)(s - 8 + 0 * p), p0_out);
- _mm_storel_epi64((__m128i *)(s - 8 + 1 * p), p1_out);
- _mm_storel_epi64((__m128i *)(s - 8 + 2 * p), p2_out);
- _mm_storel_epi64((__m128i *)(s - 8 + 3 * p), p3_out);
-
- _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
- _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
- _mm_storel_epi64((__m128i *)(s + 3 * p), q3);
+ transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+ &q0p0, &pq0, &pq1, &pq2, &pq3);
+ _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
+ _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
+ _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
+ _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
}
void aom_lpf_vertical_14_dual_sse2(
@@ -1634,8 +2359,8 @@ void aom_lpf_vertical_14_dual_sse2(
q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
q7 = _mm_srli_si128(d14d15, 8);
- lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
- &limit, &thresh);
+ lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+ &blimit, &limit, &thresh);
x0 = _mm_srli_si128(q0p0, 8);
x1 = _mm_srli_si128(q1p1, 8);
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
index c6b6469b4..8970fe7dd 100644
--- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
+++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _AOM_DSP_X86_LPF_COMMON_X86_H
-#define _AOM_DSP_X86_LPF_COMMON_X86_H
+#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
#include <emmintrin.h> // SSE2
@@ -212,4 +212,4 @@ static INLINE void highbd_transpose8x16_sse2(
d4 + 1, d5 + 1, d6 + 1, d7 + 1);
}
-#endif // _AOM_DSP_X86_LPF_COMMON_X86_H
+#endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
index 6538e4d5e..584b5e7e3 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -9,7 +9,6 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include <stdio.h>
#include <tmmintrin.h>
#include "config/aom_config.h"
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
index 19b429d91..cffbd9672 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H
-#define _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H
+#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
const uint8_t *a_ptr, int a_stride,
@@ -30,4 +30,4 @@ unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
const uint8_t *m_ptr, int m_stride,
int height);
-#endif
+#endif // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
index dc41a8342..4faa098ac 100644
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H
-#define _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H
+#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
#include <stdlib.h>
#include <string.h>
@@ -89,4 +89,4 @@ static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
} while (i < height);
}
-#endif
+#endif // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
index 8b69606dd..6c821673e 100644
--- a/third_party/aom/aom_dsp/x86/mem_sse2.h
+++ b/third_party/aom/aom_dsp/x86/mem_sse2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_MEM_SSE2_H_
-#define AOM_DSP_X86_MEM_SSE2_H_
+#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
+#define AOM_AOM_DSP_X86_MEM_SSE2_H_
#include <emmintrin.h> // SSE2
@@ -39,4 +39,4 @@ static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
return dst;
}
-#endif // AOM_DSP_X86_MEM_SSE2_H_
+#endif // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
new file mode 100644
index 000000000..5181e444c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+
+static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int h) {
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n));
+ const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
+ const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
+
+ const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+ const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * h);
+
+ *sum = xx_hsum_epi32_si32(v_sum_d);
+ *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
index a3535f985..48486c6c4 100644
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
-#define AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
#include <immintrin.h>
@@ -42,4 +42,13 @@ static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
}
-#endif // AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
+ const __m128i v_tmp_d =
+ _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
new file mode 100644
index 000000000..bfec0e8a8
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int w, const int h) {
+ int n = 0, width, height = h;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ __m128i v_d;
+ const uint8_t *pre_temp;
+ assert(w >= 8);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+ do {
+ width = w;
+ pre_temp = pre;
+ do {
+ const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp);
+ const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+ const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+ const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d);
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+
+ const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31);
+ const __m256i v_tmp_d =
+ _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d);
+ const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12);
+ const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d);
+ const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1);
+
+ const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d);
+ const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ pre_temp += 8;
+ n += 8;
+ width -= 8;
+ } while (width > 0);
+ pre += pre_stride;
+ height -= 1;
+ } while (height > 0);
+ v_d = _mm_hadd_epi32(v_sum_d, v_sse_d);
+ v_d = _mm_hadd_epi32(v_d, v_d);
+ *sum = _mm_cvtsi128_si32(v_d);
+ *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
+}
+
+static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int w, const int h) {
+ int n = 0, width, height = h;
+ __m256i v_d;
+ __m128i res0;
+ const uint8_t *pre_temp;
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ __m256i v_sum_d = _mm256_setzero_si256();
+ __m256i v_sse_d = _mm256_setzero_si256();
+
+ assert(w >= 16);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+ do {
+ width = w;
+ pre_temp = pre;
+ do {
+ const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp);
+ const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+ const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+ const __m256i v_m1_d =
+ _mm256_loadu_si256((__m256i const *)(mask + n + 8));
+ const __m256i v_w1_d =
+ _mm256_loadu_si256((__m256i const *)(wsrc + n + 8));
+
+ const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+ const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8));
+
+ const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+ const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+ const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d);
+
+ const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31);
+ const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31);
+
+ const __m256i v_tmp0_d =
+ _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d);
+ const __m256i v_tmp1_d =
+ _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d);
+
+ const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12);
+ const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12);
+
+ const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d);
+ const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d);
+ const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+ v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ pre_temp += 16;
+ n += 16;
+ width -= 16;
+ } while (width > 0);
+ pre += pre_stride;
+ height -= 1;
+ } while (height > 0);
+
+ v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d);
+ v_d = _mm256_hadd_epi32(v_d, v_d);
+ res0 = _mm256_castsi256_si128(v_d);
+ res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1));
+ *sum = _mm_cvtsi128_si32(res0);
+ *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4));
+}
+
+#define OBMCVARWXH(W, H) \
+ unsigned int aom_obmc_variance##W##x##H##_avx2( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ if (W == 4) { \
+ obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \
+ } else if (W == 8) { \
+ obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+ } else { \
+ obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+ } \
+ \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+OBMCVARWXH(4, 16)
+OBMCVARWXH(16, 4)
+OBMCVARWXH(8, 32)
+OBMCVARWXH(32, 8)
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
index 2e2f6e09f..72eda0e57 100644
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -19,7 +19,7 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
#include "aom_dsp/x86/synonyms.h"
////////////////////////////////////////////////////////////////////////////////
@@ -36,45 +36,6 @@ void aom_var_filter_block2d_bil_second_pass_ssse3(
unsigned int pixel_step, unsigned int output_height,
unsigned int output_width, const uint8_t *filter);
-static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
- const int32_t *wsrc, const int32_t *mask,
- unsigned int *const sse, int *const sum,
- const int h) {
- const int pre_step = pre_stride - 4;
- int n = 0;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_d = _mm_setzero_si128();
-
- assert(IS_POWER_OF_TWO(h));
-
- do {
- const __m128i v_p_b = xx_loadl_32(pre + n);
- const __m128i v_m_d = xx_load_128(mask + n);
- const __m128i v_w_d = xx_load_128(wsrc + n);
-
- const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
- const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
- const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
- const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
-
- v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
- v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
- n += 4;
-
- if (n % 4 == 0) pre += pre_step;
- } while (n < 4 * h);
-
- *sum = xx_hsum_epi32_si32(v_sum_d);
- *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
const int32_t *wsrc, const int32_t *mask,
unsigned int *const sse, int *const sum,
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
index e6b40262d..216a0bd8f 100644
--- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
@@ -16,16 +16,12 @@
SECTION .text
%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
shift, qcoeff, dqcoeff, dequant, \
eob, scan, iscan
vzeroupper
- ; If we can skip this block, then just zero the output
- cmp skipmp, 0
- jne .blank
-
%ifnidn %1, b_32x32
; Special case for ncoeff == 16, as it is frequent and we can save on
@@ -83,14 +79,14 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
.single_nonzero:
; Actual quantization of size 16 block - setup pointers, rounders, etc.
- movifnidn r4, roundmp
- movifnidn r5, quantmp
- mov r3, dequantmp
- mov r6, shiftmp
- mova m1, [r4] ; m1 = round
- mova m2, [r5] ; m2 = quant
- mova m3, [r3] ; m3 = dequant
- mova m4, [r6] ; m4 = shift
+ movifnidn r3, roundmp
+ movifnidn r4, quantmp
+ mov r6, dequantmp
+ mov r5, shiftmp
+ mova m1, [r3] ; m1 = round
+ mova m2, [r4] ; m2 = quant
+ mova m3, [r6] ; m3 = dequant
+ mova m4, [r5] ; m4 = shift
mov r3, iscanmp
@@ -174,20 +170,20 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
%endif ; %ifnidn %1, b_32x32
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
qcoeff, dqcoeff, dequant, eob, scan, iscan
; Actual quantization loop - setup pointers, rounders, etc.
movifnidn coeffq, coeffmp
movifnidn ncoeffq, ncoeffmp
- mov r2, dequantmp
movifnidn zbinq, zbinmp
movifnidn roundq, roundmp
movifnidn quantq, quantmp
+ movifnidn dequantq, dequantmp
mova m0, [zbinq] ; m0 = zbin
mova m1, [roundq] ; m1 = round
mova m2, [quantq] ; m2 = quant
- mova m3, [r2] ; m3 = dequant
+ mova m3, [dequantq] ; m3 = dequant
pcmpeqw m4, m4 ; All lanes -1
%ifidn %1, b_32x32
psubw m0, m4
@@ -199,7 +195,7 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
mov r2, shiftmp
mov r3, qcoeffmp
- mova m4, [r2] ; m4 = shift
+ mova m4, [r2] ; m4 = shift
mov r4, dqcoeffmp
mov r5, iscanmp
%ifidn %1, b_32x32
@@ -207,7 +203,7 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
%endif
pxor m5, m5 ; m5 = dedicated zero
- DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
lea coeffq, [ coeffq+ncoeffq*4]
@@ -432,39 +428,8 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
mov [r2], ax
vzeroupper
RET
-
- ; Skip-block, i.e. just write all zeroes
-.blank:
-
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
- qcoeff, dqcoeff, dequant, eob, scan, iscan
-
- mov r0, dqcoeffmp
- movifnidn ncoeffq, ncoeffmp
- mov r2, qcoeffmp
- mov r3, eobmp
-
-DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-
- lea dqcoeffq, [dqcoeffq+ncoeffq*4]
- lea qcoeffq, [ qcoeffq+ncoeffq*4]
- neg ncoeffq
- pxor m7, m7
-
-.blank_loop:
- mova [dqcoeffq+ncoeffq*4+ 0], ymm7
- mova [dqcoeffq+ncoeffq*4+32], ymm7
- mova [qcoeffq+ncoeffq*4+ 0], ymm7
- mova [qcoeffq+ncoeffq*4+32], ymm7
- add ncoeffq, mmsize
- jl .blank_loop
-
- mov [eobq], word 0
-
- vzeroupper
- RET
%endmacro
INIT_XMM avx
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
+QUANTIZE_FN b, 9
+QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
index 46b9c7d29..d3de6e24d 100644
--- a/third_party/aom/aom_dsp/x86/quantize_sse2.c
+++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c
@@ -9,242 +9,139 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include <assert.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
- if (sizeof(tran_low_t) == 4) {
- return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
- (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
- (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
- (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
- } else {
- return _mm_load_si128((const __m128i *)coeff_ptr);
- }
+ assert(sizeof(tran_low_t) == 4);
+
+ return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
+ (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
+ (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
+ (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
}
static INLINE void store_coefficients(__m128i coeff_vals,
tran_low_t *coeff_ptr) {
- if (sizeof(tran_low_t) == 4) {
- __m128i one = _mm_set1_epi16(1);
- __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
- __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
- __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
- __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
- _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
- _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
- } else {
- _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
- }
+ assert(sizeof(tran_low_t) == 4);
+
+ __m128i one = _mm_set1_epi16(1);
+ __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+ __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+ __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+ __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+ _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+ _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
}
void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
uint16_t *eob_ptr, const int16_t *scan_ptr,
const int16_t *iscan_ptr) {
- __m128i zero;
+ const __m128i zero = _mm_setzero_si128();
+ int index = 16;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob, eob0;
+
(void)scan_ptr;
- coeff_ptr += n_coeffs;
- iscan_ptr += n_coeffs;
- qcoeff_ptr += n_coeffs;
- dqcoeff_ptr += n_coeffs;
- n_coeffs = -n_coeffs;
- zero = _mm_setzero_si128();
- if (!skip_block) {
- __m128i eob;
- __m128i zbin;
- __m128i round, quant, dequant, shift;
- {
- __m128i coeff0, coeff1;
-
- // Setup global values
- {
- __m128i pw_1;
- zbin = _mm_load_si128((const __m128i *)zbin_ptr);
- round = _mm_load_si128((const __m128i *)round_ptr);
- quant = _mm_load_si128((const __m128i *)quant_ptr);
- pw_1 = _mm_set1_epi16(1);
- zbin = _mm_sub_epi16(zbin, pw_1);
- dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
- }
-
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- __m128i cmp_mask0, cmp_mask1;
- // Do DC and first 15 AC
- coeff0 = load_coefficients(coeff_ptr + n_coeffs);
- coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
- zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
- cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- round = _mm_unpackhi_epi64(round, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- quant = _mm_unpackhi_epi64(quant, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
- qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
- qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
- qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
- shift = _mm_unpackhi_epi64(shift, shift);
- qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- // Mask out zbin threshold coeffs
- qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
- qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
- store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
- store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- dequant = _mm_unpackhi_epi64(dequant, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
- store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
- }
-
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob = _mm_max_epi16(eob, eob1);
- }
- n_coeffs += 8 * 2;
- }
-
- // AC only loop
- while (n_coeffs < 0) {
- __m128i coeff0, coeff1;
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- __m128i cmp_mask0, cmp_mask1;
-
- coeff0 = load_coefficients(coeff_ptr + n_coeffs);
- coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
- cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
- qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
- qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
- qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
- qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- // Mask out zbin threshold coeffs
- qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
- qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
- store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
- store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
- store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
- }
-
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob0, eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob0 = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob0 = _mm_max_epi16(eob0, eob1);
- eob = _mm_max_epi16(eob, eob0);
- }
- n_coeffs += 8 * 2;
- }
-
- // Accumulate EOB
- {
- __m128i eob_shuffled;
- eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
- eob = _mm_max_epi16(eob, eob_shuffled);
- *eob_ptr = _mm_extract_epi16(eob, 1);
- }
- } else {
- do {
- store_coefficients(zero, dqcoeff_ptr + n_coeffs);
- store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
- store_coefficients(zero, qcoeff_ptr + n_coeffs);
- store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
- n_coeffs += 8 * 2;
- } while (n_coeffs < 0);
- *eob_ptr = 0;
+ // Setup global values.
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr);
+ store_coefficients(coeff1, dqcoeff_ptr + 8);
+
+ eob =
+ scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr + index);
+ store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+ index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+
+ index += 16;
}
+
+ *eob_ptr = accumulate_eob(eob);
}
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
index e2c1ebb71..39d4ca674 100644
--- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -18,21 +18,18 @@ pw_1: times 8 dw 1
SECTION .text
-; TODO(yunqingwang)fix quantize_b code for skip=1 case.
%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
shift, qcoeff, dqcoeff, dequant, \
eob, scan, iscan
- cmp dword skipm, 0
- jne .blank
; actual quantize loop - setup pointers, rounders, etc.
movifnidn coeffq, coeffmp
movifnidn ncoeffq, ncoeffmp
- mov r2, dequantmp
movifnidn zbinq, zbinmp
movifnidn roundq, roundmp
movifnidn quantq, quantmp
+ movifnidn dequantq, dequantmp
mova m0, [zbinq] ; m0 = zbin
mova m1, [roundq] ; m1 = round
mova m2, [quantq] ; m2 = quant
@@ -44,18 +41,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
psrlw m0, 1 ; m0 = (m0 + 1) / 2
psrlw m1, 1 ; m1 = (m1 + 1) / 2
%endif
- mova m3, [r2q] ; m3 = dequant
- psubw m0, [GLOBAL(pw_1)]
+ mova m3, [dequantq] ; m3 = dequant
mov r2, shiftmp
- mov r3, qcoeffmp
+ psubw m0, [GLOBAL(pw_1)]
mova m4, [r2] ; m4 = shift
+ mov r3, qcoeffmp
mov r4, dqcoeffmp
mov r5, iscanmp
%ifidn %1, b_32x32
psllw m4, 1
%endif
pxor m5, m5 ; m5 = dedicated zero
- DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
lea coeffq, [ coeffq+ncoeffq*4]
lea qcoeffq, [ qcoeffq+ncoeffq*4]
lea dqcoeffq, [dqcoeffq+ncoeffq*4]
@@ -268,33 +265,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
pextrw r6, m8, 0
mov [r2], r6
RET
-
- ; skip-block, i.e. just write all zeroes
-.blank:
- mov r0, dqcoeffmp
- movifnidn ncoeffq, ncoeffmp
- mov r2, qcoeffmp
- mov r3, eobmp
- DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
- lea dqcoeffq, [dqcoeffq+ncoeffq*4]
- lea qcoeffq, [ qcoeffq+ncoeffq*4]
- neg ncoeffq
- pxor m7, m7
-.blank_loop:
- mova [dqcoeffq+ncoeffq*4+ 0], m7
- mova [dqcoeffq+ncoeffq*4+16], m7
- mova [dqcoeffq+ncoeffq*4+32], m7
- mova [dqcoeffq+ncoeffq*4+48], m7
- mova [qcoeffq+ncoeffq*4+ 0], m7
- mova [qcoeffq+ncoeffq*4+16], m7
- mova [qcoeffq+ncoeffq*4+32], m7
- mova [qcoeffq+ncoeffq*4+48], m7
- add ncoeffq, mmsize
- jl .blank_loop
- mov word [eobq], 0
- RET
%endmacro
INIT_XMM ssse3
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
+QUANTIZE_FN b, 9
+QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h
new file mode 100644
index 000000000..4eed7dd29
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_x86.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom/aom_integer.h"
+
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+ const int16_t *round_ptr, __m128i *round,
+ const int16_t *quant_ptr, __m128i *quant,
+ const int16_t *dequant_ptr, __m128i *dequant,
+ const int16_t *shift_ptr, __m128i *shift) {
+ *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ *round = _mm_load_si128((const __m128i *)round_ptr);
+ *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ *shift = _mm_load_si128((const __m128i *)shift_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi16(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+ const __m128i quant, const __m128i shift) {
+ __m128i tmp, qcoeff;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
+ return _mm_mullo_epi16(qcoeff, dequant);
+}
+
+// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
+// to zbin to add 1 to the index in 'scan'.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+ const __m128i zbin_mask0,
+ const __m128i zbin_mask1,
+ const int16_t *scan_ptr, const int index,
+ const __m128i zero) {
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+ __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
+ __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+ __m128i eob0, eob1;
+ // Add one to convert from indices to counts
+ scan0 = _mm_sub_epi16(scan0, zbin_mask0);
+ scan1 = _mm_sub_epi16(scan1, zbin_mask1);
+ eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+ eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+ return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c
new file mode 100644
index 000000000..305dde5c0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_avx2.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+ const uint8_t *b) {
+ const __m256i v_a0 = yy_loadu_256(a);
+ const __m256i v_b0 = yy_loadu_256(b);
+ const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0));
+ const __m256i v_a01_w =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1));
+ const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0));
+ const __m256i v_b01_w =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1));
+ const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+ const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+ int64_t sum;
+ const __m256i sum0_4x64 =
+ _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all));
+ const __m256i sum1_4x64 =
+ _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1));
+ const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+ const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+ _mm256_extracti128_si256(sum_4x64, 1));
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ __m256i sum = _mm256_setzero_si256();
+ switch (width) {
+ case 4:
+ do {
+ const __m128i v_a0 = xx_loadl_32(a);
+ const __m128i v_a1 = xx_loadl_32(a + a_stride);
+ const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
+ const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
+ const __m128i v_b0 = xx_loadl_32(b);
+ const __m128i v_b1 = xx_loadl_32(b + b_stride);
+ const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
+ const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
+ const __m128i v_a0123 = _mm_unpacklo_epi64(
+ _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3));
+ const __m128i v_b0123 = _mm_unpacklo_epi64(
+ _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3));
+ const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+ const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 8:
+ do {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_a1 = xx_loadl_64(a + a_stride);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_b1 = xx_loadl_64(b + b_stride);
+ const __m256i v_a_w =
+ _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+ const __m256i v_b_w =
+ _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 16:
+ do {
+ const __m128i v_a0 = xx_loadu_128(a);
+ const __m128i v_b0 = xx_loadu_128(b);
+ const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0);
+ const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 32:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 64:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ sse_w32_avx2(&sum, a + 32, b + 32);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 128:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ sse_w32_avx2(&sum, a + 32, b + 32);
+ sse_w32_avx2(&sum, a + 64, b + 64);
+ sse_w32_avx2(&sum, a + 96, b + 96);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ default: break;
+ }
+
+ return sse;
+}
+
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+ const uint16_t *b) {
+ const __m256i v_a_w = yy_loadu_256(a);
+ const __m256i v_b_w = yy_loadu_256(b);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+ int b_stride, int width, int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ __m256i sum = _mm256_setzero_si256();
+ switch (width) {
+ case 4:
+ do {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_a1 = xx_loadl_64(a + a_stride);
+ const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
+ const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_b1 = xx_loadl_64(b + b_stride);
+ const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
+ const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
+ const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
+ _mm_unpacklo_epi64(v_a2, v_a3));
+ const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
+ _mm_unpacklo_epi64(v_b2, v_b3));
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 8:
+ do {
+ const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
+ const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 16:
+ do {
+ highbd_sse_w16_avx2(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 32:
+ do {
+ highbd_sse_w16_avx2(&sum, a, b);
+ highbd_sse_w16_avx2(&sum, a + 16, b + 16);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 64:
+ do {
+ highbd_sse_w16_avx2(&sum, a, b);
+ highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
+ highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
+ highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 128:
+ do {
+ highbd_sse_w16_avx2(&sum, a, b);
+ highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
+ highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
+ highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
+ highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4);
+ highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5);
+ highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6);
+ highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ default: break;
+ }
+ return sse;
+}
diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c
new file mode 100644
index 000000000..8b5af8469
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_sse4.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+ int64_t sum;
+ const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+ const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+ const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+ const uint8_t *b) {
+ const __m128i v_a0 = xx_loadu_128(a);
+ const __m128i v_b0 = xx_loadu_128(b);
+ const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+ const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+ const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+ const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+ const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+ const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+}
+
+int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int y = 0;
+ int64_t sse = 0;
+ __m128i sum = _mm_setzero_si128();
+ switch (width) {
+ case 4:
+ do {
+ const __m128i v_a0 = xx_loadl_32(a);
+ const __m128i v_a1 = xx_loadl_32(a + a_stride);
+ const __m128i v_b0 = xx_loadl_32(b);
+ const __m128i v_b1 = xx_loadl_32(b + b_stride);
+ const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+ const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 8:
+ do {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+ const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 16:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 32:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16, b + 16);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 64:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+ sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+ sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 128:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+ sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+ sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+ sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
+ sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
+ sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
+ sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ default: break;
+ }
+
+ return sse;
+}
+
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+ const uint16_t *b) {
+ const __m128i v_a_w = xx_loadu_128(a);
+ const __m128i v_b_w = xx_loadu_128(b);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int width,
+ int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ __m128i sum = _mm_setzero_si128();
+ switch (width) {
+ case 4:
+ do {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_a1 = xx_loadl_64(a + a_stride);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_b1 = xx_loadl_64(b + b_stride);
+ const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+ const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 8:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 16:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ highbd_sse_w8_sse4_1(&sum, a + 8, b + 8);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 32:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 64:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 128:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14);
+ highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ default: break;
+ }
+ return sse;
+}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
new file mode 100644
index 000000000..0af44e3a4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+
+static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
+ int width, int height) {
+ uint64_t result;
+ __m256i v_acc_q = _mm256_setzero_si256();
+ const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff);
+ for (int col = 0; col < height; col += 4) {
+ __m256i v_acc_d = _mm256_setzero_si256();
+ for (int row = 0; row < width; row += 16) {
+ const int16_t *tempsrc = src + row;
+ const __m256i v_val_0_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+ const __m256i v_val_1_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+ const __m256i v_val_2_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+ const __m256i v_val_3_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+
+ const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+
+ const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d);
+
+ v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d);
+ }
+ v_acc_q =
+ _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q));
+ v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32));
+ src += 4 * stride;
+ }
+ __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q);
+ __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1);
+ __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value);
+
+ result_64_2_int = _mm_add_epi64(
+ result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int));
+
+ xx_storel_64(&result, result_64_2_int);
+
+ return result;
+}
+
+uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
+ int height) {
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+ } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+ return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+ } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+ return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height);
+ } else {
+ return aom_sum_squares_2d_i16_c(src, stride, width, height);
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
index a79f22d79..22d7739ec 100644
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
@@ -14,6 +14,7 @@
#include <stdio.h>
#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
#include "config/aom_dsp_rtcd.h"
static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
@@ -44,8 +45,7 @@ static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
}
-static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
- int stride) {
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) {
const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
__m128i v_sum_d =
_mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
@@ -53,8 +53,8 @@ static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
}
-static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
- int height) {
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+ int height) {
int r = 0;
__m128i v_acc_q = _mm_setzero_si128();
do {
@@ -76,7 +76,7 @@ static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
// maintenance instructions in the common case of 4x4.
__attribute__((noinline))
#endif
-static uint64_t
+uint64_t
aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
int height) {
int r = 0;
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
new file mode 100644
index 000000000..491e31cc5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+
+uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride,
+ int width, int height);
+
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+ int height);
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
+
+#endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
index d9a53fcc5..1e9f1e27b 100644
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_SYNONYMS_H_
-#define AOM_DSP_X86_SYNONYMS_H_
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_H_
#include <immintrin.h>
@@ -103,15 +103,6 @@ static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
return _mm_srai_epi32(v_tmp_d, bits);
}
-// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
-static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
- const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
- const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
- const __m128i v_tmp_d =
- _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
- return _mm_srai_epi32(v_tmp_d, bits);
-}
-
static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
@@ -120,4 +111,4 @@ static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
return _mm_srai_epi16(v_tmp_d, bits);
}
-#endif // AOM_DSP_X86_SYNONYMS_H_
+#endif // AOM_AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
index 39f371fc9..3f69b120e 100644
--- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_SYNONYMS_AVX2_H_
-#define AOM_DSP_X86_SYNONYMS_AVX2_H_
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
#include <immintrin.h>
@@ -61,4 +61,14 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
}
-#endif // AOM_DSP_X86_SYNONYMS_AVX2_H_
+static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+ __m128i mhi = _mm_loadu_si128((__m128i *)(hi));
+ __m128i mlo = _mm_loadu_si128((__m128i *)(lo));
+ return yy_set_m128i(mhi, mlo);
+}
+
+static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
+ const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
+ return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
+}
+#endif // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
index f88a1527d..d0d1ee684 100644
--- a/third_party/aom/aom_dsp/x86/transpose_sse2.h
+++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_TRANSPOSE_SSE2_H_
-#define AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
#include <emmintrin.h> // SSE2
@@ -417,4 +417,4 @@ static INLINE void transpose_32bit_8x4(const __m128i *const in,
out[7] = _mm_unpackhi_epi64(a6, a7);
}
-#endif // AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
index bdff64b8f..b1611ba87 100644
--- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H_
-#define AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
#include <emmintrin.h>
#include "aom/aom_integer.h"
@@ -196,4 +196,4 @@ static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
}
#endif
-#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#endif // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
index 58a792424..ed82eee96 100644
--- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
+++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_DSP_X86_TXFM_COMMON_SSE2_H_
-#define AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
#include <emmintrin.h>
#include "aom/aom_integer.h"
@@ -26,4 +26,4 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) {
return _mm_shuffle_epi32(b, 0x4e);
}
-#endif // AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+#endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
index a7ac2c93d..800aef126 100644
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -433,13 +433,14 @@ static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
return comp;
}
-void aom_highbd_comp_mask_pred_avx2(uint16_t *comp_pred, const uint8_t *pred8,
+void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
int width, int height, const uint8_t *ref8,
int ref_stride, const uint8_t *mask,
int mask_stride, int invert_mask) {
int i = 0;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
const uint16_t *src0 = invert_mask ? pred : ref;
const uint16_t *src1 = invert_mask ? ref : pred;
const int stride0 = invert_mask ? width : ref_stride;
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
index 7e3c5d5db..3c37e77c0 100644
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -16,6 +16,7 @@
#include "config/aom_dsp_rtcd.h"
#include "config/av1_rtcd.h"
+#include "aom_dsp/blend.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_ports/mem.h"
@@ -485,7 +486,8 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
int mi_row, int mi_col, const MV *const mv,
uint8_t *comp_pred, int width, int height,
int subpel_x_q3, int subpel_y_q3,
- const uint8_t *ref, int ref_stride) {
+ const uint8_t *ref, int ref_stride,
+ int subpel_search) {
// expect xd == NULL only in tests
if (xd != NULL) {
const MB_MODE_INFO *mi = xd->mi[0];
@@ -553,7 +555,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
// Get convolve parameters.
- ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
const InterpFilters filters =
av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
@@ -570,7 +572,10 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
}
const InterpFilterParams *filter =
- av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+ (subpel_search == 1)
+ ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+ : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+ int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS;
if (!subpel_x_q3 && !subpel_y_q3) {
if (width >= 16) {
@@ -632,15 +637,25 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
const int16_t *const kernel_y =
av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- const int intermediate_height =
- (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+ const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+ uint8_t *temp_start_horiz =
+ (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp;
+ uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+ int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1),
- ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
- width, intermediate_height);
- aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
- MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
- width, height);
+ // TODO(Deepa): Remove the memset below when we have
+ // 4 tap simd for sse2 and ssse3.
+ if (subpel_search == 1) {
+ memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width);
+ memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width);
+ memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width);
+ memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width);
+ }
+ aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+ kernel_x, 16, NULL, -1, width, intermediate_height);
+ aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+ kernel_y, 16, width, height);
}
}
@@ -648,11 +663,11 @@ void aom_comp_avg_upsampled_pred_sse2(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride) {
+ int ref_stride, int subpel_search) {
int n;
int i;
aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride);
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
/*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
assert(!(width * height & 15));
n = width * height >> 4;
@@ -664,3 +679,128 @@ void aom_comp_avg_upsampled_pred_sse2(
pred += 16;
}
}
+
+void aom_comp_mask_upsampled_pred_sse2(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+ int subpel_search) {
+ if (subpel_x_q3 | subpel_y_q3) {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+ ref = comp_pred;
+ ref_stride = width;
+ }
+ aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+ mask_stride, invert_mask);
+}
+
+static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
+ const __m128i s1,
+ const __m128i a) {
+ const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i a_inv = _mm_sub_epi16(alpha_max, a);
+
+ const __m128i s_lo = _mm_unpacklo_epi16(s0, s1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv);
+ const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo);
+ const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i s_hi = _mm_unpackhi_epi16(s0, s1);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv);
+ const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi);
+ const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i comp = _mm_packs_epi32(pred_l, pred_h);
+
+ return comp;
+}
+
+void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ int i = 0;
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ const __m128i zero = _mm_setzero_si128();
+
+ if (width == 8) {
+ do {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+ const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask);
+ const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero);
+
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16);
+
+ _mm_storeu_si128((__m128i *)comp_pred, comp);
+
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else if (width == 16) {
+ do {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+ const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+ const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8));
+
+ const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask);
+ const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+ const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+ const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+ _mm_storeu_si128((__m128i *)comp_pred, comp);
+ _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1);
+
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else if (width == 32) {
+ do {
+ for (int j = 0; j < 2; j++) {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16));
+ const __m128i s2 =
+ _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16));
+ const __m128i s3 =
+ _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16));
+
+ const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16));
+ const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+ const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+ const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+ _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
+ _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+ }
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ }
+}
diff --git a/third_party/aom/aom_mem/aom_mem.h b/third_party/aom/aom_mem/aom_mem.h
index a36ee3e03..4b1fa45f1 100644
--- a/third_party/aom/aom_mem/aom_mem.h
+++ b/third_party/aom/aom_mem/aom_mem.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_MEM_AOM_MEM_H_
-#define AOM_MEM_AOM_MEM_H_
+#ifndef AOM_AOM_MEM_AOM_MEM_H_
+#define AOM_AOM_MEM_AOM_MEM_H_
#include "aom/aom_integer.h"
#include "config/aom_config.h"
@@ -67,4 +67,4 @@ void *aom_memset16(void *dest, int val, size_t length);
}
#endif
-#endif // AOM_MEM_AOM_MEM_H_
+#endif // AOM_AOM_MEM_AOM_MEM_H_
diff --git a/third_party/aom/aom_mem/include/aom_mem_intrnl.h b/third_party/aom/aom_mem/include/aom_mem_intrnl.h
index 977ebadcd..cbc30a9bb 100644
--- a/third_party/aom/aom_mem/include/aom_mem_intrnl.h
+++ b/third_party/aom/aom_mem/include/aom_mem_intrnl.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
-#define AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
+#ifndef AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
+#define AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
#include "config/aom_config.h"
@@ -30,4 +30,4 @@
#define align_addr(addr, align) \
(void *)(((size_t)(addr) + ((align)-1)) & ~(size_t)((align)-1))
-#endif // AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
+#endif // AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
diff --git a/third_party/aom/aom_ports/aom_once.h b/third_party/aom/aom_ports/aom_once.h
index 8e04f8583..4d77aac5a 100644
--- a/third_party/aom/aom_ports/aom_once.h
+++ b/third_party/aom/aom_ports/aom_once.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_AOM_ONCE_H_
-#define AOM_PORTS_AOM_ONCE_H_
+#ifndef AOM_AOM_PORTS_AOM_ONCE_H_
+#define AOM_AOM_PORTS_AOM_ONCE_H_
#include "config/aom_config.h"
@@ -135,4 +135,4 @@ static void aom_once(void (*func)(void)) {
}
#endif
-#endif // AOM_PORTS_AOM_ONCE_H_
+#endif // AOM_AOM_PORTS_AOM_ONCE_H_
diff --git a/third_party/aom/aom_ports/aom_timer.h b/third_party/aom/aom_ports/aom_timer.h
index c719ec677..9b17b8983 100644
--- a/third_party/aom/aom_ports/aom_timer.h
+++ b/third_party/aom/aom_ports/aom_timer.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_AOM_TIMER_H_
-#define AOM_PORTS_AOM_TIMER_H_
+#ifndef AOM_AOM_PORTS_AOM_TIMER_H_
+#define AOM_AOM_PORTS_AOM_TIMER_H_
#include "config/aom_config.h"
@@ -108,4 +108,4 @@ static INLINE int aom_usec_timer_elapsed(struct aom_usec_timer *t) {
#endif /* CONFIG_OS_SUPPORT */
-#endif // AOM_PORTS_AOM_TIMER_H_
+#endif // AOM_AOM_PORTS_AOM_TIMER_H_
diff --git a/third_party/aom/aom_ports/arm.h b/third_party/aom/aom_ports/arm.h
index a1a2ab765..cb1fb9bec 100644
--- a/third_party/aom/aom_ports/arm.h
+++ b/third_party/aom/aom_ports/arm.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_ARM_H_
-#define AOM_PORTS_ARM_H_
+#ifndef AOM_AOM_PORTS_ARM_H_
+#define AOM_AOM_PORTS_ARM_H_
#include <stdlib.h>
#include "config/aom_config.h"
@@ -26,7 +26,7 @@ extern "C" {
/*ARMv7 optional NEON instructions.*/
#define HAS_NEON 0x04
-int arm_cpu_caps(void);
+int aom_arm_cpu_caps(void);
// Earlier gcc compilers have issues with some neon intrinsics
#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 4 && \
@@ -38,4 +38,4 @@ int arm_cpu_caps(void);
} // extern "C"
#endif
-#endif // AOM_PORTS_ARM_H_
+#endif // AOM_AOM_PORTS_ARM_H_
diff --git a/third_party/aom/aom_ports/arm_cpudetect.c b/third_party/aom/aom_ports/arm_cpudetect.c
index 70efee996..5a75bb348 100644
--- a/third_party/aom/aom_ports/arm_cpudetect.c
+++ b/third_party/aom/aom_ports/arm_cpudetect.c
@@ -40,7 +40,7 @@ static int arm_cpu_env_mask(void) {
#if !CONFIG_RUNTIME_CPU_DETECT
-int arm_cpu_caps(void) {
+int aom_arm_cpu_caps(void) {
/* This function should actually be a no-op. There is no way to adjust any of
* these because the RTCD tables do not exist: the functions are called
* statically */
@@ -62,7 +62,7 @@ int arm_cpu_caps(void) {
#define WIN32_EXTRA_LEAN
#include <windows.h>
-int arm_cpu_caps(void) {
+int aom_arm_cpu_caps(void) {
int flags;
int mask;
if (!arm_cpu_env_flags(&flags)) {
@@ -90,7 +90,7 @@ int arm_cpu_caps(void) {
#elif defined(__ANDROID__) /* end _MSC_VER */
#include <cpu-features.h>
-int arm_cpu_caps(void) {
+int aom_arm_cpu_caps(void) {
int flags;
int mask;
uint64_t features;
@@ -110,7 +110,7 @@ int arm_cpu_caps(void) {
#include <stdio.h>
-int arm_cpu_caps(void) {
+int aom_arm_cpu_caps(void) {
FILE *fin;
int flags;
int mask;
diff --git a/third_party/aom/aom_ports/bitops.h b/third_party/aom/aom_ports/bitops.h
index 36f5bd487..44df17307 100644
--- a/third_party/aom/aom_ports/bitops.h
+++ b/third_party/aom/aom_ports/bitops.h
@@ -9,12 +9,13 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_BITOPS_H_
-#define AOM_PORTS_BITOPS_H_
+#ifndef AOM_AOM_PORTS_BITOPS_H_
+#define AOM_AOM_PORTS_BITOPS_H_
#include <assert.h>
#include "aom_ports/msvc.h"
+#include "config/aom_config.h"
#ifdef _MSC_VER
#if defined(_M_X64) || defined(_M_IX86)
@@ -27,6 +28,8 @@
extern "C" {
#endif
+// get_msb:
+// Returns (int)floor(log2(n)). n must be > 0.
// These versions of get_msb() are only valid when n != 0 because all
// of the optimized versions are undefined when n == 0:
// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
@@ -49,7 +52,6 @@ static INLINE int get_msb(unsigned int n) {
}
#undef USE_MSC_INTRINSICS
#else
-// Returns (int)floor(log2(n)). n must be > 0.
static INLINE int get_msb(unsigned int n) {
int log = 0;
unsigned int value = n;
@@ -73,4 +75,4 @@ static INLINE int get_msb(unsigned int n) {
} // extern "C"
#endif
-#endif // AOM_PORTS_BITOPS_H_
+#endif // AOM_AOM_PORTS_BITOPS_H_
diff --git a/third_party/aom/aom_ports/emmintrin_compat.h b/third_party/aom/aom_ports/emmintrin_compat.h
index f9d44c647..85d218a3d 100644
--- a/third_party/aom/aom_ports/emmintrin_compat.h
+++ b/third_party/aom/aom_ports/emmintrin_compat.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_EMMINTRIN_COMPAT_H_
-#define AOM_PORTS_EMMINTRIN_COMPAT_H_
+#ifndef AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_
+#define AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_
#if defined(__GNUC__) && __GNUC__ < 4
/* From emmintrin.h (gcc 4.5.3) */
@@ -53,4 +53,4 @@ extern __inline __m128d
}
#endif
-#endif // AOM_PORTS_EMMINTRIN_COMPAT_H_
+#endif // AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_
diff --git a/third_party/aom/aom_ports/mem.h b/third_party/aom/aom_ports/mem.h
index 0793d82e4..3ffea3cd6 100644
--- a/third_party/aom/aom_ports/mem.h
+++ b/third_party/aom/aom_ports/mem.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_MEM_H_
-#define AOM_PORTS_MEM_H_
+#ifndef AOM_AOM_PORTS_MEM_H_
+#define AOM_AOM_PORTS_MEM_H_
#include "aom/aom_integer.h"
#include "config/aom_config.h"
@@ -66,4 +66,4 @@
#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
-#endif // AOM_PORTS_MEM_H_
+#endif // AOM_AOM_PORTS_MEM_H_
diff --git a/third_party/aom/aom_ports/mem_ops.h b/third_party/aom/aom_ports/mem_ops.h
index ef0ee17ee..2b5bc0f0f 100644
--- a/third_party/aom/aom_ports/mem_ops.h
+++ b/third_party/aom/aom_ports/mem_ops.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_MEM_OPS_H_
-#define AOM_PORTS_MEM_OPS_H_
+#ifndef AOM_AOM_PORTS_MEM_OPS_H_
+#define AOM_AOM_PORTS_MEM_OPS_H_
/* \file
* \brief Provides portable memory access primitives
@@ -225,4 +225,4 @@ static AOM_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
mem[3] = (MAU_T)((val >> 24) & 0xff);
}
/* clang-format on */
-#endif // AOM_PORTS_MEM_OPS_H_
+#endif // AOM_AOM_PORTS_MEM_OPS_H_
diff --git a/third_party/aom/aom_ports/mem_ops_aligned.h b/third_party/aom/aom_ports/mem_ops_aligned.h
index 81fe41a63..37c367531 100644
--- a/third_party/aom/aom_ports/mem_ops_aligned.h
+++ b/third_party/aom/aom_ports/mem_ops_aligned.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_MEM_OPS_ALIGNED_H_
-#define AOM_PORTS_MEM_OPS_ALIGNED_H_
+#ifndef AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_
+#define AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_
#include "aom/aom_integer.h"
@@ -170,4 +170,4 @@ mem_put_le_aligned_generic(32)
#undef swap_endian_32_se
/* clang-format on */
-#endif // AOM_PORTS_MEM_OPS_ALIGNED_H_
+#endif // AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_
diff --git a/third_party/aom/aom_ports/msvc.h b/third_party/aom/aom_ports/msvc.h
index 7d2b54028..e78e605f2 100644
--- a/third_party/aom/aom_ports/msvc.h
+++ b/third_party/aom/aom_ports/msvc.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_MSVC_H_
-#define AOM_PORTS_MSVC_H_
+#ifndef AOM_AOM_PORTS_MSVC_H_
+#define AOM_AOM_PORTS_MSVC_H_
#ifdef _MSC_VER
#include "config/aom_config.h"
@@ -72,4 +72,4 @@ static INLINE __m256i _mm256_insert_epi32(__m256i a, int b, const int i) {
#endif // _MSC_VER <= 1900
#endif // HAVE_AVX
#endif // _MSC_VER
-#endif // AOM_PORTS_MSVC_H_
+#endif // AOM_AOM_PORTS_MSVC_H_
diff --git a/third_party/aom/aom_ports/ppc.h b/third_party/aom/aom_ports/ppc.h
index ec487c2bc..3159bda68 100644
--- a/third_party/aom/aom_ports/ppc.h
+++ b/third_party/aom/aom_ports/ppc.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_PPC_H_
-#define AOM_PORTS_PPC_H_
+#ifndef AOM_AOM_PORTS_PPC_H_
+#define AOM_AOM_PORTS_PPC_H_
#include <stdlib.h>
#include "config/aom_config.h"
@@ -27,4 +27,4 @@ int ppc_simd_caps(void);
} // extern "C"
#endif
-#endif // AOM_PORTS_PPC_H_
+#endif // AOM_AOM_PORTS_PPC_H_
diff --git a/third_party/aom/aom_ports/sanitizer.h b/third_party/aom/aom_ports/sanitizer.h
index d4e197e2f..1dd8eb4cf 100644
--- a/third_party/aom/aom_ports/sanitizer.h
+++ b/third_party/aom/aom_ports/sanitizer.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_SANITIZER_H_
-#define AOM_PORTS_SANITIZER_H_
+#ifndef AOM_AOM_PORTS_SANITIZER_H_
+#define AOM_AOM_PORTS_SANITIZER_H_
// AddressSanitizer support.
@@ -35,4 +35,4 @@
#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
#endif
-#endif // AOM_PORTS_SANITIZER_H_
+#endif // AOM_AOM_PORTS_SANITIZER_H_
diff --git a/third_party/aom/aom_ports/system_state.h b/third_party/aom/aom_ports/system_state.h
index 0f2c3d8b5..6640839d8 100644
--- a/third_party/aom/aom_ports/system_state.h
+++ b/third_party/aom/aom_ports/system_state.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_SYSTEM_STATE_H_
-#define AOM_PORTS_SYSTEM_STATE_H_
+#ifndef AOM_AOM_PORTS_SYSTEM_STATE_H_
+#define AOM_AOM_PORTS_SYSTEM_STATE_H_
#include "config/aom_config.h"
@@ -20,4 +20,4 @@ void aom_reset_mmx_state(void);
#else
#define aom_clear_system_state()
#endif // ARCH_X86 || ARCH_X86_64
-#endif // AOM_PORTS_SYSTEM_STATE_H_
+#endif // AOM_AOM_PORTS_SYSTEM_STATE_H_
diff --git a/third_party/aom/aom_ports/x86.h b/third_party/aom/aom_ports/x86.h
index b642a57f7..52ee49cb3 100644
--- a/third_party/aom/aom_ports/x86.h
+++ b/third_party/aom/aom_ports/x86.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_PORTS_X86_H_
-#define AOM_PORTS_X86_H_
+#ifndef AOM_AOM_PORTS_X86_H_
+#define AOM_AOM_PORTS_X86_H_
#include <stdlib.h>
#if defined(_MSC_VER)
@@ -322,4 +322,4 @@ extern void aom_reset_mmx_state(void);
} // extern "C"
#endif
-#endif // AOM_PORTS_X86_H_
+#endif // AOM_AOM_PORTS_X86_H_
diff --git a/third_party/aom/aom_scale/aom_scale.h b/third_party/aom/aom_scale/aom_scale.h
index a4aef6c65..11812a145 100644
--- a/third_party/aom/aom_scale/aom_scale.h
+++ b/third_party/aom/aom_scale/aom_scale.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_SCALE_AOM_SCALE_H_
-#define AOM_SCALE_AOM_SCALE_H_
+#ifndef AOM_AOM_SCALE_AOM_SCALE_H_
+#define AOM_AOM_SCALE_AOM_SCALE_H_
#include "aom_scale/yv12config.h"
@@ -20,4 +20,4 @@ extern void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
unsigned int vscale, unsigned int vratio,
unsigned int interlaced, const int num_planes);
-#endif // AOM_SCALE_AOM_SCALE_H_
+#endif // AOM_AOM_SCALE_AOM_SCALE_H_
diff --git a/third_party/aom/aom_scale/generic/yv12config.c b/third_party/aom/aom_scale/generic/yv12config.c
index ca5b69066..84705e2d8 100644
--- a/third_party/aom/aom_scale/generic/yv12config.c
+++ b/third_party/aom/aom_scale/generic/yv12config.c
@@ -14,6 +14,7 @@
#include "aom_mem/aom_mem.h"
#include "aom_ports/mem.h"
#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
/****************************************************************************
* Exports
@@ -50,11 +51,11 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
int border, int byte_alignment,
aom_codec_frame_buffer_t *fb,
aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
- if (ybf) {
#if CONFIG_SIZE_LIMIT
- if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
+ if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
#endif
+ if (ybf) {
const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
const int aligned_width = (width + 7) & ~7;
const int aligned_height = (height + 7) & ~7;
@@ -74,6 +75,17 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
uint8_t *buf = NULL;
+#if defined AOM_MAX_ALLOCABLE_MEMORY
+ // The size of ybf->buffer_alloc.
+ uint64_t alloc_size = frame_size;
+ // The size of ybf->y_buffer_8bit.
+ if (use_highbitdepth) alloc_size += yplane_size;
+ // The decoder may allocate REF_FRAMES frame buffers in the frame buffer
+ // pool. Bound the total amount of allocated memory as if these REF_FRAMES
+ // frame buffers were allocated in a single allocation.
+ if (alloc_size > AOM_MAX_ALLOCABLE_MEMORY / REF_FRAMES) return -1;
+#endif
+
if (cb != NULL) {
const int align_addr_extra_size = 31;
const uint64_t external_frame_size = frame_size + align_addr_extra_size;
@@ -94,7 +106,7 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
// This memset is needed for fixing the issue of using uninitialized
// value in msan test. It will cause a perf loss, so only do this for
// msan test.
- memset(ybf->buffer_alloc, 0, (int)frame_size);
+ memset(ybf->buffer_alloc, 0, (size_t)frame_size);
#endif
#endif
} else if (frame_size > (size_t)ybf->buffer_alloc_sz) {
@@ -165,7 +177,11 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
ybf->y_buffer_8bit = (uint8_t *)aom_memalign(32, (size_t)yplane_size);
if (!ybf->y_buffer_8bit) return -1;
} else {
- assert(!ybf->y_buffer_8bit);
+ if (ybf->y_buffer_8bit) {
+ aom_free(ybf->y_buffer_8bit);
+ ybf->y_buffer_8bit = NULL;
+ ybf->buf_8bit_valid = 0;
+ }
}
ybf->corrupted = 0; /* assume not corrupted by errors */
diff --git a/third_party/aom/aom_scale/yv12config.h b/third_party/aom/aom_scale/yv12config.h
index 2b4f597b0..2fb81acd7 100644
--- a/third_party/aom/aom_scale/yv12config.h
+++ b/third_party/aom/aom_scale/yv12config.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_SCALE_YV12CONFIG_H_
-#define AOM_SCALE_YV12CONFIG_H_
+#ifndef AOM_AOM_SCALE_YV12CONFIG_H_
+#define AOM_AOM_SCALE_YV12CONFIG_H_
#ifdef __cplusplus
extern "C" {
@@ -28,7 +28,11 @@ extern "C" {
// TODO(jingning): Use unified inter predictor for encoder and
// decoder during the development process. Revisit the frame border
// to improve the decoder performance.
+#if CONFIG_REDUCED_ENCODER_BORDER
+#define AOM_BORDER_IN_PIXELS 160
+#else
#define AOM_BORDER_IN_PIXELS 288
+#endif // CONFIG_REDUCED_ENCODER_BORDER
typedef struct yv12_buffer_config {
union {
@@ -136,4 +140,4 @@ int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
}
#endif
-#endif // AOM_SCALE_YV12CONFIG_H_
+#endif // AOM_AOM_SCALE_YV12CONFIG_H_
diff --git a/third_party/aom/aom_util/aom_thread.h b/third_party/aom/aom_util/aom_thread.h
index fdb724d0c..f14c1ac18 100644
--- a/third_party/aom/aom_util/aom_thread.h
+++ b/third_party/aom/aom_util/aom_thread.h
@@ -14,8 +14,8 @@
// Original source:
// https://chromium.googlesource.com/webm/libwebp
-#ifndef AOM_THREAD_H_
-#define AOM_THREAD_H_
+#ifndef AOM_AOM_UTIL_AOM_THREAD_H_
+#define AOM_AOM_UTIL_AOM_THREAD_H_
#include "config/aom_config.h"
@@ -427,4 +427,4 @@ const AVxWorkerInterface *aom_get_worker_interface(void);
} // extern "C"
#endif
-#endif // AOM_THREAD_H_
+#endif // AOM_AOM_UTIL_AOM_THREAD_H_
diff --git a/third_party/aom/aom_util/debug_util.h b/third_party/aom/aom_util/debug_util.h
index 4096801db..127a8b468 100644
--- a/third_party/aom/aom_util/debug_util.h
+++ b/third_party/aom/aom_util/debug_util.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_UTIL_DEBUG_UTIL_H_
-#define AOM_UTIL_DEBUG_UTIL_H_
+#ifndef AOM_AOM_UTIL_DEBUG_UTIL_H_
+#define AOM_AOM_UTIL_DEBUG_UTIL_H_
#include "config/aom_config.h"
@@ -66,4 +66,4 @@ void mismatch_check_block_tx(const uint8_t *src, int src_stride,
} // extern "C"
#endif
-#endif // AOM_UTIL_DEBUG_UTIL_H_
+#endif // AOM_AOM_UTIL_DEBUG_UTIL_H_
diff --git a/third_party/aom/aom_util/endian_inl.h b/third_party/aom/aom_util/endian_inl.h
index 2d2822141..f536ec5b8 100644
--- a/third_party/aom/aom_util/endian_inl.h
+++ b/third_party/aom/aom_util/endian_inl.h
@@ -11,8 +11,8 @@
//
// Endian related functions.
-#ifndef AOM_UTIL_ENDIAN_INL_H_
-#define AOM_UTIL_ENDIAN_INL_H_
+#ifndef AOM_AOM_UTIL_ENDIAN_INL_H_
+#define AOM_AOM_UTIL_ENDIAN_INL_H_
#include <stdlib.h>
@@ -119,4 +119,4 @@ static INLINE uint64_t BSwap64(uint64_t x) {
#endif // HAVE_BUILTIN_BSWAP64
}
-#endif // AOM_UTIL_ENDIAN_INL_H_
+#endif // AOM_AOM_UTIL_ENDIAN_INL_H_
diff --git a/third_party/aom/apps/aomdec.c b/third_party/aom/apps/aomdec.c
index 48952586f..ff13b6f50 100644
--- a/third_party/aom/apps/aomdec.c
+++ b/third_party/aom/apps/aomdec.c
@@ -40,6 +40,7 @@
#include "common/webmdec.h"
#endif
+#include "common/rawenc.h"
#include "common/y4menc.h"
#if CONFIG_LIBYUV
@@ -83,8 +84,6 @@ static const arg_def_t outputfile =
ARG_DEF("o", "output", 1, "Output file name pattern (see below)");
static const arg_def_t threadsarg =
ARG_DEF("t", "threads", 1, "Max threads to use");
-static const arg_def_t rowmtarg =
- ARG_DEF(NULL, "row-mt", 1, "Enable row based multi-threading");
static const arg_def_t verbosearg =
ARG_DEF("v", "verbose", 0, "Show version string");
static const arg_def_t scalearg =
@@ -99,29 +98,23 @@ static const arg_def_t framestatsarg =
ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)");
static const arg_def_t outbitdeptharg =
ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
-static const arg_def_t tilem = ARG_DEF(NULL, "tile-mode", 1,
- "Tile coding mode "
- "(0 for normal tile coding mode)");
-static const arg_def_t tiler = ARG_DEF(NULL, "tile-row", 1,
- "Row index of tile to decode "
- "(-1 for all rows)");
-static const arg_def_t tilec = ARG_DEF(NULL, "tile-column", 1,
- "Column index of tile to decode "
- "(-1 for all columns)");
static const arg_def_t isannexb =
ARG_DEF(NULL, "annexb", 0, "Bitstream is in Annex-B format");
static const arg_def_t oppointarg = ARG_DEF(
NULL, "oppoint", 1, "Select an operating point of a scalable bitstream");
static const arg_def_t outallarg = ARG_DEF(
NULL, "all-layers", 0, "Output all decoded frames of a scalable bitstream");
+static const arg_def_t skipfilmgrain =
+ ARG_DEF(NULL, "skip-film-grain", 0, "Skip film grain application");
static const arg_def_t *all_args[] = {
- &help, &codecarg, &use_yv12, &use_i420, &flipuvarg,
- &rawvideo, &noblitarg, &progressarg, &limitarg, &skiparg,
- &postprocarg, &summaryarg, &outputfile, &threadsarg, &rowmtarg,
- &verbosearg, &scalearg, &fb_arg, &md5arg, &framestatsarg,
- &continuearg, &outbitdeptharg, &tilem, &tiler, &tilec,
- &isannexb, &oppointarg, &outallarg, NULL
+ &help, &codecarg, &use_yv12, &use_i420,
+ &flipuvarg, &rawvideo, &noblitarg, &progressarg,
+ &limitarg, &skiparg, &postprocarg, &summaryarg,
+ &outputfile, &threadsarg, &verbosearg, &scalearg,
+ &fb_arg, &md5arg, &framestatsarg, &continuearg,
+ &outbitdeptharg, &isannexb, &oppointarg, &outallarg,
+ &skipfilmgrain, NULL
};
#if CONFIG_LIBYUV
@@ -254,44 +247,6 @@ static int read_frame(struct AvxDecInputContext *input, uint8_t **buf,
}
}
-static void update_image_md5(const aom_image_t *img, const int planes[3],
- MD5Context *md5) {
- int i, y;
-
- for (i = 0; i < 3; ++i) {
- const int plane = planes[i];
- const unsigned char *buf = img->planes[plane];
- const int stride = img->stride[plane];
- const int w = aom_img_plane_width(img, plane) *
- ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
- const int h = aom_img_plane_height(img, plane);
-
- for (y = 0; y < h; ++y) {
- MD5Update(md5, buf, w);
- buf += stride;
- }
- }
-}
-
-static void write_image_file(const aom_image_t *img, const int *planes,
- const int num_planes, FILE *file) {
- int i, y;
- const int bytes_per_sample = ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
-
- for (i = 0; i < num_planes; ++i) {
- const int plane = planes[i];
- const unsigned char *buf = img->planes[plane];
- const int stride = img->stride[plane];
- const int w = aom_img_plane_width(img, plane);
- const int h = aom_img_plane_height(img, plane);
-
- for (y = 0; y < h; ++y) {
- fwrite(buf, bytes_per_sample, w, file);
- buf += stride;
- }
- }
-}
-
static int file_is_raw(struct AvxInputContext *input) {
uint8_t buf[32];
int is_raw = 0;
@@ -505,16 +460,13 @@ static int main_loop(int argc, const char **argv_) {
int opt_raw = 0;
aom_codec_dec_cfg_t cfg = { 0, 0, 0, CONFIG_LOWBITDEPTH, { 1 } };
unsigned int output_bit_depth = 0;
- unsigned int tile_mode = 0;
unsigned int is_annexb = 0;
- int tile_row = -1;
- int tile_col = -1;
int frames_corrupted = 0;
int dec_flags = 0;
int do_scale = 0;
int operating_point = 0;
int output_all_layers = 0;
- unsigned int row_mt = 0;
+ int skip_film_grain = 0;
aom_image_t *scaled_img = NULL;
aom_image_t *img_shifted = NULL;
int frame_avail, got_data, flush_decoder = 0;
@@ -611,8 +563,6 @@ static int main_loop(int argc, const char **argv_) {
cfg.threads);
}
#endif
- } else if (arg_match(&arg, &rowmtarg, argi)) {
- row_mt = arg_parse_uint(&arg);
} else if (arg_match(&arg, &verbosearg, argi)) {
quiet = 0;
} else if (arg_match(&arg, &scalearg, argi)) {
@@ -623,19 +573,15 @@ static int main_loop(int argc, const char **argv_) {
keep_going = 1;
} else if (arg_match(&arg, &outbitdeptharg, argi)) {
output_bit_depth = arg_parse_uint(&arg);
- } else if (arg_match(&arg, &tilem, argi)) {
- tile_mode = arg_parse_int(&arg);
} else if (arg_match(&arg, &isannexb, argi)) {
is_annexb = 1;
input.obu_ctx->is_annexb = 1;
- } else if (arg_match(&arg, &tiler, argi)) {
- tile_row = arg_parse_int(&arg);
- } else if (arg_match(&arg, &tilec, argi)) {
- tile_col = arg_parse_int(&arg);
} else if (arg_match(&arg, &oppointarg, argi)) {
operating_point = arg_parse_int(&arg);
} else if (arg_match(&arg, &outallarg, argi)) {
output_all_layers = 1;
+ } else if (arg_match(&arg, &skipfilmgrain, argi)) {
+ skip_film_grain = 1;
} else {
argj++;
}
@@ -739,30 +685,11 @@ static int main_loop(int argc, const char **argv_) {
if (!quiet) fprintf(stderr, "%s\n", decoder.name);
-#if CONFIG_AV1_DECODER
- if (aom_codec_control(&decoder, AV1_SET_TILE_MODE, tile_mode)) {
- fprintf(stderr, "Failed to set decode_tile_mode: %s\n",
- aom_codec_error(&decoder));
- goto fail;
- }
-
if (aom_codec_control(&decoder, AV1D_SET_IS_ANNEXB, is_annexb)) {
fprintf(stderr, "Failed to set is_annexb: %s\n", aom_codec_error(&decoder));
goto fail;
}
- if (aom_codec_control(&decoder, AV1_SET_DECODE_TILE_ROW, tile_row)) {
- fprintf(stderr, "Failed to set decode_tile_row: %s\n",
- aom_codec_error(&decoder));
- goto fail;
- }
-
- if (aom_codec_control(&decoder, AV1_SET_DECODE_TILE_COL, tile_col)) {
- fprintf(stderr, "Failed to set decode_tile_col: %s\n",
- aom_codec_error(&decoder));
- goto fail;
- }
-
if (aom_codec_control(&decoder, AV1D_SET_OPERATING_POINT, operating_point)) {
fprintf(stderr, "Failed to set operating_point: %s\n",
aom_codec_error(&decoder));
@@ -776,11 +703,11 @@ static int main_loop(int argc, const char **argv_) {
goto fail;
}
- if (aom_codec_control(&decoder, AV1D_SET_ROW_MT, row_mt)) {
- fprintf(stderr, "Failed to set row_mt: %s\n", aom_codec_error(&decoder));
+ if (aom_codec_control(&decoder, AV1D_SET_SKIP_FILM_GRAIN, skip_film_grain)) {
+ fprintf(stderr, "Failed to set skip_film_grain: %s\n",
+ aom_codec_error(&decoder));
goto fail;
}
-#endif
if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
while (arg_skip) {
@@ -903,6 +830,8 @@ static int main_loop(int argc, const char **argv_) {
scaled_img =
aom_img_alloc(NULL, img->fmt, render_width, render_height, 16);
scaled_img->bit_depth = img->bit_depth;
+ scaled_img->monochrome = img->monochrome;
+ scaled_img->csp = img->csp;
}
if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
@@ -936,11 +865,15 @@ static int main_loop(int argc, const char **argv_) {
aom_img_free(img_shifted);
img_shifted = NULL;
}
+ if (img_shifted) {
+ img_shifted->monochrome = img->monochrome;
+ }
if (!img_shifted) {
img_shifted =
aom_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16);
img_shifted->bit_depth = output_bit_depth;
img_shifted->monochrome = img->monochrome;
+ img_shifted->csp = img->csp;
}
if (output_bit_depth > img->bit_depth) {
aom_img_upshift(img_shifted, img,
@@ -956,8 +889,7 @@ static int main_loop(int argc, const char **argv_) {
aom_input_ctx.width = img->d_w;
aom_input_ctx.height = img->d_h;
- int num_planes = (!use_y4m && opt_raw && img->monochrome) ? 1 : 3;
-
+ int num_planes = (opt_raw && img->monochrome) ? 1 : 3;
if (single_file) {
if (use_y4m) {
char y4m_buf[Y4M_BUFFER_SIZE] = { 0 };
@@ -966,8 +898,8 @@ static int main_loop(int argc, const char **argv_) {
// Y4M file header
len = y4m_write_file_header(
y4m_buf, sizeof(y4m_buf), aom_input_ctx.width,
- aom_input_ctx.height, &aom_input_ctx.framerate, img->fmt,
- img->bit_depth);
+ aom_input_ctx.height, &aom_input_ctx.framerate,
+ img->monochrome, img->csp, img->fmt, img->bit_depth);
if (do_md5) {
MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
} else {
@@ -979,8 +911,10 @@ static int main_loop(int argc, const char **argv_) {
len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf));
if (do_md5) {
MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
+ y4m_update_image_md5(img, planes, &md5_ctx);
} else {
fputs(y4m_buf, outfile);
+ y4m_write_image_file(img, planes, outfile);
}
} else {
if (frame_out == 1) {
@@ -1004,24 +938,31 @@ static int main_loop(int argc, const char **argv_) {
}
}
}
- }
-
- if (do_md5) {
- update_image_md5(img, planes, &md5_ctx);
- } else {
- write_image_file(img, planes, num_planes, outfile);
+ if (do_md5) {
+ raw_update_image_md5(img, planes, num_planes, &md5_ctx);
+ } else {
+ raw_write_image_file(img, planes, num_planes, outfile);
+ }
}
} else {
generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w,
img->d_h, frame_in);
if (do_md5) {
MD5Init(&md5_ctx);
- update_image_md5(img, planes, &md5_ctx);
+ if (use_y4m) {
+ y4m_update_image_md5(img, planes, &md5_ctx);
+ } else {
+ raw_update_image_md5(img, planes, num_planes, &md5_ctx);
+ }
MD5Final(md5_digest, &md5_ctx);
print_md5(md5_digest, outfile_name);
} else {
outfile = open_outfile(outfile_name);
- write_image_file(img, planes, num_planes, outfile);
+ if (use_y4m) {
+ y4m_write_image_file(img, planes, outfile);
+ } else {
+ raw_write_image_file(img, planes, num_planes, outfile);
+ }
fclose(outfile);
}
}
diff --git a/third_party/aom/apps/aomenc.c b/third_party/aom/apps/aomenc.c
index 31cb662e4..2e5d35cfe 100644
--- a/third_party/aom/apps/aomenc.c
+++ b/third_party/aom/apps/aomenc.c
@@ -180,9 +180,6 @@ static const arg_def_t use_webm =
ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)");
static const arg_def_t use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF");
static const arg_def_t use_obu = ARG_DEF(NULL, "obu", 0, "Output OBU");
-static const arg_def_t out_part =
- ARG_DEF("P", "output-partitions", 0,
- "Makes encoder output partitions. Requires IVF output!");
static const arg_def_t q_hist_n =
ARG_DEF(NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)");
static const arg_def_t rate_hist_n =
@@ -203,6 +200,12 @@ static const arg_def_t bitdeptharg = ARG_DEF_ENUM(
bitdepth_enum);
static const arg_def_t inbitdeptharg =
ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input");
+
+static const arg_def_t input_chroma_subsampling_x = ARG_DEF(
+ NULL, "input-chroma-subsampling-x", 1, "chroma subsampling x value.");
+static const arg_def_t input_chroma_subsampling_y = ARG_DEF(
+ NULL, "input-chroma-subsampling-y", 1, "chroma subsampling y value.");
+
static const arg_def_t *main_args[] = { &help,
#if CONFIG_FILEOPTIONS
&use_cfg,
@@ -222,7 +225,6 @@ static const arg_def_t *main_args[] = { &help,
&use_webm,
&use_ivf,
&use_obu,
- &out_part,
&q_hist_n,
&rate_hist_n,
&disable_warnings,
@@ -411,16 +413,13 @@ static const arg_def_t max_intra_rate_pct =
#if CONFIG_AV1_ENCODER
static const arg_def_t cpu_used_av1 =
ARG_DEF(NULL, "cpu-used", 1, "CPU Used (0..8)");
-static const arg_def_t dev_sf_av1 =
- ARG_DEF(NULL, "dev-sf", 1, "Dev Speed (0..255)");
-static const arg_def_t single_tile_decoding =
- ARG_DEF(NULL, "single-tile-decoding", 1,
- "Single tile decoding (0: off (default), 1: on)");
+static const arg_def_t rowmtarg =
+ ARG_DEF(NULL, "row-mt", 1,
+ "Enable row based multi-threading (0: off (default), 1: on)");
static const arg_def_t tile_cols =
ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
static const arg_def_t tile_rows =
- ARG_DEF(NULL, "tile-rows", 1,
- "Number of tile rows to use, log2 (set to 0 while threads > 1)");
+ ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2");
static const arg_def_t tile_width =
ARG_DEF(NULL, "tile-width", 1, "Tile widths (comma separated)");
static const arg_def_t tile_height =
@@ -619,11 +618,10 @@ static const arg_def_t superblock_size = ARG_DEF_ENUM(
NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum);
static const arg_def_t *av1_args[] = { &cpu_used_av1,
- &dev_sf_av1,
&auto_altref,
&sharpness,
&static_thresh,
- &single_tile_decoding,
+ &rowmtarg,
&tile_cols,
&tile_rows,
&arnr_maxframes,
@@ -670,16 +668,17 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1,
&enable_ref_frame_mvs,
&bitdeptharg,
&inbitdeptharg,
+ &input_chroma_subsampling_x,
+ &input_chroma_subsampling_y,
&sframe_dist,
&sframe_mode,
&save_as_annexb,
NULL };
static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
- AOME_SET_DEVSF,
AOME_SET_ENABLEAUTOALTREF,
AOME_SET_SHARPNESS,
AOME_SET_STATIC_THRESHOLD,
- AV1E_SET_SINGLE_TILE_DECODING,
+ AV1E_SET_ROW_MT,
AV1E_SET_TILE_COLUMNS,
AV1E_SET_TILE_ROWS,
AOME_SET_ARNR_MAXFRAMES,
@@ -830,6 +829,8 @@ struct stream_state {
struct aom_image *img;
aom_codec_ctx_t decoder;
int mismatch_seen;
+ unsigned int chroma_subsampling_x;
+ unsigned int chroma_subsampling_y;
};
static void validate_positive_rational(const char *msg,
@@ -927,9 +928,7 @@ static void parse_global_config(struct AvxEncoderConfig *global, int *argc,
global->framerate = arg_parse_rational(&arg);
validate_positive_rational(arg.name, &global->framerate);
global->have_framerate = 1;
- } else if (arg_match(&arg, &out_part, argi))
- global->out_part = 1;
- else if (arg_match(&arg, &debugmode, argi))
+ } else if (arg_match(&arg, &debugmode, argi))
global->debug = 1;
else if (arg_match(&arg, &q_hist_n, argi))
global->show_q_hist_buckets = arg_parse_uint(&arg);
@@ -1087,6 +1086,11 @@ static void set_config_arg_ctrls(struct stream_config *config, int key,
assert(j < (int)ARG_CTRL_CNT_MAX);
config->arg_ctrls[j][0] = key;
config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg);
+
+ if (key == AOME_SET_ENABLEAUTOALTREF && config->arg_ctrls[j][1] > 1) {
+ warn("auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n");
+ config->arg_ctrls[j][1] = 1;
+ }
if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++;
}
@@ -1174,6 +1178,10 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg);
} else if (arg_match(&arg, &inbitdeptharg, argi)) {
config->cfg.g_input_bit_depth = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &input_chroma_subsampling_x, argi)) {
+ stream->chroma_subsampling_x = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &input_chroma_subsampling_y, argi)) {
+ stream->chroma_subsampling_y = arg_parse_uint(&arg);
#if CONFIG_WEBM_IO
} else if (arg_match(&arg, &stereo_mode, argi)) {
config->stereo_fmt = arg_parse_enum_or_int(&arg);
@@ -1531,7 +1539,6 @@ static void initialize_encoder(struct stream_state *stream,
int flags = 0;
flags |= global->show_psnr ? AOM_CODEC_USE_PSNR : 0;
- flags |= global->out_part ? AOM_CODEC_USE_OUTPUT_PARTITION : 0;
flags |= stream->config.use_16bit_internal ? AOM_CODEC_USE_HIGHBITDEPTH : 0;
/* Construct Encoder Context */
@@ -1609,14 +1616,14 @@ static void encode_frame(struct stream_state *stream,
aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16);
}
I420Scale_16(
- (uint16 *)img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y] / 2,
- (uint16 *)img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U] / 2,
- (uint16 *)img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V] / 2,
- img->d_w, img->d_h, (uint16 *)stream->img->planes[AOM_PLANE_Y],
+ (uint16_t *)img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y] / 2,
+ (uint16_t *)img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U] / 2,
+ (uint16_t *)img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V] / 2,
+ img->d_w, img->d_h, (uint16_t *)stream->img->planes[AOM_PLANE_Y],
stream->img->stride[AOM_PLANE_Y] / 2,
- (uint16 *)stream->img->planes[AOM_PLANE_U],
+ (uint16_t *)stream->img->planes[AOM_PLANE_U],
stream->img->stride[AOM_PLANE_U] / 2,
- (uint16 *)stream->img->planes[AOM_PLANE_V],
+ (uint16_t *)stream->img->planes[AOM_PLANE_V],
stream->img->stride[AOM_PLANE_V] / 2, stream->img->d_w,
stream->img->d_h, kFilterBox);
img = stream->img;
@@ -2043,17 +2050,25 @@ int main(int argc, const char **argv_) {
input.fmt == AOM_IMG_FMT_I42016)) {
stream->config.cfg.g_profile = 0;
profile_updated = 1;
+ } else if (input.bit_depth == 12 &&
+ input.file_type == FILE_TYPE_Y4M) {
+ // Note that here the input file values for chroma subsampling
+ // are used instead of those from the command line.
+ aom_codec_control(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_X,
+ input.y4m.dst_c_dec_h >> 1);
+ aom_codec_control(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_Y,
+ input.y4m.dst_c_dec_v >> 1);
+ } else if (input.bit_depth == 12 &&
+ input.file_type == FILE_TYPE_RAW) {
+ aom_codec_control(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_X,
+ stream->chroma_subsampling_x);
+ aom_codec_control(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_Y,
+ stream->chroma_subsampling_y);
}
break;
default: break;
}
}
- /* Automatically set the codec bit depth to match the input bit depth.
- * Upgrade the profile if required. */
- if (stream->config.cfg.g_input_bit_depth >
- (unsigned int)stream->config.cfg.g_bit_depth) {
- stream->config.cfg.g_bit_depth = stream->config.cfg.g_input_bit_depth;
- }
if (stream->config.cfg.g_bit_depth > 10) {
switch (stream->config.cfg.g_profile) {
case 0:
diff --git a/third_party/aom/apps/aomenc.h b/third_party/aom/apps/aomenc.h
index 976079d74..7c23df006 100644
--- a/third_party/aom/apps/aomenc.h
+++ b/third_party/aom/apps/aomenc.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOMENC_H_
-#define AOMENC_H_
+#ifndef AOM_APPS_AOMENC_H_
+#define AOM_APPS_AOMENC_H_
#include "aom/aom_encoder.h"
@@ -47,7 +47,6 @@ struct AvxEncoderConfig {
enum TestDecodeFatality test_decode;
int have_framerate;
struct aom_rational framerate;
- int out_part;
int debug;
int show_q_hist_buckets;
int show_rate_hist_buckets;
@@ -60,4 +59,4 @@ struct AvxEncoderConfig {
} // extern "C"
#endif
-#endif // AOMENC_H_
+#endif // AOM_APPS_AOMENC_H_
diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
index 4c4f542fe..3a7cd7ee1 100644
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@@ -53,6 +53,8 @@ list(APPEND AOM_AV1_COMMON_SOURCES
"${AOM_ROOT}/av1/common/mv.h"
"${AOM_ROOT}/av1/common/mvref_common.c"
"${AOM_ROOT}/av1/common/mvref_common.h"
+ "${AOM_ROOT}/av1/common/obu_util.c"
+ "${AOM_ROOT}/av1/common/obu_util.h"
"${AOM_ROOT}/av1/common/odintrin.c"
"${AOM_ROOT}/av1/common/odintrin.h"
"${AOM_ROOT}/av1/common/onyxc_int.h"
@@ -78,8 +80,8 @@ list(APPEND AOM_AV1_COMMON_SOURCES
"${AOM_ROOT}/av1/common/thread_common.h"
"${AOM_ROOT}/av1/common/tile_common.c"
"${AOM_ROOT}/av1/common/tile_common.h"
- "${AOM_ROOT}/av1/common/timing.h"
"${AOM_ROOT}/av1/common/timing.c"
+ "${AOM_ROOT}/av1/common/timing.h"
"${AOM_ROOT}/av1/common/token_cdfs.h"
"${AOM_ROOT}/av1/common/txb_common.c"
"${AOM_ROOT}/av1/common/txb_common.h"
@@ -176,6 +178,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
"${AOM_ROOT}/av1/encoder/rd.h"
"${AOM_ROOT}/av1/encoder/rdopt.c"
"${AOM_ROOT}/av1/encoder/rdopt.h"
+ "${AOM_ROOT}/av1/encoder/reconinter_enc.c"
+ "${AOM_ROOT}/av1/encoder/reconinter_enc.h"
"${AOM_ROOT}/av1/encoder/segmentation.c"
"${AOM_ROOT}/av1/encoder/segmentation.h"
"${AOM_ROOT}/av1/encoder/speed_features.c"
@@ -268,7 +272,8 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
"${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
- "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c")
+ "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
"${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
@@ -276,7 +281,9 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
"${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
"${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
- "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c")
+ "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
"${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
@@ -301,6 +308,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON
"${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
"${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
"${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
+ "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c"
"${AOM_ROOT}/av1/common/cdef_block_neon.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
index 3bc4804c9..3295f618a 100644
--- a/third_party/aom/av1/av1_cx_iface.c
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -14,28 +14,29 @@
#include "config/aom_config.h"
#include "config/aom_version.h"
-#include "aom/aom_encoder.h"
#include "aom_ports/aom_once.h"
+#include "aom_ports/mem_ops.h"
#include "aom_ports/system_state.h"
+
+#include "aom/aom_encoder.h"
#include "aom/internal/aom_codec_internal.h"
-#include "av1/encoder/encoder.h"
-#include "aom/aomcx.h"
-#include "av1/encoder/firstpass.h"
+
#include "av1/av1_iface_common.h"
#include "av1/encoder/bitstream.h"
-#include "aom_ports/mem_ops.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
#define MAG_SIZE (4)
#define MAX_NUM_ENHANCEMENT_LAYERS 3
struct av1_extracfg {
int cpu_used; // available cpu percentage in 1/16
- int dev_sf;
unsigned int enable_auto_alt_ref;
unsigned int enable_auto_bwd_ref;
unsigned int noise_sensitivity;
unsigned int sharpness;
unsigned int static_thresh;
+ unsigned int row_mt;
unsigned int tile_columns; // log2 number of tile columns
unsigned int tile_rows; // log2 number of tile rows
unsigned int arnr_max_frames;
@@ -98,37 +99,40 @@ struct av1_extracfg {
float noise_level;
int noise_block_size;
#endif
+
+ unsigned int chroma_subsampling_x;
+ unsigned int chroma_subsampling_y;
};
static struct av1_extracfg default_extra_cfg = {
- 0, // cpu_used
- 0, // dev_sf
- 1, // enable_auto_alt_ref
- 0, // enable_auto_bwd_ref
- 0, // noise_sensitivity
- 0, // sharpness
- 0, // static_thresh
- 0, // tile_columns
- 0, // tile_rows
- 7, // arnr_max_frames
- 5, // arnr_strength
- 0, // min_gf_interval; 0 -> default decision
- 0, // max_gf_interval; 0 -> default decision
- AOM_TUNE_PSNR, // tuning
- 10, // cq_level
- 0, // rc_max_intra_bitrate_pct
- 0, // rc_max_inter_bitrate_pct
- 0, // gf_cbr_boost_pct
- 0, // lossless
- 1, // enable_cdef
- 1, // enable_restoration
- 0, // disable_trellis_quant
- 0, // enable_qm
- DEFAULT_QM_Y, // qm_y
- DEFAULT_QM_U, // qm_u
- DEFAULT_QM_V, // qm_v
- DEFAULT_QM_FIRST, // qm_min
- DEFAULT_QM_LAST, // qm_max
+ 0, // cpu_used
+ 1, // enable_auto_alt_ref
+ 0, // enable_auto_bwd_ref
+ 0, // noise_sensitivity
+ CONFIG_SHARP_SETTINGS, // sharpness
+ 0, // static_thresh
+ 0, // row_mt
+ 0, // tile_columns
+ 0, // tile_rows
+ 7, // arnr_max_frames
+ 5, // arnr_strength
+ 0, // min_gf_interval; 0 -> default decision
+ 0, // max_gf_interval; 0 -> default decision
+ AOM_TUNE_PSNR, // tuning
+ 10, // cq_level
+ 0, // rc_max_intra_bitrate_pct
+ 0, // rc_max_inter_bitrate_pct
+ 0, // gf_cbr_boost_pct
+ 0, // lossless
+ !CONFIG_SHARP_SETTINGS, // enable_cdef
+ 1, // enable_restoration
+ 0, // disable_trellis_quant
+ 0, // enable_qm
+ DEFAULT_QM_Y, // qm_y
+ DEFAULT_QM_U, // qm_u
+ DEFAULT_QM_V, // qm_v
+ DEFAULT_QM_FIRST, // qm_min
+ DEFAULT_QM_LAST, // qm_max
#if CONFIG_DIST_8X8
0,
#endif
@@ -150,7 +154,7 @@ static struct av1_extracfg default_extra_cfg = {
0, // render width
0, // render height
AOM_SUPERBLOCK_SIZE_DYNAMIC, // superblock_size
- 0, // Single tile decoding is off by default.
+ 1, // this depends on large_scale_tile.
0, // error_resilient_mode off by default.
0, // s_frame_mode off by default.
0, // film_grain_test_vector
@@ -168,6 +172,8 @@ static struct av1_extracfg default_extra_cfg = {
0, // noise_level
32, // noise_block_size
#endif
+ 0, // chroma_subsampling_x
+ 0, // chroma_subsampling_y
};
struct aom_codec_alg_priv {
@@ -251,10 +257,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
if (extra_cfg->max_gf_interval > 0) {
- RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1));
- }
- if (extra_cfg->min_gf_interval > 0 && extra_cfg->max_gf_interval > 0) {
- RANGE_CHECK(extra_cfg, max_gf_interval, extra_cfg->min_gf_interval,
+ RANGE_CHECK(extra_cfg, max_gf_interval, MAX(2, extra_cfg->min_gf_interval),
(MAX_LAG_BUFFERS - 1));
}
@@ -284,13 +287,14 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 2);
RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
RANGE_CHECK(extra_cfg, cpu_used, 0, 8);
- RANGE_CHECK(extra_cfg, dev_sf, 0, UINT8_MAX);
RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
AOM_SUPERBLOCK_SIZE_DYNAMIC);
RANGE_CHECK_HI(cfg, large_scale_tile, 1);
RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1);
+ RANGE_CHECK_HI(extra_cfg, row_mt, 1);
+
RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
@@ -372,6 +376,9 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
#endif
}
+ RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1);
+ RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1);
+
return AOM_CODEC_OK;
}
@@ -581,7 +588,6 @@ static aom_codec_err_t set_encoder_config(
oxcf->sframe_mode = cfg->sframe_mode;
oxcf->sframe_enabled = cfg->sframe_dist != 0;
oxcf->speed = extra_cfg->cpu_used;
- oxcf->dev_sf = extra_cfg->dev_sf;
oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
@@ -637,6 +643,8 @@ static aom_codec_err_t set_encoder_config(
oxcf->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
}
+ oxcf->row_mt = extra_cfg->row_mt;
+
oxcf->tile_columns = extra_cfg->tile_columns;
oxcf->tile_rows = extra_cfg->tile_rows;
@@ -692,6 +700,24 @@ static aom_codec_err_t set_encoder_config(
oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
+
+#if CONFIG_REDUCED_ENCODER_BORDER
+ if (oxcf->superres_mode != SUPERRES_NONE ||
+ oxcf->resize_mode != RESIZE_NONE) {
+ warn(
+ "Superres / resize cannot be used with CONFIG_REDUCED_ENCODER_BORDER. "
+ "Disabling superres/resize.\n");
+ // return AOM_CODEC_INVALID_PARAM;
+ disable_superres(oxcf);
+ oxcf->resize_mode = RESIZE_NONE;
+ oxcf->resize_scale_denominator = SCALE_NUMERATOR;
+ oxcf->resize_kf_scale_denominator = SCALE_NUMERATOR;
+ }
+#endif // CONFIG_REDUCED_ENCODER_BORDER
+
+ oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
+ oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
+
return AOM_CODEC_OK;
}
@@ -731,6 +757,10 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
return res;
}
+static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) {
+ return av1_get_global_headers(ctx->cpi);
+}
+
static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx,
va_list args) {
int *const arg = va_arg(args, int *);
@@ -765,12 +795,6 @@ static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
-static aom_codec_err_t ctrl_set_devsf(aom_codec_alg_priv_t *ctx, va_list args) {
- struct av1_extracfg extra_cfg = ctx->extra_cfg;
- extra_cfg.dev_sf = CAST(AOME_SET_DEVSF, args);
- return update_extra_cfg(ctx, &extra_cfg);
-}
-
static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx,
va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -806,6 +830,13 @@ static aom_codec_err_t ctrl_set_static_thresh(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.row_mt = CAST(AV1E_SET_ROW_MT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx,
va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1669,6 +1700,20 @@ static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static aom_codec_err_t ctrl_set_chroma_subsampling_x(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.chroma_subsampling_x = CAST(AV1E_SET_CHROMA_SUBSAMPLING_X, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.chroma_subsampling_y = CAST(AV1E_SET_CHROMA_SUBSAMPLING_Y, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AV1_COPY_REFERENCE, ctrl_copy_reference },
{ AOME_USE_REFERENCE, ctrl_use_reference },
@@ -1681,11 +1726,11 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AOME_SET_SCALEMODE, ctrl_set_scale_mode },
{ AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id },
{ AOME_SET_CPUUSED, ctrl_set_cpuused },
- { AOME_SET_DEVSF, ctrl_set_devsf },
{ AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref },
{ AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref },
{ AOME_SET_SHARPNESS, ctrl_set_sharpness },
{ AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
+ { AV1E_SET_ROW_MT, ctrl_set_row_mt },
{ AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
{ AV1E_SET_TILE_ROWS, ctrl_set_tile_rows },
{ AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
@@ -1754,7 +1799,8 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AV1E_GET_ACTIVEMAP, ctrl_get_active_map },
{ AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
{ AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
-
+ { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x },
+ { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y },
{ -1, NULL },
};
@@ -1850,13 +1896,13 @@ CODEC_INTERFACE(aom_codec_av1_cx) = {
},
{
// NOLINT
- 1, // 1 cfg map
- encoder_usage_cfg_map, // aom_codec_enc_cfg_map_t
- encoder_encode, // aom_codec_encode_fn_t
- encoder_get_cxdata, // aom_codec_get_cx_data_fn_t
- encoder_set_config, // aom_codec_enc_config_set_fn_t
- NULL, // aom_codec_get_global_headers_fn_t
- encoder_get_preview, // aom_codec_get_preview_frame_fn_t
- NULL // aom_codec_enc_mr_get_mem_loc_fn_t
+ 1, // 1 cfg map
+ encoder_usage_cfg_map, // aom_codec_enc_cfg_map_t
+ encoder_encode, // aom_codec_encode_fn_t
+ encoder_get_cxdata, // aom_codec_get_cx_data_fn_t
+ encoder_set_config, // aom_codec_enc_config_set_fn_t
+ encoder_get_global_headers, // aom_codec_get_global_headers_fn_t
+ encoder_get_preview, // aom_codec_get_preview_frame_fn_t
+ NULL // aom_codec_enc_mr_get_mem_loc_fn_t
}
};
diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c
index f42572019..4a6631047 100644
--- a/third_party/aom/av1/av1_dx_iface.c
+++ b/third_party/aom/av1/av1_dx_iface.c
@@ -26,6 +26,7 @@
#include "av1/common/alloccommon.h"
#include "av1/common/frame_buffers.h"
#include "av1/common/enums.h"
+#include "av1/common/obu_util.h"
#include "av1/decoder/decoder.h"
#include "av1/decoder/decodeframe.h"
@@ -46,6 +47,7 @@ struct aom_codec_alg_priv {
int last_show_frame; // Index of last output frame.
int byte_alignment;
int skip_loop_filter;
+ int skip_film_grain;
int decode_tile_row;
int decode_tile_col;
unsigned int tile_mode;
@@ -103,6 +105,15 @@ static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx,
priv->cfg.cfg.ext_partition = 1;
}
av1_zero(priv->image_with_grain);
+ // Turn row_mt on by default.
+ priv->row_mt = 1;
+
+ // Turn on normal tile coding mode by default.
+ // 0 is for normal tile coding mode, and 1 is for large scale tile coding
+ // mode(refer to lightfield example).
+ priv->tile_mode = 0;
+ priv->decode_tile_row = -1;
+ priv->decode_tile_col = -1;
}
return AOM_CODEC_OK;
@@ -216,7 +227,7 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
while (1) {
data += bytes_read;
data_sz -= bytes_read;
- const uint8_t *payload_start = data;
+ if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME;
// Check that the selected OBU is a sequence header
if (obu_header.type == OBU_SEQUENCE_HEADER) {
// Sanity check on sequence header size
@@ -264,9 +275,9 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
}
}
// skip past any unread OBU header data
- data = payload_start + payload_size;
+ data += payload_size;
data_sz -= payload_size;
- if (data_sz <= 0) break; // exit if we're out of OBUs
+ if (data_sz == 0) break; // exit if we're out of OBUs
status = aom_read_obu_header_and_size(
data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
if (status != AOM_CODEC_OK) return status;
@@ -313,6 +324,7 @@ static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
cm->new_fb_idx = INVALID_IDX;
cm->byte_alignment = ctx->byte_alignment;
cm->skip_loop_filter = ctx->skip_loop_filter;
+ cm->skip_film_grain = ctx->skip_film_grain;
if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
pool->get_fb_cb = ctx->get_ext_fb_cb;
@@ -434,7 +446,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
frame_worker_data->pbi->row_mt = ctx->row_mt;
- worker->hook = (AVxWorkerHook)frame_worker_hook;
+ worker->hook = frame_worker_hook;
if (!winterface->reset(worker)) {
set_error_detail(ctx, "Frame Worker thread creation failed");
return AOM_CODEC_MEM_ERROR;
@@ -515,12 +527,11 @@ static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
const uint8_t *data, size_t data_sz,
void *user_priv) {
- const uint8_t *data_start = data;
- const uint8_t *data_end = data + data_sz;
aom_codec_err_t res = AOM_CODEC_OK;
- // Release any pending output frames from the previous decoder call.
- // We need to do this even if the decoder is being flushed
+ // Release any pending output frames from the previous decoder_decode call.
+ // We need to do this even if the decoder is being flushed or the input
+ // arguments are invalid.
if (ctx->frame_workers) {
BufferPool *const pool = ctx->buffer_pool;
RefCntBuffer *const frame_bufs = pool->frame_bufs;
@@ -538,10 +549,13 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
unlock_buffer_pool(ctx->buffer_pool);
}
+ /* Sanity checks */
+ /* NULL data ptr allowed if data_sz is 0 too */
if (data == NULL && data_sz == 0) {
ctx->flushed = 1;
return AOM_CODEC_OK;
}
+ if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM;
// Reset flushed when receiving a valid frame.
ctx->flushed = 0;
@@ -552,6 +566,9 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
if (res != AOM_CODEC_OK) return res;
}
+ const uint8_t *data_start = data;
+ const uint8_t *data_end = data + data_sz;
+
if (ctx->is_annexb) {
// read the size of this temporal unit
size_t length_of_size;
@@ -617,6 +634,7 @@ static aom_image_t *add_grain_if_needed(aom_image_t *img,
img->fmt != grain_img_buf->fmt) {
aom_img_free(grain_img_buf);
grain_img_buf = NULL;
+ *grain_img_ptr = NULL;
}
}
if (!grain_img_buf) {
@@ -624,7 +642,14 @@ static aom_image_t *add_grain_if_needed(aom_image_t *img,
*grain_img_ptr = grain_img_buf;
}
- av1_add_film_grain(grain_params, img, grain_img_buf);
+ if (grain_img_buf) {
+ grain_img_buf->user_priv = img->user_priv;
+ if (av1_add_film_grain(grain_params, img, grain_img_buf)) {
+ aom_img_free(grain_img_buf);
+ grain_img_buf = NULL;
+ *grain_img_ptr = NULL;
+ }
+ }
return grain_img_buf;
}
@@ -720,8 +745,13 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
img = &ctx->img;
img->temporal_id = cm->temporal_layer_id;
img->spatial_id = cm->spatial_layer_id;
+ if (cm->skip_film_grain) grain_params->apply_grain = 0;
aom_image_t *res = add_grain_if_needed(
img, &ctx->image_with_grain[*index], grain_params);
+ if (!res) {
+ aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME,
+ "Grain systhesis failed\n");
+ }
*index += 1; // Advance the iterator to point to the next image
return res;
}
@@ -1128,6 +1158,19 @@ static aom_codec_err_t ctrl_set_skip_loop_filter(aom_codec_alg_priv_t *ctx,
return AOM_CODEC_OK;
}
+static aom_codec_err_t ctrl_set_skip_film_grain(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->skip_film_grain = va_arg(args, int);
+
+ if (ctx->frame_workers) {
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->pbi->common.skip_film_grain = ctx->skip_film_grain;
+ }
+
+ return AOM_CODEC_OK;
+}
+
static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx,
va_list args) {
#if !CONFIG_ACCOUNTING
@@ -1231,6 +1274,7 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
{ AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug },
{ AV1D_SET_ROW_MT, ctrl_set_row_mt },
{ AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr },
+ { AV1D_SET_SKIP_FILM_GRAIN, ctrl_set_skip_film_grain },
// Getters
{ AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted },
diff --git a/third_party/aom/av1/av1_iface_common.h b/third_party/aom/av1/av1_iface_common.h
index c03892b73..4a7af580b 100644
--- a/third_party/aom/av1/av1_iface_common.h
+++ b/third_party/aom/av1/av1_iface_common.h
@@ -8,10 +8,11 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_AV1_IFACE_COMMON_H_
-#define AV1_AV1_IFACE_COMMON_H_
+#ifndef AOM_AV1_AV1_IFACE_COMMON_H_
+#define AOM_AV1_AV1_IFACE_COMMON_H_
#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
void *user_priv) {
@@ -132,4 +133,4 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
return AOM_CODEC_OK;
}
-#endif // AV1_AV1_IFACE_COMMON_H_
+#endif // AOM_AV1_AV1_IFACE_COMMON_H_
diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h
index dbcb5b947..8e5896981 100644
--- a/third_party/aom/av1/common/alloccommon.h
+++ b/third_party/aom/av1/common/alloccommon.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ALLOCCOMMON_H_
-#define AV1_COMMON_ALLOCCOMMON_H_
+#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_
+#define AOM_AV1_COMMON_ALLOCCOMMON_H_
#define INVALID_IDX -1 // Invalid buffer index.
@@ -45,4 +45,4 @@ int av1_get_MBs(int width, int height);
} // extern "C"
#endif
-#endif // AV1_COMMON_ALLOCCOMMON_H_
+#endif // AOM_AV1_COMMON_ALLOCCOMMON_H_
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
index 51c991498..bad411743 100644
--- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
@@ -9,6 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include <arm_neon.h>
+
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "config/av1_rtcd.h"
@@ -19,19 +21,7 @@
#include "av1/common/enums.h"
#include "av1/common/idct.h"
#include "av1/common/arm/av1_inv_txfm_neon.h"
-
-static INLINE TxSetType find_TxSetType(TX_SIZE tx_size) {
- const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
- TxSetType tx_set_type;
- if (tx_size_sqr_up > TX_32X32) {
- tx_set_type = EXT_TX_SET_DCTONLY;
- } else if (tx_size_sqr_up == TX_32X32) {
- tx_set_type = EXT_TX_SET_DCT_IDTX;
- } else {
- tx_set_type = EXT_TX_SET_ALL16;
- }
- return tx_set_type;
-}
+#include "av1/common/arm/transpose_neon.h"
// 1D itx types
typedef enum ATTRIBUTE_PACKED {
@@ -65,6 +55,2038 @@ static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
{ av1_idct64_new, NULL, NULL },
};
+static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
+ uint8_t *output, int stride,
+ int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ int16x8_t temp_output;
+ for (int i = 0; i < height; ++i, j += step) {
+ temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
+ temp_output = vaddq_s16(temp_output, in[j]);
+ vst1_u8(output, vqmovun_s16(temp_output));
+ output += stride;
+ }
+}
+
+static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
+ int16x8_t res0,
+ int16x8_t res1) {
+ int16x8_t temp_output[2];
+ uint8x16_t temp_output_8q;
+ temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
+ temp_output[0] = vaddq_s16(temp_output[0], res0);
+ temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
+ temp_output[1] = vaddq_s16(temp_output[1], res1);
+ temp_output_8q =
+ vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
+ return temp_output_8q;
+}
+
+static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
+ uint8_t *output, int stride,
+ int flipud, int height) {
+ uint8x16_t temp_output_8q;
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ temp_output_8q = vld1q_u8(output + i * stride);
+ temp_output_8q =
+ lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
+ vst1q_u8((output + i * stride), temp_output_8q);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
+ int value) {
+ for (int i = 0; i < size; i++) {
+ a[i] = vdupq_n_s16((int16_t)value);
+ }
+}
+
+static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
+ int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0_l, s0_h, s1_l, s1_h;
+ int16x4_t v0[2], v1[2];
+
+ s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
+ s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
+ s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
+ s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
+
+ v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
+ int32x4_t t0[2], t1[2];
+ int16x4_t v0[2], v1[2];
+
+ // Don't add/sub before multiply, which will overflow in iadst8.
+ const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
+ const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
+ const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
+ const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
+
+ t0[0] = vaddq_s32(x0_lo, x1_lo);
+ t0[1] = vaddq_s32(x0_hi, x1_hi);
+ t1[0] = vsubq_s32(x0_lo, x1_lo);
+ t1[1] = vsubq_s32(x0_hi, x1_hi);
+
+ v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
+
+ x[0] = vcombine_s16(v0[0], v0[1]);
+ x[1] = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1,
+ int16_t *const c2,
+ int16_t *const c3) {
+ int16x4_t val = vdup_n_s16((int16_t)0);
+ val = vld1_lane_s16(c0, val, 0);
+ val = vld1_lane_s16(c1, val, 1);
+ val = vld1_lane_s16(c2, val, 2);
+ val = vld1_lane_s16(c3, val, 3);
+ return val;
+}
+
+static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+ (int16_t *)(cospi + 20), (int16_t *)(cospi + 44));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28),
+ (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[8];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ // Stage 1
+ x[0] = in[7];
+ x[1] = in[0];
+ x[2] = in[5];
+ x[3] = in[2];
+ x[4] = in[3];
+ x[5] = in[4];
+ x[6] = in[1];
+ x[7] = in[6];
+
+ // Stage 2
+ btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+ btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+ btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+ btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+
+ // Stage 3
+ x[0] = vqaddq_s16(s0, s4);
+ x[1] = vqaddq_s16(s1, s5);
+ x[2] = vqaddq_s16(s2, s6);
+ x[3] = vqaddq_s16(s3, s7);
+ x[4] = vqsubq_s16(s0, s4);
+ x[5] = vqsubq_s16(s1, s5);
+ x[6] = vqsubq_s16(s2, s6);
+ x[7] = vqsubq_s16(s3, s7);
+
+ // Stage 4
+ s0 = x[0];
+ s1 = x[1];
+ s2 = x[2];
+ s3 = x[3];
+ btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
+
+ // Stage 5
+ x[0] = vqaddq_s16(s0, s2);
+ x[1] = vqaddq_s16(s1, s3);
+ x[2] = vqsubq_s16(s0, s2);
+ x[3] = vqsubq_s16(s1, s3);
+ x[4] = vqaddq_s16(s4, s6);
+ x[5] = vqaddq_s16(s5, s7);
+ x[6] = vqsubq_s16(s4, s6);
+ x[7] = vqsubq_s16(s5, s7);
+
+ // stage 6
+ btf_16_half_neon(x + 2, c2);
+ btf_16_half_neon(x + 6, c2);
+
+ // Stage 7
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[4]);
+ out[2] = x[6];
+ out[3] = vnegq_s16(x[2]);
+ out[4] = x[3];
+ out[5] = vnegq_s16(x[7]);
+ out[6] = x[5];
+ out[7] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[8];
+ int16x8_t s0, s1, s4, s5;
+
+ // Stage 1
+ x[1] = in[0];
+
+ // Stage 2
+
+ btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
+
+ // Stage 3
+ x[0] = s0;
+ x[1] = s1;
+ x[4] = s0;
+ x[5] = s1;
+
+ // Stage 4
+ s0 = x[0];
+ s1 = x[1];
+ btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+
+ // Stage 5
+ x[0] = s0;
+ x[1] = s1;
+ x[2] = s0;
+ x[3] = s1;
+ x[4] = s4;
+ x[5] = s5;
+ x[6] = s4;
+ x[7] = s5;
+
+ // stage 6
+ btf_16_half_neon(x + 2, c2);
+ btf_16_half_neon(x + 6, c2);
+
+ // Stage 7
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[4]);
+ out[2] = x[6];
+ out[3] = vnegq_s16(x[2]);
+ out[4] = x[3];
+ out[5] = vnegq_s16(x[7]);
+ out[6] = x[5];
+ out[7] = vnegq_s16(x[1]);
+}
+
+static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+ int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[8], step2[8];
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 2
+ btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
+ btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
+
+ // stage 3
+ btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]);
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+
+ // stage 4
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]);
+
+ // stage 5
+ out[0] = vqaddq_s16(step1[0], step2[7]);
+ out[1] = vqaddq_s16(step1[1], step1[6]);
+ out[2] = vqaddq_s16(step1[2], step1[5]);
+ out[3] = vqaddq_s16(step1[3], step2[4]);
+ out[4] = vqsubq_s16(step1[3], step2[4]);
+ out[5] = vqsubq_s16(step1[2], step1[5]);
+ out[6] = vqsubq_s16(step1[1], step1[6]);
+ out[7] = vqsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 1
+ // stage 2
+ // stage 3
+ t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
+
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ // stage 4
+ // stage 5
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+}
+
+void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
+ assert(!(size % 4));
+ if (!bit) return;
+ const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
+ for (int i = 0; i < size; i++) {
+ arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
+ }
+}
+
+static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
+ int16x8_t temp[8];
+ for (int i = 0; i < size; ++i) {
+ temp[i] = input[size - 1 - i];
+ }
+ for (int i = 0; i < size; ++i) {
+ input[i] = temp[i];
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
+ int16x8_t *const a,
+ int out_size) {
+ for (int i = 0; i < 8; ++i) {
+ a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
+ vmovn_s32(vld1q_s32(input + 4)));
+ input += out_size;
+ }
+}
+
+static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ (void)cos_bit;
+
+ output[0] = vmulq_n_s16(input[0], (int16_t)2);
+ output[1] = vmulq_n_s16(input[1], (int16_t)2);
+ output[2] = vmulq_n_s16(input[2], (int16_t)2);
+ output[3] = vmulq_n_s16(input[3], (int16_t)2);
+ output[4] = vmulq_n_s16(input[4], (int16_t)2);
+ output[5] = vmulq_n_s16(input[5], (int16_t)2);
+ output[6] = vmulq_n_s16(input[6], (int16_t)2);
+ output[7] = vmulq_n_s16(input[7], (int16_t)2);
+}
+
+static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
+ int size) {
+ int32x4_t out_low, out_high;
+ int16x4_t low, high;
+
+ for (int z = 0; z < size; ++z) {
+ out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
+ out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
+
+ low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+ high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+ output[z] = vcombine_s16(low, high);
+ }
+}
+
+static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ (void)cos_bit;
+
+ int32x4_t out_low, out_high;
+ int16x4_t low, high;
+ int16_t scale = (int16_t)(2 * NewSqrt2);
+
+ for (int z = 0; z < 16; ++z) {
+ out_low = vmull_n_s16(vget_low_s16(input[z]), scale);
+ out_high = vmull_n_s16(vget_high_s16(input[z]), scale);
+
+ low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+ high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+ output[z] = vcombine_s16(low, high);
+ }
+}
+
+static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ (void)cos_bit;
+
+ for (int z = 0; z < 32; ++z) {
+ output[z] = vmulq_n_s16(input[z], (int16_t)4);
+ }
+}
+
+static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 4
+
+ t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ // stage 6
+ // stage 7
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+ out[8] = step1;
+ out[9] = step1;
+ out[10] = step1;
+ out[11] = step1;
+ out[12] = step1;
+ out[13] = step1;
+ out[14] = step1;
+ out[15] = step1;
+}
+
+static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[16], step2[16];
+
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+ (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
+ (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c3 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 2
+
+ btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
+ btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
+ btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
+ btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
+
+ step2[0] = in[0];
+ step2[1] = in[8];
+ step2[2] = in[4];
+ step2[3] = in[12];
+ step2[4] = in[2];
+ step2[5] = in[10];
+ step2[6] = in[6];
+ step2[7] = in[14];
+
+ // stage 3
+
+ btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
+ btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
+
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ step1[8] = vqaddq_s16(step2[8], step2[9]);
+ step1[9] = vqsubq_s16(step2[8], step2[9]);
+ step1[10] = vqsubq_s16(step2[11], step2[10]);
+ step1[11] = vqaddq_s16(step2[11], step2[10]);
+ step1[12] = vqaddq_s16(step2[12], step2[13]);
+ step1[13] = vqsubq_s16(step2[12], step2[13]);
+ step1[14] = vqsubq_s16(step2[15], step2[14]);
+ step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+
+ btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
+ &step2[10], &step2[13]);
+
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+
+ btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ step1[7] = step2[7];
+ step1[8] = vqaddq_s16(step2[8], step2[11]);
+ step1[9] = vqaddq_s16(step2[9], step2[10]);
+ step1[10] = vqsubq_s16(step2[9], step2[10]);
+ step1[11] = vqsubq_s16(step2[8], step2[11]);
+ step1[12] = vqsubq_s16(step2[15], step2[12]);
+ step1[13] = vqsubq_s16(step2[14], step2[13]);
+ step1[14] = vqaddq_s16(step2[14], step2[13]);
+ step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[7]);
+ step2[1] = vqaddq_s16(step1[1], step1[6]);
+ step2[2] = vqaddq_s16(step1[2], step1[5]);
+ step2[3] = vqaddq_s16(step1[3], step1[4]);
+ step2[4] = vqsubq_s16(step1[3], step1[4]);
+ step2[5] = vqsubq_s16(step1[2], step1[5]);
+ step2[6] = vqsubq_s16(step1[1], step1[6]);
+ step2[7] = vqsubq_s16(step1[0], step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ out[0] = vqaddq_s16(step2[0], step2[15]);
+ out[1] = vqaddq_s16(step2[1], step2[14]);
+ out[2] = vqaddq_s16(step2[2], step2[13]);
+ out[3] = vqaddq_s16(step2[3], step2[12]);
+ out[4] = vqaddq_s16(step2[4], step2[11]);
+ out[5] = vqaddq_s16(step2[5], step2[10]);
+ out[6] = vqaddq_s16(step2[6], step2[9]);
+ out[7] = vqaddq_s16(step2[7], step2[8]);
+ out[8] = vqsubq_s16(step2[7], step2[8]);
+ out[9] = vqsubq_s16(step2[6], step2[9]);
+ out[10] = vqsubq_s16(step2[5], step2[10]);
+ out[11] = vqsubq_s16(step2[4], step2[11]);
+ out[12] = vqsubq_s16(step2[3], step2[12]);
+ out[13] = vqsubq_s16(step2[2], step2[13]);
+ out[14] = vqsubq_s16(step2[1], step2[14]);
+ out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[16], step2[16];
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[2] = in[4];
+ step2[4] = in[2];
+ step2[6] = in[6];
+
+ btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
+ btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
+ btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
+ btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
+
+ // stage 3
+
+ btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+ btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
+
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[8] = vqaddq_s16(step2[8], step2[9]);
+ step1[9] = vqsubq_s16(step2[8], step2[9]);
+ step1[10] = vqsubq_s16(step2[11], step2[10]);
+ step1[11] = vqaddq_s16(step2[11], step2[10]);
+ step1[12] = vqaddq_s16(step2[12], step2[13]);
+ step1[13] = vqsubq_s16(step2[12], step2[13]);
+ step1[14] = vqsubq_s16(step2[15], step2[14]);
+ step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+
+ btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+ btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0,
+ &step2[10], &step2[13]);
+
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+
+ btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ step1[7] = step2[7];
+ step1[8] = vqaddq_s16(step2[8], step2[11]);
+ step1[9] = vqaddq_s16(step2[9], step2[10]);
+ step1[10] = vqsubq_s16(step2[9], step2[10]);
+ step1[11] = vqsubq_s16(step2[8], step2[11]);
+ step1[12] = vqsubq_s16(step2[15], step2[12]);
+ step1[13] = vqsubq_s16(step2[14], step2[13]);
+ step1[14] = vqaddq_s16(step2[14], step2[13]);
+ step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+ btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[7]);
+ step2[1] = vqaddq_s16(step1[1], step1[6]);
+ step2[2] = vqaddq_s16(step1[2], step1[5]);
+ step2[3] = vqaddq_s16(step1[3], step1[4]);
+ step2[4] = vqsubq_s16(step1[3], step1[4]);
+ step2[5] = vqsubq_s16(step1[2], step1[5]);
+ step2[6] = vqsubq_s16(step1[1], step1[6]);
+ step2[7] = vqsubq_s16(step1[0], step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+
+ out[0] = vqaddq_s16(step2[0], step2[15]);
+ out[1] = vqaddq_s16(step2[1], step2[14]);
+ out[2] = vqaddq_s16(step2[2], step2[13]);
+ out[3] = vqaddq_s16(step2[3], step2[12]);
+ out[4] = vqaddq_s16(step2[4], step2[11]);
+ out[5] = vqaddq_s16(step2[5], step2[10]);
+ out[6] = vqaddq_s16(step2[6], step2[9]);
+ out[7] = vqaddq_s16(step2[7], step2[8]);
+ out[8] = vqsubq_s16(step2[7], step2[8]);
+ out[9] = vqsubq_s16(step2[6], step2[9]);
+ out[10] = vqsubq_s16(step2[5], step2[10]);
+ out[11] = vqsubq_s16(step2[4], step2[11]);
+ out[12] = vqsubq_s16(step2[3], step2[12]);
+ out[13] = vqsubq_s16(step2[2], step2[13]);
+ out[14] = vqsubq_s16(step2[1], step2[14]);
+ out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
+ (int16_t *)(cospi + 10), (int16_t *)(cospi + 54));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
+ (int16_t *)(cospi + 26), (int16_t *)(cospi + 38));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30),
+ (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
+ const int16x4_t c3 =
+ create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14),
+ (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
+ const int16x4_t c4 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+
+ const int16x4_t c =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[16];
+ int16x8_t t[14];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+ // Stage 1
+ x[0] = in[15];
+ x[1] = in[0];
+ x[2] = in[13];
+ x[3] = in[2];
+ x[4] = in[11];
+ x[5] = in[4];
+ x[6] = in[9];
+ x[7] = in[6];
+ x[8] = in[7];
+ x[9] = in[8];
+ x[10] = in[5];
+ x[11] = in[10];
+ x[12] = in[3];
+ x[13] = in[12];
+ x[14] = in[1];
+ x[15] = in[14];
+
+ // Stage 2
+ btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+ btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+ btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+ btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+ btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
+ btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
+ btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
+
+ // Stage 3
+ x[0] = vqaddq_s16(s0, s8);
+ x[1] = vqaddq_s16(s1, s9);
+ x[2] = vqaddq_s16(s2, s10);
+ x[3] = vqaddq_s16(s3, s11);
+ x[4] = vqaddq_s16(s4, s12);
+ x[5] = vqaddq_s16(s5, s13);
+ x[6] = vqaddq_s16(s6, s14);
+ x[7] = vqaddq_s16(s7, s15);
+ x[8] = vqsubq_s16(s0, s8);
+ x[9] = vqsubq_s16(s1, s9);
+ x[10] = vqsubq_s16(s2, s10);
+ x[11] = vqsubq_s16(s3, s11);
+ x[12] = vqsubq_s16(s4, s12);
+ x[13] = vqsubq_s16(s5, s13);
+ x[14] = vqsubq_s16(s6, s14);
+ x[15] = vqsubq_s16(s7, s15);
+
+ // Stage 4
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ t[4] = x[4];
+ t[5] = x[5];
+ t[6] = x[6];
+ t[7] = x[7];
+ btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+ btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+ btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+ // Stage 5
+ x[0] = vqaddq_s16(t[0], t[4]);
+ x[1] = vqaddq_s16(t[1], t[5]);
+ x[2] = vqaddq_s16(t[2], t[6]);
+ x[3] = vqaddq_s16(t[3], t[7]);
+ x[4] = vqsubq_s16(t[0], t[4]);
+ x[5] = vqsubq_s16(t[1], t[5]);
+ x[6] = vqsubq_s16(t[2], t[6]);
+ x[7] = vqsubq_s16(t[3], t[7]);
+ x[8] = vqaddq_s16(s8, s12);
+ x[9] = vqaddq_s16(s9, s13);
+ x[10] = vqaddq_s16(s10, s14);
+ x[11] = vqaddq_s16(s11, s15);
+ x[12] = vqsubq_s16(s8, s12);
+ x[13] = vqsubq_s16(s9, s13);
+ x[14] = vqsubq_s16(s10, s14);
+ x[15] = vqsubq_s16(s11, s15);
+
+ // stage 6
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+ t[8] = x[8];
+ t[9] = x[9];
+ t[10] = x[10];
+ t[11] = x[11];
+ btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+ btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+
+ // Stage 7
+ x[0] = vqaddq_s16(t[0], t[2]);
+ x[1] = vqaddq_s16(t[1], t[3]);
+ x[2] = vqsubq_s16(t[0], t[2]);
+ x[3] = vqsubq_s16(t[1], t[3]);
+ x[4] = vqaddq_s16(s4, s6);
+ x[5] = vqaddq_s16(s5, s7);
+ x[6] = vqsubq_s16(s4, s6);
+ x[7] = vqsubq_s16(s5, s7);
+ x[8] = vqaddq_s16(t[8], t[10]);
+ x[9] = vqaddq_s16(t[9], t[11]);
+ x[10] = vqsubq_s16(t[8], t[10]);
+ x[11] = vqsubq_s16(t[9], t[11]);
+ x[12] = vqaddq_s16(s12, s14);
+ x[13] = vqaddq_s16(s13, s15);
+ x[14] = vqsubq_s16(s12, s14);
+ x[15] = vqsubq_s16(s13, s15);
+
+ // Stage 8
+ btf_16_half_neon(x + 2, c);
+ btf_16_half_neon(x + 6, c);
+ btf_16_half_neon(x + 10, c);
+ btf_16_half_neon(x + 14, c);
+
+ // Stage 9
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[8]);
+ out[2] = x[12];
+ out[3] = vnegq_s16(x[4]);
+ out[4] = x[6];
+ out[5] = vnegq_s16(x[14]);
+ out[6] = x[10];
+ out[7] = vnegq_s16(x[2]);
+ out[8] = x[3];
+ out[9] = vnegq_s16(x[11]);
+ out[10] = x[15];
+ out[11] = vnegq_s16(x[7]);
+ out[12] = x[5];
+ out[13] = vnegq_s16(x[13]);
+ out[14] = x[9];
+ out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const int16x4_t c4 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[16];
+ int16x8_t t[10];
+ int16x8_t s0, s1, s4, s5;
+ int16x8_t s8, s9, s12, s13;
+
+ // Stage 1
+ x[1] = in[0];
+
+ // Stage 2
+ btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+
+ // Stage 3
+ x[0] = s0;
+ x[1] = s1;
+ x[8] = s0;
+ x[9] = s1;
+
+ // Stage 4
+ t[0] = x[0];
+ t[1] = x[1];
+ btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+
+ // Stage 5
+ x[0] = t[0];
+ x[1] = t[1];
+ x[4] = t[0];
+ x[5] = t[1];
+ x[8] = s8;
+ x[9] = s9;
+ x[12] = s8;
+ x[13] = s9;
+
+ // stage 6
+ t[0] = x[0];
+ t[1] = x[1];
+ btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+ t[8] = x[8];
+ t[9] = x[9];
+ btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+
+ // Stage 7
+ x[0] = t[0];
+ x[1] = t[1];
+ x[2] = t[0];
+ x[3] = t[1];
+ x[4] = s4;
+ x[5] = s5;
+ x[6] = s4;
+ x[7] = s5;
+ x[8] = t[8];
+ x[9] = t[9];
+ x[10] = t[8];
+ x[11] = t[9];
+ x[12] = s12;
+ x[13] = s13;
+ x[14] = s12;
+ x[15] = s13;
+
+ // Stage 8
+ btf_16_half_neon(x + 2, c);
+ btf_16_half_neon(x + 6, c);
+ btf_16_half_neon(x + 10, c);
+ btf_16_half_neon(x + 14, c);
+
+ // Stage 9
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[8]);
+ out[2] = x[12];
+ out[3] = vnegq_s16(x[4]);
+ out[4] = x[6];
+ out[5] = vnegq_s16(x[14]);
+ out[6] = x[10];
+ out[7] = vnegq_s16(x[2]);
+ out[8] = x[3];
+ out[9] = vnegq_s16(x[11]);
+ out[10] = x[15];
+ out[11] = vnegq_s16(x[7]);
+ out[12] = x[5];
+ out[13] = vnegq_s16(x[13]);
+ out[14] = x[9];
+ out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ const int16x4_t c4 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[16];
+ int16x8_t t[14];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+ // Stage 1
+ x[1] = in[0];
+ x[3] = in[2];
+ x[5] = in[4];
+ x[7] = in[6];
+ x[8] = in[7];
+ x[10] = in[5];
+ x[12] = in[3];
+ x[14] = in[1];
+
+ // Stage 2
+ btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+ btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
+ btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
+ btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
+
+ btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
+ btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
+ btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
+ btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
+
+ // Stage 3
+ x[0] = vqaddq_s16(s0, s8);
+ x[1] = vqaddq_s16(s1, s9);
+ x[2] = vqaddq_s16(s2, s10);
+ x[3] = vqaddq_s16(s3, s11);
+ x[4] = vqaddq_s16(s4, s12);
+ x[5] = vqaddq_s16(s5, s13);
+ x[6] = vqaddq_s16(s6, s14);
+ x[7] = vqaddq_s16(s7, s15);
+ x[8] = vqsubq_s16(s0, s8);
+ x[9] = vqsubq_s16(s1, s9);
+ x[10] = vqsubq_s16(s2, s10);
+ x[11] = vqsubq_s16(s3, s11);
+ x[12] = vqsubq_s16(s4, s12);
+ x[13] = vqsubq_s16(s5, s13);
+ x[14] = vqsubq_s16(s6, s14);
+ x[15] = vqsubq_s16(s7, s15);
+
+ // Stage 4
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ t[4] = x[4];
+ t[5] = x[5];
+ t[6] = x[6];
+ t[7] = x[7];
+ btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+ btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+ btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+ // Stage 5
+ x[0] = vqaddq_s16(t[0], t[4]);
+ x[1] = vqaddq_s16(t[1], t[5]);
+ x[2] = vqaddq_s16(t[2], t[6]);
+ x[3] = vqaddq_s16(t[3], t[7]);
+ x[4] = vqsubq_s16(t[0], t[4]);
+ x[5] = vqsubq_s16(t[1], t[5]);
+ x[6] = vqsubq_s16(t[2], t[6]);
+ x[7] = vqsubq_s16(t[3], t[7]);
+ x[8] = vqaddq_s16(s8, s12);
+ x[9] = vqaddq_s16(s9, s13);
+ x[10] = vqaddq_s16(s10, s14);
+ x[11] = vqaddq_s16(s11, s15);
+ x[12] = vqsubq_s16(s8, s12);
+ x[13] = vqsubq_s16(s9, s13);
+ x[14] = vqsubq_s16(s10, s14);
+ x[15] = vqsubq_s16(s11, s15);
+
+ // stage 6
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+ t[8] = x[8];
+ t[9] = x[9];
+ t[10] = x[10];
+ t[11] = x[11];
+ btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+ btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+
+ // Stage 7
+ x[0] = vqaddq_s16(t[0], t[2]);
+ x[1] = vqaddq_s16(t[1], t[3]);
+ x[2] = vqsubq_s16(t[0], t[2]);
+ x[3] = vqsubq_s16(t[1], t[3]);
+ x[4] = vqaddq_s16(s4, s6);
+ x[5] = vqaddq_s16(s5, s7);
+ x[6] = vqsubq_s16(s4, s6);
+ x[7] = vqsubq_s16(s5, s7);
+ x[8] = vqaddq_s16(t[8], t[10]);
+ x[9] = vqaddq_s16(t[9], t[11]);
+ x[10] = vqsubq_s16(t[8], t[10]);
+ x[11] = vqsubq_s16(t[9], t[11]);
+ x[12] = vqaddq_s16(s12, s14);
+ x[13] = vqaddq_s16(s13, s15);
+ x[14] = vqsubq_s16(s12, s14);
+ x[15] = vqsubq_s16(s13, s15);
+
+ // Stage 8
+ btf_16_half_neon(x + 2, c);
+ btf_16_half_neon(x + 6, c);
+ btf_16_half_neon(x + 10, c);
+ btf_16_half_neon(x + 14, c);
+
+ // Stage 9
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[8]);
+ out[2] = x[12];
+ out[3] = vnegq_s16(x[4]);
+ out[4] = x[6];
+ out[5] = vnegq_s16(x[14]);
+ out[6] = x[10];
+ out[7] = vnegq_s16(x[2]);
+ out[8] = x[3];
+ out[9] = vnegq_s16(x[11]);
+ out[10] = x[15];
+ out[11] = vnegq_s16(x[7]);
+ out[12] = x[5];
+ out[13] = vnegq_s16(x[13]);
+ out[14] = x[9];
+ out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[32], step2[32];
+
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
+ (int16_t *)(cospi + 34), (int16_t *)(cospi + 30));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
+ (int16_t *)(cospi + 50), (int16_t *)(cospi + 14));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54),
+ (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
+ const int16x4_t c3 =
+ create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38),
+ (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
+ const int16x4_t c4 =
+ create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+ (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
+ const int16x4_t c5 =
+ create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
+ (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+ const int16x4_t c6 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c7 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 2
+
+ btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
+ btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
+ btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
+ btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
+ btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
+ btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
+
+ step2[0] = in[0];
+ step2[1] = in[16];
+ step2[2] = in[8];
+ step2[3] = in[24];
+ step2[4] = in[4];
+ step2[5] = in[20];
+ step2[6] = in[12];
+ step2[7] = in[28];
+ step2[8] = in[2];
+ step2[9] = in[18];
+ step2[10] = in[10];
+ step2[11] = in[26];
+ step2[12] = in[6];
+ step2[13] = in[22];
+ step2[14] = in[14];
+ step2[15] = in[30];
+
+ // stage 3
+
+ btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
+ btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
+ btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
+ btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
+
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ step1[4] = step2[4];
+ step1[5] = step2[5];
+ step1[6] = step2[6];
+ step1[7] = step2[7];
+
+ step1[16] = vqaddq_s16(step2[16], step2[17]);
+ step1[17] = vqsubq_s16(step2[16], step2[17]);
+ step1[18] = vqsubq_s16(step2[19], step2[18]);
+ step1[19] = vqaddq_s16(step2[19], step2[18]);
+ step1[20] = vqaddq_s16(step2[20], step2[21]);
+ step1[21] = vqsubq_s16(step2[20], step2[21]);
+ step1[22] = vqsubq_s16(step2[23], step2[22]);
+ step1[23] = vqaddq_s16(step2[23], step2[22]);
+ step1[24] = vqaddq_s16(step2[24], step2[25]);
+ step1[25] = vqsubq_s16(step2[24], step2[25]);
+ step1[26] = vqsubq_s16(step2[27], step2[26]);
+ step1[27] = vqaddq_s16(step2[27], step2[26]);
+ step1[28] = vqaddq_s16(step2[28], step2[29]);
+ step1[29] = vqsubq_s16(step2[28], step2[29]);
+ step1[30] = vqsubq_s16(step2[31], step2[30]);
+ step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+ // stage 4
+
+ btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
+ btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
+ btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
+ btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6,
+ &step2[18], &step2[29]);
+ btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6,
+ &step2[22], &step2[25]);
+
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[8] = vqaddq_s16(step1[8], step1[9]);
+ step2[9] = vqsubq_s16(step1[8], step1[9]);
+ step2[10] = vqsubq_s16(step1[11], step1[10]);
+ step2[11] = vqaddq_s16(step1[11], step1[10]);
+ step2[12] = vqaddq_s16(step1[12], step1[13]);
+ step2[13] = vqsubq_s16(step1[12], step1[13]);
+ step2[14] = vqsubq_s16(step1[15], step1[14]);
+ step2[15] = vqaddq_s16(step1[15], step1[14]);
+ step2[16] = step1[16];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[31] = step1[31];
+
+ // stage 5
+
+ btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
+ btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
+ btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7,
+ &step1[10], &step1[13]);
+
+ step1[4] = vqaddq_s16(step2[4], step2[5]);
+ step1[5] = vqsubq_s16(step2[4], step2[5]);
+ step1[6] = vqsubq_s16(step2[7], step2[6]);
+ step1[7] = vqaddq_s16(step2[7], step2[6]);
+ step1[8] = step2[8];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[19]);
+ step1[17] = vqaddq_s16(step2[17], step2[18]);
+ step1[18] = vqsubq_s16(step2[17], step2[18]);
+ step1[19] = vqsubq_s16(step2[16], step2[19]);
+ step1[20] = vqsubq_s16(step2[23], step2[20]);
+ step1[21] = vqsubq_s16(step2[22], step2[21]);
+ step1[22] = vqaddq_s16(step2[22], step2[21]);
+ step1[23] = vqaddq_s16(step2[23], step2[20]);
+ step1[24] = vqaddq_s16(step2[24], step2[27]);
+ step1[25] = vqaddq_s16(step2[25], step2[26]);
+ step1[26] = vqsubq_s16(step2[25], step2[26]);
+ step1[27] = vqsubq_s16(step2[24], step2[27]);
+ step1[28] = vqsubq_s16(step2[31], step2[28]);
+ step1[29] = vqsubq_s16(step2[30], step2[29]);
+ step1[30] = vqaddq_s16(step2[30], step2[29]);
+ step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
+ btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7,
+ &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7,
+ &step2[21], &step2[26]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[3]);
+ step2[1] = vqaddq_s16(step1[1], step1[2]);
+ step2[2] = vqsubq_s16(step1[1], step1[2]);
+ step2[3] = vqsubq_s16(step1[0], step1[3]);
+ step2[4] = step1[4];
+ step2[7] = step1[7];
+ step2[8] = vqaddq_s16(step1[8], step1[11]);
+ step2[9] = vqaddq_s16(step1[9], step1[10]);
+ step2[10] = vqsubq_s16(step1[9], step1[10]);
+ step2[11] = vqsubq_s16(step1[8], step1[11]);
+ step2[12] = vqsubq_s16(step1[15], step1[12]);
+ step2[13] = vqsubq_s16(step1[14], step1[13]);
+ step2[14] = vqaddq_s16(step1[14], step1[13]);
+ step2[15] = vqaddq_s16(step1[15], step1[12]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
+ btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[7]);
+ step1[1] = vqaddq_s16(step2[1], step2[6]);
+ step1[2] = vqaddq_s16(step2[2], step2[5]);
+ step1[3] = vqaddq_s16(step2[3], step2[4]);
+ step1[4] = vqsubq_s16(step2[3], step2[4]);
+ step1[5] = vqsubq_s16(step2[2], step2[5]);
+ step1[6] = vqsubq_s16(step2[1], step2[6]);
+ step1[7] = vqsubq_s16(step2[0], step2[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[23]);
+ step1[17] = vqaddq_s16(step2[17], step2[22]);
+ step1[18] = vqaddq_s16(step2[18], step2[21]);
+ step1[19] = vqaddq_s16(step2[19], step2[20]);
+ step1[20] = vqsubq_s16(step2[19], step2[20]);
+ step1[21] = vqsubq_s16(step2[18], step2[21]);
+ step1[22] = vqsubq_s16(step2[17], step2[22]);
+ step1[23] = vqsubq_s16(step2[16], step2[23]);
+ step1[24] = vqsubq_s16(step2[31], step2[24]);
+ step1[25] = vqsubq_s16(step2[30], step2[25]);
+ step1[26] = vqsubq_s16(step2[29], step2[26]);
+ step1[27] = vqsubq_s16(step2[28], step2[27]);
+ step1[28] = vqaddq_s16(step2[27], step2[28]);
+ step1[29] = vqaddq_s16(step2[26], step2[29]);
+ step1[30] = vqaddq_s16(step2[25], step2[30]);
+ step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
+ btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
+ btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[15]);
+ step2[1] = vqaddq_s16(step1[1], step1[14]);
+ step2[2] = vqaddq_s16(step1[2], step1[13]);
+ step2[3] = vqaddq_s16(step1[3], step1[12]);
+ step2[4] = vqaddq_s16(step1[4], step1[11]);
+ step2[5] = vqaddq_s16(step1[5], step1[10]);
+ step2[6] = vqaddq_s16(step1[6], step1[9]);
+ step2[7] = vqaddq_s16(step1[7], step1[8]);
+ step2[8] = vqsubq_s16(step1[7], step1[8]);
+ step2[9] = vqsubq_s16(step1[6], step1[9]);
+ step2[10] = vqsubq_s16(step1[5], step1[10]);
+ step2[11] = vqsubq_s16(step1[4], step1[11]);
+ step2[12] = vqsubq_s16(step1[3], step1[12]);
+ step2[13] = vqsubq_s16(step1[2], step1[13]);
+ step2[14] = vqsubq_s16(step1[1], step1[14]);
+ step2[15] = vqsubq_s16(step1[0], step1[15]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[18];
+ step2[19] = step1[19];
+ step2[28] = step1[28];
+ step2[29] = step1[29];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 9
+
+ out[0] = vqaddq_s16(step2[0], step2[31]);
+ out[1] = vqaddq_s16(step2[1], step2[30]);
+ out[2] = vqaddq_s16(step2[2], step2[29]);
+ out[3] = vqaddq_s16(step2[3], step2[28]);
+ out[4] = vqaddq_s16(step2[4], step2[27]);
+ out[5] = vqaddq_s16(step2[5], step2[26]);
+ out[6] = vqaddq_s16(step2[6], step2[25]);
+ out[7] = vqaddq_s16(step2[7], step2[24]);
+ out[8] = vqaddq_s16(step2[8], step2[23]);
+ out[9] = vqaddq_s16(step2[9], step2[22]);
+ out[10] = vqaddq_s16(step2[10], step2[21]);
+ out[11] = vqaddq_s16(step2[11], step2[20]);
+ out[12] = vqaddq_s16(step2[12], step2[19]);
+ out[13] = vqaddq_s16(step2[13], step2[18]);
+ out[14] = vqaddq_s16(step2[14], step2[17]);
+ out[15] = vqaddq_s16(step2[15], step2[16]);
+ out[16] = vqsubq_s16(step2[15], step2[16]);
+ out[17] = vqsubq_s16(step2[14], step2[17]);
+ out[18] = vqsubq_s16(step2[13], step2[18]);
+ out[19] = vqsubq_s16(step2[12], step2[19]);
+ out[20] = vqsubq_s16(step2[11], step2[20]);
+ out[21] = vqsubq_s16(step2[10], step2[21]);
+ out[22] = vqsubq_s16(step2[9], step2[22]);
+ out[23] = vqsubq_s16(step2[8], step2[23]);
+ out[24] = vqsubq_s16(step2[7], step2[24]);
+ out[25] = vqsubq_s16(step2[6], step2[25]);
+ out[26] = vqsubq_s16(step2[5], step2[26]);
+ out[27] = vqsubq_s16(step2[4], step2[27]);
+ out[28] = vqsubq_s16(step2[3], step2[28]);
+ out[29] = vqsubq_s16(step2[2], step2[29]);
+ out[30] = vqsubq_s16(step2[1], step2[30]);
+ out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+
+ t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+ out[8] = step1;
+ out[9] = step1;
+ out[10] = step1;
+ out[11] = step1;
+ out[12] = step1;
+ out[13] = step1;
+ out[14] = step1;
+ out[15] = step1;
+ out[16] = step1;
+ out[17] = step1;
+ out[18] = step1;
+ out[19] = step1;
+ out[20] = step1;
+ out[21] = step1;
+ out[22] = step1;
+ out[23] = step1;
+ out[24] = step1;
+ out[25] = step1;
+ out[26] = step1;
+ out[27] = step1;
+ out[28] = step1;
+ out[29] = step1;
+ out[30] = step1;
+ out[31] = step1;
+}
+
+static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[32], step2[32];
+ int32x4_t t32[16];
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[4] = in[4];
+ step2[8] = in[2];
+ step2[12] = in[6];
+
+ btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+ btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+ btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+ btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] = step2[4];
+
+ btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+ btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[16];
+ step1[18] = step2[19];
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ step1[21] = step2[20];
+ step1[22] = step2[23];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[24];
+ step1[26] = step2[27];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+ step1[29] = step2[28];
+ step1[30] = step2[31];
+ step1[31] = step2[31];
+
+ // stage 4
+
+ btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+ btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+ btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
+ &step2[18], &step2[29]);
+ btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
+ &step2[22], &step2[25]);
+
+ step2[0] = step1[0];
+ step2[8] = step1[8];
+ step2[9] = step1[8];
+ step2[10] = step1[11];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[13] = step1[12];
+ step2[14] = step1[15];
+ step2[15] = step1[15];
+ step2[16] = step1[16];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[31] = step1[31];
+
+ // stage 5
+
+ t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+ step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
+ &step1[10], &step1[13]);
+
+ step1[4] = step2[4];
+ step1[5] = step2[4];
+ step1[6] = step2[7];
+ step1[7] = step2[7];
+ step1[8] = step2[8];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[19]);
+ step1[17] = vqaddq_s16(step2[17], step2[18]);
+ step1[18] = vqsubq_s16(step2[17], step2[18]);
+ step1[19] = vqsubq_s16(step2[16], step2[19]);
+ step1[20] = vqsubq_s16(step2[23], step2[20]);
+ step1[21] = vqsubq_s16(step2[22], step2[21]);
+ step1[22] = vqaddq_s16(step2[22], step2[21]);
+ step1[23] = vqaddq_s16(step2[23], step2[20]);
+ step1[24] = vqaddq_s16(step2[24], step2[27]);
+ step1[25] = vqaddq_s16(step2[25], step2[26]);
+ step1[26] = vqsubq_s16(step2[25], step2[26]);
+ step1[27] = vqsubq_s16(step2[24], step2[27]);
+ step1[28] = vqsubq_s16(step2[31], step2[28]);
+ step1[29] = vqsubq_s16(step2[30], step2[29]);
+ step1[30] = vqaddq_s16(step2[30], step2[29]);
+ step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+ btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
+ &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
+ &step2[21], &step2[26]);
+
+ step2[0] = step1[0];
+ step2[1] = step1[0];
+ step2[2] = step1[0];
+ step2[3] = step1[0];
+ step2[4] = step1[4];
+ step2[7] = step1[7];
+ step2[8] = vqaddq_s16(step1[8], step1[11]);
+ step2[9] = vqaddq_s16(step1[9], step1[10]);
+ step2[10] = vqsubq_s16(step1[9], step1[10]);
+ step2[11] = vqsubq_s16(step1[8], step1[11]);
+ step2[12] = vqsubq_s16(step1[15], step1[12]);
+ step2[13] = vqsubq_s16(step1[14], step1[13]);
+ step2[14] = vqaddq_s16(step1[14], step1[13]);
+ step2[15] = vqaddq_s16(step1[15], step1[12]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+ btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[7]);
+ step1[1] = vqaddq_s16(step2[1], step2[6]);
+ step1[2] = vqaddq_s16(step2[2], step2[5]);
+ step1[3] = vqaddq_s16(step2[3], step2[4]);
+ step1[4] = vqsubq_s16(step2[3], step2[4]);
+ step1[5] = vqsubq_s16(step2[2], step2[5]);
+ step1[6] = vqsubq_s16(step2[1], step2[6]);
+ step1[7] = vqsubq_s16(step2[0], step2[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[23]);
+ step1[17] = vqaddq_s16(step2[17], step2[22]);
+ step1[18] = vqaddq_s16(step2[18], step2[21]);
+ step1[19] = vqaddq_s16(step2[19], step2[20]);
+ step1[20] = vqsubq_s16(step2[19], step2[20]);
+ step1[21] = vqsubq_s16(step2[18], step2[21]);
+ step1[22] = vqsubq_s16(step2[17], step2[22]);
+ step1[23] = vqsubq_s16(step2[16], step2[23]);
+ step1[24] = vqsubq_s16(step2[31], step2[24]);
+ step1[25] = vqsubq_s16(step2[30], step2[25]);
+ step1[26] = vqsubq_s16(step2[29], step2[26]);
+ step1[27] = vqsubq_s16(step2[28], step2[27]);
+ step1[28] = vqaddq_s16(step2[27], step2[28]);
+ step1[29] = vqaddq_s16(step2[26], step2[29]);
+ step1[30] = vqaddq_s16(step2[25], step2[30]);
+ step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+ btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+ btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[15]);
+ step2[1] = vqaddq_s16(step1[1], step1[14]);
+ step2[2] = vqaddq_s16(step1[2], step1[13]);
+ step2[3] = vqaddq_s16(step1[3], step1[12]);
+ step2[4] = vqaddq_s16(step1[4], step1[11]);
+ step2[5] = vqaddq_s16(step1[5], step1[10]);
+ step2[6] = vqaddq_s16(step1[6], step1[9]);
+ step2[7] = vqaddq_s16(step1[7], step1[8]);
+ step2[8] = vqsubq_s16(step1[7], step1[8]);
+ step2[9] = vqsubq_s16(step1[6], step1[9]);
+ step2[10] = vqsubq_s16(step1[5], step1[10]);
+ step2[11] = vqsubq_s16(step1[4], step1[11]);
+ step2[12] = vqsubq_s16(step1[3], step1[12]);
+ step2[13] = vqsubq_s16(step1[2], step1[13]);
+ step2[14] = vqsubq_s16(step1[1], step1[14]);
+ step2[15] = vqsubq_s16(step1[0], step1[15]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[18];
+ step2[19] = step1[19];
+ step2[28] = step1[28];
+ step2[29] = step1[29];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 9
+
+ out[0] = vqaddq_s16(step2[0], step2[31]);
+ out[1] = vqaddq_s16(step2[1], step2[30]);
+ out[2] = vqaddq_s16(step2[2], step2[29]);
+ out[3] = vqaddq_s16(step2[3], step2[28]);
+ out[4] = vqaddq_s16(step2[4], step2[27]);
+ out[5] = vqaddq_s16(step2[5], step2[26]);
+ out[6] = vqaddq_s16(step2[6], step2[25]);
+ out[7] = vqaddq_s16(step2[7], step2[24]);
+ out[8] = vqaddq_s16(step2[8], step2[23]);
+ out[9] = vqaddq_s16(step2[9], step2[22]);
+ out[10] = vqaddq_s16(step2[10], step2[21]);
+ out[11] = vqaddq_s16(step2[11], step2[20]);
+ out[12] = vqaddq_s16(step2[12], step2[19]);
+ out[13] = vqaddq_s16(step2[13], step2[18]);
+ out[14] = vqaddq_s16(step2[14], step2[17]);
+ out[15] = vqaddq_s16(step2[15], step2[16]);
+ out[16] = vqsubq_s16(step2[15], step2[16]);
+ out[17] = vqsubq_s16(step2[14], step2[17]);
+ out[18] = vqsubq_s16(step2[13], step2[18]);
+ out[19] = vqsubq_s16(step2[12], step2[19]);
+ out[20] = vqsubq_s16(step2[11], step2[20]);
+ out[21] = vqsubq_s16(step2[10], step2[21]);
+ out[22] = vqsubq_s16(step2[9], step2[22]);
+ out[23] = vqsubq_s16(step2[8], step2[23]);
+ out[24] = vqsubq_s16(step2[7], step2[24]);
+ out[25] = vqsubq_s16(step2[6], step2[25]);
+ out[26] = vqsubq_s16(step2[5], step2[26]);
+ out[27] = vqsubq_s16(step2[4], step2[27]);
+ out[28] = vqsubq_s16(step2[3], step2[28]);
+ out[29] = vqsubq_s16(step2[2], step2[29]);
+ out[30] = vqsubq_s16(step2[1], step2[30]);
+ out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[32], step2[32];
+ int32x4_t t32[16];
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 1
+ // stage 2
+
+ btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+ btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
+ btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
+ btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+ btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+ btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
+ btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
+ btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+ step2[0] = in[0];
+ step2[2] = in[8];
+ step2[4] = in[4];
+ step2[6] = in[12];
+ step2[8] = in[2];
+ step2[10] = in[10];
+ step2[12] = in[6];
+ step2[14] = in[14];
+
+ // stage 3
+
+ btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+ btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
+ btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
+ btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[4] = step2[4];
+ step1[6] = step2[6];
+ step1[16] = vqaddq_s16(step2[16], step2[17]);
+ step1[17] = vqsubq_s16(step2[16], step2[17]);
+ step1[18] = vqsubq_s16(step2[19], step2[18]);
+ step1[19] = vqaddq_s16(step2[19], step2[18]);
+ step1[20] = vqaddq_s16(step2[20], step2[21]);
+ step1[21] = vqsubq_s16(step2[20], step2[21]);
+ step1[22] = vqsubq_s16(step2[23], step2[22]);
+ step1[23] = vqaddq_s16(step2[23], step2[22]);
+ step1[24] = vqaddq_s16(step2[24], step2[25]);
+ step1[25] = vqsubq_s16(step2[24], step2[25]);
+ step1[26] = vqsubq_s16(step2[27], step2[26]);
+ step1[27] = vqaddq_s16(step2[27], step2[26]);
+ step1[28] = vqaddq_s16(step2[28], step2[29]);
+ step1[29] = vqsubq_s16(step2[28], step2[29]);
+ step1[30] = vqsubq_s16(step2[31], step2[30]);
+ step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+ // stage 4
+
+ btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+ btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
+ btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+ btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
+ &step2[18], &step2[29]);
+ btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
+ &step2[22], &step2[25]);
+
+ step2[0] = step1[0];
+ step2[2] = step1[2];
+ step2[8] = vqaddq_s16(step1[8], step1[9]);
+ step2[9] = vqsubq_s16(step1[8], step1[9]);
+ step2[10] = vqsubq_s16(step1[11], step1[10]);
+ step2[11] = vqaddq_s16(step1[11], step1[10]);
+ step2[12] = vqaddq_s16(step1[12], step1[13]);
+ step2[13] = vqsubq_s16(step1[12], step1[13]);
+ step2[14] = vqsubq_s16(step1[15], step1[14]);
+ step2[15] = vqaddq_s16(step1[15], step1[14]);
+ step2[16] = step1[16];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[31] = step1[31];
+
+ // stage 5
+
+ t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+
+ step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
+ btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
+ &step1[10], &step1[13]);
+
+ step1[4] = vqaddq_s16(step2[4], step2[5]);
+ step1[5] = vqsubq_s16(step2[4], step2[5]);
+ step1[6] = vqsubq_s16(step2[7], step2[6]);
+ step1[7] = vqaddq_s16(step2[7], step2[6]);
+ step1[8] = step2[8];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[19]);
+ step1[17] = vqaddq_s16(step2[17], step2[18]);
+ step1[18] = vqsubq_s16(step2[17], step2[18]);
+ step1[19] = vqsubq_s16(step2[16], step2[19]);
+ step1[20] = vqsubq_s16(step2[23], step2[20]);
+ step1[21] = vqsubq_s16(step2[22], step2[21]);
+ step1[22] = vqaddq_s16(step2[22], step2[21]);
+ step1[23] = vqaddq_s16(step2[23], step2[20]);
+ step1[24] = vqaddq_s16(step2[24], step2[27]);
+ step1[25] = vqaddq_s16(step2[25], step2[26]);
+ step1[26] = vqsubq_s16(step2[25], step2[26]);
+ step1[27] = vqsubq_s16(step2[24], step2[27]);
+ step1[28] = vqsubq_s16(step2[31], step2[28]);
+ step1[29] = vqsubq_s16(step2[30], step2[29]);
+ step1[30] = vqaddq_s16(step2[30], step2[29]);
+ step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+ btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
+ &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
+ &step2[21], &step2[26]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[3]);
+ step2[1] = vqaddq_s16(step1[0], step1[2]);
+ step2[2] = vqsubq_s16(step1[0], step1[2]);
+ step2[3] = vqsubq_s16(step1[0], step1[3]);
+ step2[4] = step1[4];
+ step2[7] = step1[7];
+ step2[8] = vqaddq_s16(step1[8], step1[11]);
+ step2[9] = vqaddq_s16(step1[9], step1[10]);
+ step2[10] = vqsubq_s16(step1[9], step1[10]);
+ step2[11] = vqsubq_s16(step1[8], step1[11]);
+ step2[12] = vqsubq_s16(step1[15], step1[12]);
+ step2[13] = vqsubq_s16(step1[14], step1[13]);
+ step2[14] = vqaddq_s16(step1[14], step1[13]);
+ step2[15] = vqaddq_s16(step1[15], step1[12]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+ btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[7]);
+ step1[1] = vqaddq_s16(step2[1], step2[6]);
+ step1[2] = vqaddq_s16(step2[2], step2[5]);
+ step1[3] = vqaddq_s16(step2[3], step2[4]);
+ step1[4] = vqsubq_s16(step2[3], step2[4]);
+ step1[5] = vqsubq_s16(step2[2], step2[5]);
+ step1[6] = vqsubq_s16(step2[1], step2[6]);
+ step1[7] = vqsubq_s16(step2[0], step2[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[23]);
+ step1[17] = vqaddq_s16(step2[17], step2[22]);
+ step1[18] = vqaddq_s16(step2[18], step2[21]);
+ step1[19] = vqaddq_s16(step2[19], step2[20]);
+ step1[20] = vqsubq_s16(step2[19], step2[20]);
+ step1[21] = vqsubq_s16(step2[18], step2[21]);
+ step1[22] = vqsubq_s16(step2[17], step2[22]);
+ step1[23] = vqsubq_s16(step2[16], step2[23]);
+ step1[24] = vqsubq_s16(step2[31], step2[24]);
+ step1[25] = vqsubq_s16(step2[30], step2[25]);
+ step1[26] = vqsubq_s16(step2[29], step2[26]);
+ step1[27] = vqsubq_s16(step2[28], step2[27]);
+ step1[28] = vqaddq_s16(step2[27], step2[28]);
+ step1[29] = vqaddq_s16(step2[26], step2[29]);
+ step1[30] = vqaddq_s16(step2[25], step2[30]);
+ step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+ btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+ btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[15]);
+ step2[1] = vqaddq_s16(step1[1], step1[14]);
+ step2[2] = vqaddq_s16(step1[2], step1[13]);
+ step2[3] = vqaddq_s16(step1[3], step1[12]);
+ step2[4] = vqaddq_s16(step1[4], step1[11]);
+ step2[5] = vqaddq_s16(step1[5], step1[10]);
+ step2[6] = vqaddq_s16(step1[6], step1[9]);
+ step2[7] = vqaddq_s16(step1[7], step1[8]);
+ step2[8] = vqsubq_s16(step1[7], step1[8]);
+ step2[9] = vqsubq_s16(step1[6], step1[9]);
+ step2[10] = vqsubq_s16(step1[5], step1[10]);
+ step2[11] = vqsubq_s16(step1[4], step1[11]);
+ step2[12] = vqsubq_s16(step1[3], step1[12]);
+ step2[13] = vqsubq_s16(step1[2], step1[13]);
+ step2[14] = vqsubq_s16(step1[1], step1[14]);
+ step2[15] = vqsubq_s16(step1[0], step1[15]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[18];
+ step2[19] = step1[19];
+ step2[28] = step1[28];
+ step2[29] = step1[29];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 9
+
+ out[0] = vqaddq_s16(step2[0], step2[31]);
+ out[1] = vqaddq_s16(step2[1], step2[30]);
+ out[2] = vqaddq_s16(step2[2], step2[29]);
+ out[3] = vqaddq_s16(step2[3], step2[28]);
+ out[4] = vqaddq_s16(step2[4], step2[27]);
+ out[5] = vqaddq_s16(step2[5], step2[26]);
+ out[6] = vqaddq_s16(step2[6], step2[25]);
+ out[7] = vqaddq_s16(step2[7], step2[24]);
+ out[8] = vqaddq_s16(step2[8], step2[23]);
+ out[9] = vqaddq_s16(step2[9], step2[22]);
+ out[10] = vqaddq_s16(step2[10], step2[21]);
+ out[11] = vqaddq_s16(step2[11], step2[20]);
+ out[12] = vqaddq_s16(step2[12], step2[19]);
+ out[13] = vqaddq_s16(step2[13], step2[18]);
+ out[14] = vqaddq_s16(step2[14], step2[17]);
+ out[15] = vqaddq_s16(step2[15], step2[16]);
+ out[16] = vqsubq_s16(step2[15], step2[16]);
+ out[17] = vqsubq_s16(step2[14], step2[17]);
+ out[18] = vqsubq_s16(step2[13], step2[18]);
+ out[19] = vqsubq_s16(step2[12], step2[19]);
+ out[20] = vqsubq_s16(step2[11], step2[20]);
+ out[21] = vqsubq_s16(step2[10], step2[21]);
+ out[22] = vqsubq_s16(step2[9], step2[22]);
+ out[23] = vqsubq_s16(step2[8], step2[23]);
+ out[24] = vqsubq_s16(step2[7], step2[24]);
+ out[25] = vqsubq_s16(step2[6], step2[25]);
+ out[26] = vqsubq_s16(step2[5], step2[26]);
+ out[27] = vqsubq_s16(step2[4], step2[27]);
+ out[28] = vqsubq_s16(step2[3], step2[28]);
+ out[29] = vqsubq_s16(step2[2], step2[29]);
+ out[30] = vqsubq_s16(step2[1], step2[30]);
+ out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
// Functions for blocks with eob at DC and within
// topleft 8x8, 16x16, 32x32 corner
static const transform_1d_neon
@@ -90,10 +2112,37 @@ static const transform_1d_neon
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL } }
};
-static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
- uint8_t *output, int stride,
- TX_TYPE tx_type,
- TX_SIZE tx_size, int eob) {
+
+static const transform_neon
+ lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL },
+ { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL },
+ { identity8_new_neon, identity8_new_neon, NULL, NULL } },
+ {
+ { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL },
+ { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon,
+ NULL },
+ { identity16_new_neon, identity16_new_neon, identity16_new_neon,
+ NULL },
+ },
+ { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon,
+ idct32_new_neon },
+ { NULL, NULL, NULL, NULL },
+ { identity32_new_neon, identity32_new_neon, identity32_new_neon,
+ identity32_new_neon } },
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_idtx_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
int32_t *temp_in = txfm_buf;
@@ -160,7 +2209,79 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
}
}
-static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[32 * 4];
+ int16x8_t b[32 * 4];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const int32_t *input_1;
+ int temp_b = 0;
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ input_1 = input;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ transpose_s16_8x8q(&a[k], &a[k]);
+ input_1 += 8;
+ }
+ input += (txfm_size_col * 8);
+ if (abs(rect_type) == 1) {
+ int y = i * txfm_size_col;
+ round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ }
+ row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+ av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+ -shift[0]);
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(
+ &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_wxh_identity_neon(
const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
@@ -244,7 +2365,88 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
}
}
-static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[16 * 2];
+ int16x8_t b[16 * 2];
+ int eobx, eoby, ud_flip, lr_flip;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const int32_t *input_1;
+ int temp_b = 0;
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ input_1 = input;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ transpose_s16_8x8q(&a[k], &a[k]);
+ input_1 += 8;
+ }
+ input += (txfm_size_col * 8);
+ if (abs(rect_type) == 1) {
+ int y = i * txfm_size_col;
+ round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ }
+ row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+ av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+ -shift[0]);
+ if (lr_flip == 1) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ flip_buf_ud_neon(&a[k], 8);
+ transpose_s16_8x8q(
+ &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+ }
+ temp_b += 8;
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(
+ &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_wxh_identity_neon(
const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
@@ -328,6 +2530,78 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
}
}
+static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[16 * 2];
+ int16x8_t b[16 * 2];
+ int eobx, eoby, ud_flip, lr_flip;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const int32_t *input_1;
+ int temp_b = 0;
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ input_1 = input;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ transpose_s16_8x8q(&a[k], &a[k]);
+ input_1 += 8;
+ }
+ input += (txfm_size_col * 8);
+ if (abs(rect_type) == 1) {
+ int y = i * txfm_size_col;
+ round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ }
+ row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+ av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+ -shift[0]);
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+ }
+}
+
static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
uint8_t *output, int stride,
TX_TYPE tx_type,
@@ -644,7 +2918,7 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
}
}
-static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+static INLINE void lowbd_inv_txfm2d_add_wxh_no_identity_neon(
const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
@@ -727,6 +3001,118 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
}
}
+static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[64 * 8];
+ int16x8_t b[64 * 8];
+ int eobx, eoby, ud_flip, lr_flip;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const int32_t *input_1;
+ int temp_b = 0;
+
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ input_1 = input;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ transpose_s16_8x8q(&a[k], &a[k]);
+ input_1 += 8;
+ }
+ input += (txfm_size_col * 8);
+ if (abs(rect_type) == 1) {
+ int y = i * txfm_size_col;
+ round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ }
+ row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+ av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+ -shift[0]);
+ if (lr_flip == 1) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ flip_buf_ud_neon(&a[k], 8);
+ transpose_s16_8x8q(
+ &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+ }
+ temp_b += 8;
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_universe_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case IDTX:
+ lowbd_inv_txfm2d_add_wxh_idtx_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_wxh_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_wxh_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ default:
+ lowbd_inv_txfm2d_add_wxh_no_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
static INLINE void lowbd_inv_txfm2d_add_universe_neon(
const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
@@ -756,6 +3142,7 @@ static INLINE void lowbd_inv_txfm2d_add_universe_neon(
break;
}
}
+
void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, TX_SIZE tx_size,
int eob) {
@@ -787,8 +3174,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
break;
case TX_16X64: {
- lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
- tx_size, eob);
+ lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
} break;
case TX_64X16: {
@@ -797,13 +3184,13 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
}
- lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
- tx_size, eob);
+ lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
} break;
case TX_32X64: {
- lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
- tx_size, eob);
+ lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
} break;
case TX_64X32: {
@@ -812,8 +3199,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
}
- lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
- tx_size, eob);
+ lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
} break;
case TX_64X64: {
@@ -822,8 +3209,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
}
- lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
- tx_size, eob);
+ lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
} break;
default:
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
index 6af2d61e7..9ec658291 100644
--- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
-#define AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
@@ -23,6 +23,8 @@
typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
const int8_t cos_bit,
const int8_t *stage_ptr);
+typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit, int bit);
DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
@@ -149,4 +151,4 @@ static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
*eoby = eob_fill[temp_eoby];
}
-#endif // AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#endif // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
index 0d8233744..7134f183e 100644
--- a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
+++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
@@ -34,8 +34,8 @@ void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
uint8x8_t tmp0, tmp1;
uint8x16_t res_q;
uint16x8_t res, res_low, res_high;
- uint32x2_t tmp0_32, tmp1_32;
- uint16x4_t tmp0_16, tmp1_16;
+ uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+ uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
if (w >= 16) {
diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
index 33b06b767..194e94c8c 100644
--- a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
+++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
@@ -27,8 +27,8 @@ void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
uint8x8_t tmp0, tmp1;
uint8x16_t tmp0_q, tmp1_q, res_q;
uint16x8_t res, res_low, res_high;
- uint32x2_t tmp0_32, tmp1_32;
- uint16x4_t tmp0_16, tmp1_16;
+ uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+ uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c
index d731b6a66..39025b5e5 100644
--- a/third_party/aom/av1/common/arm/cfl_neon.c
+++ b/third_party/aom/av1/common/arm/cfl_neon.c
@@ -131,7 +131,7 @@ static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
}
-#if __ARM_ARCH <= 7
+#ifndef __aarch64__
uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
@@ -311,7 +311,7 @@ static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst,
// Permute and add in such a way that each lane contains the block sum.
// [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
-#if __ARM_ARCH >= 8
+#ifdef __aarch64__
sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
#else
diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c
index f15744c94..d0c4f8ff6 100644
--- a/third_party/aom/av1/common/arm/convolve_neon.c
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@@ -13,6 +13,8 @@
#include <assert.h>
#include <arm_neon.h>
+#include "config/av1_rtcd.h"
+
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
#include "av1/common/convolve.h"
@@ -68,6 +70,33 @@ static INLINE uint8x8_t convolve8_horiz_8x8(
return vqmovun_s16(sum);
}
+#if !defined(__aarch64__)
+static INLINE uint8x8_t convolve8_horiz_4x1(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+ const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
+ int16x4_t sum;
+
+ sum = vmul_n_s16(s0, filter[0]);
+ sum = vmla_n_s16(sum, s1, filter[1]);
+ sum = vmla_n_s16(sum, s2, filter[2]);
+ sum = vmla_n_s16(sum, s5, filter[5]);
+ sum = vmla_n_s16(sum, s6, filter[6]);
+ sum = vmla_n_s16(sum, s7, filter[7]);
+ /* filter[3] can take a max value of 128. So the max value of the result :
+ * 128*255 + sum > 16 bits
+ */
+ sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
+ sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+
+ sum = vqrshl_s16(sum, shift_round_0);
+ sum = vqrshl_s16(sum, shift_by_bits);
+
+ return vqmovun_s16(vcombine_s16(sum, sum));
+}
+#endif // !defined(__arch64__)
+
static INLINE uint8x8_t convolve8_vert_8x4(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
@@ -175,7 +204,10 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
(void)conv_params;
(void)filter_params_y;
- uint8x8_t t0, t1, t2, t3;
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3;
+#endif
assert(bits >= 0);
assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -188,7 +220,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
src -= horiz_offset;
-
+#if defined(__aarch64__)
if (h == 4) {
uint8x8_t d01, d23;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
@@ -275,12 +307,18 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
w -= 4;
} while (w > 0);
} else {
+#endif
int width;
const uint8_t *s;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10;
uint8x8_t t4, t5, t6, t7;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+#endif
if (w <= 4) {
+#if defined(__aarch64__)
do {
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -387,10 +425,49 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
}
h -= 8;
} while (h > 0);
+#else
+ int16x8_t tt0;
+ int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
+ const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
+ const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
+ do {
+ t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ x0 = vget_low_s16(tt0); // a0 a1 a2 a3
+ x4 = vget_high_s16(tt0); // a4 a5 a6 a7
+
+ t0 = vld1_u8(src + 8); // a8 a9 a10 a11 a12 a13 a14 a15
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ x7 = vget_low_s16(tt0); // a8 a9 a10 a11
+
+ x1 = vext_s16(x0, x4, 1); // a1 a2 a3 a4
+ x2 = vext_s16(x0, x4, 2); // a2 a3 a4 a5
+ x3 = vext_s16(x0, x4, 3); // a3 a4 a5 a6
+ x5 = vext_s16(x4, x7, 1); // a5 a6 a7 a8
+ x6 = vext_s16(x4, x7, 2); // a6 a7 a8 a9
+ x7 = vext_s16(x4, x7, 3); // a7 a8 a9 a10
+
+ src += src_stride;
+
+ t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
+ shift_round_0_low, shift_by_bits_low);
+
+ if (w == 4) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+ 0); // 00 01 02 03
+ dst += dst_stride;
+ } else if (w == 2) {
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01
+ dst += dst_stride;
+ }
+ h -= 1;
+ } while (h > 0);
+#endif
} else {
uint8_t *d;
- int16x8_t s11, s12, s13, s14;
-
+ int16x8_t s11;
+#if defined(__aarch64__)
+ int16x8_t s12, s13, s14;
do {
__builtin_prefetch(src + 0 * src_stride);
__builtin_prefetch(src + 1 * src_stride);
@@ -479,8 +556,47 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
dst += 8 * dst_stride;
h -= 8;
} while (h > 0);
+#else
+ do {
+ t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ width = w;
+ s = src + 8;
+ d = dst;
+ __builtin_prefetch(dst);
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s11 = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(s11, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(s11, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(s11, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(s11, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(s11, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(s11, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(s11, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ shift_round_0, shift_by_bits);
+ vst1_u8(d, t0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 1;
+ } while (h > 0);
+#endif
}
+#if defined(__aarch64__)
}
+#endif
}
void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
@@ -505,9 +621,12 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
filter_params_y, subpel_y_q4 & SUBPEL_MASK);
if (w <= 4) {
- uint8x8_t d01, d23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+ uint8x8_t d01;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+ uint8x8_t d23;
+ int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
src += src_stride;
s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
@@ -526,6 +645,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
do {
s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
src += src_stride;
+#if defined(__aarch64__)
s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
src += src_stride;
s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
@@ -591,14 +711,41 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
s5 = s9;
s6 = s10;
h -= 4;
+#else
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(src + 0 * src_stride);
+
+ d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
+
+ if (w == 4) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+ dst += dst_stride;
+ } else if (w == 2) {
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
+ dst += dst_stride;
+ }
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ h -= 1;
+#endif
} while (h > 0);
} else {
int height;
const uint8_t *s;
uint8_t *d;
- uint8x8_t t0, t1, t2, t3;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
+ uint8x8_t t0;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3;
+ int16x8_t s8, s9, s10;
+#endif
do {
__builtin_prefetch(src + 0 * src_stride);
__builtin_prefetch(src + 1 * src_stride);
@@ -628,6 +775,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
do {
s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
s += src_stride;
+#if defined(__aarch64__)
s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
s += src_stride;
s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
@@ -670,6 +818,24 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
s5 = s9;
s6 = s10;
height -= 4;
+#else
+ __builtin_prefetch(d);
+ __builtin_prefetch(s);
+
+ t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+ vst1_u8(d, t0);
+ d += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
} while (height > 0);
src += 8;
dst += 8;
@@ -686,7 +852,10 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
ConvolveParams *conv_params) {
int im_dst_stride;
int width, height;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
DECLARE_ALIGNED(16, int16_t,
im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
@@ -724,13 +893,18 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
assert(conv_params->round_0 > 0);
if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
do {
s = src_ptr;
+
+#if defined(__aarch64__)
__builtin_prefetch(s + 0 * src_stride);
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
@@ -789,16 +963,56 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
src_ptr += 4 * src_stride;
dst_ptr += 4 * im_dst_stride;
height -= 4;
+#else
+ int16x8_t tt0;
+
+ __builtin_prefetch(s);
+
+ t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s0 = vget_low_s16(tt0);
+ s4 = vget_high_s16(tt0);
+
+ __builtin_prefetch(dst_ptr);
+ s += 8;
+
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
+ s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
+ s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+
+ if (w == 4) {
+ vst1_s16(dst_ptr, d0);
+ dst_ptr += im_dst_stride;
+ } else if (w == 2) {
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+ dst_ptr += im_dst_stride;
+ }
+
+ src_ptr += src_stride;
+ height -= 1;
+#endif
} while (height > 0);
} else {
int16_t *d_tmp;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
int16x8_t s11, s12, s13, s14;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+#endif
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
+#if defined(__aarch64__)
do {
__builtin_prefetch(src_ptr + 0 * src_stride);
__builtin_prefetch(src_ptr + 1 * src_stride);
@@ -886,6 +1100,45 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
dst_ptr += 8 * im_dst_stride;
height -= 8;
} while (height > 0);
+#else
+ do {
+ t0 = vld1_u8(src_ptr);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ width = w;
+ s = src_ptr + 8;
+ d_tmp = dst_ptr;
+
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t sum = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+
+ vst1q_s16(d_tmp, res0);
+
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += im_dst_stride;
+ height -= 1;
+ } while (height > 0);
+#endif
}
// vertical
@@ -910,10 +1163,17 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
width = w;
if (width <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x4_t d0, d1, d2, d3;
- uint16x8_t dd0, dd1;
- uint8x8_t d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint16x4_t d0;
+ uint16x8_t dd0;
+ uint8x8_t d01;
+
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10;
+ uint16x4_t d1, d2, d3;
+ uint16x8_t dd1;
+ uint8x8_t d23;
+#endif
d_u8 = dst_u8_ptr;
v_s = v_src_ptr;
@@ -931,6 +1191,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
v_s += (7 * im_stride);
do {
+#if defined(__aarch64__)
load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
v_s += (im_stride << 2);
@@ -1008,11 +1269,48 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
s5 = s9;
s6 = s10;
height -= 4;
+#else
+ s7 = vld1_s16(v_s);
+ v_s += im_stride;
+
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+ d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+
+ dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
+ d01 = vqmovn_u16(dd0);
+
+ if (w == 4) {
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ d_u8 += dst_stride;
+
+ } else if (w == 2) {
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 0); // 00 01
+ d_u8 += dst_stride;
+ }
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
} while (height > 0);
} else {
// if width is a multiple of 8 & height is a multiple of 4
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x8_t res0, res1, res2, res3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x8_t res0;
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10;
+ uint8x8_t res1, res2, res3;
+#endif
do {
__builtin_prefetch(v_src_ptr + 0 * im_stride);
@@ -1032,6 +1330,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
height = h;
do {
+#if defined(__aarch64__)
load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
v_s += (im_stride << 2);
@@ -1076,6 +1375,28 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
s5 = s9;
s6 = s10;
height -= 4;
+#else
+ s7 = vld1q_s16(v_s);
+ v_s += im_stride;
+
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+ res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+
+ vst1_u8(d_u8, res0);
+ d_u8 += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
} while (height > 0);
v_src_ptr += 8;
dst_u8_ptr += 8;
diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h
index 47c93d645..f382984f2 100644
--- a/third_party/aom/av1/common/arm/convolve_neon.h
+++ b/third_party/aom/av1/common/arm/convolve_neon.h
@@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef AV1_COMMON_ARM_CONVOLVE_NEON_H_
-#define AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
#include <arm_neon.h>
@@ -225,4 +225,4 @@ static INLINE uint16x4_t convolve8_4x4_s32(
return res;
}
-#endif // AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
index 4015082b4..e5674ef7c 100644
--- a/third_party/aom/av1/common/arm/jnt_convolve_neon.c
+++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
@@ -22,12 +22,108 @@
#include "av1/common/arm/mem_neon.h"
#include "av1/common/arm/transpose_neon.h"
+#if !defined(__aarch64__)
+static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
+ const uint16_t fwd_offset,
+ const uint16_t bck_offset,
+ const int16x4_t sub_const_vec,
+ const int16_t round_bits,
+ const int use_jnt_comp_avg, uint8x8_t *t0) {
+ int16x4_t tmp0;
+ uint16x4_t tmp_u0;
+ uint32x4_t sum0;
+ int32x4_t dst0;
+ int16x8_t tmp4;
+
+ if (use_jnt_comp_avg) {
+ const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
+
+ sum0 = vmull_n_u16(res0, fwd_offset);
+ sum0 = vmlal_n_u16(sum0, d0, bck_offset);
+
+ sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+ dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), vmovl_s16(sub_const_vec));
+
+ dst0 = vqrshlq_s32(dst0, round_bits_vec);
+
+ tmp0 = vqmovn_s32(dst0);
+ tmp4 = vcombine_s16(tmp0, tmp0);
+
+ *t0 = vqmovun_s16(tmp4);
+ } else {
+ const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
+ tmp_u0 = vhadd_u16(res0, d0);
+
+ tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec);
+
+ tmp0 = vqrshl_s16(tmp0, round_bits_vec);
+
+ tmp4 = vcombine_s16(tmp0, tmp0);
+
+ *t0 = vqmovun_s16(tmp4);
+ }
+}
+
+static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
+ const uint16_t fwd_offset,
+ const uint16_t bck_offset,
+ const int16x4_t sub_const,
+ const int16_t round_bits,
+ const int use_jnt_comp_avg, uint8x8_t *t0) {
+ int16x4_t tmp0, tmp2;
+ int16x8_t f0;
+ uint32x4_t sum0, sum2;
+ int32x4_t dst0, dst2;
+
+ uint16x8_t tmp_u0;
+
+ if (use_jnt_comp_avg) {
+ const int32x4_t sub_const_vec = vmovl_s16(sub_const);
+ const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
+
+ sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset);
+ sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset);
+ sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+ sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset);
+ sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset);
+ sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+
+ dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec);
+ dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec);
+
+ dst0 = vqrshlq_s32(dst0, round_bits_vec);
+ dst2 = vqrshlq_s32(dst2, round_bits_vec);
+
+ tmp0 = vqmovn_s32(dst0);
+ tmp2 = vqmovn_s32(dst2);
+
+ f0 = vcombine_s16(tmp0, tmp2);
+
+ *t0 = vqmovun_s16(f0);
+
+ } else {
+ const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
+ const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits);
+
+ tmp_u0 = vhaddq_u16(res0, d0);
+
+ f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec);
+
+ f0 = vqrshlq_s16(f0, round_bits_vec);
+
+ *t0 = vqmovun_s16(f0);
+ }
+}
+#endif // !defined(__arch64__)
+
static INLINE void compute_avg_4x4(
uint16x4_t res0, uint16x4_t res1, uint16x4_t res2, uint16x4_t res3,
uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
const uint16_t fwd_offset, const uint16_t bck_offset,
const int16x4_t sub_const_vec, const int16_t round_bits,
- const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+ const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
int16x4_t tmp0, tmp1, tmp2, tmp3;
uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
uint32x4_t sum0, sum1, sum2, sum3;
@@ -107,7 +203,7 @@ static INLINE void compute_avg_8x4(
uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
const uint16_t fwd_offset, const uint16_t bck_offset,
const int16x4_t sub_const, const int16_t round_bits,
- const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
+ const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
uint8x8_t *t3) {
int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int16x8_t f0, f1, f2, f3;
@@ -231,7 +327,6 @@ static INLINE void jnt_convolve_2d_horiz_neon(
int16_t *dst_ptr;
int dst_stride;
int width, height;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
dst_ptr = im_block;
dst_stride = im_stride;
@@ -239,15 +334,22 @@ static INLINE void jnt_convolve_2d_horiz_neon(
width = w;
if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- int16x8_t tt0, tt1, tt2, tt3;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+ int16x8_t tt0;
+ uint8x8_t t0;
const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+ int16x8_t tt1, tt2, tt3;
+ uint8x8_t t1, t2, t3;
+#endif
do {
s = src;
__builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
@@ -301,17 +403,48 @@ static INLINE void jnt_convolve_2d_horiz_neon(
src += 4 * src_stride;
dst_ptr += 4 * dst_stride;
height -= 4;
+#else
+ t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+ s0 = vget_low_s16(tt0); // a0 a1 a2 a3
+ s4 = vget_high_s16(tt0); // a4 a5 a6 a7
+ __builtin_prefetch(dst_ptr);
+ s += 8;
+ t0 = vld1_u8(s); // a8 a9 a10 a11
+
+ // a8 a9 a10 a11
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
+ s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
+ s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+
+ vst1_s16(dst_ptr, d0);
+
+ src += src_stride;
+ dst_ptr += dst_stride;
+ height -= 1;
+#endif
} while (height > 0);
} else {
int16_t *d_tmp;
- int16x8_t s11, s12, s13, s14;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t res0;
+ uint8x8_t t0;
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
-
do {
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+ int16x8_t s8, s9, s10, s11, s12, s13, s14;
+ int16x8_t res1, res2, res3, res4, res5, res6, res7;
__builtin_prefetch(src + 0 * src_stride);
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
@@ -390,6 +523,42 @@ static INLINE void jnt_convolve_2d_horiz_neon(
src += 8 * src_stride;
dst_ptr += 8 * dst_stride;
height -= 8;
+#else
+ int16x8_t temp_0;
+ t0 = vld1_u8(src);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ width = w;
+ s = src + 8;
+ d_tmp = dst_ptr;
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ temp_0 = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(temp_0, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(temp_0, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(temp_0, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(temp_0, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(temp_0, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter_tmp, horiz_const, shift_round_0);
+ vst1q_s16(d_tmp, res0);
+
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src += src_stride;
+ dst_ptr += dst_stride;
+ height -= 1;
+#endif
} while (height > 0);
}
}
@@ -420,10 +589,15 @@ static INLINE void jnt_convolve_2d_vert_neon(
const int do_average = conv_params->do_average;
const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x4_t res4, res5, res6, res7;
- uint16x4_t d0, d1, d2, d3;
- uint8x8_t t0, t1;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint16x4_t res4, d0;
+ uint8x8_t t0;
+
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10;
+ uint16x4_t res5, res6, res7, d1, d2, d3;
+ uint8x8_t t1;
+#endif
dst = conv_params->dst;
src_ptr = im_block;
@@ -450,6 +624,7 @@ static INLINE void jnt_convolve_2d_vert_neon(
s += (7 * im_stride);
do {
+#if defined(__aarch64__)
load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10);
s += (im_stride << 2);
@@ -480,17 +655,13 @@ static INLINE void jnt_convolve_2d_vert_neon(
bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg,
&t0, &t1);
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
- 0); // 00 01 02 03
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
d_u8 += dst8_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
- 1); // 10 11 12 13
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
d_u8 += dst8_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
- 0); // 20 21 22 23
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
d_u8 += dst8_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
- 1); // 30 31 32 33
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
d_u8 += dst8_stride;
} else {
@@ -505,6 +676,39 @@ static INLINE void jnt_convolve_2d_vert_neon(
s5 = s9;
s6 = s10;
height -= 4;
+#else
+ s7 = vld1_s16(s);
+ s += (im_stride);
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d_u8 + 0 * dst8_stride);
+
+ d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const);
+
+ if (do_average) {
+ res4 = vld1_u16(d);
+ d += (dst_stride);
+
+ compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec,
+ round_bits, use_jnt_comp_avg, &t0);
+
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+ d_u8 += dst8_stride;
+
+ } else {
+ vst1_u16(d, d0);
+ d += (dst_stride);
+ }
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height--;
+#endif
} while (height > 0);
src_ptr += 4;
dst_ptr += 4;
@@ -722,8 +926,10 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
uint8_t *dst_u8_ptr;
CONV_BUF_TYPE *d, *dst_ptr;
int width, height;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
s = src_ptr;
dst_ptr = dst;
dst_u8_ptr = dst8;
@@ -731,11 +937,18 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
height = h;
if ((w == 4) || (h == 4)) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- int16x8_t tt0, tt1, tt2, tt3;
- uint16x4_t res4, res5, res6, res7;
- uint32x2_t tu0, tu1;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+ int16x8_t tt0;
+ uint16x4_t res4;
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+ int16x8_t tt1, tt2, tt3;
+ uint16x4_t res5, res6, res7;
+ uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0);
int16x8_t u0, u1;
+#else
+ int16x4_t temp_0;
+#endif
const int16x4_t zero = vdup_n_s16(0);
const int16x4_t round_offset_vec = vdup_n_s16(round_offset);
const int16x4_t shift_round_0 = vdup_n_s16(-conv_params->round_0 + 1);
@@ -746,6 +959,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
d_u8 = dst_u8_ptr;
width = w;
__builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
@@ -854,15 +1068,66 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
dst_ptr += (dst_stride << 2);
dst_u8_ptr += (dst8_stride << 2);
height -= 4;
+#else
+ t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+ s0 = vget_low_s16(tt0); // a0 a1 a2 a3
+ s4 = vget_high_s16(tt0); // a4 a5 a6 a7
+ __builtin_prefetch(d);
+
+ s += 8;
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11
+
+ // a8 a9 a10 a11
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ temp_0 = s7;
+ s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
+ s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
+ s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ zero, shift_round_0);
+ d0 = vrshl_s16(d0, horiz_const);
+ d0 = vadd_s16(d0, round_offset_vec);
+ s0 = s4;
+ s4 = temp_0;
+ if (conv_params->do_average) {
+ __builtin_prefetch(d);
+ __builtin_prefetch(d_u8);
+
+ res4 = vld1_u16(d);
+
+ compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+ bck_offset, round_offset_vec, round_bits,
+ use_jnt_comp_avg, &t0);
+
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+ 0); // 00 01 02 03
+ } else {
+ vst1_u16(d, vreinterpret_u16_s16(d0));
+ }
+
+ s += 4;
+ width -= 4;
+ d += 4;
+ d_u8 += 4;
+ } while (width > 0);
+ src_ptr += (src_stride);
+ dst_ptr += (dst_stride);
+ dst_u8_ptr += (dst8_stride);
+ height--;
+#endif
} while (height > 0);
} else {
CONV_BUF_TYPE *d_tmp;
uint8_t *d_u8_tmp;
- int16x8_t s11, s12, s13, s14;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
- uint16x8_t res8, res9, res10, res11;
-
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t res0;
+ uint16x8_t res8;
const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
const int16x4_t round_offset64 = vdup_n_s16(round_offset);
const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
@@ -872,6 +1137,11 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
d = dst_ptr = dst;
d_u8 = dst_u8_ptr = dst8;
do {
+#if defined(__aarch64__)
+ int16x8_t s11, s12, s13, s14;
+ int16x8_t s8, s9, s10;
+ int16x8_t res1, res2, res3, res4, res5, res6, res7;
+ uint16x8_t res9, res10, res11;
__builtin_prefetch(src_ptr + 0 * src_stride);
__builtin_prefetch(src_ptr + 1 * src_stride);
__builtin_prefetch(src_ptr + 2 * src_stride);
@@ -1007,6 +1277,67 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
dst_ptr += 8 * dst_stride;
dst_u8_ptr += 8 * dst8_stride;
height -= 8;
+#else
+ int16x8_t temp_0;
+ __builtin_prefetch(src_ptr);
+ t0 = vld1_u8(src_ptr);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ width = w;
+ s = src_ptr + 8;
+ d = dst_ptr;
+ d_u8_tmp = dst_u8_ptr;
+
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ d_u8 = d_u8_tmp;
+ d_tmp = d;
+
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ temp_0 = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(temp_0, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(temp_0, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(temp_0, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(temp_0, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(temp_0, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter_tmp, zero, shift_round_0);
+
+ res0 = vrshlq_s16(res0, horiz_const);
+ res0 = vaddq_s16(res0, round_offset128);
+
+ if (conv_params->do_average) {
+ res8 = vld1q_u16(d_tmp);
+ d_tmp += (dst_stride);
+
+ compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+ bck_offset, round_offset64, round_bits,
+ use_jnt_comp_avg, &t0);
+
+ vst1_u8(d_u8, t0);
+ d_u8 += (dst8_stride);
+ } else {
+ vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+ d_tmp += (dst_stride);
+ }
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ d_u8_tmp += 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ dst_u8_ptr += dst8_stride;
+ height--;
+#endif
} while (height > 0);
}
}
@@ -1057,7 +1388,6 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
uint8_t *dst_u8_ptr;
CONV_BUF_TYPE *d, *dst_ptr;
int width, height;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
s = src_ptr;
dst_ptr = dst;
@@ -1070,11 +1400,18 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
assert((conv_params->round_1 - 2) >= bits);
if ((w == 4) || (h == 4)) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- uint16x4_t res4, res5, res6, res7;
- uint32x2_t tu0, tu1, tu2, tu3;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+ uint16x4_t res4;
+ uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
+ tu3 = vdup_n_u32(0);
int16x8_t u0, u1, u2, u3;
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+ uint16x4_t res5, res6, res7;
+ uint8x8_t t1;
+#endif
const int16x4_t round_offset64 = vdup_n_s16(round_offset);
const int16x4_t shift_vec = vdup_n_s16(-shift_value);
const int16x4_t zero = vdup_n_s16(0);
@@ -1111,6 +1448,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
s += (7 * src_stride);
do {
+#if defined(__aarch64__)
load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
@@ -1154,17 +1492,13 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
round_offset64, round_bits, use_jnt_comp_avg, &t0,
&t1);
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
- 0); // 00 01 02 03
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
d_u8 += dst8_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
- 1); // 10 11 12 13
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
d_u8 += dst8_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
- 0); // 20 21 22 23
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
d_u8 += dst8_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
- 1); // 30 31 32 33
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
d_u8 += dst8_stride;
} else {
store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
@@ -1183,6 +1517,44 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
s += (src_stride << 2);
height -= 4;
+#else
+ load_unaligned_u8_4x1(s, src_stride, &tu0);
+ u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+ s7 = vget_low_s16(u0);
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+ zero, shift_vec);
+
+ d0 = vadd_s16(d0, round_offset64);
+
+ if (conv_params->do_average) {
+ __builtin_prefetch(d);
+
+ res4 = vld1_u16(d);
+ d += (dst_stride);
+
+ compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+ bck_offset, round_offset64, round_bits,
+ use_jnt_comp_avg, &t0);
+
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+ d_u8 += dst8_stride;
+ } else {
+ vst1_u16(d, vreinterpret_u16_s16(d0));
+ d += (dst_stride);
+ }
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+
+ s += (src_stride);
+ height--;
+#endif
} while (height > 0);
src_ptr += 4;
dst_ptr += 4;
@@ -1191,15 +1563,19 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
} while (width > 0);
} else {
CONV_BUF_TYPE *d_tmp;
- int16x8_t s11, s12, s13, s14;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
- uint16x8_t res8, res9, res10, res11;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t res0;
+ uint16x8_t res8;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
const int16x8_t shift_vec = vdupq_n_s16(-shift_value);
const int16x4_t round_offset64 = vdup_n_s16(round_offset);
const int16x8_t zero = vdupq_n_s16(0);
-
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10, s11, s12, s13, s14;
+ int16x8_t res1, res2, res3, res4, res5, res6, res7;
+ uint16x8_t res10, res11, res9;
+#endif
dst_ptr = dst;
dst_u8_ptr = dst8;
do {
@@ -1227,6 +1603,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
d_u8 = dst_u8_ptr;
do {
+#if defined(__aarch64__)
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -1316,6 +1693,43 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
s6 = s14;
s += (8 * src_stride);
height -= 8;
+#else
+ s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ __builtin_prefetch(dst_ptr);
+
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+ zero, shift_vec);
+ res0 = vaddq_s16(res0, round_offset128);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+
+ if (conv_params->do_average) {
+ __builtin_prefetch(d_tmp);
+
+ res8 = vld1q_u16(d_tmp);
+ d_tmp += (dst_stride);
+
+ compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+ bck_offset, round_offset64, round_bits,
+ use_jnt_comp_avg, &t0);
+
+ vst1_u8(d_u8, t0);
+ d_u8 += (dst8_stride);
+ } else {
+ vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+ d_tmp += dst_stride;
+ }
+
+ s += (src_stride);
+ height--;
+#endif
} while (height > 0);
src_ptr += 8;
dst_ptr += 8;
diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h
index 4bf45a52c..c4ae2e784 100644
--- a/third_party/aom/av1/common/arm/mem_neon.h
+++ b/third_party/aom/av1/common/arm/mem_neon.h
@@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef AV1_COMMON_ARM_MEM_NEON_H_
-#define AV1_COMMON_ARM_MEM_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
+#define AOM_AV1_COMMON_ARM_MEM_NEON_H_
#include <arm_neon.h>
#include <string.h>
@@ -362,6 +362,15 @@ static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
*tu1 = vset_lane_u32(a, *tu1, 1);
}
+static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
+ uint32x2_t *tu0) {
+ uint32_t a;
+
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu0 = vset_lane_u32(a, *tu0, 0);
+}
+
static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
uint32x2_t *tu0) {
uint32_t a;
@@ -482,4 +491,4 @@ static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
vst1q_u32(s, s4);
}
-#endif // AV1_COMMON_ARM_MEM_NEON_H_
+#endif // AOM_AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c
index b4808a972..b3a37c4cb 100644
--- a/third_party/aom/av1/common/arm/selfguided_neon.c
+++ b/third_party/aom/av1/common/arm/selfguided_neon.c
@@ -1007,10 +1007,11 @@ static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
}
-void final_filter_fast_internal(uint16_t *A, int32_t *B, const int buf_stride,
- int16_t *src, const int src_stride,
- int32_t *dst, const int dst_stride,
- const int width, const int height) {
+static void final_filter_fast_internal(uint16_t *A, int32_t *B,
+ const int buf_stride, int16_t *src,
+ const int src_stride, int32_t *dst,
+ const int dst_stride, const int width,
+ const int height) {
int16x8_t s0;
int32_t *B_tmp, *dst_ptr;
uint16_t *A_tmp;
@@ -1340,10 +1341,10 @@ static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
}
}
-void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
- int stride, int32_t *flt0, int32_t *flt1,
- int flt_stride, int sgr_params_idx,
- int bit_depth, int highbd) {
+int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
+ int stride, int32_t *flt0, int32_t *flt1,
+ int flt_stride, int sgr_params_idx,
+ int bit_depth, int highbd) {
const sgr_params_type *const params = &sgr_params[sgr_params_idx];
assert(!(params->r[0] == 0 && params->r[1] == 0));
@@ -1376,6 +1377,7 @@ void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
if (params->r[1] > 0)
restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride,
bit_depth, sgr_params_idx, 1);
+ return 0;
}
void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h
index fe134087b..8a3d9f07f 100644
--- a/third_party/aom/av1/common/arm/transpose_neon.h
+++ b/third_party/aom/av1/common/arm/transpose_neon.h
@@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef AV1_COMMON_ARM_TRANSPOSE_NEON_H_
-#define AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
#include <arm_neon.h>
@@ -386,6 +386,83 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
}
+static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+ int16x8x2_t b0;
+ b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+ vreinterpret_s16_s32(vget_low_s32(a1)));
+ b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+ vreinterpret_s16_s32(vget_high_s32(a1)));
+ return b0;
+}
+
+static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
+ const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
+ const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
+ const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+ const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+ *out = d0.val[0];
+ *(out + 1) = d1.val[0];
+ *(out + 2) = d2.val[0];
+ *(out + 3) = d3.val[0];
+ *(out + 4) = d0.val[1];
+ *(out + 5) = d1.val[1];
+ *(out + 6) = d2.val[1];
+ *(out + 7) = d3.val[1];
+}
+
static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
int16x4_t *a2, int16x4_t *a3) {
// Swap 16 bit elements. Goes from:
@@ -457,4 +534,4 @@ static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
*a3 = c1.val[1];
}
-#endif // AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#endif // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c
new file mode 100644
index 000000000..7f02d42a7
--- /dev/null
+++ b/third_party/aom/av1/common/arm/warp_plane_neon.c
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <memory.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+/* This is a modified version of 'warped_filter' from warped_motion.c:
+ * Each coefficient is stored in 8 bits instead of 16 bits
+ * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+ This is done in order to avoid overflow: Since the tap with the largest
+ coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+ order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+ convolve functions.
+
+ Instead, we use the summation order
+ ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+ The rearrangement of coefficients in this table is so that we can get the
+ coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, static const int8_t,
+ filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+ // [-1, 0)
+ { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
+ { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0},
+ { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0},
+ { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0},
+ { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0},
+ { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0},
+ { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0},
+ { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0},
+ { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0},
+ { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0},
+ { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0},
+ { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0},
+ { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0},
+ { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0},
+ { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0},
+ { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0},
+ { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0},
+ { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0},
+ { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0},
+ { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0},
+ { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0},
+ { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0},
+ { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0},
+ { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0},
+ { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0},
+ { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0},
+ { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0},
+ { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0},
+ { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0},
+ { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0},
+ { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0},
+ { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0},
+ // [0, 1)
+ { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0},
+ { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0},
+ { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1},
+ {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1},
+ {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1},
+ {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1},
+ {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1},
+ {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1},
+ {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2},
+ {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2},
+ {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2},
+ {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2},
+ {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2},
+ {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2},
+ {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2},
+ {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2},
+ {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2},
+ {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2},
+ {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2},
+ {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2},
+ {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2},
+ {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2},
+ {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2},
+ {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2},
+ {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2},
+ {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1},
+ {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2},
+ {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1},
+ {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1},
+ {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1},
+ { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0},
+ { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0},
+ // [1, 2)
+ { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0},
+ { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1},
+ { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1},
+ { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1},
+ { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1},
+ { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2},
+ { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2},
+ { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2},
+ { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3},
+ { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3},
+ { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3},
+ { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4},
+ { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4},
+ { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4},
+ { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4},
+ { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4},
+ { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4},
+ { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4},
+ { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4},
+ { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4},
+ { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4},
+ { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4},
+ { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4},
+ { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3},
+ { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3},
+ { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3},
+ { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2},
+ { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2},
+ { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2},
+ { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1},
+ { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1},
+ { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0},
+ // dummy (replicate row index 191)
+ { 0, 0, 2, -1, 0, 0, 127, 0},
+
+#else
+ // [-1, 0)
+ { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0},
+ { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0},
+ { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0},
+ { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0},
+ { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0},
+ { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0},
+ { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0},
+ { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0},
+ { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0},
+ { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0},
+ { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0},
+ { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0},
+ { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0},
+ { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0},
+ { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0},
+ { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0},
+ // [0, 1)
+ { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0},
+ { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1},
+ {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1},
+ {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1},
+ {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2},
+ {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2},
+ {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2},
+ {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2},
+ {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2},
+ {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2},
+ {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2},
+ {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2},
+ {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2},
+ {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1},
+ {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1},
+ { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0},
+ // [1, 2)
+ { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0},
+ { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1},
+ { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2},
+ { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3},
+ { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3},
+ { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3},
+ { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4},
+ { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4},
+ { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4},
+ { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4},
+ { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4},
+ { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3},
+ { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3},
+ { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2},
+ { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1},
+ { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1},
+ // dummy (replicate row index 95)
+ { 0, 0, 4, -3, 0, -1, 127, 1},
+#endif // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0,
+ uint8x8_t src_1, int16x4_t *res) {
+ int16x8_t coeff_0, coeff_1;
+ int16x8_t pix_0, pix_1;
+
+ coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]),
+ vreinterpret_s16_s32(x1.val[0]));
+ coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]),
+ vreinterpret_s16_s32(x1.val[1]));
+
+ pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0));
+ pix_0 = vmulq_s16(coeff_0, pix_0);
+
+ pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1));
+ pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1);
+
+ *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0));
+}
+
+static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2,
+ uint8x16_t src_3, uint8x16_t src_4,
+ int16x8_t *tmp_dst, int sx, int alpha,
+ int k, const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ const uint8x16_t mask = { 255, 0, 255, 0, 255, 0, 255, 0,
+ 255, 0, 255, 0, 255, 0, 255, 0 };
+ const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz));
+ const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz);
+
+ int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+ int32x2x2_t b0, b1;
+ uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low;
+ int32x4_t tmp_res_low, tmp_res_high;
+ uint16x8_t res;
+ int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd;
+
+ uint8x16_t tmp_0 = vandq_u8(src_1, mask);
+ uint8x16_t tmp_1 = vandq_u8(src_2, mask);
+ uint8x16_t tmp_2 = vandq_u8(src_3, mask);
+ uint8x16_t tmp_3 = vandq_u8(src_4, mask);
+
+ tmp_2 = vextq_u8(tmp_0, tmp_0, 1);
+ tmp_3 = vextq_u8(tmp_1, tmp_1, 1);
+
+ src_1 = vaddq_u8(tmp_0, tmp_2);
+ src_2 = vaddq_u8(tmp_1, tmp_3);
+
+ src_1_low = vget_low_u8(src_1);
+ src_2_low = vget_low_u8(src_2);
+ src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4));
+ src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4));
+ src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2));
+ src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6));
+
+ // Loading the 8 filter taps
+ f0 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f1 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f2 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f3 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f4 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f5 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f6 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f7 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]));
+
+ b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)),
+ vreinterpret_s32_s16(vget_low_s16(f2)));
+ b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)),
+ vreinterpret_s32_s16(vget_low_s16(f6)));
+ convolve(b0, b1, src_1_low, src_3_low, &res_0246_even);
+
+ b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)),
+ vreinterpret_s32_s16(vget_low_s16(f3)));
+ b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)),
+ vreinterpret_s32_s16(vget_low_s16(f7)));
+ convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd);
+
+ b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)),
+ vreinterpret_s32_s16(vget_high_s16(f2)));
+ b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)),
+ vreinterpret_s32_s16(vget_high_s16(f6)));
+ convolve(b0, b1, src_2_low, src_4_low, &res_1357_even);
+
+ b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)),
+ vreinterpret_s32_s16(vget_high_s16(f3)));
+ b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)),
+ vreinterpret_s32_s16(vget_high_s16(f7)));
+ convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd);
+
+ tmp_res_low = vaddl_s16(res_0246_even, res_1357_even);
+ tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high));
+ res = vqrshlq_u16(res, shift);
+
+ tmp_dst[k + 7] = vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_neon(const int16x8_t *src,
+ int32x4_t *res_low, int32x4_t *res_high,
+ int sy, int gamma) {
+ int16x4_t src_0, src_1, fltr_0, fltr_1;
+ int32x4_t res_0, res_1;
+ int32x2_t res_0_im, res_1_im;
+ int32x4_t res_even, res_odd, im_res_0, im_res_1;
+
+ int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+ int16x8x2_t b0, b1, b2, b3;
+ int32x4x2_t c0, c1, c2, c3;
+ int32x4x2_t d0, d1, d2, d3;
+
+ b0 = vtrnq_s16(src[0], src[1]);
+ b1 = vtrnq_s16(src[2], src[3]);
+ b2 = vtrnq_s16(src[4], src[5]);
+ b3 = vtrnq_s16(src[6], src[7]);
+
+ c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b0.val[1]));
+ c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b2.val[1]));
+ c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ f0 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f1 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f2 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f3 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f4 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f5 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f6 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f7 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
+ d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));
+ d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3));
+ d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7));
+
+ // row:0,1 even_col:0,2
+ src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0]));
+ fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0]));
+ res_0 = vmull_s16(src_0, fltr_0);
+
+ // row:0,1,2,3 even_col:0,2
+ src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0]));
+ fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1]));
+ res_0 = vmlal_s16(res_0, src_0, fltr_0);
+ res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+ // row:0,1 even_col:4,6
+ src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1]));
+ fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0]));
+ res_1 = vmull_s16(src_1, fltr_1);
+
+ // row:0,1,2,3 even_col:4,6
+ src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1]));
+ fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1]));
+ res_1 = vmlal_s16(res_1, src_1, fltr_1);
+ res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+ // row:0,1,2,3 even_col:0,2,4,6
+ im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+ // row:4,5 even_col:0,2
+ src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0]));
+ fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0]));
+ res_0 = vmull_s16(src_0, fltr_0);
+
+ // row:4,5,6,7 even_col:0,2
+ src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0]));
+ fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1]));
+ res_0 = vmlal_s16(res_0, src_0, fltr_0);
+ res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+ // row:4,5 even_col:4,6
+ src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1]));
+ fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0]));
+ res_1 = vmull_s16(src_1, fltr_1);
+
+ // row:4,5,6,7 even_col:4,6
+ src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1]));
+ fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1]));
+ res_1 = vmlal_s16(res_1, src_1, fltr_1);
+ res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+ // row:4,5,6,7 even_col:0,2,4,6
+ im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+ // row:0-7 even_col:0,2,4,6
+ res_even = vaddq_s32(im_res_0, im_res_1);
+
+ // row:0,1 odd_col:1,3
+ src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0]));
+ fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0]));
+ res_0 = vmull_s16(src_0, fltr_0);
+
+ // row:0,1,2,3 odd_col:1,3
+ src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0]));
+ fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1]));
+ res_0 = vmlal_s16(res_0, src_0, fltr_0);
+ res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+ // row:0,1 odd_col:5,7
+ src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1]));
+ fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0]));
+ res_1 = vmull_s16(src_1, fltr_1);
+
+ // row:0,1,2,3 odd_col:5,7
+ src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1]));
+ fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1]));
+ res_1 = vmlal_s16(res_1, src_1, fltr_1);
+ res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+ // row:0,1,2,3 odd_col:1,3,5,7
+ im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+ // row:4,5 odd_col:1,3
+ src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0]));
+ fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0]));
+ res_0 = vmull_s16(src_0, fltr_0);
+
+ // row:4,5,6,7 odd_col:1,3
+ src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0]));
+ fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1]));
+ res_0 = vmlal_s16(res_0, src_0, fltr_0);
+ res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+ // row:4,5 odd_col:5,7
+ src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1]));
+ fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0]));
+ res_1 = vmull_s16(src_1, fltr_1);
+
+ // row:4,5,6,7 odd_col:5,7
+ src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1]));
+ fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1]));
+ res_1 = vmlal_s16(res_1, src_1, fltr_1);
+ res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+ // row:4,5,6,7 odd_col:1,3,5,7
+ im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+ // row:0-7 odd_col:1,3,5,7
+ res_odd = vaddq_s32(im_res_0, im_res_1);
+
+ // reordering as 0 1 2 3 | 4 5 6 7
+ c0 = vtrnq_s32(res_even, res_odd);
+
+ // Final store
+ *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1]));
+ *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1]));
+}
+
+void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ int16x8_t tmp[15];
+ const int bd = 8;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const int32x4_t fwd = vdupq_n_s32((int32_t)w0);
+ const int32x4_t bwd = vdupq_n_s32((int32_t)w1);
+ const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd));
+
+ int limit = 0;
+ uint8x16_t vec_dup, mask_val;
+ int32x4_t res_lo, res_hi;
+ int16x8_t result_final;
+ uint8x16_t src_1, src_2, src_3, src_4;
+ uint8x16_t indx_vec = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ };
+ uint8x16_t cmp_vec;
+
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert);
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int16x4_t res_sub_const =
+ vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1))));
+ int k;
+
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ for (int i = 0; i < p_height; i += 8) {
+ for (int j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+ const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+ const int32_t x4 = dst_x >> subsampling_x;
+ const int32_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ // horizontal
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int16_t dup_val =
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
+
+ tmp[k + 7] = vdupq_n_s16(dup_val);
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz));
+ tmp[k + 7] = vdupq_n_s16(dup_val);
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ const uint8_t *src = ref + iy * stride + ix4 - 7;
+ src_1 = vld1q_u8(src);
+
+ if (out_of_boundary_left >= 0) {
+ limit = out_of_boundary_left + 1;
+ cmp_vec = vdupq_n_u8(out_of_boundary_left);
+ vec_dup = vdupq_n_u8(*(src + limit));
+ mask_val = vcleq_u8(indx_vec, cmp_vec);
+ src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+ }
+ if (out_of_boundary_right >= 0) {
+ limit = 15 - (out_of_boundary_right + 1);
+ cmp_vec = vdupq_n_u8(15 - out_of_boundary_right);
+ vec_dup = vdupq_n_u8(*(src + limit));
+ mask_val = vcgeq_u8(indx_vec, cmp_vec);
+ src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+ }
+ src_2 = vextq_u8(src_1, src_1, 1);
+ src_3 = vextq_u8(src_2, src_2, 1);
+ src_4 = vextq_u8(src_3, src_3, 1);
+
+ horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+ } else {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ const uint8_t *src = ref + iy * stride + ix4 - 7;
+ src_1 = vld1q_u8(src);
+ src_2 = vextq_u8(src_1, src_1, 1);
+ src_3 = vextq_u8(src_2, src_2, 1);
+ src_4 = vextq_u8(src_3, src_3, 1);
+
+ horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+ }
+
+ // vertical
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ const int16x8_t *v_src = tmp + (k + 4);
+
+ vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma);
+
+ res_lo = vaddq_s32(res_lo, add_const_vert);
+ res_hi = vaddq_s32(res_hi, add_const_vert);
+
+ if (conv_params->is_compound) {
+ uint16_t *const p =
+ (uint16_t *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j];
+
+ res_lo = vrshlq_s32(res_lo, shift_vert);
+ if (conv_params->do_average) {
+ uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
+ uint16x4_t tmp16_lo = vld1_u16(p);
+ int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
+ int16x4_t tmp16_low;
+ if (conv_params->use_jnt_comp_avg) {
+ res_lo = vmulq_s32(res_lo, bwd);
+ tmp32_lo = vmulq_s32(tmp32_lo, fwd);
+ tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+ tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
+ } else {
+ tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+ tmp16_low = vshrn_n_s32(tmp32_lo, 1);
+ }
+ int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const);
+ res_low = vqrshl_s16(res_low, round_bits_vec);
+ int16x8_t final_res_low = vcombine_s16(res_low, res_low);
+ uint8x8_t res_8_low = vqmovun_s16(final_res_low);
+
+ vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0);
+ } else {
+ uint16x4_t res_u16_low = vqmovun_s32(res_lo);
+ vst1_u16(p, res_u16_low);
+ }
+ if (p_width > 4) {
+ uint16_t *const p4 =
+ (uint16_t *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+ res_hi = vrshlq_s32(res_hi, shift_vert);
+ if (conv_params->do_average) {
+ uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4];
+
+ uint16x4_t tmp16_hi = vld1_u16(p4);
+ int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
+ int16x4_t tmp16_high;
+ if (conv_params->use_jnt_comp_avg) {
+ res_hi = vmulq_s32(res_hi, bwd);
+ tmp32_hi = vmulq_s32(tmp32_hi, fwd);
+ tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+ tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS);
+ } else {
+ tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+ tmp16_high = vshrn_n_s32(tmp32_hi, 1);
+ }
+ int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const);
+ res_high = vqrshl_s16(res_high, round_bits_vec);
+ int16x8_t final_res_high = vcombine_s16(res_high, res_high);
+ uint8x8_t res_8_high = vqmovun_s16(final_res_high);
+
+ vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high),
+ 0);
+ } else {
+ uint16x4_t res_u16_high = vqmovun_s32(res_hi);
+ vst1_u16(p4, res_u16_high);
+ }
+ }
+ } else {
+ res_lo = vrshlq_s32(res_lo, shift_vert);
+ res_hi = vrshlq_s32(res_hi, shift_vert);
+
+ result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi));
+ result_final = vsubq_s16(result_final, sub_constant);
+
+ uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+ uint8x8_t val = vqmovun_s16(result_final);
+
+ if (p_width == 4) {
+ vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
+ } else {
+ vst1_u8(p, val);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
index 72fbed4d4..a9bb5bcf0 100644
--- a/third_party/aom/av1/common/arm/wiener_convolve_neon.c
+++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
@@ -26,7 +26,6 @@
Apply horizontal filter and store in a temporary buffer. When applying
vertical filter, overwrite the original pixel values.
*/
-
void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
@@ -78,8 +77,10 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
/* if height is a multiple of 8 */
if (!(h & 7)) {
int16x8_t res0, res1, res2, res3;
- uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11;
+ uint16x8_t res4;
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+#if defined(__aarch64__)
+ uint16x8_t res5, res6, res7, res8, res9, res10, res11;
uint8x8_t t8, t9, t10, t11, t12, t13, t14;
do {
@@ -190,16 +191,64 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
dst_ptr += 8 * MAX_SB_SIZE;
height -= 8;
} while (height > 0);
+#else
+ uint8x8_t temp_0;
+
+ do {
+ const uint8_t *s;
+
+ __builtin_prefetch(src_ptr);
+
+ t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ s = src_ptr + 8;
+ d_tmp = dst_ptr;
+ width = w;
+
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ temp_0 = t0;
+ t0 = t7;
+
+ t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+ res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+ res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+ res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+ bd, conv_params->round_0);
+
+ vst1q_u16(d_tmp, res4);
+
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += MAX_SB_SIZE;
+ height--;
+ } while (height > 0);
+#endif
} else {
/*if height is a multiple of 4*/
- int16x8_t tt0, tt1, tt2, tt3;
const uint8_t *s;
+ int16x8_t tt0, tt1, tt2, tt3;
+ uint16x8_t d0;
+ uint8x8_t t0, t1, t2, t3;
+
+#if defined(__aarch64__)
uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
- uint16x8_t d0, d1, d2, d3;
+ uint16x8_t d1, d2, d3;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
int16x4_t s11, s12, s13, s14;
- uint8x8_t t0, t1, t2, t3;
-
do {
__builtin_prefetch(src_ptr + 0 * src_stride);
__builtin_prefetch(src_ptr + 1 * src_stride);
@@ -292,11 +341,61 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
dst_ptr += 4 * MAX_SB_SIZE;
height -= 4;
} while (height > 0);
+#else
+ uint8x8_t temp_0, t4, t5, t6, t7;
+
+ do {
+ __builtin_prefetch(src_ptr);
+
+ t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ __builtin_prefetch(dst_ptr);
+
+ s = src_ptr + 8;
+ d_tmp = dst_ptr;
+ width = w;
+
+ do {
+ t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ temp_0 = t0;
+ t0 = t7;
+
+ t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+ tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+ tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd,
+ conv_params->round_0);
+
+ vst1q_u16(d_tmp, d0);
+
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+
+ src_ptr += src_stride;
+ dst_ptr += MAX_SB_SIZE;
+ height -= 1;
+ } while (height > 0);
+#endif
}
{
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x8_t t0, t1, t2, t3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10;
+ uint8x8_t t1, t2, t3;
+#endif
int16_t *src_tmp_ptr, *s;
uint8_t *dst_tmp_ptr;
height = h;
@@ -324,6 +423,7 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
d = dst_tmp_ptr;
height = h;
+#if defined(__aarch64__)
do {
__builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
__builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
@@ -397,5 +497,34 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
w -= 8;
} while (w > 0);
+#else
+ do {
+ __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+
+ s7 = vld1q_s16(s);
+ s += src_stride;
+
+ t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
+ bd, conv_params->round_1);
+
+ vst1_u8(d, t0);
+ d += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+ } while (height > 0);
+
+ src_tmp_ptr += 8;
+ dst_tmp_ptr += 8;
+
+ w -= 8;
+ } while (w > 0);
+#endif
}
}
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c
index 8514dc64c..7ef2d6d7f 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d.c
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.c
@@ -11,56 +11,7 @@
#include <stdlib.h>
#include "av1/common/av1_inv_txfm1d.h"
-
-static void range_check_buf(int32_t stage, const int32_t *input,
- const int32_t *buf, int32_t size, int8_t bit) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
- const int64_t max_value = (1LL << (bit - 1)) - 1;
- const int64_t min_value = -(1LL << (bit - 1));
-
- int in_range = 1;
-
- for (int i = 0; i < size; ++i) {
- if (buf[i] < min_value || buf[i] > max_value) {
- in_range = 0;
- }
- }
-
- if (!in_range) {
- fprintf(stderr, "Error: coeffs contain out-of-range values\n");
- fprintf(stderr, "size: %d\n", size);
- fprintf(stderr, "stage: %d\n", stage);
- fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
- max_value);
-
- fprintf(stderr, "coeffs: ");
-
- fprintf(stderr, "[");
- for (int j = 0; j < size; j++) {
- if (j > 0) fprintf(stderr, ", ");
- fprintf(stderr, "%d", input[j]);
- }
- fprintf(stderr, "]\n");
-
- fprintf(stderr, " buf: ");
-
- fprintf(stderr, "[");
- for (int j = 0; j < size; j++) {
- if (j > 0) fprintf(stderr, ", ");
- fprintf(stderr, "%d", buf[j]);
- }
- fprintf(stderr, "]\n\n");
- }
-
- assert(in_range);
-#else
- (void)stage;
- (void)input;
- (void)buf;
- (void)size;
- (void)bit;
-#endif
-}
+#include "av1/common/av1_txfm.h"
// TODO(angiebird): Make 1-d txfm functions static
//
@@ -84,7 +35,7 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[1] = input[2];
bf1[2] = input[1];
bf1[3] = input[3];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -94,7 +45,7 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -129,7 +80,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = input[5];
bf1[6] = input[3];
bf1[7] = input[7];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -143,7 +94,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -157,7 +108,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -171,7 +122,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
bf1[7] = bf0[7];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -218,7 +169,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = input[11];
bf1[14] = input[7];
bf1[15] = input[15];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -240,7 +191,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -262,7 +213,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -284,7 +235,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
bf1[15] = bf0[15];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -306,7 +257,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -328,7 +279,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
bf1[14] = bf0[14];
bf1[15] = bf0[15];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -399,7 +350,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = input[23];
bf1[30] = input[15];
bf1[31] = input[31];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -437,7 +388,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -475,7 +426,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -513,7 +464,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
bf1[31] = bf0[31];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -551,7 +502,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -589,7 +540,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
bf1[30] = bf0[30];
bf1[31] = bf0[31];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -627,7 +578,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 8
stage++;
@@ -665,7 +616,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = bf0[29];
bf1[30] = bf0[30];
bf1[31] = bf0[31];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
stage++;
@@ -760,7 +711,6 @@ void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
output[1] = round_shift(x1, bit);
output[2] = round_shift(x2, bit);
output[3] = round_shift(x3, bit);
- range_check_buf(6, input, output, 4, stage_range[6]);
}
void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -786,7 +736,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = input[4];
bf1[6] = input[1];
bf1[7] = input[6];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -800,7 +750,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -814,7 +764,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -828,7 +778,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -842,7 +792,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -856,7 +806,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = bf0[5];
bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -903,7 +853,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = input[12];
bf1[14] = input[1];
bf1[15] = input[14];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -925,7 +875,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -947,7 +897,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -969,7 +919,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -991,7 +941,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -1013,7 +963,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -1035,7 +985,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 8
stage++;
@@ -1057,7 +1007,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[13];
bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
stage++;
@@ -1193,7 +1143,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = input[47];
bf1[62] = input[31];
bf1[63] = input[63];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -1263,7 +1213,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -1333,7 +1283,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -1403,7 +1353,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
bf1[63] = bf0[63];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -1473,7 +1423,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -1543,7 +1493,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
bf1[62] = bf0[62];
bf1[63] = bf0[63];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -1613,7 +1563,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 8
stage++;
@@ -1683,7 +1633,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[61];
bf1[62] = bf0[62];
bf1[63] = bf0[63];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
stage++;
@@ -1753,7 +1703,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 10
stage++;
@@ -1823,7 +1773,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[61];
bf1[62] = bf0[62];
bf1[63] = bf0[63];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 11
stage++;
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h
index 64a1a921c..c31c019aa 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d.h
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_INV_TXFM1D_H_
-#define AV1_INV_TXFM1D_H_
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
#include "av1/common/av1_txfm.h"
@@ -58,4 +58,4 @@ void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
}
#endif
-#endif // AV1_INV_TXFM1D_H_
+#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
index 4c600f756..7d80a0099 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
+++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_INV_TXFM2D_CFG_H_
-#define AV1_INV_TXFM2D_CFG_H_
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
#include "av1/common/av1_inv_txfm1d.h"
// sum of fwd_shift_##
@@ -44,4 +44,4 @@ extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL];
extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/];
extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/];
-#endif // AV1_INV_TXFM2D_CFG_H_
+#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
index 9d68b8760..537d8dfe9 100644
--- a/third_party/aom/av1/common/av1_loopfilter.c
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -68,23 +68,6 @@ static const int mode_lf_lut[] = {
// 10101010|10101010
//
// A loopfilter should be applied to every other 4x4 horizontally.
-// TODO(chengchen): make these tables static
-const FilterMask left_txform_mask[TX_SIZES] = {
- { { 0xffffffffffffffffULL, // TX_4X4,
- 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
-
- { { 0x5555555555555555ULL, // TX_8X8,
- 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL } },
-
- { { 0x1111111111111111ULL, // TX_16X16,
- 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL } },
-
- { { 0x0101010101010101ULL, // TX_32X32,
- 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL } },
-
- { { 0x0001000100010001ULL, // TX_64X64,
- 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
-};
// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
// We use 4 uint64_t to represent the 256 bit.
@@ -113,98 +96,314 @@ const FilterMask left_txform_mask[TX_SIZES] = {
// 00000000|00000000
//
// A loopfilter should be applied to every other 4x4 horizontally.
-const FilterMask above_txform_mask[TX_SIZES] = {
- { { 0xffffffffffffffffULL, // TX_4X4
- 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
- { { 0x0000ffff0000ffffULL, // TX_8X8
- 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL } },
-
- { { 0x000000000000ffffULL, // TX_16X16
- 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL } },
-
- { { 0x000000000000ffffULL, // TX_32X32
- 0x0000000000000000ULL, 0x000000000000ffffULL, 0x0000000000000000ULL } },
-
- { { 0x000000000000ffffULL, // TX_64X64
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
};
-// 64 bit mask to shift and set for each prediction size. A bit is set for
-// each 4x4 block that would be in the top left most block of the given block
-// size in the 64x64 block.
-const FilterMask size_mask_y[BLOCK_SIZES_ALL] = {
- { { 0x0000000000000001ULL, // BLOCK_4X4
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x0000000000010001ULL, // BLOCK_4X8
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x0000000000000003ULL, // BLOCK_8X4
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x0000000000030003ULL, // BLOCK_8X8
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x0003000300030003ULL, // BLOCK_8X16
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x00000000000f000fULL, // BLOCK_16X8
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x000f000f000f000fULL, // BLOCK_16X16
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x000f000f000f000fULL, // BLOCK_16X32
- 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x00ff00ff00ff00ffULL, // BLOCK_32X16
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x00ff00ff00ff00ffULL, // BLOCK_32X32
- 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x00ff00ff00ff00ffULL, // BLOCK_32X64
- 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL } },
-
- { { 0xffffffffffffffffULL, // BLOCK_64X32
- 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0xffffffffffffffffULL, // BLOCK_64X64
- 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
- // Y plane max coding block size is 128x128, but the codec divides it
- // into 4 64x64 blocks.
- // BLOCK_64X128
- { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
- // BLOCK_128X64
- { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
- // BLOCK_128X128
- { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
-
- { { 0x0001000100010001ULL, // BLOCK_4X16
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x000000000000000fULL, // BLOCK_16X4
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x0003000300030003ULL, // BLOCK_8X32
- 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
+ -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
+};
- { { 0x0000000000ff00ffULL, // BLOCK_32X8
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
+ -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
+};
- { { 0x000f000f000f000fULL, // BLOCK_16X64
- 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL } },
+const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, 0, 1, 2,
+ 3, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1 };
+
+const FilterMask left_mask_univariant_reordered[67] = {
+ // TX_4X4
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X4, TX_4X4
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_4X4
+ { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_4X4
+ { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_4X4
+ { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+ 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+ 0xffffffffffffffffULL } }, // block size 64X64, TX_4X4
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X4
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_4X4
+ { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_4X4
+ { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+ 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_4X4
+ // TX_8X8
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_8X8
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X8
+ { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_8X8
+ { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_8X8
+ { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_8X8
+ { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_8X8
+ { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_8X8
+ { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
+ 0x0055005500550055ULL } }, // block size 32X64, TX_8X8
+ { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_8X8
+ { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
+ 0x5555555555555555ULL } }, // block size 64X64, TX_8X8
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X8
+ { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_8X8
+ { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
+ 0x0005000500050005ULL } }, // block size 16X64, TX_8X8
+ { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_8X8
+ // TX_16X16
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_16X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X16
+ { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_16X16
+ { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_16X16
+ { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
+ 0x0011001100110011ULL } }, // block size 32X64, TX_16X16
+ { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_16X16
+ { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
+ 0x1111111111111111ULL } }, // block size 64X64, TX_16X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 16X64, TX_16X16
+ { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_16X16
+ // TX_32X32
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_32X32
+ { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+ 0x0101010101010101ULL } }, // block size 32X64, TX_32X32
+ { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_32X32
+ { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+ 0x0101010101010101ULL } }, // block size 64X64, TX_32X32
+ // TX_64X64
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 64X64, TX_64X64
+ // 2:1, 1:2 transform sizes.
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X8
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X8
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_8X4
+ { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_8X4
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X16
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_16X8
+ { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_16X8
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X32
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 16X64, TX_16X32
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_32X16
+ { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_32X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 32X64, TX_32X64
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_64X32
+ // 4:1, 1:4 transform sizes.
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X16
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_16X4
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X32
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_32X8
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 16X64, TX_16X64
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_64X16
+};
- { { 0xffffffffffffffffULL, // BLOCK_64X16
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }
+const FilterMask above_mask_univariant_reordered[67] = {
+ // TX_4X4
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X4, TX_4X4
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_4X4
+ { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_4X4
+ { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_4X4
+ { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+ 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+ 0xffffffffffffffffULL } }, // block size 64X64, TX_4x4
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X4
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_4X4
+ { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_4X4
+ { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+ 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_4X4
+ // TX_8X8
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_8X8
+ { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X8
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_8X8
+ { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_8X8
+ { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_8X8
+ { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_8X8
+ { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_8X8
+ { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
+ 0x000000ff000000ffULL } }, // block size 32X64, TX_8X8
+ { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_8X8
+ { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+ 0x0000ffff0000ffffULL } }, // block size 64X64, TX_8X8
+ { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X8
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_8X8
+ { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
+ 0x0000000f0000000fULL } }, // block size 16X64, TX_8X8
+ { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_8X8
+ // TX_16X16
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_16X16
+ { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X16
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_16X16
+ { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_16X16
+ { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
+ 0x00000000000000ffULL } }, // block size 32X64, TX_16X16
+ { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_16X16
+ { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
+ 0x000000000000ffffULL } }, // block size 64X64, TX_16X16
+ { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
+ 0x000000000000000fULL } }, // block size 16X64, TX_16X16
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_16X16
+ // TX_32X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_32X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
+ 0x0000000000000000ULL } }, // block size 32X64, TX_32X32
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_32X32
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
+ 0x0000000000000000ULL } }, // block size 64X64, TX_32X32
+ // TX_64X64
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X64, TX_64X64
+ // 2:1, 1:2 transform sizes.
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X8
+ { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X8
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_8X4
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_8X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X16
+ { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X16
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_16X8
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_16X8
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X32
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
+ 0x0000000000000000ULL } }, // block size 16X64, TX_16X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_32X16
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_32X16
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X64, TX_32X64
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_64X32
+ // 4:1, 1:4 transform sizes.
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X16
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_16X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_32X8
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X64, TX_16X64
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_64X16
};
LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
int mi_col) {
- if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
- (mi_col << MI_SIZE_LOG2) >= cm->width)
- return NULL;
assert(cm->lf.lfm != NULL);
const int row = mi_row >> MIN_MIB_SIZE_LOG2; // 64x64
const int col = mi_col >> MIN_MIB_SIZE_LOG2;
@@ -248,10 +447,10 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
SIMD_WIDTH);
}
}
-static uint8_t get_filter_level(const AV1_COMMON *cm,
- const loop_filter_info_n *lfi_n,
- const int dir_idx, int plane,
- const MB_MODE_INFO *mbmi) {
+
+uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n,
+ const int dir_idx, int plane,
+ const MB_MODE_INFO *mbmi) {
const int segment_id = mbmi->segment_id;
if (cm->delta_lf_present_flag) {
int delta_lf;
@@ -374,30 +573,6 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
}
}
}
-
-#if LOOP_FILTER_BITMASK
- memset(lf->neighbor_sb_lpf_info.tx_size_y_above, TX_64X64,
- sizeof(TX_SIZE) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.tx_size_y_left, TX_64X64,
- sizeof(TX_SIZE) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.tx_size_uv_above, TX_64X64,
- sizeof(TX_SIZE) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.tx_size_uv_left, TX_64X64,
- sizeof(TX_SIZE) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.y_level_above, 0,
- sizeof(uint8_t) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.y_level_left, 0,
- sizeof(uint8_t) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.u_level_above, 0,
- sizeof(uint8_t) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.u_level_left, 0,
- sizeof(uint8_t) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.v_level_above, 0,
- sizeof(uint8_t) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.v_level_left, 0,
- sizeof(uint8_t) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.skip, 0, sizeof(uint8_t) * MI_SIZE_64X64);
-#endif // LOOP_FILTER_BITMASK
}
#if LOOP_FILTER_BITMASK
@@ -413,7 +588,7 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
// After locating which uint64_t, mi_row % 4 is the
// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
// Therefore, shift = (row << stride_log2) + mi_col;
-static int get_index_shift(int mi_col, int mi_row, int *index) {
+int get_index_shift(int mi_col, int mi_row, int *index) {
// *index = mi_row >> 2;
// rows = mi_row % 4;
// stride_log2 = 4;
@@ -588,15 +763,9 @@ static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
else
lfm->lfl_y_hor[row][col] = level;
} else if (plane == 1) {
- if (dir == VERT_EDGE)
- lfm->lfl_u_ver[row][col] = level;
- else
- lfm->lfl_u_hor[row][col] = level;
+ lfm->lfl_u[row][col] = level;
} else {
- if (dir == VERT_EDGE)
- lfm->lfl_v_ver[row][col] = level;
- else
- lfm->lfl_v_hor[row][col] = level;
+ lfm->lfl_v[row][col] = level;
}
}
}
@@ -623,11 +792,12 @@ static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
const TX_SIZE prev_tx_size =
plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
: mbmi_prev->tx_size;
- const TX_SIZE min_tx_size =
- (dir == VERT_EDGE) ? AOMMIN(txsize_horz_map[tx_size],
- txsize_horz_map[prev_tx_size])
- : AOMMIN(txsize_vert_map[tx_size],
- txsize_vert_map[prev_tx_size]);
+ TX_SIZE min_tx_size = (dir == VERT_EDGE)
+ ? AOMMIN(txsize_horz_map[tx_size],
+ txsize_horz_map[prev_tx_size])
+ : AOMMIN(txsize_vert_map[tx_size],
+ txsize_vert_map[prev_tx_size]);
+ min_tx_size = AOMMIN(min_tx_size, TX_16X16);
assert(min_tx_size < TX_SIZES);
const int row = r % MI_SIZE_64X64;
const int col = c % MI_SIZE_64X64;
@@ -883,13 +1053,11 @@ void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
} else if (plane == 1) {
av1_zero(lfm->left_u);
av1_zero(lfm->above_u);
- av1_zero(lfm->lfl_u_ver);
- av1_zero(lfm->lfl_u_hor);
+ av1_zero(lfm->lfl_u);
} else {
av1_zero(lfm->left_v);
av1_zero(lfm->above_v);
- av1_zero(lfm->lfl_v_ver);
- av1_zero(lfm->lfl_v_hor);
+ av1_zero(lfm->lfl_v);
}
}
}
@@ -979,13 +1147,10 @@ static void filter_selectively_vert_row2(
if ((mask_16x16_0 & mask_16x16_1) & 1) {
if (plane) {
- // TODO(any): add aom_lpf_vertical_6_dual for chroma plane.
- aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
- aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
+ aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
} else {
- // TODO(any): add dual function simd function. Current sse2 code
- // just called aom_lpf_vertical_14_sse2 twice.
aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, lfi1->mblim, lfi1->lim,
lfi1->hev_thr);
@@ -1005,9 +1170,9 @@ static void filter_selectively_vert_row2(
if ((mask_8x8_0 & mask_8x8_1) & 1) {
if (plane) {
- aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
- aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
+ aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
} else {
aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, lfi1->mblim, lfi1->lim,
@@ -1070,10 +1235,9 @@ static void highbd_filter_selectively_vert_row2(
if ((mask_16x16_0 & mask_16x16_1) & 1) {
if (plane) {
- aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, bd);
- aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, bd);
+ aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
} else {
aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, lfi1->mblim,
@@ -1094,10 +1258,9 @@ static void highbd_filter_selectively_vert_row2(
if ((mask_8x8_0 & mask_8x8_1) & 1) {
if (plane) {
- aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, bd);
- aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, bd);
+ aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
} else {
aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, lfi1->mblim,
@@ -1163,13 +1326,15 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
if ((mask_16x16 & two_block_mask) == two_block_mask) {
- /*
- aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr);
- */
-
- lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
- lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr);
+ if (plane) {
+ aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ } else {
+ aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ }
count = 2;
} else {
lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
@@ -1181,28 +1346,24 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
if ((mask_8x8 & two_block_mask) == two_block_mask) {
- /*
- aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim, lfin->lim,
- lfin->hev_thr);
- */
-
- lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
- lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr);
+ if (plane) {
+ aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ } else {
+ aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ }
count = 2;
} else {
lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
}
} else if (mask_4x4 & 1) {
if ((mask_4x4 & two_block_mask) == two_block_mask) {
- /*
aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
lfi->hev_thr, lfin->mblim, lfin->lim,
lfin->hev_thr);
- */
- aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
- aom_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim,
- lfin->hev_thr);
count = 2;
} else {
aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
@@ -1239,15 +1400,15 @@ static void highbd_filter_selectively_horiz(
plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
if ((mask_16x16 & two_block_mask) == two_block_mask) {
- /*
- aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, bd);
- */
-
- highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
- bd);
- highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim,
- lfin->hev_thr, bd);
+ if (plane) {
+ aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ }
count = 2;
} else {
highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
@@ -1258,15 +1419,15 @@ static void highbd_filter_selectively_horiz(
plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
if ((mask_8x8 & two_block_mask) == two_block_mask) {
- /*
- aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim, lfin->lim,
- lfin->hev_thr, bd);
- */
- highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
- bd);
- highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim,
- lfin->hev_thr, bd);
+ if (plane) {
+ aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ }
count = 2;
} else {
highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
@@ -1274,15 +1435,9 @@ static void highbd_filter_selectively_horiz(
}
} else if (mask_4x4 & 1) {
if ((mask_4x4 & two_block_mask) == two_block_mask) {
- /*
aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
lfi->hev_thr, lfin->mblim, lfin->lim,
lfin->hev_thr, bd);
- */
- aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, bd);
- aom_highbd_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim,
- lfin->hev_thr, bd);
count = 2;
} else {
aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
@@ -1299,43 +1454,289 @@ static void highbd_filter_selectively_horiz(
}
}
-static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf,
- uint8_t *dst_buf, int ref_stride, int dst_stride,
- int start, int end) {
- return 0;
-
- start <<= MI_SIZE_LOG2;
- end <<= MI_SIZE_LOG2;
- uint8_t *ref0 = ref_buf;
- uint8_t *dst0 = dst_buf;
- if (cm->seq_params.use_highbitdepth) {
- const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf);
- const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf);
- for (int j = 0; j < 4; ++j) {
- for (int i = start; i < end; ++i)
- if (ref16[i] != dst16[i]) {
- ref_buf = ref0;
- dst_buf = dst0;
- return i + 1;
+void av1_build_bitmask_vert_info(
+ AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+ int plane) {
+ const int subsampling_x = plane_ptr->subsampling_x;
+ const int subsampling_y = plane_ptr->subsampling_y;
+ const int row_step = (MI_SIZE >> MI_SIZE_LOG2);
+ const int is_uv = plane > 0;
+ TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+ uint8_t level, prev_level = 1;
+ int skip, prev_skip = 0;
+ int is_coding_block_border;
+
+ for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) {
+ const int mi_row = r << subsampling_y;
+ const int row = mi_row % MI_SIZE_64X64;
+ int index = 0;
+ const int shift = get_index_shift(0, row, &index);
+
+ for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
+ c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
+ const int mi_col = c << subsampling_x;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+ for (int col_in_unit = 0;
+ col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
+ const int x = (c + col_in_unit) << MI_SIZE_LOG2;
+ if (x >= plane_ptr->dst.width) break;
+ const int col = col_in_unit << subsampling_x;
+ const uint64_t mask = ((uint64_t)1 << (shift | col));
+ skip = lfm->skip.bits[index] & mask;
+ is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
+ switch (plane) {
+ case 0: level = lfm->lfl_y_ver[row][col]; break;
+ case 1: level = lfm->lfl_u[row][col]; break;
+ case 2: level = lfm->lfl_v[row][col]; break;
+ default: assert(plane >= 0 && plane <= 2); return;
}
- ref16 += ref_stride;
- dst16 += dst_stride;
+ for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+ if (is_uv && ts == TX_64X64) continue;
+ if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
+ tx_size = ts;
+ break;
+ }
+ }
+ if ((c + col_in_unit > 0) && (level || prev_level) &&
+ (!prev_skip || !skip || is_coding_block_border)) {
+ const TX_SIZE min_tx_size =
+ AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+ const int tmp_row = (mi_row | subsampling_y) % MI_SIZE_64X64;
+ const int tmp_col = (col | subsampling_x) % MI_SIZE_64X64;
+ const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
+ const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+ switch (plane) {
+ case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
+ case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
+ case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ }
+
+ // update prev info
+ prev_level = level;
+ prev_skip = skip;
+ prev_tx_size = tx_size;
+ // advance
+ col_in_unit += tx_size_wide_unit[tx_size];
+ }
}
- } else {
- for (int j = 0; j < 4; ++j) {
- for (int i = start; i < end; ++i)
- if (ref_buf[i] != dst_buf[i]) {
- ref_buf = ref0;
- dst_buf = dst0;
- return i + 1;
+ }
+}
+
+void av1_build_bitmask_horz_info(
+ AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+ int plane) {
+ const int subsampling_x = plane_ptr->subsampling_x;
+ const int subsampling_y = plane_ptr->subsampling_y;
+ const int col_step = (MI_SIZE >> MI_SIZE_LOG2);
+ const int is_uv = plane > 0;
+ TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+ uint8_t level, prev_level = 1;
+ int skip, prev_skip = 0;
+ int is_coding_block_border;
+
+ for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) {
+ const int mi_col = c << subsampling_x;
+ const int col = mi_col % MI_SIZE_64X64;
+
+ for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
+ r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
+ const int mi_row = r << subsampling_y;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+ for (int r_in_unit = 0;
+ r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
+ const int y = (r + r_in_unit) << MI_SIZE_LOG2;
+ if (y >= plane_ptr->dst.height) break;
+ const int row = r_in_unit << subsampling_y;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ const uint64_t mask = ((uint64_t)1 << shift);
+ skip = lfm->skip.bits[index] & mask;
+ is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
+ switch (plane) {
+ case 0: level = lfm->lfl_y_hor[row][col]; break;
+ case 1: level = lfm->lfl_u[row][col]; break;
+ case 2: level = lfm->lfl_v[row][col]; break;
+ default: assert(plane >= 0 && plane <= 2); return;
}
- ref_buf += ref_stride;
- dst_buf += dst_stride;
+ for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+ if (is_uv && ts == TX_64X64) continue;
+ if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
+ tx_size = ts;
+ break;
+ }
+ }
+ if ((r + r_in_unit > 0) && (level || prev_level) &&
+ (!prev_skip || !skip || is_coding_block_border)) {
+ const TX_SIZE min_tx_size =
+ AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+ const int tmp_row = (row | subsampling_y) % MI_SIZE_64X64;
+ const int tmp_col = (mi_col | subsampling_x) % MI_SIZE_64X64;
+ const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
+ const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+
+ switch (plane) {
+ case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
+ case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
+ case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ }
+
+ // update prev info
+ prev_level = level;
+ prev_skip = skip;
+ prev_tx_size = tx_size;
+ // advance
+ r_in_unit += tx_size_high_unit[tx_size];
+ }
+ }
+ }
+}
+
+void av1_filter_block_plane_bitmask_vert(
+ AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+ int mi_row, int mi_col) {
+ struct buf_2d *const dst = &plane_ptr->dst;
+ uint8_t *const buf0 = dst->buf;
+ const int ssx = plane_ptr->subsampling_x;
+ const int ssy = plane_ptr->subsampling_y;
+ const int mask_cutoff = 0xffff;
+ const int row_step = 1 << ssy;
+ const int two_row_step = 2 << ssy;
+ const int row_stride = dst->stride << MI_SIZE_LOG2;
+ const int two_row_stride = row_stride << 1;
+ uint64_t mask_16x16 = 0;
+ uint64_t mask_8x8 = 0;
+ uint64_t mask_4x4 = 0;
+ uint8_t *lfl;
+ uint8_t *lfl2;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ assert(lfm);
+
+ // 1. vertical filtering. filter two rows at a time
+ for (int r = 0;
+ ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+ r += two_row_step) {
+ const int row = r | ssy;
+ const int row_next = row + row_step;
+ const int col = ssx;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ int index_next = 0;
+ const int shift_next = get_index_shift(col, row_next, &index_next);
+ switch (pl) {
+ case 0:
+ mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+ lfl = &lfm->lfl_y_ver[row][col];
+ lfl2 = &lfm->lfl_y_ver[row_next][col];
+ break;
+ case 1:
+ mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+ lfl = &lfm->lfl_u[row][col];
+ lfl2 = &lfm->lfl_u[row_next][col];
+ break;
+ case 2:
+ mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+ lfl = &lfm->lfl_v[row][col];
+ lfl2 = &lfm->lfl_v[row_next][col];
+ break;
+ default: assert(pl >= 0 && pl <= 2); return;
+ }
+ uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+ uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+ uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+ uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+ uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+ uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_vert_row2(
+ ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+ mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+ &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+ else
+ filter_selectively_vert_row2(
+ ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+ mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+ dst->buf += two_row_stride;
+ }
+ // reset buf pointer for horizontal filtering
+ dst->buf = buf0;
+}
+
+void av1_filter_block_plane_bitmask_horz(
+ AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+ int mi_row, int mi_col) {
+ struct buf_2d *const dst = &plane_ptr->dst;
+ uint8_t *const buf0 = dst->buf;
+ const int ssx = plane_ptr->subsampling_x;
+ const int ssy = plane_ptr->subsampling_y;
+ const int mask_cutoff = 0xffff;
+ const int row_step = 1 << ssy;
+ const int row_stride = dst->stride << MI_SIZE_LOG2;
+ uint64_t mask_16x16 = 0;
+ uint64_t mask_8x8 = 0;
+ uint64_t mask_4x4 = 0;
+ uint8_t *lfl;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ assert(lfm);
+ for (int r = 0;
+ ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+ r += row_step) {
+ if (mi_row + r == 0) {
+ dst->buf += row_stride;
+ continue;
}
+ const int row = r | ssy;
+ const int col = ssx;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ switch (pl) {
+ case 0:
+ mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+ lfl = &lfm->lfl_y_hor[row][col];
+ break;
+ case 1:
+ mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+ lfl = &lfm->lfl_u[row][col];
+ break;
+ case 2:
+ mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+ lfl = &lfm->lfl_v[row][col];
+ break;
+ default: assert(pl >= 0 && pl <= 2); return;
+ }
+ mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+ mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+ mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_horiz(
+ CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
+ else
+ filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl);
+ dst->buf += row_stride;
}
- ref_buf = ref0;
- dst_buf = dst0;
- return 0;
+ // reset buf pointer for next block
+ dst->buf = buf0;
}
void av1_filter_block_plane_ver(AV1_COMMON *const cm,
@@ -1385,15 +1786,15 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm,
mask_16x16 = lfm->left_u[TX_16X16].bits[index];
mask_8x8 = lfm->left_u[TX_8X8].bits[index];
mask_4x4 = lfm->left_u[TX_4X4].bits[index];
- lfl = &lfm->lfl_u_ver[row][col];
- lfl2 = &lfm->lfl_u_ver[row_next][col];
+ lfl = &lfm->lfl_u[row][col];
+ lfl2 = &lfm->lfl_u[row_next][col];
break;
case 2:
mask_16x16 = lfm->left_v[TX_16X16].bits[index];
mask_8x8 = lfm->left_v[TX_8X8].bits[index];
mask_4x4 = lfm->left_v[TX_4X4].bits[index];
- lfl = &lfm->lfl_v_ver[row][col];
- lfl2 = &lfm->lfl_v_ver[row_next][col];
+ lfl = &lfm->lfl_v[row][col];
+ lfl2 = &lfm->lfl_v[row_next][col];
break;
default: assert(pl >= 0 && pl <= 2); return;
}
@@ -1460,13 +1861,13 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm,
mask_16x16 = lfm->above_u[TX_16X16].bits[index];
mask_8x8 = lfm->above_u[TX_8X8].bits[index];
mask_4x4 = lfm->above_u[TX_4X4].bits[index];
- lfl = &lfm->lfl_u_hor[row][col];
+ lfl = &lfm->lfl_u[row][col];
break;
case 2:
mask_16x16 = lfm->above_v[TX_16X16].bits[index];
mask_8x8 = lfm->above_v[TX_8X8].bits[index];
mask_4x4 = lfm->above_v[TX_4X4].bits[index];
- lfl = &lfm->lfl_v_hor[row][col];
+ lfl = &lfm->lfl_v[row][col];
break;
default: assert(pl >= 0 && pl <= 2); return;
}
@@ -1820,6 +2221,9 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
MACROBLOCKD *xd, int start, int stop,
+#if LOOP_FILTER_BITMASK
+ int is_decoding,
+#endif
int plane_start, int plane_end) {
struct macroblockd_plane *pd = xd->plane;
const int col_start = 0;
@@ -1827,6 +2231,45 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
int mi_row, mi_col;
int plane;
+#if LOOP_FILTER_BITMASK
+ if (is_decoding) {
+ for (plane = plane_start; plane < plane_end; plane++) {
+ if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+ break;
+ else if (plane == 1 && !(cm->lf.filter_level_u))
+ continue;
+ else if (plane == 2 && !(cm->lf.filter_level_v))
+ continue;
+
+ av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
+ plane, plane + 1);
+ av1_build_bitmask_vert_info(cm, &pd[plane], plane);
+ av1_build_bitmask_horz_info(cm, &pd[plane], plane);
+
+ // apply loop filtering which only goes through buffer once
+ for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
+ for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
+ av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, mi_col,
+ plane, plane + 1);
+ av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
+ mi_col);
+ if (mi_col - MI_SIZE_64X64 >= 0) {
+ av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+ mi_col - MI_SIZE_64X64, plane, plane + 1);
+ av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+ mi_col - MI_SIZE_64X64);
+ }
+ }
+ av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+ mi_col - MI_SIZE_64X64, plane, plane + 1);
+ av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+ mi_col - MI_SIZE_64X64);
+ }
+ }
+ return;
+ }
+#endif
+
for (plane = plane_start; plane < plane_end; plane++) {
if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
break;
@@ -1910,8 +2353,11 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
}
void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
- MACROBLOCKD *xd, int plane_start, int plane_end,
- int partial_frame) {
+ MACROBLOCKD *xd,
+#if LOOP_FILTER_BITMASK
+ int is_decoding,
+#endif
+ int plane_start, int plane_end, int partial_frame) {
int start_mi_row, end_mi_row, mi_rows_to_filter;
start_mi_row = 0;
@@ -1923,6 +2369,9 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
}
end_mi_row = start_mi_row + mi_rows_to_filter;
av1_loop_filter_frame_init(cm, plane_start, plane_end);
- loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
- plane_end);
+ loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
+#if LOOP_FILTER_BITMASK
+ is_decoding,
+#endif
+ plane_start, plane_end);
}
diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h
index c35c3b2dc..80ac61178 100644
--- a/third_party/aom/av1/common/av1_loopfilter.h
+++ b/third_party/aom/av1/common/av1_loopfilter.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_LOOPFILTER_H_
-#define AV1_COMMON_LOOPFILTER_H_
+#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_
+#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_
#include "config/aom_config.h"
@@ -60,51 +60,20 @@ typedef struct {
uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64];
uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
- // U plane vertical edge and horizontal edge filter level
- uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64];
- uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+ // U plane filter level
+ uint8_t lfl_u[MI_SIZE_64X64][MI_SIZE_64X64];
- // V plane vertical edge and horizontal edge filter level
- uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64];
- uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64];
-} LoopFilterMask;
+ // V plane filter level
+ uint8_t lfl_v[MI_SIZE_64X64][MI_SIZE_64X64];
-// To determine whether to apply loop filtering at one transform block edge,
-// we need information of the neighboring transform block. Specifically,
-// in determining a vertical edge, we need the information of the tx block
-// to its left. For a horizontal edge, we need info of the tx block above it.
-// Thus, we need to record info of right column and bottom row of tx blocks.
-// We record the information of the neighboring superblock, when bitmask
-// building for a superblock is finished. And it will be used for next
-// superblock bitmask building.
-// Information includes:
-// ------------------------------------------------------------
-// MI_SIZE_64X64
-// Y tx_size above |--------------|
-// Y tx_size left |--------------|
-// UV tx_size above |--------------|
-// UV tx_size left |--------------|
-// Y level above |--------------|
-// Y level left |--------------|
-// U level above |--------------|
-// U level left |--------------|
-// V level above |--------------|
-// V level left |--------------|
-// skip |--------------|
-// ------------------------------------------------------------
-typedef struct {
- TX_SIZE tx_size_y_above[MI_SIZE_64X64];
- TX_SIZE tx_size_y_left[MI_SIZE_64X64];
- TX_SIZE tx_size_uv_above[MI_SIZE_64X64];
- TX_SIZE tx_size_uv_left[MI_SIZE_64X64];
- uint8_t y_level_above[MI_SIZE_64X64];
- uint8_t y_level_left[MI_SIZE_64X64];
- uint8_t u_level_above[MI_SIZE_64X64];
- uint8_t u_level_left[MI_SIZE_64X64];
- uint8_t v_level_above[MI_SIZE_64X64];
- uint8_t v_level_left[MI_SIZE_64X64];
- uint8_t skip[MI_SIZE_64X64];
-} LpfSuperblockInfo;
+ // other info
+ FilterMask skip;
+ FilterMask is_vert_border;
+ FilterMask is_horz_border;
+ // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64
+ FilterMask tx_size_ver[2][5];
+ FilterMask tx_size_hor[2][5];
+} LoopFilterMask;
#endif // LOOP_FILTER_BITMASK
struct loopfilter {
@@ -130,7 +99,6 @@ struct loopfilter {
LoopFilterMask *lfm;
size_t lfm_num;
int lfm_stride;
- LpfSuperblockInfo neighbor_sb_lpf_info;
#endif // LOOP_FILTER_BITMASK
};
@@ -157,9 +125,15 @@ void av1_loop_filter_init(struct AV1Common *cm);
void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
int plane_end);
+#if LOOP_FILTER_BITMASK
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+ struct macroblockd *mbd, int is_decoding,
+ int plane_start, int plane_end, int partial_frame);
+#else
void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
struct macroblockd *mbd, int plane_start,
int plane_end, int partial_frame);
+#endif
void av1_filter_block_plane_vert(const struct AV1Common *const cm,
const MACROBLOCKD *const xd, const int plane,
@@ -180,6 +154,9 @@ typedef struct LoopFilterWorkerData {
MACROBLOCKD *xd;
} LFWorkerData;
+uint8_t get_filter_level(const struct AV1Common *cm,
+ const loop_filter_info_n *lfi_n, const int dir_idx,
+ int plane, const MB_MODE_INFO *mbmi);
#if LOOP_FILTER_BITMASK
void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col,
int plane, int subsampling_x, int subsampling_y,
@@ -192,10 +169,59 @@ void av1_filter_block_plane_ver(struct AV1Common *const cm,
void av1_filter_block_plane_hor(struct AV1Common *const cm,
struct macroblockd_plane *const plane, int pl,
int mi_row, int mi_col);
+LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm,
+ int mi_row, int mi_col);
+int get_index_shift(int mi_col, int mi_row, int *index);
+
+static const FilterMask left_txform_mask[TX_SIZES] = {
+ { { 0x0000000000000001ULL, // TX_4X4,
+ 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+ { { 0x0000000000010001ULL, // TX_8X8,
+ 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+ { { 0x0001000100010001ULL, // TX_16X16,
+ 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+ { { 0x0001000100010001ULL, // TX_32X32,
+ 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+ { { 0x0001000100010001ULL, // TX_64X64,
+ 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
+};
+
+static const uint64_t above_txform_mask[2][TX_SIZES] = {
+ {
+ 0x0000000000000001ULL, // TX_4X4
+ 0x0000000000000003ULL, // TX_8X8
+ 0x000000000000000fULL, // TX_16X16
+ 0x00000000000000ffULL, // TX_32X32
+ 0x000000000000ffffULL, // TX_64X64
+ },
+ {
+ 0x0000000000000001ULL, // TX_4X4
+ 0x0000000000000005ULL, // TX_8X8
+ 0x0000000000000055ULL, // TX_16X16
+ 0x0000000000005555ULL, // TX_32X32
+ 0x0000000055555555ULL, // TX_64X64
+ },
+};
+
+extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL];
+
+extern const FilterMask left_mask_univariant_reordered[67];
+
+extern const FilterMask above_mask_univariant_reordered[67];
#endif
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_COMMON_LOOPFILTER_H_
+#endif // AOM_AV1_COMMON_AV1_LOOPFILTER_H_
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index fa8b34981..dee1f1c79 100755
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -76,12 +76,12 @@ specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
+
# directional intra predictor functions
add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
-
# FILTER_INTRA predictor functions
add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
specialize qw/av1_filter_intra_predictor sse4_1/;
@@ -108,6 +108,22 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
+
+add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/;
+
add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
@@ -122,9 +138,7 @@ specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/;
add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-specialize qw/av1_inv_txfm2d_add_32x32 avx2/;
add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -132,8 +146,6 @@ add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *ou
add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-specialize qw/av1_inv_txfm2d_add_64x64 sse4_1/;
-
add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -146,13 +158,13 @@ add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride
# build compound seg mask functions
add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
-specialize qw/av1_build_compound_diffwtd_mask sse4_1/;
+specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/;
add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
-specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 neon/;
+specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
#
# Encoder functions below this point.
@@ -186,7 +198,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_8x16 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_16x8 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -203,6 +217,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -218,7 +233,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
specialize qw/av1_temporal_filter_apply sse2 msa/;
- add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+ add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
# ENCODEMB INVOKE
@@ -238,7 +253,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
specialize qw/av1_get_nz_map_contexts sse2/;
add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
- specialize qw/av1_txb_init_levels sse4_1/;
+ specialize qw/av1_txb_init_levels sse4_1 avx2/;
add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
@@ -251,6 +266,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
specialize qw/av1_get_crc32c_value sse4_2/;
+ add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, double *M, double *H";
+ specialize qw/av1_compute_stats sse4_1 avx2/;
+
+ add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+ specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/;
}
# end encoder functions
@@ -275,7 +295,7 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) {
# WARPED_MOTION / GLOBAL_MOTION functions
add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_warp_affine sse4_1/;
+specialize qw/av1_warp_affine sse4_1 neon/;
add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
specialize qw/av1_highbd_warp_affine sse4_1/;
@@ -290,9 +310,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/;
-add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
- int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
- int sgr_params_idx, int bit_depth, int highbd";
+add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth, int highbd";
specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
# CONVOLVE_ROUND/COMPOUND_ROUND functions
diff --git a/third_party/aom/av1/common/av1_txfm.c b/third_party/aom/av1/common/av1_txfm.c
index 1e6654121..bb70eab70 100644
--- a/third_party/aom/av1/common/av1_txfm.c
+++ b/third_party/aom/av1/common/av1_txfm.c
@@ -108,3 +108,53 @@ const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = {
1, // TXFM_TYPE_IDENTITY16
1, // TXFM_TYPE_IDENTITY32
};
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+ const int32_t *buf, int32_t size, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ const int64_t max_value = (1LL << (bit - 1)) - 1;
+ const int64_t min_value = -(1LL << (bit - 1));
+
+ int in_range = 1;
+
+ for (int i = 0; i < size; ++i) {
+ if (buf[i] < min_value || buf[i] > max_value) {
+ in_range = 0;
+ }
+ }
+
+ if (!in_range) {
+ fprintf(stderr, "Error: coeffs contain out-of-range values\n");
+ fprintf(stderr, "size: %d\n", size);
+ fprintf(stderr, "stage: %d\n", stage);
+ fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
+ max_value);
+
+ fprintf(stderr, "coeffs: ");
+
+ fprintf(stderr, "[");
+ for (int j = 0; j < size; j++) {
+ if (j > 0) fprintf(stderr, ", ");
+ fprintf(stderr, "%d", input[j]);
+ }
+ fprintf(stderr, "]\n");
+
+ fprintf(stderr, " buf: ");
+
+ fprintf(stderr, "[");
+ for (int j = 0; j < size; j++) {
+ if (j > 0) fprintf(stderr, ", ");
+ fprintf(stderr, "%d", buf[j]);
+ }
+ fprintf(stderr, "]\n\n");
+ }
+
+ assert(in_range);
+#else
+ (void)stage;
+ (void)input;
+ (void)buf;
+ (void)size;
+ (void)bit;
+#endif
+}
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
index c9cc79852..59d64ca4a 100644
--- a/third_party/aom/av1/common/av1_txfm.h
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_TXFM_H_
-#define AV1_TXFM_H_
+#ifndef AOM_AV1_COMMON_AV1_TXFM_H_
+#define AOM_AV1_COMMON_AV1_TXFM_H_
#include <assert.h>
#include <math.h>
@@ -39,7 +39,7 @@ extern const int32_t av1_sinpi_arr_data[7][5];
static const int cos_bit_min = 10;
static const int cos_bit_max = 16;
-static const int NewSqrt2Bits = 12;
+#define NewSqrt2Bits ((int32_t)12)
// 2^12 * sqrt(2)
static const int32_t NewSqrt2 = 5793;
// 2^12 / sqrt(2)
@@ -64,7 +64,7 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
#if DO_RANGE_CHECK_CLAMP
bit = AOMMIN(bit, 31);
- return clamp(value, (1 << (bit - 1)) - 1, -(1 << (bit - 1)));
+ return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1);
#endif // DO_RANGE_CHECK_CLAMP
(void)bit;
return value;
@@ -78,10 +78,25 @@ static INLINE int32_t round_shift(int64_t value, int bit) {
static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
int bit) {
int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
+ int64_t intermediate = result_64 + (1LL << (bit - 1));
+ // NOTE(david.barker): The value 'result_64' may not necessarily fit
+ // into 32 bits. However, the result of this function is nominally
+ // ROUND_POWER_OF_TWO_64(result_64, bit)
+ // and that is required to fit into stage_range[stage] many bits
+ // (checked by range_check_buf()).
+ //
+ // Here we've unpacked that rounding operation, and it can be shown
+ // that the value of 'intermediate' here *does* fit into 32 bits
+ // for any conformant bitstream.
+ // The upshot is that, if you do all this calculation using
+ // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic,
+ // then you'll still get the correct result.
+ // To provide a check on this logic, we assert that 'intermediate'
+ // would fit into an int32 if range checking is enabled.
#if CONFIG_COEFFICIENT_RANGE_CHECKING
- assert(result_64 >= INT32_MIN && result_64 <= INT32_MAX);
+ assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX);
#endif
- return round_shift(result_64, bit);
+ return (int32_t)(intermediate >> bit);
}
static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
@@ -206,9 +221,12 @@ static INLINE int get_txw_idx(TX_SIZE tx_size) {
static INLINE int get_txh_idx(TX_SIZE tx_size) {
return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
}
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+ const int32_t *buf, int32_t size, int8_t bit);
#define MAX_TXWH_IDX 5
#ifdef __cplusplus
}
#endif // __cplusplus
-#endif // AV1_TXFM_H_
+#endif // AOM_AV1_COMMON_AV1_TXFM_H_
diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c
index 86b4b5d6c..2e796b656 100644
--- a/third_party/aom/av1/common/blockd.c
+++ b/third_party/aom/av1/common/blockd.c
@@ -28,66 +28,6 @@ PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
return above_mi->mode;
}
-void av1_foreach_transformed_block_in_plane(
- const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
- foreach_transformed_block_visitor visit, void *arg) {
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
- // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
- // transform size varies per plane, look it up in a common way.
- const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
- const BLOCK_SIZE plane_bsize =
- get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
- const uint8_t txw_unit = tx_size_wide_unit[tx_size];
- const uint8_t txh_unit = tx_size_high_unit[tx_size];
- const int step = txw_unit * txh_unit;
- int i = 0, r, c;
-
- // If mb_to_right_edge is < 0 we are in a situation in which
- // the current block size extends into the UMV and we won't
- // visit the sub blocks that are wholly within the UMV.
- const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
- const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-
- int blk_row, blk_col;
-
- const BLOCK_SIZE max_unit_bsize =
- get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
- int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
- int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
- mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
- mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
-
- // Keep track of the row and column of the blocks we use so that we know
- // if we are in the unrestricted motion border.
- for (r = 0; r < max_blocks_high; r += mu_blocks_high) {
- const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
- // Skip visiting the sub blocks that are wholly within the UMV.
- for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
- const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
- for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
- for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
- visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
- i += step;
- }
- }
- }
- }
-}
-
-void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
- BLOCK_SIZE bsize, int mi_row, int mi_col,
- foreach_transformed_block_visitor visit,
- void *arg, const int num_planes) {
- for (int plane = 0; plane < num_planes; ++plane) {
- if (!is_chroma_reference(mi_row, mi_col, bsize,
- xd->plane[plane].subsampling_x,
- xd->plane[plane].subsampling_y))
- continue;
- av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
- }
-}
-
void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
int has_eob, int aoff, int loff) {
@@ -159,6 +99,10 @@ void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
xd->plane[i].subsampling_x = i ? ss_x : 0;
xd->plane[i].subsampling_y = i ? ss_y : 0;
}
+ for (i = num_planes; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].subsampling_x = 1;
+ xd->plane[i].subsampling_y = 1;
+ }
}
const int16_t dr_intra_derivative[90] = {
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
index 979f13bd9..a2311c1b0 100644
--- a/third_party/aom/av1/common/blockd.h
+++ b/third_party/aom/av1/common/blockd.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_BLOCKD_H_
-#define AV1_COMMON_BLOCKD_H_
+#ifndef AOM_AV1_COMMON_BLOCKD_H_
+#define AOM_AV1_COMMON_BLOCKD_H_
#include "config/aom_config.h"
@@ -38,13 +38,13 @@ extern "C" {
#define MAX_DIFFWTD_MASK_BITS 1
// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
DIFFWTD_38 = 0,
DIFFWTD_38_INV,
DIFFWTD_MASK_TYPES,
} DIFFWTD_MASK_TYPE;
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
KEY_FRAME = 0,
INTER_FRAME = 1,
INTRA_ONLY_FRAME = 2, // replaces intra-only
@@ -57,7 +57,7 @@ static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
}
static INLINE int is_inter_mode(PREDICTION_MODE mode) {
- return mode >= NEARESTMV && mode <= NEW_NEWMV;
+ return mode >= INTER_MODE_START && mode < INTER_MODE_END;
}
typedef struct {
@@ -66,10 +66,10 @@ typedef struct {
} BUFFER_SET;
static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
- return mode >= NEARESTMV && mode <= NEWMV;
+ return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END;
}
static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
- return mode >= NEAREST_NEARESTMV && mode <= NEW_NEWMV;
+ return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END;
}
static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
@@ -148,10 +148,6 @@ static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
}
-static INLINE int use_masked_motion_search(COMPOUND_TYPE type) {
- return (type == COMPOUND_WEDGE);
-}
-
static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
}
@@ -267,8 +263,8 @@ typedef struct MB_MODE_INFO {
int mi_row;
int mi_col;
#endif
- int num_proj_ref[2];
- WarpedMotionParams wm_params[2];
+ int num_proj_ref;
+ WarpedMotionParams wm_params;
// Index of the alpha Cb and alpha Cr combination
int cfl_alpha_idx;
@@ -376,7 +372,7 @@ static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
}
#endif
-enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
+enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
struct buf_2d {
uint8_t *buf;
@@ -500,6 +496,8 @@ typedef struct jnt_comp_params {
int bck_offset;
} JNT_COMP_PARAMS;
+// Most/all of the pointers are mere pointers to actual arrays are allocated
+// elsewhere. This is mostly for coding convenience.
typedef struct macroblockd {
struct macroblockd_plane plane[MAX_MB_PLANE];
@@ -544,7 +542,7 @@ typedef struct macroblockd {
SgrprojInfo sgrproj_info[MAX_MB_PLANE];
// block dimension in the unit of mode_info.
- uint8_t n8_w, n8_h;
+ uint8_t n4_w, n4_h;
uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
@@ -599,6 +597,9 @@ typedef struct macroblockd {
uint16_t cb_offset[MAX_MB_PLANE];
uint16_t txb_offset[MAX_MB_PLANE];
uint16_t color_index_map_offset[2];
+
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
} MACROBLOCKD;
static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
@@ -623,6 +624,11 @@ static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
}
}
+// For a square block size 'bsize', returns the size of the sub-blocks used by
+// the given partition type. If the partition produces sub-blocks of different
+// sizes, then the function returns the largest sub-block size.
+// Implements the Partition_Subsize lookup table in the spec (Section 9.3.
+// Conversion tables).
// Note: the input block size should be square.
// Otherwise it's considered invalid.
static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
@@ -781,6 +787,8 @@ static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
return intra_mode_to_tx_type(mbmi, plane_type);
}
+// Implements the get_plane_residual_size() function in the spec (Section
+// 5.11.38. Get plane residual size function).
static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
int subsampling_x,
int subsampling_y) {
@@ -952,15 +960,6 @@ typedef void (*foreach_transformed_block_visitor)(int plane, int block,
BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg);
-void av1_foreach_transformed_block_in_plane(
- const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
- foreach_transformed_block_visitor visit, void *arg);
-
-void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
- BLOCK_SIZE bsize, int mi_row, int mi_col,
- foreach_transformed_block_visitor visit,
- void *arg, const int num_planes);
-
void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
int has_eob, int aoff, int loff);
@@ -976,7 +975,7 @@ static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
}
static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
- return (mode >= NEARESTMV) && (mode <= NEWMV);
+ return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
}
static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
@@ -1045,7 +1044,7 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
is_motion_variation_allowed_compound(mbmi)) {
if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
assert(!has_second_ref(mbmi));
- if (mbmi->num_proj_ref[0] >= 1 &&
+ if (mbmi->num_proj_ref >= 1 &&
(allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) {
if (xd->cur_frame_force_integer_mv) {
return OBMC_CAUSAL;
@@ -1174,4 +1173,4 @@ static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
} // extern "C"
#endif
-#endif // AV1_COMMON_BLOCKD_H_
+#endif // AOM_AV1_COMMON_BLOCKD_H_
diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h
index 092230de9..3b2eac8a5 100644
--- a/third_party/aom/av1/common/cdef.h
+++ b/third_party/aom/av1/common/cdef.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_CDEF_H_
-#define AV1_COMMON_CDEF_H_
+#ifndef AOM_AV1_COMMON_CDEF_H_
+#define AOM_AV1_COMMON_CDEF_H_
#define CDEF_STRENGTH_BITS 6
@@ -48,4 +48,4 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_COMMON_CDEF_H_
+#endif // AOM_AV1_COMMON_CDEF_H_
diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h
index 81c6da077..6b4452cd6 100644
--- a/third_party/aom/av1/common/cdef_block.h
+++ b/third_party/aom/av1/common/cdef_block.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#if !defined(_CDEF_BLOCK_H)
-#define _CDEF_BLOCK_H (1)
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_H_
#include "av1/common/odintrin.h"
@@ -56,4 +56,4 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
cdef_list *dlist, int cdef_count, int level,
int sec_strength, int pri_damping, int sec_damping,
int coeff_shift);
-#endif
+#endif // AOM_AV1_COMMON_CDEF_BLOCK_H_
diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h
index d24a7c0fa..14587a023 100644
--- a/third_party/aom/av1/common/cdef_block_simd.h
+++ b/third_party/aom/av1/common/cdef_block_simd.h
@@ -9,6 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+
#include "config/av1_rtcd.h"
#include "av1/common/cdef_block.h"
@@ -913,3 +916,5 @@ void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
}
}
}
+
+#endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h
index bc9fbce1b..d627891bf 100644
--- a/third_party/aom/av1/common/cfl.h
+++ b/third_party/aom/av1/common/cfl.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_CFL_H_
-#define AV1_COMMON_CFL_H_
+#ifndef AOM_AV1_COMMON_CFL_H_
+#define AOM_AV1_COMMON_CFL_H_
#include "av1/common/blockd.h"
#include "av1/common/onyxc_int.h"
@@ -299,4 +299,4 @@ void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst,
return pred[tx_size % TX_SIZES_ALL]; \
}
-#endif // AV1_COMMON_CFL_H_
+#endif // AOM_AV1_COMMON_CFL_H_
diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h
index 72c6d3a1e..bed6083db 100644
--- a/third_party/aom/av1/common/common.h
+++ b/third_party/aom/av1/common/common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_COMMON_H_
-#define AV1_COMMON_COMMON_H_
+#ifndef AOM_AV1_COMMON_COMMON_H_
+#define AOM_AV1_COMMON_COMMON_H_
/* Interface header for common constant data structures and lookup tables */
@@ -60,4 +60,4 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
} // extern "C"
#endif
-#endif // AV1_COMMON_COMMON_H_
+#endif // AOM_AV1_COMMON_COMMON_H_
diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h
index f521f10bf..46e455fdb 100644
--- a/third_party/aom/av1/common/common_data.h
+++ b/third_party/aom/av1/common/common_data.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_COMMON_DATA_H_
-#define AV1_COMMON_COMMON_DATA_H_
+#ifndef AOM_AV1_COMMON_COMMON_DATA_H_
+#define AOM_AV1_COMMON_COMMON_DATA_H_
#include "av1/common/enums.h"
#include "aom/aom_integer.h"
@@ -20,34 +20,43 @@
extern "C" {
#endif
-// Log 2 conversion lookup tables in units of mode info(4x4).
+// Log 2 conversion lookup tables in units of mode info (4x4).
+// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables).
static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = {
0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4
};
+// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables).
static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = {
0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2
};
+// Width/height lookup tables in units of mode info (4x4).
+// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables).
static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = {
1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16
};
+// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables).
static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = {
1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4
};
-// Width/height lookup tables in units of various block sizes
+// Width/height lookup tables in units of samples.
+// The Block_Width table in the spec (Section 9.3. Conversion tables).
static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = {
4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32,
64, 64, 64, 128, 128, 4, 16, 8, 32, 16, 64
};
+// The Block_Height table in the spec (Section 9.3. Conversion tables).
static const uint8_t block_size_high[BLOCK_SIZES_ALL] = {
4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64,
32, 64, 128, 64, 128, 16, 4, 32, 8, 64, 16
};
-// AOMMIN(3, AOMMIN(b_width_log2(bsize), b_height_log2(bsize)))
+// Maps a block size to a context.
+// The Size_Group table in the spec (Section 9.3. Conversion tables).
+// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize)))
static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = {
0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
};
@@ -56,6 +65,8 @@ static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = {
4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10
};
+// A compressed version of the Partition_Subsize table in the spec (9.3.
+// Conversion tables), for square block sizes only.
/* clang-format off */
static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = {
{ // PARTITION_NONE
@@ -350,34 +361,36 @@ static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
TX_64X64, // TX_MODE_LARGEST
TX_64X64, // TX_MODE_SELECT
};
-/* clang-format on */
+// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual
+// size function).
static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = {
- // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1
- // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1
- { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
- { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
- { { BLOCK_8X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
- { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } },
- { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_4X16, BLOCK_4X8 } },
- { { BLOCK_16X8, BLOCK_16X4 }, { BLOCK_8X8, BLOCK_8X4 } },
- { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } },
- { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_8X32, BLOCK_8X16 } },
- { { BLOCK_32X16, BLOCK_32X8 }, { BLOCK_16X16, BLOCK_16X8 } },
- { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } },
- { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_16X64, BLOCK_16X32 } },
- { { BLOCK_64X32, BLOCK_64X16 }, { BLOCK_32X32, BLOCK_32X16 } },
- { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } },
- { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } },
- { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } },
- { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } },
- { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_4X16, BLOCK_4X8 } },
- { { BLOCK_16X4, BLOCK_16X4 }, { BLOCK_8X4, BLOCK_8X4 } },
- { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } },
- { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } },
- { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } },
- { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } }
+ // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1
+ // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1
+ { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
+ { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_4X4 } },
+ { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X4 } },
+ { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } },
+ { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
+ { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } },
+ { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } },
+ { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } },
+ { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } },
+ { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } },
+ { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } },
+ { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } },
+ { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } },
+ { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } },
+ { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } },
+ { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } },
+ { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
+ { { BLOCK_16X4, BLOCK_INVALID }, { BLOCK_8X4, BLOCK_8X4 } },
+ { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } },
+ { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } },
+ { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } },
+ { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } }
};
+/* clang-format on */
// Generates 5 bit field in which each bit set to 1 represents
// a blocksize partition 11111 means we split 128x128, 64x64, 32x32, 16x16
@@ -430,4 +443,4 @@ static const int quant_dist_lookup_table[2][4][2] = {
} // extern "C"
#endif
-#endif // AV1_COMMON_COMMON_DATA_H_
+#endif // AOM_AV1_COMMON_COMMON_DATA_H_
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
index ed962c722..1f11126fc 100644
--- a/third_party/aom/av1/common/convolve.c
+++ b/third_party/aom/av1/common/convolve.c
@@ -173,6 +173,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -510,31 +511,73 @@ static void convolve_2d_scale_wrapper(
y_step_qn, conv_params);
}
+// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So
+// we may create optimized code to do 2-tap filtering for all bilinear filtering
+// usages, not just IntraBC.
+static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ int subpel_x_q4, int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const InterpFilterParams *filter_params_x =
+ subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
+ const InterpFilterParams *filter_params_y =
+ subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
+ if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, 0, 0, conv_params);
+ } else if (subpel_x_q4 != 0) {
+ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, 0, 0, conv_params);
+ } else {
+ av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, 0, 0, conv_params);
+ }
+}
+
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilters interp_filters, const int subpel_x_q4,
int x_step_q4, const int subpel_y_q4, int y_step_q4,
int scaled, ConvolveParams *conv_params,
- const struct scale_factors *sf) {
+ const struct scale_factors *sf, int is_intrabc) {
+ assert(IMPLIES(is_intrabc, !scaled));
(void)x_step_q4;
(void)y_step_q4;
(void)dst;
(void)dst_stride;
- InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
- InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+
+ if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
+ convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
+ subpel_y_q4, conv_params);
+ return;
+ }
+
+ InterpFilter filter_x = 0;
+ InterpFilter filter_y = 0;
+ const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
+ const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
+ if (need_filter_params_x)
+ filter_x = av1_extract_interp_filter(interp_filters, 1);
+ if (need_filter_params_y)
+ filter_y = av1_extract_interp_filter(interp_filters, 0);
const InterpFilterParams *filter_params_x =
- av1_get_interp_filter_params_with_block_size(filter_x, w);
+ need_filter_params_x
+ ? av1_get_interp_filter_params_with_block_size(filter_x, w)
+ : NULL;
const InterpFilterParams *filter_params_y =
- av1_get_interp_filter_params_with_block_size(filter_y, h);
+ need_filter_params_y
+ ? av1_get_interp_filter_params_with_block_size(filter_y, h)
+ : NULL;
- if (scaled)
+ if (scaled) {
convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
filter_params_x, filter_params_y, subpel_x_q4,
x_step_q4, subpel_y_q4, y_step_q4, conv_params);
- else
+ } else {
sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
src, src_stride, dst, dst_stride, w, h, filter_params_x,
filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+ }
}
void av1_highbd_convolve_2d_copy_sr_c(
@@ -964,24 +1007,68 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
}
}
+static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, int subpel_x_q4,
+ int subpel_y_q4,
+ ConvolveParams *conv_params,
+ int bd) {
+ const InterpFilterParams *filter_params_x =
+ subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
+ const InterpFilterParams *filter_params_y =
+ subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
+ if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
+ av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, 0, 0,
+ conv_params, bd);
+ } else if (subpel_x_q4 != 0) {
+ av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, 0, 0,
+ conv_params, bd);
+ } else {
+ av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, 0, 0,
+ conv_params, bd);
+ }
+}
+
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
uint8_t *dst8, int dst_stride, int w, int h,
InterpFilters interp_filters,
const int subpel_x_q4, int x_step_q4,
const int subpel_y_q4, int y_step_q4,
int scaled, ConvolveParams *conv_params,
- const struct scale_factors *sf, int bd) {
+ const struct scale_factors *sf,
+ int is_intrabc, int bd) {
+ assert(IMPLIES(is_intrabc, !scaled));
(void)x_step_q4;
(void)y_step_q4;
(void)dst_stride;
-
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
- InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+
+ if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
+ subpel_x_q4, subpel_y_q4, conv_params, bd);
+ return;
+ }
+
+ InterpFilter filter_x = 0;
+ InterpFilter filter_y = 0;
+ const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
+ const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
+ if (need_filter_params_x)
+ filter_x = av1_extract_interp_filter(interp_filters, 1);
+ if (need_filter_params_y)
+ filter_y = av1_extract_interp_filter(interp_filters, 0);
const InterpFilterParams *filter_params_x =
- av1_get_interp_filter_params_with_block_size(filter_x, w);
+ need_filter_params_x
+ ? av1_get_interp_filter_params_with_block_size(filter_x, w)
+ : NULL;
const InterpFilterParams *filter_params_y =
- av1_get_interp_filter_params_with_block_size(filter_y, h);
+ need_filter_params_y
+ ? av1_get_interp_filter_params_with_block_size(filter_y, h)
+ : NULL;
if (scaled) {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -1111,7 +1198,8 @@ void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
assert(w <= MAX_SB_SIZE);
assert(h <= MAX_SB_SIZE);
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
index bc2d4bccf..4109dd843 100644
--- a/third_party/aom/av1/common/convolve.h
+++ b/third_party/aom/av1/common/convolve.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_AV1_CONVOLVE_H_
-#define AV1_COMMON_AV1_CONVOLVE_H_
+#ifndef AOM_AV1_COMMON_CONVOLVE_H_
+#define AOM_AV1_COMMON_CONVOLVE_H_
#include "av1/common/filter.h"
#ifdef __cplusplus
@@ -19,7 +19,6 @@ extern "C" {
typedef uint16_t CONV_BUF_TYPE;
typedef struct ConvolveParams {
- int ref;
int do_average;
CONV_BUF_TYPE *dst;
int dst_stride;
@@ -59,15 +58,13 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
InterpFilters interp_filters, const int subpel_x_q4,
int x_step_q4, const int subpel_y_q4, int y_step_q4,
int scaled, ConvolveParams *conv_params,
- const struct scale_factors *sf);
+ const struct scale_factors *sf, int is_intrabc);
-static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
- int plane,
+static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane,
CONV_BUF_TYPE *dst,
int dst_stride,
int is_compound, int bd) {
ConvolveParams conv_params;
- conv_params.ref = ref;
conv_params.do_average = do_average;
assert(IMPLIES(do_average, is_compound));
conv_params.is_compound = is_compound;
@@ -88,15 +85,14 @@ static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
return conv_params;
}
-static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane,
+static INLINE ConvolveParams get_conv_params(int do_average, int plane,
int bd) {
- return get_conv_params_no_round(ref, do_average, plane, NULL, 0, 0, bd);
+ return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd);
}
static INLINE ConvolveParams get_conv_params_wiener(int bd) {
ConvolveParams conv_params;
(void)bd;
- conv_params.ref = 0;
conv_params.do_average = 0;
conv_params.is_compound = 0;
conv_params.round_0 = WIENER_ROUND0_BITS;
@@ -119,10 +115,11 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
const int subpel_x_q4, int x_step_q4,
const int subpel_y_q4, int y_step_q4,
int scaled, ConvolveParams *conv_params,
- const struct scale_factors *sf, int bd);
+ const struct scale_factors *sf,
+ int is_intrabc, int bd);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_COMMON_AV1_CONVOLVE_H_
+#endif // AOM_AV1_COMMON_CONVOLVE_H_
diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h
index ef944c5a0..991692c2f 100644
--- a/third_party/aom/av1/common/entropy.h
+++ b/third_party/aom/av1/common/entropy.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ENTROPY_H_
-#define AV1_COMMON_ENTROPY_H_
+#ifndef AOM_AV1_COMMON_ENTROPY_H_
+#define AOM_AV1_COMMON_ENTROPY_H_
#include "config/aom_config.h"
@@ -178,4 +178,4 @@ static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
} // extern "C"
#endif
-#endif // AV1_COMMON_ENTROPY_H_
+#endif // AOM_AV1_COMMON_ENTROPY_H_
diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h
index 0bd2e20a1..7047f34d2 100644
--- a/third_party/aom/av1/common/entropymode.h
+++ b/third_party/aom/av1/common/entropymode.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ENTROPYMODE_H_
-#define AV1_COMMON_ENTROPYMODE_H_
+#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_
+#define AOM_AV1_COMMON_ENTROPYMODE_H_
#include "av1/common/entropy.h"
#include "av1/common/entropymv.h"
@@ -186,6 +186,8 @@ void av1_set_default_mode_deltas(int8_t *mode_deltas);
void av1_setup_frame_contexts(struct AV1Common *cm);
void av1_setup_past_independence(struct AV1Common *cm);
+// Returns (int)ceil(log2(n)).
+// NOTE: This implementation only works for n <= 2^30.
static INLINE int av1_ceil_log2(int n) {
if (n < 2) return 0;
int i = 1, p = 2;
@@ -207,4 +209,4 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
} // extern "C"
#endif
-#endif // AV1_COMMON_ENTROPYMODE_H_
+#endif // AOM_AV1_COMMON_ENTROPYMODE_H_
diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c
index 446aa433c..491337387 100644
--- a/third_party/aom/av1/common/entropymv.c
+++ b/third_party/aom/av1/common/entropymv.c
@@ -60,61 +60,6 @@ static const nmv_context default_nmv_context = {
} },
};
-static const uint8_t log_in_base_2[] = {
- 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
- 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
- 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
- 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
-};
-
-static INLINE int mv_class_base(MV_CLASS_TYPE c) {
- return c ? CLASS0_SIZE << (c + 2) : 0;
-}
-
-MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
- const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
- ? MV_CLASS_10
- : (MV_CLASS_TYPE)log_in_base_2[z >> 3];
- if (offset) *offset = z - mv_class_base(c);
- return c;
-}
-
void av1_init_mv_probs(AV1_COMMON *cm) {
// NB: this sets CDFs too
cm->fc->nmvc = default_nmv_context;
diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h
index 02ca7b66b..fa818a2c1 100644
--- a/third_party/aom/av1/common/entropymv.h
+++ b/third_party/aom/av1/common/entropymv.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ENTROPYMV_H_
-#define AV1_COMMON_ENTROPYMV_H_
+#ifndef AOM_AV1_COMMON_ENTROPYMV_H_
+#define AOM_AV1_COMMON_ENTROPYMV_H_
#include "config/aom_config.h"
@@ -91,16 +91,6 @@ typedef struct {
nmv_component comps[2];
} nmv_context;
-static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
- if (mv->row == 0) {
- return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
- } else {
- return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
- }
-}
-
-MV_CLASS_TYPE av1_get_mv_class(int z, int *offset);
-
typedef enum {
MV_SUBPEL_NONE = -1,
MV_SUBPEL_LOW_PRECISION = 0,
@@ -111,4 +101,4 @@ typedef enum {
} // extern "C"
#endif
-#endif // AV1_COMMON_ENTROPYMV_H_
+#endif // AOM_AV1_COMMON_ENTROPYMV_H_
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
index 689c25f30..869c06ef2 100644
--- a/third_party/aom/av1/common/enums.h
+++ b/third_party/aom/av1/common/enums.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ENUMS_H_
-#define AV1_COMMON_ENUMS_H_
+#ifndef AOM_AV1_COMMON_ENUMS_H_
+#define AOM_AV1_COMMON_ENUMS_H_
#include "config/aom_config.h"
@@ -274,7 +274,7 @@ typedef enum ATTRIBUTE_PACKED {
TX_TYPES,
} TX_TYPE;
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
REG_REG,
REG_SMOOTH,
REG_SHARP,
@@ -438,6 +438,8 @@ typedef enum ATTRIBUTE_PACKED {
COMP_INTER_MODE_START = NEAREST_NEARESTMV,
COMP_INTER_MODE_END = MB_MODE_COUNT,
COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START,
+ INTER_MODE_START = NEARESTMV,
+ INTER_MODE_END = MB_MODE_COUNT,
INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode.
INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks
} PREDICTION_MODE;
@@ -478,7 +480,7 @@ typedef enum ATTRIBUTE_PACKED {
INTERINTRA_MODES
} INTERINTRA_MODE;
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
COMPOUND_AVERAGE,
COMPOUND_WEDGE,
COMPOUND_DIFFWTD,
@@ -614,4 +616,4 @@ typedef enum ATTRIBUTE_PACKED {
} // extern "C"
#endif
-#endif // AV1_COMMON_ENUMS_H_
+#endif // AOM_AV1_COMMON_ENUMS_H_
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
index 7f8ad583a..571422d11 100644
--- a/third_party/aom/av1/common/filter.h
+++ b/third_party/aom/av1/common/filter.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_FILTER_H_
-#define AV1_COMMON_FILTER_H_
+#ifndef AOM_AV1_COMMON_FILTER_H_
+#define AOM_AV1_COMMON_FILTER_H_
#include <assert.h>
@@ -139,6 +139,17 @@ static const InterpFilterParams
BILINEAR }
};
+// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
+// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV.
+DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = {
+ 64,
+ 64,
+};
+
+static const InterpFilterParams av1_intrabc_filter_params = {
+ av1_intrabc_bilinear_filter, 2, 0, BILINEAR
+};
+
DECLARE_ALIGNED(256, static const InterpKernel,
av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
{ 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 },
@@ -181,6 +192,11 @@ av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
return &av1_interp_filter_params_list[interp_filter];
}
+static INLINE const InterpFilterParams *av1_get_4tap_interp_filter_params(
+ const InterpFilter interp_filter) {
+ return &av1_interp_4tap[interp_filter];
+}
+
static INLINE const int16_t *av1_get_interp_filter_kernel(
const InterpFilter interp_filter) {
return av1_interp_filter_params_list[interp_filter].filter_ptr;
@@ -195,4 +211,4 @@ static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
} // extern "C"
#endif
-#endif // AV1_COMMON_FILTER_H_
+#endif // AOM_AV1_COMMON_FILTER_H_
diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c
index 502ccd27d..fd6c4bc79 100644
--- a/third_party/aom/av1/common/frame_buffers.c
+++ b/third_party/aom/av1/common/frame_buffers.c
@@ -38,6 +38,17 @@ void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
list->int_fb = NULL;
}
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) {
+ int i;
+
+ assert(list != NULL);
+
+ for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+ if (list->int_fb[i].data && !list->int_fb[i].in_use)
+ memset(list->int_fb[i].data, 0, list->int_fb[i].size);
+ }
+}
+
int av1_get_frame_buffer(void *cb_priv, size_t min_size,
aom_codec_frame_buffer_t *fb) {
int i;
diff --git a/third_party/aom/av1/common/frame_buffers.h b/third_party/aom/av1/common/frame_buffers.h
index e7341cfdd..16188e51c 100644
--- a/third_party/aom/av1/common/frame_buffers.h
+++ b/third_party/aom/av1/common/frame_buffers.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_FRAME_BUFFERS_H_
-#define AV1_COMMON_FRAME_BUFFERS_H_
+#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_
+#define AOM_AV1_COMMON_FRAME_BUFFERS_H_
#include "aom/aom_frame_buffer.h"
#include "aom/aom_integer.h"
@@ -36,6 +36,12 @@ int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list);
// Free any data allocated to the frame buffers.
void av1_free_internal_frame_buffers(InternalFrameBufferList *list);
+// Zeros all unused internal frame buffers. In particular, this zeros the
+// frame borders. Call this function after a sequence header change to
+// re-initialize the frame borders for the different width, height, or bit
+// depth.
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list);
+
// Callback used by libaom to request an external frame buffer. |cb_priv|
// Callback private data, which points to an InternalFrameBufferList.
// |min_size| is the minimum size in bytes needed to decode the next frame.
@@ -51,4 +57,4 @@ int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb);
} // extern "C"
#endif
-#endif // AV1_COMMON_FRAME_BUFFERS_H_
+#endif // AOM_AV1_COMMON_FRAME_BUFFERS_H_
diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c
index bc758eb57..2c1cb9827 100644
--- a/third_party/aom/av1/common/idct.c
+++ b/third_party/aom/av1/common/idct.c
@@ -31,21 +31,16 @@ int av1_get_tx_scale(const TX_SIZE tx_size) {
// that input and output could be the same buffer.
// idct
-static void highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest,
- int stride, int eob, int bd) {
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
if (eob > 1)
av1_highbd_iwht4x4_16_add(input, dest, stride, bd);
else
av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
}
-static const int32_t *cast_to_int32(const tran_low_t *input) {
- assert(sizeof(int32_t) == sizeof(tran_low_t));
- return (const int32_t *)input;
-}
-
-void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
int eob = txfm_param->eob;
int bd = txfm_param->bd;
@@ -54,206 +49,150 @@ void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
const TX_TYPE tx_type = txfm_param->tx_type;
if (lossless) {
assert(tx_type == DCT_DCT);
- highbd_iwht4x4_add(input, dest, stride, eob, bd);
+ av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
return;
}
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- }
+
+ av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
}
-static void highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_4x8(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_8x4(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_8x16(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_16x8(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_16x32(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_32x16(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_16x4(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_4x16(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_32x8(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_8x32(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_32x64(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_64x32(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_16x64(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
-}
-static void highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
- const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_64x16(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
}
-static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
int bd = txfm_param->bd;
const TX_TYPE tx_type = txfm_param->tx_type;
const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+
+ av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
bd);
- break;
- }
}
-static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
- default:
- av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- }
+ av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int bd = txfm_param->bd;
const TX_TYPE tx_type = txfm_param->tx_type;
const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_inv_txfm2d_add_32x32(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- // Assembly version doesn't support IDTX, so use C version for it.
- case IDTX:
- av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
- default: assert(0);
- }
+ av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
}
-static void highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int bd = txfm_param->bd;
const TX_TYPE tx_type = txfm_param->tx_type;
const int32_t *src = cast_to_int32(input);
assert(tx_type == DCT_DCT);
- av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
+ av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
}
static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
@@ -270,70 +209,70 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
}
-static void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const TX_SIZE tx_size = txfm_param->tx_size;
switch (tx_size) {
case TX_32X32:
- highbd_inv_txfm_add_32x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param);
break;
case TX_16X16:
- highbd_inv_txfm_add_16x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param);
break;
case TX_8X8:
- highbd_inv_txfm_add_8x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param);
break;
case TX_4X8:
- highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
break;
case TX_8X4:
- highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
break;
case TX_8X16:
- highbd_inv_txfm_add_8x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param);
break;
case TX_16X8:
- highbd_inv_txfm_add_16x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param);
break;
case TX_16X32:
- highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
break;
case TX_32X16:
- highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
break;
case TX_64X64:
- highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param);
break;
case TX_32X64:
- highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
break;
case TX_64X32:
- highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
break;
case TX_16X64:
- highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
break;
case TX_64X16:
- highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
break;
case TX_4X4:
// this is like av1_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
- av1_highbd_inv_txfm_add_4x4(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param);
break;
case TX_16X4:
- highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
break;
case TX_4X16:
- highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
break;
case TX_8X32:
- highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
break;
case TX_32X8:
- highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
break;
default: assert(0 && "Invalid transform size"); break;
}
@@ -352,7 +291,8 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
}
}
- highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, txfm_param);
+ av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+ txfm_param);
for (int r = 0; r < h; ++r) {
for (int c = 0; c < w; ++c) {
@@ -375,7 +315,7 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd,
assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
if (txfm_param.is_hbd) {
- highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+ av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
} else {
av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
}
diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h
index 50032a167..d9454e73f 100644
--- a/third_party/aom/av1/common/idct.h
+++ b/third_party/aom/av1/common/idct.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_IDCT_H_
-#define AV1_COMMON_IDCT_H_
+#ifndef AOM_AV1_COMMON_IDCT_H_
+#define AOM_AV1_COMMON_IDCT_H_
#include "config/aom_config.h"
@@ -36,11 +36,32 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd,
const tran_low_t *dqcoeff, int plane,
TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
int stride, int eob, int reduced_tx_set);
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+
+static INLINE const int32_t *cast_to_int32(const tran_low_t *input) {
+ assert(sizeof(int32_t) == sizeof(tran_low_t));
+ return (const int32_t *)input;
+}
+
+typedef void(highbd_inv_txfm_add)(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *param);
+
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x8;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x4;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x64;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x64;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x4;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x8;
-void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *param);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_COMMON_IDCT_H_
+#endif // AOM_AV1_COMMON_IDCT_H_
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
index c2495640e..5b0225192 100644
--- a/third_party/aom/av1/common/mv.h
+++ b/third_party/aom/av1/common/mv.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_MV_H_
-#define AV1_COMMON_MV_H_
+#ifndef AOM_AV1_COMMON_MV_H_
+#define AOM_AV1_COMMON_MV_H_
#include "av1/common/common.h"
#include "av1/common/common_data.h"
@@ -56,7 +56,7 @@ typedef struct mv32 {
#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
/* clang-format off */
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
IDENTITY = 0, // identity transformation, 0-parameter
TRANSLATION = 1, // translational motion 2-parameter
ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter
@@ -298,4 +298,4 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
} // extern "C"
#endif
-#endif // AV1_COMMON_MV_H_
+#endif // AOM_AV1_COMMON_MV_H_
diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c
index 6939df335..7f24ab4e6 100644
--- a/third_party/aom/av1/common/mvref_common.c
+++ b/third_party/aom/av1/common/mvref_common.c
@@ -27,16 +27,19 @@ static void get_mv_projection(MV *output, MV ref, int num, int den) {
den = AOMMIN(den, MAX_FRAME_DISTANCE);
num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
: AOMMAX(num, -MAX_FRAME_DISTANCE);
- int mv_row = ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
- int mv_col = ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
+ const int mv_row =
+ ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
+ const int mv_col =
+ ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
const int clamp_max = MV_UPP - 1;
const int clamp_min = MV_LOW + 1;
output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
}
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
- int mi_row, int mi_col, int x_mis, int y_mis) {
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+ const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+ int x_mis, int y_mis) {
const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
MV_REF *frame_mvs =
cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
@@ -141,38 +144,37 @@ static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
uint8_t *ref_match_count, uint8_t *newmv_count,
int_mv *gm_mv_candidates, int max_row_offset,
int *processed_rows) {
- int end_mi = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+ int end_mi = AOMMIN(xd->n4_w, cm->mi_cols - mi_col);
end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
const int n8_w_8 = mi_size_wide[BLOCK_8X8];
const int n8_w_16 = mi_size_wide[BLOCK_16X16];
int i;
int col_offset = 0;
- const int shift = 0;
// TODO(jingning): Revisit this part after cb4x4 is stable.
if (abs(row_offset) > 1) {
col_offset = 1;
- if ((mi_col & 0x01) && xd->n8_w < n8_w_8) --col_offset;
+ if ((mi_col & 0x01) && xd->n4_w < n8_w_8) --col_offset;
}
- const int use_step_16 = (xd->n8_w >= 16);
+ const int use_step_16 = (xd->n4_w >= 16);
MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
(void)mi_row;
for (i = 0; i < end_mi;) {
const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
const int candidate_bsize = candidate->sb_type;
- const int n8_w = mi_size_wide[candidate_bsize];
- int len = AOMMIN(xd->n8_w, n8_w);
+ const int n4_w = mi_size_wide[candidate_bsize];
+ int len = AOMMIN(xd->n4_w, n4_w);
if (use_step_16)
len = AOMMAX(n8_w_16, len);
else if (abs(row_offset) > 1)
len = AOMMAX(len, n8_w_8);
int weight = 2;
- if (xd->n8_w >= n8_w_8 && xd->n8_w <= n8_w) {
+ if (xd->n4_w >= n8_w_8 && xd->n4_w <= n4_w) {
int inc = AOMMIN(-max_row_offset + row_offset + 1,
mi_size_high[candidate_bsize]);
// Obtain range used in weight calculation.
- weight = AOMMAX(weight, (inc << shift));
+ weight = AOMMAX(weight, inc);
// Update processed rows.
*processed_rows = inc - row_offset - 1;
}
@@ -192,37 +194,36 @@ static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
uint8_t *ref_match_count, uint8_t *newmv_count,
int_mv *gm_mv_candidates, int max_col_offset,
int *processed_cols) {
- int end_mi = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+ int end_mi = AOMMIN(xd->n4_h, cm->mi_rows - mi_row);
end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
const int n8_h_8 = mi_size_high[BLOCK_8X8];
const int n8_h_16 = mi_size_high[BLOCK_16X16];
int i;
int row_offset = 0;
- const int shift = 0;
if (abs(col_offset) > 1) {
row_offset = 1;
- if ((mi_row & 0x01) && xd->n8_h < n8_h_8) --row_offset;
+ if ((mi_row & 0x01) && xd->n4_h < n8_h_8) --row_offset;
}
- const int use_step_16 = (xd->n8_h >= 16);
+ const int use_step_16 = (xd->n4_h >= 16);
(void)mi_col;
for (i = 0; i < end_mi;) {
const MB_MODE_INFO *const candidate =
xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
const int candidate_bsize = candidate->sb_type;
- const int n8_h = mi_size_high[candidate_bsize];
- int len = AOMMIN(xd->n8_h, n8_h);
+ const int n4_h = mi_size_high[candidate_bsize];
+ int len = AOMMIN(xd->n4_h, n4_h);
if (use_step_16)
len = AOMMAX(n8_h_16, len);
else if (abs(col_offset) > 1)
len = AOMMAX(len, n8_h_8);
int weight = 2;
- if (xd->n8_h >= n8_h_8 && xd->n8_h <= n8_h) {
+ if (xd->n4_h >= n8_h_8 && xd->n4_h <= n4_h) {
int inc = AOMMIN(-max_col_offset + col_offset + 1,
mi_size_wide[candidate_bsize]);
// Obtain range used in weight calculation.
- weight = AOMMAX(weight, (inc << shift));
+ weight = AOMMAX(weight, inc);
// Update processed cols.
*processed_cols = inc - col_offset - 1;
}
@@ -248,7 +249,7 @@ static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
mi_pos.row = row_offset;
mi_pos.col = col_offset;
- if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) {
+ if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
const MB_MODE_INFO *const candidate =
xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
const int len = mi_size_wide[BLOCK_8X8];
@@ -290,19 +291,19 @@ static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
// The left hand of two vertical rectangles always has a top right (as the
// block above will have been decoded)
- if (xd->n8_w < xd->n8_h)
+ if (xd->n4_w < xd->n4_h)
if (!xd->is_sec_rect) has_tr = 1;
// The bottom of two horizontal rectangles never has a top right (as the block
// to the right won't have been decoded)
- if (xd->n8_w > xd->n8_h)
+ if (xd->n4_w > xd->n4_h)
if (xd->is_sec_rect) has_tr = 0;
// The bottom left square of a Vertical A (in the old format) does
// not have a top right as it is decoded before the right hand
// rectangle of the partition
if (xd->mi[0]->partition == PARTITION_VERT_A) {
- if (xd->n8_w == xd->n8_h)
+ if (xd->n4_w == xd->n4_h)
if (mask_row & bs) has_tr = 0;
}
@@ -335,7 +336,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
- if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) return 0;
+ if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
const TPL_MV_REF *prev_frame_mvs =
cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) +
@@ -430,20 +431,75 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
return 0;
}
+static void process_compound_ref_mv_candidate(
+ const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+ const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2],
+ int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
+ for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+ MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
+
+ for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
+ if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
+ ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
+ ++ref_id_count[cmp_idx];
+ } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
+ int_mv this_mv = candidate->mv[rf_idx];
+ if (cm->ref_frame_sign_bias[can_rf] !=
+ cm->ref_frame_sign_bias[rf[cmp_idx]]) {
+ this_mv.as_mv.row = -this_mv.as_mv.row;
+ this_mv.as_mv.col = -this_mv.as_mv.col;
+ }
+ ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
+ ++ref_diff_count[cmp_idx];
+ }
+ }
+ }
+}
+
+static void process_single_ref_mv_candidate(
+ const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+ MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+ CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE]) {
+ for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+ if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
+ int_mv this_mv = candidate->mv[rf_idx];
+ if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
+ cm->ref_frame_sign_bias[ref_frame]) {
+ this_mv.as_mv.row = -this_mv.as_mv.row;
+ this_mv.as_mv.col = -this_mv.as_mv.col;
+ }
+ int stack_idx;
+ for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
+ const int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+ if (this_mv.as_int == stack_mv.as_int) break;
+ }
+
+ if (stack_idx == refmv_count[ref_frame]) {
+ ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
+
+ // TODO(jingning): Set an arbitrary small number here. The weight
+ // doesn't matter as long as it is properly initialized.
+ ref_mv_stack[ref_frame][stack_idx].weight = 2;
+ ++refmv_count[ref_frame];
+ }
+ }
+ }
+}
+
static void setup_ref_mv_list(
const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
uint8_t refmv_count[MODE_CTX_REF_FRAMES],
CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
int mi_row, int mi_col, int16_t *mode_context) {
- const int bs = AOMMAX(xd->n8_w, xd->n8_h);
+ const int bs = AOMMAX(xd->n4_w, xd->n4_h);
const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
MV_REFERENCE_FRAME rf[2];
const TileInfo *const tile = &xd->tile;
int max_row_offset = 0, max_col_offset = 0;
- const int row_adj = (xd->n8_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
- const int col_adj = (xd->n8_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
+ const int row_adj = (xd->n4_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
+ const int col_adj = (xd->n4_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
int processed_rows = 0;
int processed_cols = 0;
@@ -455,17 +511,16 @@ static void setup_ref_mv_list(
if (xd->up_available) {
max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
- if (xd->n8_h < mi_size_high[BLOCK_8X8])
+ if (xd->n4_h < mi_size_high[BLOCK_8X8])
max_row_offset = -(2 << 1) + row_adj;
- max_row_offset =
- find_valid_row_offset(tile, mi_row, cm->mi_rows, max_row_offset);
+ max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
}
if (xd->left_available) {
max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
- if (xd->n8_w < mi_size_wide[BLOCK_8X8])
+ if (xd->n4_w < mi_size_wide[BLOCK_8X8])
max_col_offset = -(2 << 1) + col_adj;
max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
@@ -487,12 +542,12 @@ static void setup_ref_mv_list(
gm_mv_candidates, max_col_offset, &processed_cols);
// Check top-right boundary
if (has_tr)
- scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n8_w,
+ scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n4_w,
ref_mv_stack[ref_frame], &row_match_count, &newmv_count,
gm_mv_candidates, &refmv_count[ref_frame]);
- uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
- uint8_t nearest_refmv_count = refmv_count[ref_frame];
+ const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
+ const uint8_t nearest_refmv_count = refmv_count[ref_frame];
// TODO(yunqing): for comp_search, do it for all 3 cases.
for (int idx = 0; idx < nearest_refmv_count; ++idx)
@@ -500,27 +555,27 @@ static void setup_ref_mv_list(
if (cm->allow_ref_frame_mvs) {
int is_available = 0;
- const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n8_h);
- const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n8_w);
- const int blk_row_end = AOMMIN(xd->n8_h, mi_size_high[BLOCK_64X64]);
- const int blk_col_end = AOMMIN(xd->n8_w, mi_size_wide[BLOCK_64X64]);
+ const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n4_h);
+ const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n4_w);
+ const int blk_row_end = AOMMIN(xd->n4_h, mi_size_high[BLOCK_64X64]);
+ const int blk_col_end = AOMMIN(xd->n4_w, mi_size_wide[BLOCK_64X64]);
const int tpl_sample_pos[3][2] = {
{ voffset, -2 },
{ voffset, hoffset },
{ voffset - 2, hoffset },
};
- const int allow_extension = (xd->n8_h >= mi_size_high[BLOCK_8X8]) &&
- (xd->n8_h < mi_size_high[BLOCK_64X64]) &&
- (xd->n8_w >= mi_size_wide[BLOCK_8X8]) &&
- (xd->n8_w < mi_size_wide[BLOCK_64X64]);
-
- int step_h = (xd->n8_h >= mi_size_high[BLOCK_64X64])
- ? mi_size_high[BLOCK_16X16]
- : mi_size_high[BLOCK_8X8];
- int step_w = (xd->n8_w >= mi_size_wide[BLOCK_64X64])
- ? mi_size_wide[BLOCK_16X16]
- : mi_size_wide[BLOCK_8X8];
+ const int allow_extension = (xd->n4_h >= mi_size_high[BLOCK_8X8]) &&
+ (xd->n4_h < mi_size_high[BLOCK_64X64]) &&
+ (xd->n4_w >= mi_size_wide[BLOCK_8X8]) &&
+ (xd->n4_w < mi_size_wide[BLOCK_64X64]);
+
+ const int step_h = (xd->n4_h >= mi_size_high[BLOCK_64X64])
+ ? mi_size_high[BLOCK_16X16]
+ : mi_size_high[BLOCK_8X8];
+ const int step_w = (xd->n4_w >= mi_size_wide[BLOCK_64X64])
+ ? mi_size_wide[BLOCK_16X16]
+ : mi_size_wide[BLOCK_8X8];
for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
@@ -569,7 +624,7 @@ static void setup_ref_mv_list(
max_col_offset, &processed_cols);
}
- uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
+ const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
switch (nearest_match) {
case 0:
@@ -636,62 +691,24 @@ static void setup_ref_mv_list(
int_mv ref_id[2][2], ref_diff[2][2];
int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
- int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
+ int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
- int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
+ int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
int mi_size = AOMMIN(mi_width, mi_height);
for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
- const int candidate_bsize = candidate->sb_type;
-
- for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
- MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
-
- for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
- if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
- ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
- ++ref_id_count[cmp_idx];
- } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
- int_mv this_mv = candidate->mv[rf_idx];
- if (cm->ref_frame_sign_bias[can_rf] !=
- cm->ref_frame_sign_bias[rf[cmp_idx]]) {
- this_mv.as_mv.row = -this_mv.as_mv.row;
- this_mv.as_mv.col = -this_mv.as_mv.col;
- }
- ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
- ++ref_diff_count[cmp_idx];
- }
- }
- }
- idx += mi_size_wide[candidate_bsize];
+ process_compound_ref_mv_candidate(
+ candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+ idx += mi_size_wide[candidate->sb_type];
}
for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
- const int candidate_bsize = candidate->sb_type;
-
- for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
- MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
-
- for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
- if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
- ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
- ++ref_id_count[cmp_idx];
- } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
- int_mv this_mv = candidate->mv[rf_idx];
- if (cm->ref_frame_sign_bias[can_rf] !=
- cm->ref_frame_sign_bias[rf[cmp_idx]]) {
- this_mv.as_mv.row = -this_mv.as_mv.row;
- this_mv.as_mv.col = -this_mv.as_mv.col;
- }
- ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
- ++ref_diff_count[cmp_idx];
- }
- }
- }
- idx += mi_size_high[candidate_bsize];
+ process_compound_ref_mv_candidate(
+ candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+ idx += mi_size_high[candidate->sb_type];
}
// Build up the compound mv predictor
@@ -743,87 +760,37 @@ static void setup_ref_mv_list(
for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
- xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+ xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv,
- xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+ xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
}
} else {
// Handle single reference frame extension
- int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
+ int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
- int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
+ int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
int mi_size = AOMMIN(mi_width, mi_height);
for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
- const int candidate_bsize = candidate->sb_type;
-
- // TODO(jingning): Refactor the following code.
- for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
- if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
- int_mv this_mv = candidate->mv[rf_idx];
- if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
- cm->ref_frame_sign_bias[ref_frame]) {
- this_mv.as_mv.row = -this_mv.as_mv.row;
- this_mv.as_mv.col = -this_mv.as_mv.col;
- }
- int stack_idx;
- for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
- int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
- if (this_mv.as_int == stack_mv.as_int) break;
- }
-
- if (stack_idx == refmv_count[ref_frame]) {
- ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
-
- // TODO(jingning): Set an arbitrary small number here. The weight
- // doesn't matter as long as it is properly initialized.
- ref_mv_stack[ref_frame][stack_idx].weight = 2;
- ++refmv_count[ref_frame];
- }
- }
- }
- idx += mi_size_wide[candidate_bsize];
+ process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+ ref_mv_stack);
+ idx += mi_size_wide[candidate->sb_type];
}
for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
- const int candidate_bsize = candidate->sb_type;
-
- // TODO(jingning): Refactor the following code.
- for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
- if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
- int_mv this_mv = candidate->mv[rf_idx];
- if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
- cm->ref_frame_sign_bias[ref_frame]) {
- this_mv.as_mv.row = -this_mv.as_mv.row;
- this_mv.as_mv.col = -this_mv.as_mv.col;
- }
- int stack_idx;
- for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
- int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
- if (this_mv.as_int == stack_mv.as_int) break;
- }
-
- if (stack_idx == refmv_count[ref_frame]) {
- ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
-
- // TODO(jingning): Set an arbitrary small number here. The weight
- // doesn't matter as long as it is properly initialized.
- ref_mv_stack[ref_frame][stack_idx].weight = 2;
- ++refmv_count[ref_frame];
- }
- }
- }
- idx += mi_size_high[candidate_bsize];
+ process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+ ref_mv_stack);
+ idx += mi_size_high[candidate->sb_type];
}
for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
- xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+ xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
}
if (mv_ref_list != NULL) {
@@ -936,8 +903,10 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
: -((-mv.col) >> (4 + MI_SIZE_LOG2));
- int row = (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
- int col = (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
+ const int row =
+ (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
+ const int col =
+ (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 ||
col >= (cm->mi_cols >> 1))
@@ -955,37 +924,44 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
return 1;
}
-static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame,
- int dir) {
+// Note: motion_filed_projection finds motion vectors of current frame's
+// reference frame, and projects them to current frame. To make it clear,
+// let's call current frame's reference frame as start frame.
+// Call Start frame's reference frames as reference frames.
+// Call ref_offset as frame distances between start frame and its reference
+// frames.
+static int motion_field_projection(AV1_COMMON *cm,
+ MV_REFERENCE_FRAME start_frame, int dir) {
TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
int ref_offset[REF_FRAMES] = { 0 };
(void)dir;
- int ref_frame_idx = cm->frame_refs[FWD_RF_OFFSET(ref_frame)].idx;
- if (ref_frame_idx < 0) return 0;
+ const int start_frame_idx = cm->frame_refs[FWD_RF_OFFSET(start_frame)].idx;
+ if (start_frame_idx < 0) return 0;
- if (cm->buffer_pool->frame_bufs[ref_frame_idx].intra_only) return 0;
+ if (cm->buffer_pool->frame_bufs[start_frame_idx].intra_only) return 0;
- if (cm->buffer_pool->frame_bufs[ref_frame_idx].mi_rows != cm->mi_rows ||
- cm->buffer_pool->frame_bufs[ref_frame_idx].mi_cols != cm->mi_cols)
+ if (cm->buffer_pool->frame_bufs[start_frame_idx].mi_rows != cm->mi_rows ||
+ cm->buffer_pool->frame_bufs[start_frame_idx].mi_cols != cm->mi_cols)
return 0;
- int ref_frame_index =
- cm->buffer_pool->frame_bufs[ref_frame_idx].cur_frame_offset;
- unsigned int *ref_rf_idx =
- &cm->buffer_pool->frame_bufs[ref_frame_idx].ref_frame_offset[0];
- int cur_frame_index = cm->cur_frame->cur_frame_offset;
- int ref_to_cur = get_relative_dist(cm, ref_frame_index, cur_frame_index);
+ const int start_frame_offset =
+ cm->buffer_pool->frame_bufs[start_frame_idx].cur_frame_offset;
+ const unsigned int *const ref_frame_offsets =
+ &cm->buffer_pool->frame_bufs[start_frame_idx].ref_frame_offset[0];
+ const int cur_frame_offset = cm->cur_frame->cur_frame_offset;
+ int start_to_current_frame_offset =
+ get_relative_dist(cm, start_frame_offset, cur_frame_offset);
for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
- ref_offset[rf] =
- get_relative_dist(cm, ref_frame_index, ref_rf_idx[rf - LAST_FRAME]);
+ ref_offset[rf] = get_relative_dist(cm, start_frame_offset,
+ ref_frame_offsets[rf - LAST_FRAME]);
}
- if (dir == 2) ref_to_cur = -ref_to_cur;
+ if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
- MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[ref_frame_idx].mvs;
+ MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[start_frame_idx].mvs;
const int mvs_rows = (cm->mi_rows + 1) >> 1;
const int mvs_cols = (cm->mi_cols + 1) >> 1;
@@ -999,19 +975,20 @@ static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame,
int mi_r, mi_c;
const int ref_frame_offset = ref_offset[mv_ref->ref_frame];
- int pos_valid = abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
- ref_frame_offset > 0 &&
- abs(ref_to_cur) <= MAX_FRAME_DISTANCE;
+ int pos_valid =
+ abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
+ ref_frame_offset > 0 &&
+ abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE;
if (pos_valid) {
- get_mv_projection(&this_mv.as_mv, fwd_mv, ref_to_cur,
- ref_frame_offset);
+ get_mv_projection(&this_mv.as_mv, fwd_mv,
+ start_to_current_frame_offset, ref_frame_offset);
pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
this_mv.as_mv, dir >> 1);
}
if (pos_valid) {
- int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
+ const int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
@@ -1167,14 +1144,14 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
if (up_available) {
int mi_row_offset = -1;
MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride];
- uint8_t n8_w = mi_size_wide[mbmi->sb_type];
+ uint8_t n4_w = mi_size_wide[mbmi->sb_type];
- if (xd->n8_w <= n8_w) {
+ if (xd->n4_w <= n4_w) {
// Handle "current block width <= above block width" case.
- int col_offset = -mi_col % n8_w;
+ int col_offset = -mi_col % n4_w;
if (col_offset < 0) do_tl = 0;
- if (col_offset + n8_w > xd->n8_w) do_tr = 0;
+ if (col_offset + n4_w > xd->n4_w) do_tr = 0;
if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
@@ -1185,11 +1162,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
}
} else {
// Handle "current block width > above block width" case.
- for (i = 0; i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
+ for (i = 0; i < AOMMIN(xd->n4_w, cm->mi_cols - mi_col); i += mi_step) {
int mi_col_offset = i;
mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
- n8_w = mi_size_wide[mbmi->sb_type];
- mi_step = AOMMIN(xd->n8_w, n8_w);
+ n4_w = mi_size_wide[mbmi->sb_type];
+ mi_step = AOMMIN(xd->n4_w, n4_w);
if (mbmi->ref_frame[0] == ref_frame &&
mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1209,11 +1186,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
int mi_col_offset = -1;
MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
- uint8_t n8_h = mi_size_high[mbmi->sb_type];
+ uint8_t n4_h = mi_size_high[mbmi->sb_type];
- if (xd->n8_h <= n8_h) {
+ if (xd->n4_h <= n4_h) {
// Handle "current block height <= above block height" case.
- int row_offset = -mi_row % n8_h;
+ int row_offset = -mi_row % n4_h;
if (row_offset < 0) do_tl = 0;
@@ -1226,11 +1203,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
}
} else {
// Handle "current block height > above block height" case.
- for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
+ for (i = 0; i < AOMMIN(xd->n4_h, cm->mi_rows - mi_row); i += mi_step) {
int mi_row_offset = i;
mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
- n8_h = mi_size_high[mbmi->sb_type];
- mi_step = AOMMIN(xd->n8_h, n8_h);
+ n4_h = mi_size_high[mbmi->sb_type];
+ mi_step = AOMMIN(xd->n4_h, n4_h);
if (mbmi->ref_frame[0] == ref_frame &&
mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1264,18 +1241,18 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
// Top-right block
if (do_tr &&
- has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n8_w, xd->n8_h))) {
- POSITION trb_pos = { -1, xd->n8_w };
+ has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n4_w, xd->n4_h))) {
+ POSITION trb_pos = { -1, xd->n4_w };
- if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &trb_pos)) {
+ if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
int mi_row_offset = -1;
- int mi_col_offset = xd->n8_w;
+ int mi_col_offset = xd->n4_w;
MB_MODE_INFO *mbmi =
xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
- record_samples(mbmi, pts, pts_inref, 0, -1, xd->n8_w, 1);
+ record_samples(mbmi, pts, pts_inref, 0, -1, xd->n4_w, 1);
np++;
if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
}
@@ -1372,7 +1349,7 @@ static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx,
REF_FRAME_INFO *ref_info) {
- assert(frame_idx >= 0 && frame_idx <= INTER_REFS_PER_FRAME);
+ assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
const int buf_idx = ref_info->buf_idx;
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
index f68c159e1..83f7a1ac0 100644
--- a/third_party/aom/av1/common/mvref_common.h
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_MVREF_COMMON_H_
-#define AV1_COMMON_MVREF_COMMON_H_
+#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_
+#define AOM_AV1_COMMON_MVREF_COMMON_H_
#include "av1/common/onyxc_int.h"
#include "av1/common/blockd.h"
@@ -85,29 +85,17 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
// Checks that the given mi_row, mi_col and search point
// are inside the borders of the tile.
static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
- int mi_rows, const POSITION *mi_pos) {
- const int dependent_horz_tile_flag = 0;
- if (dependent_horz_tile_flag && !tile->tg_horz_boundary) {
- return !(mi_row + mi_pos->row < 0 ||
- mi_col + mi_pos->col < tile->mi_col_start ||
- mi_row + mi_pos->row >= mi_rows ||
- mi_col + mi_pos->col >= tile->mi_col_end);
- } else {
- return !(mi_row + mi_pos->row < tile->mi_row_start ||
- mi_col + mi_pos->col < tile->mi_col_start ||
- mi_row + mi_pos->row >= tile->mi_row_end ||
- mi_col + mi_pos->col >= tile->mi_col_end);
- }
+ const POSITION *mi_pos) {
+ return !(mi_row + mi_pos->row < tile->mi_row_start ||
+ mi_col + mi_pos->col < tile->mi_col_start ||
+ mi_row + mi_pos->row >= tile->mi_row_end ||
+ mi_col + mi_pos->col >= tile->mi_col_end);
}
static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row,
- int mi_rows, int row_offset) {
- const int dependent_horz_tile_flag = 0;
- if (dependent_horz_tile_flag && !tile->tg_horz_boundary)
- return clamp(row_offset, -mi_row, mi_rows - mi_row - 1);
- else
- return clamp(row_offset, tile->mi_row_start - mi_row,
- tile->mi_row_end - mi_row - 1);
+ int row_offset) {
+ return clamp(row_offset, tile->mi_row_start - mi_row,
+ tile->mi_row_end - mi_row - 1);
}
static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col,
@@ -263,8 +251,9 @@ static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
}
}
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
- int mi_row, int mi_col, int x_mis, int y_mis);
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+ const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+ int x_mis, int y_mis);
void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
@@ -286,7 +275,6 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
#define INTRABC_DELAY_PIXELS 256 // Delay of 256 pixels
#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
-#define USE_WAVE_FRONT 1 // Use only top left area of frame for reference.
static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
int mib_size, int mi_row, int mi_col) {
@@ -356,13 +344,12 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col;
if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0;
-#if USE_WAVE_FRONT
+ // Wavefront constraint: use only top left area of frame for reference.
const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64);
const int wf_offset = gradient * (active_sb_row - src_sb_row);
if (src_sb_row > active_sb_row ||
src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset)
return 0;
-#endif
return 1;
}
@@ -371,4 +358,4 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
} // extern "C"
#endif
-#endif // AV1_COMMON_MVREF_COMMON_H_
+#endif // AOM_AV1_COMMON_MVREF_COMMON_H_
diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h
index 3918c82c6..1c90cd93f 100644
--- a/third_party/aom/av1/common/obmc.h
+++ b/third_party/aom/av1/common/obmc.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_OBMC_H_
-#define AV1_COMMON_OBMC_H_
+#ifndef AOM_AV1_COMMON_OBMC_H_
+#define AOM_AV1_COMMON_OBMC_H_
typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos,
uint8_t nb_mi_size,
@@ -30,7 +30,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
// prev_row_mi points into the mi array, starting at the beginning of the
// previous row.
MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
- const int end_col = AOMMIN(mi_col + xd->n8_w, cm->mi_cols);
+ const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols);
uint8_t mi_step;
for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
above_mi_col += mi_step) {
@@ -49,7 +49,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
}
if (is_neighbor_overlappable(*above_mi)) {
++nb_count;
- fun(xd, above_mi_col - mi_col, AOMMIN(xd->n8_w, mi_step), *above_mi,
+ fun(xd, above_mi_col - mi_col, AOMMIN(xd->n4_w, mi_step), *above_mi,
fun_ctxt, num_planes);
}
}
@@ -68,7 +68,7 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
// prev_col_mi points into the mi array, starting at the top of the
// previous column
MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
- const int end_row = AOMMIN(mi_row + xd->n8_h, cm->mi_rows);
+ const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows);
uint8_t mi_step;
for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
left_mi_row += mi_step) {
@@ -82,10 +82,10 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
}
if (is_neighbor_overlappable(*left_mi)) {
++nb_count;
- fun(xd, left_mi_row - mi_row, AOMMIN(xd->n8_h, mi_step), *left_mi,
+ fun(xd, left_mi_row - mi_row, AOMMIN(xd->n4_h, mi_step), *left_mi,
fun_ctxt, num_planes);
}
}
}
-#endif // AV1_COMMON_OBMC_H_
+#endif // AOM_AV1_COMMON_OBMC_H_
diff --git a/third_party/aom/av1/common/obu_util.c b/third_party/aom/av1/common/obu_util.c
new file mode 100644
index 000000000..823b700b1
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/obu_util.h"
+
+#include "aom_dsp/bitreader_buffer.h"
+
+// Returns 1 when OBU type is valid, and 0 otherwise.
+static int valid_obu_type(int obu_type) {
+ int valid_type = 0;
+ switch (obu_type) {
+ case OBU_SEQUENCE_HEADER:
+ case OBU_TEMPORAL_DELIMITER:
+ case OBU_FRAME_HEADER:
+ case OBU_TILE_GROUP:
+ case OBU_METADATA:
+ case OBU_FRAME:
+ case OBU_REDUNDANT_FRAME_HEADER:
+ case OBU_TILE_LIST:
+ case OBU_PADDING: valid_type = 1; break;
+ default: break;
+ }
+ return valid_type;
+}
+
+static aom_codec_err_t read_obu_size(const uint8_t *data,
+ size_t bytes_available,
+ size_t *const obu_size,
+ size_t *const length_field_size) {
+ uint64_t u_obu_size = 0;
+ if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
+ 0) {
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+
+ if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
+ *obu_size = (size_t)u_obu_size;
+ return AOM_CODEC_OK;
+}
+
+// Parses OBU header and stores values in 'header'.
+static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
+ int is_annexb, ObuHeader *header) {
+ if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
+
+ const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
+ if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
+
+ header->size = 1;
+
+ if (aom_rb_read_bit(rb) != 0) {
+ // Forbidden bit. Must not be set.
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+
+ header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
+
+ if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
+
+ header->has_extension = aom_rb_read_bit(rb);
+ header->has_size_field = aom_rb_read_bit(rb);
+
+ if (!header->has_size_field && !is_annexb) {
+ // section 5 obu streams must have obu_size field set.
+ return AOM_CODEC_UNSUP_BITSTREAM;
+ }
+
+ if (aom_rb_read_bit(rb) != 0) {
+ // obu_reserved_1bit must be set to 0.
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+
+ if (header->has_extension) {
+ if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
+
+ header->size += 1;
+ header->temporal_layer_id = aom_rb_read_literal(rb, 3);
+ header->spatial_layer_id = aom_rb_read_literal(rb, 2);
+ if (aom_rb_read_literal(rb, 3) != 0) {
+ // extension_header_reserved_3bits must be set to 0.
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+ }
+
+ return AOM_CODEC_OK;
+}
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+ size_t *consumed, ObuHeader *header,
+ int is_annexb) {
+ if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
+
+ // TODO(tomfinegan): Set the error handler here and throughout this file, and
+ // confirm parsing work done via aom_read_bit_buffer is successful.
+ struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
+ NULL };
+ aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
+ if (parse_result == AOM_CODEC_OK) *consumed = header->size;
+ return parse_result;
+}
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+ size_t bytes_available,
+ int is_annexb,
+ ObuHeader *obu_header,
+ size_t *const payload_size,
+ size_t *const bytes_read) {
+ size_t length_field_size = 0, obu_size = 0;
+ aom_codec_err_t status;
+
+ if (is_annexb) {
+ // Size field comes before the OBU header, and includes the OBU header
+ status =
+ read_obu_size(data, bytes_available, &obu_size, &length_field_size);
+
+ if (status != AOM_CODEC_OK) return status;
+ }
+
+ struct aom_read_bit_buffer rb = { data + length_field_size,
+ data + bytes_available, 0, NULL, NULL };
+
+ status = read_obu_header(&rb, is_annexb, obu_header);
+ if (status != AOM_CODEC_OK) return status;
+
+ if (is_annexb) {
+ // Derive the payload size from the data we've already read
+ if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
+
+ *payload_size = obu_size - obu_header->size;
+ } else {
+ // Size field comes after the OBU header, and is just the payload size
+ status = read_obu_size(data + obu_header->size,
+ bytes_available - obu_header->size, payload_size,
+ &length_field_size);
+ if (status != AOM_CODEC_OK) return status;
+ }
+
+ *bytes_read = length_field_size + obu_header->size;
+ return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/common/obu_util.h b/third_party/aom/av1/common/obu_util.h
new file mode 100644
index 000000000..7c56904c8
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_OBU_UTIL_H_
+#define AOM_AV1_COMMON_OBU_UTIL_H_
+
+#include "aom/aom_codec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ size_t size; // Size (1 or 2 bytes) of the OBU header (including the
+ // optional OBU extension header) in the bitstream.
+ OBU_TYPE type;
+ int has_size_field;
+ int has_extension;
+ // The following fields come from the OBU extension header and therefore are
+ // only used if has_extension is true.
+ int temporal_layer_id;
+ int spatial_layer_id;
+} ObuHeader;
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+ size_t *consumed, ObuHeader *header,
+ int is_annexb);
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+ size_t bytes_available,
+ int is_annexb,
+ ObuHeader *obu_header,
+ size_t *const payload_size,
+ size_t *const bytes_read);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_OBU_UTIL_H_
diff --git a/third_party/aom/av1/common/odintrin.h b/third_party/aom/av1/common/odintrin.h
index e87c5a0bf..e1db0f44d 100644
--- a/third_party/aom/av1/common/odintrin.h
+++ b/third_party/aom/av1/common/odintrin.h
@@ -11,8 +11,8 @@
/* clang-format off */
-#ifndef AV1_COMMON_ODINTRIN_H_
-#define AV1_COMMON_ODINTRIN_H_
+#ifndef AOM_AV1_COMMON_ODINTRIN_H_
+#define AOM_AV1_COMMON_ODINTRIN_H_
#include <stdlib.h>
#include <string.h>
@@ -46,9 +46,9 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
#define OD_MAXI AOMMAX
#define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max)))
-#define OD_CLZ0 (1)
-#define OD_CLZ(x) (-get_msb(x))
-#define OD_ILOG_NZ(x) (OD_CLZ0 - OD_CLZ(x))
+/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer.
+ OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/
+#define OD_ILOG_NZ(x) (1 + get_msb(x))
/*Enable special features for gcc and compatible compilers.*/
#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
@@ -93,4 +93,4 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
} // extern "C"
#endif
-#endif // AV1_COMMON_ODINTRIN_H_
+#endif // AOM_AV1_COMMON_ODINTRIN_H_
diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h
index 6b1bf2d74..ff011c89e 100644
--- a/third_party/aom/av1/common/onyxc_int.h
+++ b/third_party/aom/av1/common/onyxc_int.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ONYXC_INT_H_
-#define AV1_COMMON_ONYXC_INT_H_
+#ifndef AOM_AV1_COMMON_ONYXC_INT_H_
+#define AOM_AV1_COMMON_ONYXC_INT_H_
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
@@ -480,6 +480,7 @@ typedef struct AV1Common {
int byte_alignment;
int skip_loop_filter;
+ int skip_film_grain;
// Private data associated with the frame buffer callbacks.
void *cb_priv;
@@ -823,18 +824,18 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
xd->chroma_left_mbmi = chroma_left_mi;
}
- xd->n8_h = bh;
- xd->n8_w = bw;
+ xd->n4_h = bh;
+ xd->n4_w = bw;
xd->is_sec_rect = 0;
- if (xd->n8_w < xd->n8_h) {
+ if (xd->n4_w < xd->n4_h) {
// Only mark is_sec_rect as 1 for the last block.
// For PARTITION_VERT_4, it would be (0, 0, 0, 1);
// For other partitions, it would be (0, 1).
- if (!((mi_col + xd->n8_w) & (xd->n8_h - 1))) xd->is_sec_rect = 1;
+ if (!((mi_col + xd->n4_w) & (xd->n4_h - 1))) xd->is_sec_rect = 1;
}
- if (xd->n8_w > xd->n8_h)
- if (mi_row & (xd->n8_w - 1)) xd->is_sec_rect = 1;
+ if (xd->n4_w > xd->n4_h)
+ if (mi_row & (xd->n4_w - 1)) xd->is_sec_rect = 1;
}
static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
@@ -1115,18 +1116,18 @@ static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
}
-static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n8_w, int n8_h, int skip,
+static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
const MACROBLOCKD *xd) {
uint8_t bw = tx_size_wide[tx_size];
uint8_t bh = tx_size_high[tx_size];
if (skip) {
- bw = n8_w * MI_SIZE;
- bh = n8_h * MI_SIZE;
+ bw = n4_w * MI_SIZE;
+ bh = n4_h * MI_SIZE;
}
- set_txfm_ctx(xd->above_txfm_context, bw, n8_w);
- set_txfm_ctx(xd->left_txfm_context, bh, n8_h);
+ set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
+ set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
}
static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
@@ -1338,4 +1339,4 @@ static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) {
} // extern "C"
#endif
-#endif // AV1_COMMON_ONYXC_INT_H_
+#endif // AOM_AV1_COMMON_ONYXC_INT_H_
diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c
index 58933a7b3..026a07809 100644
--- a/third_party/aom/av1/common/ppc/cfl_ppc.c
+++ b/third_party/aom/av1/common/ppc/cfl_ppc.c
@@ -24,19 +24,21 @@
#define CFL_LINE_2 128
#define CFL_LINE_3 192
-typedef vector int8_t int8x16_t;
-typedef vector uint8_t uint8x16_t;
-typedef vector int16_t int16x8_t;
-typedef vector uint16_t uint16x8_t;
-typedef vector int32_t int32x4_t;
-typedef vector uint32_t uint32x4_t;
-typedef vector uint64_t uint64x2_t;
+typedef vector signed char int8x16_t; // NOLINT(runtime/int)
+typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int)
+typedef vector signed short int16x8_t; // NOLINT(runtime/int)
+typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int)
+typedef vector signed int int32x4_t; // NOLINT(runtime/int)
+typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int)
+typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int)
-static INLINE void subtract_average_vsx(int16_t *pred_buf, int width,
- int height, int round_offset,
+static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
+ int width, int height, int round_offset,
int num_pel_log2) {
- const int16_t *end = pred_buf + height * CFL_BUF_LINE;
- const int16_t *sum_buf = pred_buf;
+ // int16_t *dst = dst_ptr;
+ const int16_t *dst_end = dst + height * CFL_BUF_LINE;
+ const int16_t *sum_buf = (const int16_t *)src_ptr;
+ const int16_t *end = sum_buf + height * CFL_BUF_LINE;
const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
@@ -71,43 +73,40 @@ static INLINE void subtract_average_vsx(int16_t *pred_buf, int width,
const int32x4_t avg = vec_sr(sum_32x4, div_shift);
const int16x8_t vec_avg = vec_pack(avg, avg);
do {
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, pred_buf), vec_avg), OFF_0, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, pred_buf), vec_avg),
- OFF_0 + CFL_BUF_LINE_BYTES, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, pred_buf), vec_avg),
- OFF_0 + CFL_LINE_2, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, pred_buf), vec_avg),
- OFF_0 + CFL_LINE_3, pred_buf);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg),
+ OFF_0 + CFL_BUF_LINE_BYTES, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg),
+ OFF_0 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg),
+ OFF_0 + CFL_LINE_3, dst);
if (width >= 16) {
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, pred_buf), vec_avg), OFF_1,
- pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, pred_buf), vec_avg),
- OFF_1 + CFL_LINE_1, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, pred_buf), vec_avg),
- OFF_1 + CFL_LINE_2, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, pred_buf), vec_avg),
- OFF_1 + CFL_LINE_3, pred_buf);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg),
+ OFF_1 + CFL_LINE_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg),
+ OFF_1 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg),
+ OFF_1 + CFL_LINE_3, dst);
}
if (width == 32) {
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, pred_buf), vec_avg), OFF_2,
- pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, pred_buf), vec_avg),
- OFF_2 + CFL_LINE_1, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, pred_buf), vec_avg),
- OFF_2 + CFL_LINE_2, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, pred_buf), vec_avg),
- OFF_2 + CFL_LINE_3, pred_buf);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg),
+ OFF_2 + CFL_LINE_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg),
+ OFF_2 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg),
+ OFF_2 + CFL_LINE_3, dst);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, pred_buf), vec_avg), OFF_3,
- pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, pred_buf), vec_avg),
- OFF_3 + CFL_LINE_1, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, pred_buf), vec_avg),
- OFF_3 + CFL_LINE_2, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, pred_buf), vec_avg),
- OFF_3 + CFL_LINE_3, pred_buf);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg),
+ OFF_3 + CFL_LINE_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg),
+ OFF_3 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg),
+ OFF_3 + CFL_LINE_3, dst);
}
- } while ((pred_buf += CFL_BUF_LINE * 4) < end);
+ } while ((dst += CFL_BUF_LINE * 4) < dst_end);
}
// Declare wrappers for VSX sizes
diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c
index d77739d85..5952441d1 100644
--- a/third_party/aom/av1/common/pred_common.c
+++ b/third_party/aom/av1/common/pred_common.c
@@ -31,8 +31,8 @@ int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
const MB_MODE_INFO *const mbmi = xd->mi[0];
const int ctx_offset =
(mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
- MV_REFERENCE_FRAME ref_frame =
- (dir < 2) ? mbmi->ref_frame[0] : mbmi->ref_frame[1];
+ assert(dir == 0 || dir == 1);
+ const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries corresponding to real macroblocks.
diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h
index 6a835c467..6dba2322d 100644
--- a/third_party/aom/av1/common/pred_common.h
+++ b/third_party/aom/av1/common/pred_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_PRED_COMMON_H_
-#define AV1_COMMON_PRED_COMMON_H_
+#ifndef AOM_AV1_COMMON_PRED_COMMON_H_
+#define AOM_AV1_COMMON_PRED_COMMON_H_
#include "av1/common/blockd.h"
#include "av1/common/mvref_common.h"
@@ -357,4 +357,4 @@ static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
} // extern "C"
#endif
-#endif // AV1_COMMON_PRED_COMMON_H_
+#endif // AOM_AV1_COMMON_PRED_COMMON_H_
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
index ca199e94c..d1f52a660 100644
--- a/third_party/aom/av1/common/quant_common.h
+++ b/third_party/aom/av1/common/quant_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_QUANT_COMMON_H_
-#define AV1_COMMON_QUANT_COMMON_H_
+#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_
+#define AOM_AV1_COMMON_QUANT_COMMON_H_
#include "aom/aom_codec.h"
#include "av1/common/seg_common.h"
@@ -60,4 +60,4 @@ const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp,
} // extern "C"
#endif
-#endif // AV1_COMMON_QUANT_COMMON_H_
+#endif // AOM_AV1_COMMON_QUANT_COMMON_H_
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
index b9f0b57f3..3203efce4 100644
--- a/third_party/aom/av1/common/reconinter.c
+++ b/third_party/aom/av1/common/reconinter.c
@@ -44,10 +44,9 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi,
if (build_for_obmc) return 0;
- if (warp_types->local_warp_allowed && !mbmi->wm_params[0].invalid) {
+ if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) {
if (final_warp_params != NULL)
- memcpy(final_warp_params, &mbmi->wm_params[0],
- sizeof(*final_warp_params));
+ memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params));
return 1;
} else if (warp_types->global_warp_allowed && !gm_params->invalid) {
if (final_warp_params != NULL)
@@ -78,6 +77,9 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]],
build_for_obmc, subpel_params->xs, subpel_params->ys,
&final_warp_params));
+ const int is_intrabc = mi->use_intrabc;
+ assert(IMPLIES(is_intrabc, !do_warp));
+
if (do_warp && xd->cur_frame_force_integer_mv == 0) {
const struct macroblockd_plane *const pd = &xd->plane[plane];
const struct buf_2d *const pre_buf = &pd->pre[ref];
@@ -88,10 +90,11 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
pd->subsampling_x, pd->subsampling_y, conv_params);
} else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
- w, h, conv_params, interp_filters, xd->bd);
+ w, h, conv_params, interp_filters, is_intrabc,
+ xd->bd);
} else {
inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h,
- conv_params, interp_filters);
+ conv_params, interp_filters, is_intrabc);
}
}
@@ -574,37 +577,6 @@ static void build_masked_compound_no_round(
h, subw, subh, conv_params);
}
-static void build_masked_compound(
- uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
- const uint8_t *src1, int src1_stride,
- const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
- int w) {
- // Derive subsampling from h and w passed in. May be refactored to
- // pass in subsampling factors directly.
- const int subh = (2 << mi_size_high_log2[sb_type]) == h;
- const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
- const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
- aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
- mask, block_size_wide[sb_type], w, h, subw, subh);
-}
-
-static void build_masked_compound_highbd(
- uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
- const uint8_t *src1_8, int src1_stride,
- const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
- int w, int bd) {
- // Derive subsampling from h and w passed in. May be refactored to
- // pass in subsampling factors directly.
- const int subh = (2 << mi_size_high_log2[sb_type]) == h;
- const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
- const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
- // const uint8_t *mask =
- // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
- aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
- src1_stride, mask, block_size_wide[sb_type], w, h,
- subw, subh, bd);
-}
-
void av1_make_masked_inter_predictor(
const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
@@ -653,63 +625,6 @@ void av1_make_masked_inter_predictor(
mi->sb_type, h, w, conv_params, xd);
}
-// TODO(sarahparker) av1_highbd_build_inter_predictor and
-// av1_build_inter_predictor should be combined with
-// av1_make_inter_predictor
-void av1_highbd_build_inter_predictor(
- const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
- const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
- InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col,
- int p_row, int plane, enum mv_precision precision, int x, int y,
- const MACROBLOCKD *xd, int can_use_previous) {
- const int is_q4 = precision == MV_PRECISION_Q4;
- const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
- is_q4 ? src_mv->col : src_mv->col * 2 };
- MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
- mv.col += SCALE_EXTRA_OFF;
- mv.row += SCALE_EXTRA_OFF;
- const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
- mv.col & SCALE_SUBPEL_MASK,
- mv.row & SCALE_SUBPEL_MASK };
- ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
-
- src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
- (mv.col >> SCALE_SUBPEL_BITS);
-
- av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
- w, h, &conv_params, interp_filters, warp_types,
- p_col, p_row, plane, ref, xd->mi[0], 0, xd,
- can_use_previous);
-}
-
-void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, const MV *src_mv,
- const struct scale_factors *sf, int w, int h,
- ConvolveParams *conv_params,
- InterpFilters interp_filters,
- const WarpTypesAllowed *warp_types, int p_col,
- int p_row, int plane, int ref,
- enum mv_precision precision, int x, int y,
- const MACROBLOCKD *xd, int can_use_previous) {
- const int is_q4 = precision == MV_PRECISION_Q4;
- const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
- is_q4 ? src_mv->col : src_mv->col * 2 };
- MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
- mv.col += SCALE_EXTRA_OFF;
- mv.row += SCALE_EXTRA_OFF;
-
- const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
- mv.col & SCALE_SUBPEL_MASK,
- mv.row & SCALE_SUBPEL_MASK };
- src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
- (mv.col >> SCALE_SUBPEL_BITS);
-
- av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
- w, h, conv_params, interp_filters, warp_types, p_col,
- p_row, plane, ref, xd->mi[0], 0, xd,
- can_use_previous);
-}
-
void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
int order_idx, int *fwd_offset, int *bck_offset,
int *use_jnt_comp_avg, int is_compound) {
@@ -759,279 +674,6 @@ void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
*bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
}
-static INLINE void calc_subpel_params(
- MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
- int plane, const int pre_x, const int pre_y, int x, int y,
- struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params,
- int bw, int bh) {
- struct macroblockd_plane *const pd = &xd->plane[plane];
- const int is_scaled = av1_is_scaled(sf);
- if (is_scaled) {
- int ssx = pd->subsampling_x;
- int ssy = pd->subsampling_y;
- int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
- orig_pos_y += mv.row * (1 << (1 - ssy));
- int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
- orig_pos_x += mv.col * (1 << (1 - ssx));
- int pos_y = sf->scale_value_y(orig_pos_y, sf);
- int pos_x = sf->scale_value_x(orig_pos_x, sf);
- pos_x += SCALE_EXTRA_OFF;
- pos_y += SCALE_EXTRA_OFF;
-
- const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
- const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
- const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
- << SCALE_SUBPEL_BITS;
- const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
- pos_y = clamp(pos_y, top, bottom);
- pos_x = clamp(pos_x, left, right);
-
- *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
- (pos_x >> SCALE_SUBPEL_BITS);
- subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
- subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
- subpel_params->xs = sf->x_step_q4;
- subpel_params->ys = sf->y_step_q4;
- } else {
- const MV mv_q4 = clamp_mv_to_umv_border_sb(
- xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
- subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
- subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
- subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
- *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
- (x + (mv_q4.col >> SUBPEL_BITS));
- }
-}
-
-static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int plane, const MB_MODE_INFO *mi,
- int build_for_obmc, int bw, int bh,
- int mi_x, int mi_y) {
- struct macroblockd_plane *const pd = &xd->plane[plane];
- int is_compound = has_second_ref(mi);
- int ref;
- const int is_intrabc = is_intrabc_block(mi);
- assert(IMPLIES(is_intrabc, !is_compound));
- int is_global[2] = { 0, 0 };
- for (ref = 0; ref < 1 + is_compound; ++ref) {
- const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
- is_global[ref] = is_global_mv_block(mi, wm->wmtype);
- }
-
- const BLOCK_SIZE bsize = mi->sb_type;
- const int ss_x = pd->subsampling_x;
- const int ss_y = pd->subsampling_y;
- int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
- (block_size_high[bsize] < 8 && ss_y);
-
- if (is_intrabc) sub8x8_inter = 0;
-
- // For sub8x8 chroma blocks, we may be covering more than one luma block's
- // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
- // the top-left corner of the prediction source - the correct top-left corner
- // is at (pre_x, pre_y).
- const int row_start =
- (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
- const int col_start =
- (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
- const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
- const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
-
- sub8x8_inter = sub8x8_inter && !build_for_obmc;
- if (sub8x8_inter) {
- for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
- for (int col = col_start; col <= 0; ++col) {
- const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
- if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
- if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
- }
- }
- }
-
- if (sub8x8_inter) {
- // block size
- const int b4_w = block_size_wide[bsize] >> ss_x;
- const int b4_h = block_size_high[bsize] >> ss_y;
- const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
- const int b8_w = block_size_wide[plane_bsize] >> ss_x;
- const int b8_h = block_size_high[plane_bsize] >> ss_y;
- assert(!is_compound);
-
- const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
-
- int row = row_start;
- for (int y = 0; y < b8_h; y += b4_h) {
- int col = col_start;
- for (int x = 0; x < b8_w; x += b4_w) {
- MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
- is_compound = has_second_ref(this_mbmi);
- DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]);
- int tmp_dst_stride = 8;
- assert(bw < 8 || bh < 8);
- ConvolveParams conv_params = get_conv_params_no_round(
- 0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd);
- conv_params.use_jnt_comp_avg = 0;
- struct buf_2d *const dst_buf = &pd->dst;
- uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
-
- ref = 0;
- const RefBuffer *ref_buf =
- &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
-
- pd->pre[ref].buf0 =
- (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
- pd->pre[ref].buf =
- pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
- ref_buf->buf->uv_stride,
- &ref_buf->sf);
- pd->pre[ref].width = ref_buf->buf->uv_crop_width;
- pd->pre[ref].height = ref_buf->buf->uv_crop_height;
- pd->pre[ref].stride = ref_buf->buf->uv_stride;
-
- const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &ref_buf->sf;
- struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-
- const MV mv = this_mbmi->mv[ref].as_mv;
-
- uint8_t *pre;
- SubpelParams subpel_params;
- WarpTypesAllowed warp_types;
- warp_types.global_warp_allowed = is_global[ref];
- warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
-
- calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
- &subpel_params, bw, bh);
-
- conv_params.ref = ref;
- conv_params.do_average = ref;
- if (is_masked_compound_type(mi->interinter_comp.type)) {
- // masked compound type has its own average mechanism
- conv_params.do_average = 0;
- }
-
- av1_make_inter_predictor(
- pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf,
- b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
- (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
- plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
-
- ++col;
- }
- ++row;
- }
-
- for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
- return;
- }
-
- {
- DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
- ConvolveParams conv_params = get_conv_params_no_round(
- 0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd);
- av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
- &conv_params.bck_offset,
- &conv_params.use_jnt_comp_avg, is_compound);
-
- struct buf_2d *const dst_buf = &pd->dst;
- uint8_t *const dst = dst_buf->buf;
- for (ref = 0; ref < 1 + is_compound; ++ref) {
- const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
- struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
- const MV mv = mi->mv[ref].as_mv;
-
- uint8_t *pre;
- SubpelParams subpel_params;
- calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre,
- &subpel_params, bw, bh);
-
- WarpTypesAllowed warp_types;
- warp_types.global_warp_allowed = is_global[ref];
- warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
- conv_params.ref = ref;
-
- if (ref && is_masked_compound_type(mi->interinter_comp.type)) {
- // masked compound type has its own average mechanism
- conv_params.do_average = 0;
- av1_make_masked_inter_predictor(
- pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
- bh, &conv_params, mi->interp_filters, plane, &warp_types,
- mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd,
- cm->allow_warped_motion);
- } else {
- conv_params.do_average = ref;
- av1_make_inter_predictor(
- pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
- bh, &conv_params, mi->interp_filters, &warp_types,
- mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref,
- mi, build_for_obmc, xd, cm->allow_warped_motion);
- }
- }
- }
-}
-
-static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
- MACROBLOCKD *xd, BLOCK_SIZE bsize,
- int mi_row, int mi_col,
- int plane_from, int plane_to) {
- int plane;
- const int mi_x = mi_col * MI_SIZE;
- const int mi_y = mi_row * MI_SIZE;
- for (plane = plane_from; plane <= plane_to; ++plane) {
- const struct macroblockd_plane *pd = &xd->plane[plane];
- const int bw = pd->width;
- const int bh = pd->height;
-
- if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
- pd->subsampling_y))
- continue;
-
- build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
- }
-}
-
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize) {
- build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
-
- if (is_interintra_pred(xd->mi[0])) {
- BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
- { xd->plane[0].dst.stride, 0, 0 } };
- if (!ctx) ctx = &default_ctx;
- av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf,
- xd->plane[0].dst.stride, ctx, 0, bsize);
- }
-}
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize) {
- build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
- MAX_MB_PLANE - 1);
-
- if (is_interintra_pred(xd->mi[0])) {
- BUFFER_SET default_ctx = {
- { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
- { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
- };
- if (!ctx) ctx = &default_ctx;
- av1_build_interintra_predictors_sbuv(
- cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
- xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize);
- }
-}
-
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize) {
- const int num_planes = av1_num_planes(cm);
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
- if (num_planes > 1)
- av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
-}
-
void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
const int plane_start, const int plane_end) {
@@ -1292,63 +934,7 @@ void av1_setup_build_prediction_by_above_pred(
xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
xd->mb_to_right_edge = ctxt->mb_to_far_edge +
- (xd->n8_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
-}
-
-static INLINE void build_prediction_by_above_pred(
- MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
- MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
- struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
- const int above_mi_col = ctxt->mi_col + rel_mi_col;
- int mi_x, mi_y;
- MB_MODE_INFO backup_mbmi = *above_mbmi;
-
- av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
- above_mbmi, ctxt, num_planes);
- mi_x = above_mi_col << MI_SIZE_LOG2;
- mi_y = ctxt->mi_row << MI_SIZE_LOG2;
-
- const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-
- for (int j = 0; j < num_planes; ++j) {
- const struct macroblockd_plane *pd = &xd->plane[j];
- int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
- int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
- block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
-
- if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
- build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y);
- }
- *above_mbmi = backup_mbmi;
-}
-
-void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- uint8_t *tmp_buf[MAX_MB_PLANE],
- int tmp_width[MAX_MB_PLANE],
- int tmp_height[MAX_MB_PLANE],
- int tmp_stride[MAX_MB_PLANE]) {
- if (!xd->up_available) return;
-
- // Adjust mb_to_bottom_edge to have the correct value for the OBMC
- // prediction block. This is half the height of the original block,
- // except for 128-wide blocks, where we only use a height of 32.
- int this_height = xd->n8_h * MI_SIZE;
- int pred_height = AOMMIN(this_height / 2, 32);
- xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
-
- struct build_prediction_ctxt ctxt = { cm, mi_row,
- mi_col, tmp_buf,
- tmp_width, tmp_height,
- tmp_stride, xd->mb_to_right_edge };
- BLOCK_SIZE bsize = xd->mi[0]->sb_type;
- foreach_overlappable_nb_above(cm, xd, mi_col,
- max_neighbor_obmc[mi_size_wide_log2[bsize]],
- build_prediction_by_above_pred, &ctxt);
-
- xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
- xd->mb_to_right_edge = ctxt.mb_to_far_edge;
- xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+ (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
}
void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
@@ -1386,101 +972,7 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
xd->mb_to_bottom_edge =
ctxt->mb_to_far_edge +
- (xd->n8_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
-}
-
-static INLINE void build_prediction_by_left_pred(
- MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
- MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
- struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
- const int left_mi_row = ctxt->mi_row + rel_mi_row;
- int mi_x, mi_y;
- MB_MODE_INFO backup_mbmi = *left_mbmi;
-
- av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
- left_mbmi, ctxt, num_planes);
- mi_x = ctxt->mi_col << MI_SIZE_LOG2;
- mi_y = left_mi_row << MI_SIZE_LOG2;
- const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-
- for (int j = 0; j < num_planes; ++j) {
- const struct macroblockd_plane *pd = &xd->plane[j];
- int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
- block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
- int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
-
- if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
- build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y);
- }
- *left_mbmi = backup_mbmi;
-}
-
-void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- uint8_t *tmp_buf[MAX_MB_PLANE],
- int tmp_width[MAX_MB_PLANE],
- int tmp_height[MAX_MB_PLANE],
- int tmp_stride[MAX_MB_PLANE]) {
- if (!xd->left_available) return;
-
- // Adjust mb_to_right_edge to have the correct value for the OBMC
- // prediction block. This is half the width of the original block,
- // except for 128-wide blocks, where we only use a width of 32.
- int this_width = xd->n8_w * MI_SIZE;
- int pred_width = AOMMIN(this_width / 2, 32);
- xd->mb_to_right_edge += (this_width - pred_width) * 8;
-
- struct build_prediction_ctxt ctxt = { cm, mi_row,
- mi_col, tmp_buf,
- tmp_width, tmp_height,
- tmp_stride, xd->mb_to_bottom_edge };
- BLOCK_SIZE bsize = xd->mi[0]->sb_type;
- foreach_overlappable_nb_left(cm, xd, mi_row,
- max_neighbor_obmc[mi_size_high_log2[bsize]],
- build_prediction_by_left_pred, &ctxt);
-
- xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
- xd->mb_to_right_edge -= (this_width - pred_width) * 8;
- xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
-}
-
-void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col) {
- const int num_planes = av1_num_planes(cm);
- DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
- uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
- int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- int len = sizeof(uint16_t);
- dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
- dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
- dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
- dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
- dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
- dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
- } else {
- dst_buf1[0] = tmp_buf1;
- dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
- dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
- dst_buf2[0] = tmp_buf2;
- dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
- dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
- }
- av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
- dst_width1, dst_height1, dst_stride1);
- av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
- dst_width2, dst_height2, dst_stride2);
- av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
- mi_row, mi_col, 0, num_planes);
- av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
- dst_buf2, dst_stride2);
+ (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
}
/* clang-format off */
@@ -1668,127 +1160,3 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
}
-
-void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *ypred, uint8_t *upred,
- uint8_t *vpred, int ystride, int ustride,
- int vstride, BUFFER_SET *ctx,
- BLOCK_SIZE bsize) {
- av1_build_interintra_predictors_sbp(cm, xd, ypred, ystride, ctx, 0, bsize);
- av1_build_interintra_predictors_sbuv(cm, xd, upred, vpred, ustride, vstride,
- ctx, bsize);
-}
-
-// Builds the inter-predictor for the single ref case
-// for use in the encoder to search the wedges efficiently.
-static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
- int bw, int bh, int x, int y,
- int w, int h, int mi_x, int mi_y,
- int ref, uint8_t *const ext_dst,
- int ext_dst_stride,
- int can_use_previous) {
- struct macroblockd_plane *const pd = &xd->plane[plane];
- const MB_MODE_INFO *mi = xd->mi[0];
-
- const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
- struct buf_2d *const pre_buf = &pd->pre[ref];
- uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
- const MV mv = mi->mv[ref].as_mv;
-
- ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
- WarpTypesAllowed warp_types;
- const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
- warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
- warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
- const int pre_x = (mi_x) >> pd->subsampling_x;
- const int pre_y = (mi_y) >> pd->subsampling_y;
- uint8_t *pre;
- SubpelParams subpel_params;
- calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
- &subpel_params, bw, bh);
-
- av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
- &subpel_params, sf, w, h, &conv_params,
- mi->interp_filters, &warp_types, pre_x + x,
- pre_y + y, plane, ref, mi, 0, xd, can_use_previous);
-}
-
-void av1_build_inter_predictors_for_planes_single_buf(
- MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
- int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
- int can_use_previous) {
- int plane;
- const int mi_x = mi_col * MI_SIZE;
- const int mi_y = mi_row * MI_SIZE;
- for (plane = plane_from; plane <= plane_to; ++plane) {
- const BLOCK_SIZE plane_bsize = get_plane_block_size(
- bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
- const int bw = block_size_wide[plane_bsize];
- const int bh = block_size_high[plane_bsize];
- build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x,
- mi_y, ref, ext_dst[plane],
- ext_dst_stride[plane], can_use_previous);
- }
-}
-
-static void build_wedge_inter_predictor_from_buf(
- MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
- int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
- MB_MODE_INFO *const mbmi = xd->mi[0];
- const int is_compound = has_second_ref(mbmi);
- MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
- struct buf_2d *const dst_buf = &pd->dst;
- uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
- mbmi->interinter_comp.seg_mask = xd->seg_mask;
- const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
-
- if (is_compound && is_masked_compound_type(comp_data->type)) {
- if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- av1_build_compound_diffwtd_mask_highbd(
- comp_data->seg_mask, comp_data->mask_type,
- CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
- CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
- else
- av1_build_compound_diffwtd_mask(
- comp_data->seg_mask, comp_data->mask_type, ext_dst0,
- ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
- }
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- build_masked_compound_highbd(
- dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
- CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
- mbmi->sb_type, h, w, xd->bd);
- else
- build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
- ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
- h, w);
- } else {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
- dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
- xd->bd);
- else
- aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
- 0, NULL, 0, w, h);
- }
-}
-
-void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
- int plane_from, int plane_to,
- uint8_t *ext_dst0[3],
- int ext_dst_stride0[3],
- uint8_t *ext_dst1[3],
- int ext_dst_stride1[3]) {
- int plane;
- for (plane = plane_from; plane <= plane_to; ++plane) {
- const BLOCK_SIZE plane_bsize = get_plane_block_size(
- bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
- const int bw = block_size_wide[plane_bsize];
- const int bh = block_size_high[plane_bsize];
- build_wedge_inter_predictor_from_buf(
- xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
- ext_dst1[plane], ext_dst_stride1[plane]);
- }
-}
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
index 6a3def270..db86c777e 100644
--- a/third_party/aom/av1/common/reconinter.h
+++ b/third_party/aom/av1/common/reconinter.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_RECONINTER_H_
-#define AV1_COMMON_RECONINTER_H_
+#ifndef AOM_AV1_COMMON_RECONINTER_H_
+#define AOM_AV1_COMMON_RECONINTER_H_
#include "av1/common/filter.h"
#include "av1/common/onyxc_int.h"
@@ -113,40 +113,48 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
const SubpelParams *subpel_params,
const struct scale_factors *sf, int w, int h,
ConvolveParams *conv_params,
- InterpFilters interp_filters) {
+ InterpFilters interp_filters,
+ int is_intrabc) {
assert(conv_params->do_average == 0 || conv_params->do_average == 1);
assert(sf);
- if (has_scale(subpel_params->xs, subpel_params->ys)) {
+ const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+ assert(IMPLIES(is_intrabc, !is_scaled));
+ if (is_scaled) {
av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
interp_filters, subpel_params->subpel_x,
subpel_params->xs, subpel_params->subpel_y,
- subpel_params->ys, 1, conv_params, sf);
+ subpel_params->ys, 1, conv_params, sf, is_intrabc);
} else {
SubpelParams sp = *subpel_params;
revert_scale_extra_bits(&sp);
av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
- sp.ys, 0, conv_params, sf);
+ sp.ys, 0, conv_params, sf, is_intrabc);
}
}
-static INLINE void highbd_inter_predictor(
- const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
- const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
- int h, ConvolveParams *conv_params, InterpFilters interp_filters, int bd) {
+static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const SubpelParams *subpel_params,
+ const struct scale_factors *sf, int w,
+ int h, ConvolveParams *conv_params,
+ InterpFilters interp_filters,
+ int is_intrabc, int bd) {
assert(conv_params->do_average == 0 || conv_params->do_average == 1);
assert(sf);
- if (has_scale(subpel_params->xs, subpel_params->ys)) {
- av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
- interp_filters, subpel_params->subpel_x,
- subpel_params->xs, subpel_params->subpel_y,
- subpel_params->ys, 1, conv_params, sf, bd);
+ const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+ assert(IMPLIES(is_intrabc, !is_scaled));
+ if (is_scaled) {
+ av1_highbd_convolve_2d_facade(
+ src, src_stride, dst, dst_stride, w, h, interp_filters,
+ subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y,
+ subpel_params->ys, 1, conv_params, sf, is_intrabc, bd);
} else {
SubpelParams sp = *subpel_params;
revert_scale_extra_bits(&sp);
- av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
- interp_filters, sp.subpel_x, sp.xs,
- sp.subpel_y, sp.ys, 0, conv_params, sf, bd);
+ av1_highbd_convolve_2d_facade(
+ src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x,
+ sp.xs, sp.subpel_y, sp.ys, 0, conv_params, sf, is_intrabc, bd);
}
}
@@ -237,35 +245,6 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
return clamped_mv;
}
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize);
-
-void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, const MV *src_mv,
- const struct scale_factors *sf, int w, int h,
- ConvolveParams *conv_params,
- InterpFilters interp_filters,
- const WarpTypesAllowed *warp_types, int p_col,
- int p_row, int plane, int ref,
- enum mv_precision precision, int x, int y,
- const MACROBLOCKD *xd, int can_use_previous);
-
-void av1_highbd_build_inter_predictor(
- const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
- const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
- InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col,
- int p_row, int plane, enum mv_precision precision, int x, int y,
- const MACROBLOCKD *xd, int can_use_previous);
-
static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
const struct scale_factors *sf) {
const int x =
@@ -303,32 +282,6 @@ void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
const struct scale_factors *sf, const int num_planes);
-// Detect if the block have sub-pixel level motion vectors
-// per component.
-#define CHECK_SUBPEL 0
-static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi,
- const MACROBLOCKD *const xd,
- int dir) {
-#if CHECK_SUBPEL
- const BLOCK_SIZE bsize = mbmi->sb_type;
- int plane;
- int ref = (dir >> 1);
-
- if (dir & 0x01) {
- if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
- } else {
- if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
- }
-
- return 0;
-#else
- (void)mbmi;
- (void)xd;
- (void)dir;
- return 1;
-#endif
-}
-
static INLINE void set_default_interp_filters(
MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
mbmi->interp_filters =
@@ -343,21 +296,6 @@ static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
return 1;
}
-static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
- MB_MODE_INFO *const mi = xd->mi[0];
- const int is_compound = has_second_ref(mi);
- int ref;
- for (ref = 0; ref < 1 + is_compound; ++ref) {
- int row_col;
- for (row_col = 0; row_col < 2; ++row_col) {
- const int dir = (ref << 1) + row_col;
- if (has_subpel_mv_component(mi, xd, dir)) {
- return 1;
- }
- }
- }
- return 0;
-}
void av1_setup_build_prediction_by_above_pred(
MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
@@ -367,18 +305,6 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
MB_MODE_INFO *left_mbmi,
struct build_prediction_ctxt *ctxt,
const int num_planes);
-void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- uint8_t *tmp_buf[MAX_MB_PLANE],
- int tmp_width[MAX_MB_PLANE],
- int tmp_height[MAX_MB_PLANE],
- int tmp_stride[MAX_MB_PLANE]);
-void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- uint8_t *tmp_buf[MAX_MB_PLANE],
- int tmp_width[MAX_MB_PLANE],
- int tmp_height[MAX_MB_PLANE],
- int tmp_stride[MAX_MB_PLANE]);
void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
int mi_row, int mi_col,
uint8_t *above[MAX_MB_PLANE],
@@ -389,8 +315,6 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
const uint8_t *av1_get_obmc_mask(int length);
void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
int mi_row, int mi_col);
-void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col);
#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
@@ -406,12 +330,6 @@ static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
const uint8_t *av1_get_compound_type_mask(
const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
-void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *ypred, uint8_t *upred,
- uint8_t *vpred, int ystride, int ustride,
- int vstride, BUFFER_SET *ctx,
- BLOCK_SIZE bsize);
-
// build interintra_predictors for one plane
void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
uint8_t *pred, int stride,
@@ -431,18 +349,6 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
const uint8_t *inter_pred, int inter_stride,
const uint8_t *intra_pred, int intra_stride);
-// Encoder only
-void av1_build_inter_predictors_for_planes_single_buf(
- MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
- int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
- int can_use_previous);
-void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
- int plane_from, int plane_to,
- uint8_t *ext_dst0[3],
- int ext_dst_stride0[3],
- uint8_t *ext_dst1[3],
- int ext_dst_stride1[3]);
-
void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
int order_idx, int *fwd_offset, int *bck_offset,
int *use_jnt_comp_avg, int is_compound);
@@ -456,4 +362,4 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi,
} // extern "C"
#endif
-#endif // AV1_COMMON_RECONINTER_H_
+#endif // AOM_AV1_COMMON_RECONINTER_H_
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
index 57638f24e..07853aba0 100644
--- a/third_party/aom/av1/common/reconintra.h
+++ b/third_party/aom/av1/common/reconintra.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_RECONINTRA_H_
-#define AV1_COMMON_RECONINTRA_H_
+#ifndef AOM_AV1_COMMON_RECONINTRA_H_
+#define AOM_AV1_COMMON_RECONINTRA_H_
#include <stdlib.h>
@@ -116,4 +116,4 @@ static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_COMMON_RECONINTRA_H_
+#endif // AOM_AV1_COMMON_RECONINTRA_H_
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
index 93d62292a..d61a20aa2 100644
--- a/third_party/aom/av1/common/resize.c
+++ b/third_party/aom/av1/common/resize.c
@@ -170,42 +170,6 @@ static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = {
{ -1, 3, -9, 17, 112, 10, -7, 3 }, { -1, 3, -8, 15, 112, 12, -7, 2 },
};
-// Filters for interpolation (full-band) - no filtering for integer pixels
-static const InterpKernel filteredinterp_filters1000[(1 << RS_SUBPEL_BITS)] = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -1, 128, 2, -1, 0, 0 },
- { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -4, 127, 6, -3, 1, 0 },
- { 0, 2, -6, 126, 8, -3, 1, 0 }, { 0, 2, -7, 125, 11, -4, 1, 0 },
- { -1, 2, -8, 125, 13, -5, 2, 0 }, { -1, 3, -9, 124, 15, -6, 2, 0 },
- { -1, 3, -10, 123, 18, -6, 2, -1 }, { -1, 3, -11, 122, 20, -7, 3, -1 },
- { -1, 4, -12, 121, 22, -8, 3, -1 }, { -1, 4, -13, 120, 25, -9, 3, -1 },
- { -1, 4, -14, 118, 28, -9, 3, -1 }, { -1, 4, -15, 117, 30, -10, 4, -1 },
- { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 },
- { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 },
- { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 },
- { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 },
- { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 },
- { -1, 6, -20, 97, 58, -17, 6, -1 }, { -1, 6, -20, 95, 61, -18, 6, -1 },
- { -2, 7, -20, 93, 64, -18, 6, -2 }, { -2, 7, -20, 91, 66, -19, 6, -1 },
- { -2, 7, -20, 88, 69, -19, 6, -1 }, { -2, 7, -20, 86, 71, -19, 6, -1 },
- { -2, 7, -20, 84, 74, -20, 7, -2 }, { -2, 7, -20, 81, 76, -20, 7, -1 },
- { -2, 7, -20, 79, 79, -20, 7, -2 }, { -1, 7, -20, 76, 81, -20, 7, -2 },
- { -2, 7, -20, 74, 84, -20, 7, -2 }, { -1, 6, -19, 71, 86, -20, 7, -2 },
- { -1, 6, -19, 69, 88, -20, 7, -2 }, { -1, 6, -19, 66, 91, -20, 7, -2 },
- { -2, 6, -18, 64, 93, -20, 7, -2 }, { -1, 6, -18, 61, 95, -20, 6, -1 },
- { -1, 6, -17, 58, 97, -20, 6, -1 }, { -1, 6, -17, 56, 99, -20, 6, -1 },
- { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 },
- { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 },
- { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 },
- { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 },
- { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 },
- { -1, 3, -9, 28, 118, -14, 4, -1 }, { -1, 3, -9, 25, 120, -13, 4, -1 },
- { -1, 3, -8, 22, 121, -12, 4, -1 }, { -1, 3, -7, 20, 122, -11, 3, -1 },
- { -1, 2, -6, 18, 123, -10, 3, -1 }, { 0, 2, -6, 15, 124, -9, 3, -1 },
- { 0, 2, -5, 13, 125, -8, 2, -1 }, { 0, 1, -4, 11, 125, -7, 2, 0 },
- { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -3, 6, 127, -4, 1, 0 },
- { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, -1, 2, 128, -1, 0, 0 },
-};
-
const int16_t av1_resize_filter_normative[(
1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = {
#if UPSCALE_NORMATIVE_TAPS == 8
@@ -246,6 +210,9 @@ const int16_t av1_resize_filter_normative[(
#endif // UPSCALE_NORMATIVE_TAPS == 8
};
+// Filters for interpolation (full-band) - no filtering for integer pixels
+#define filteredinterp_filters1000 av1_resize_filter_normative
+
// Filters for factor of 2 downsampling.
static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h
index feec3a90e..9a59a8d63 100644
--- a/third_party/aom/av1/common/resize.h
+++ b/third_party/aom/av1/common/resize.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RESIZE_H_
-#define AV1_ENCODER_RESIZE_H_
+#ifndef AOM_AV1_COMMON_RESIZE_H_
+#define AOM_AV1_COMMON_RESIZE_H_
#include <stdio.h>
#include "aom/aom_integer.h"
@@ -109,4 +109,4 @@ int32_t av1_get_upscale_convolve_step(int in_length, int out_length);
} // extern "C"
#endif
-#endif // AV1_ENCODER_RESIZE_H_
+#endif // AOM_AV1_COMMON_RESIZE_H_
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
index 632967957..d276a915b 100644
--- a/third_party/aom/av1/common/restoration.c
+++ b/third_party/aom/av1/common/restoration.c
@@ -661,9 +661,10 @@ const int32_t one_by_x[MAX_NELEM] = {
293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
};
-static void selfguided_restoration_fast_internal(
- int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
- int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+static void calculate_intermediate_result(int32_t *dgd, int width, int height,
+ int dgd_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx,
+ int pass, int32_t *A, int32_t *B) {
const sgr_params_type *const params = &sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
@@ -673,10 +674,7 @@ static void selfguided_restoration_fast_internal(
// We also align the stride to a multiple of 16 bytes, for consistency
// with the SIMD version of this function.
int buf_stride = ((width_ext + 3) & ~3) + 16;
- int32_t A_[RESTORATION_PROC_UNIT_PELS];
- int32_t B_[RESTORATION_PROC_UNIT_PELS];
- int32_t *A = A_;
- int32_t *B = B_;
+ const int step = pass == 0 ? 1 : 2;
int i, j;
assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
@@ -691,7 +689,7 @@ static void selfguided_restoration_fast_internal(
B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
// Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
// for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
- for (i = -1; i < height + 1; i += 2) {
+ for (i = -1; i < height + 1; i += step) {
for (j = -1; j < width + 1; ++j) {
const int k = i * buf_stride + j;
const int n = (2 * r + 1) * (2 * r + 1);
@@ -754,7 +752,31 @@ static void selfguided_restoration_fast_internal(
SGRPROJ_RECIP_BITS);
}
}
+}
+
+static void selfguided_restoration_fast_internal(
+ int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
+ int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes, for consistency
+ // with the SIMD version of this function.
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *A = A_;
+ int32_t *B = B_;
+ int i, j;
+ calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+ sgr_params_idx, radius_idx, 1, A, B);
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
// Use the A[] and B[] arrays to calculate the filtered image
+ (void)r;
assert(r == 2);
for (i = 0; i < height; ++i) {
if (!(i & 1)) { // even row
@@ -796,10 +818,7 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
int dst_stride, int bit_depth,
int sgr_params_idx,
int radius_idx) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
- const int r = params->r[radius_idx];
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
- const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes, for consistency
@@ -810,82 +829,11 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
int32_t *A = A_;
int32_t *B = B_;
int i, j;
-
- assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
- assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
- "Need SGRPROJ_BORDER_* >= r+1");
-
- boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
- width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
- boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
- width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
+ calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+ sgr_params_idx, radius_idx, 0, A, B);
A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
- // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
- // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
- for (i = -1; i < height + 1; ++i) {
- for (j = -1; j < width + 1; ++j) {
- const int k = i * buf_stride + j;
- const int n = (2 * r + 1) * (2 * r + 1);
-
- // a < 2^16 * n < 2^22 regardless of bit depth
- uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
- // b < 2^8 * n < 2^14 regardless of bit depth
- uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
-
- // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
- // and p itself satisfies p < 2^14 * n^2 < 2^26.
- // This bound on p is due to:
- // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
- //
- // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
- // This is an artefact of rounding, and can only happen if all pixels
- // are (almost) identical, so in this case we saturate to p=0.
- uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
-
- const uint32_t s = params->s[radius_idx];
-
- // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
- // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
- // (this holds even after accounting for the rounding in s)
- const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
-
- // Note: We have to be quite careful about the value of A[k].
- // This is used as a blend factor between individual pixel values and the
- // local mean. So it logically has a range of [0, 256], including both
- // endpoints.
- //
- // This is a pain for hardware, as we'd like something which can be stored
- // in exactly 8 bits.
- // Further, in the calculation of B[k] below, if z == 0 and r == 2,
- // then A[k] "should be" 0. But then we can end up setting B[k] to a value
- // slightly above 2^(8 + bit depth), due to rounding in the value of
- // one_by_x[25-1].
- //
- // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
- // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
- // overflow), without significantly affecting the final result: z == 0
- // implies that the image is essentially "flat", so the local mean and
- // individual pixel values are very similar.
- //
- // Note that saturating on the other side, ie. requring A[k] <= 255,
- // would be a bad idea, as that corresponds to the case where the image
- // is very variable, when we want to preserve the local pixel value as
- // much as possible.
- A[k] = x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
- // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
- // one_by_x[n - 1] = round(2^12 / n)
- // => the product here is < 2^(20 + bit_depth) <= 2^32,
- // and B[k] is set to a value < 2^(8 + bit depth)
- // This holds even with the rounding in one_by_x and in the overall
- // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
- B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
- (uint32_t)B[k] *
- (uint32_t)one_by_x[n - 1],
- SGRPROJ_RECIP_BITS);
- }
- }
// Use the A[] and B[] arrays to calculate the filtered image
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
@@ -911,10 +859,10 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
}
}
-void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
- int dgd_stride, int32_t *flt0, int32_t *flt1,
- int flt_stride, int sgr_params_idx,
- int bit_depth, int highbd) {
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0, int32_t *flt1,
+ int flt_stride, int sgr_params_idx,
+ int bit_depth, int highbd) {
int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
int32_t *dgd32 =
@@ -948,6 +896,7 @@ void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
if (params->r[1] > 0)
selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
flt_stride, bit_depth, sgr_params_idx, 1);
+ return 0;
}
void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
@@ -959,8 +908,10 @@ void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
- av1_selfguided_restoration_c(dat8, width, height, stride, flt0, flt1, width,
- eps, bit_depth, highbd);
+ const int ret = av1_selfguided_restoration_c(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ (void)ret;
+ assert(!ret);
const sgr_params_type *const params = &sgr_params[eps];
int xq[2];
decode_xq(xqd, xq, params);
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
index aec37d834..d834f9270 100644
--- a/third_party/aom/av1/common/restoration.h
+++ b/third_party/aom/av1/common/restoration.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_RESTORATION_H_
-#define AV1_COMMON_RESTORATION_H_
+#ifndef AOM_AV1_COMMON_RESTORATION_H_
+#define AOM_AV1_COMMON_RESTORATION_H_
#include "aom_ports/mem.h"
#include "config/aom_config.h"
@@ -120,6 +120,7 @@ extern "C" {
// If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for
// chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN.
#define WIENER_WIN_CHROMA (WIENER_WIN - 2)
+#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
#define WIENER_FILT_PREC_BITS 7
#define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
@@ -373,4 +374,4 @@ void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
} // extern "C"
#endif
-#endif // AV1_COMMON_RESTORATION_H_
+#endif // AOM_AV1_COMMON_RESTORATION_H_
diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h
index 5f02fdb81..748e958c3 100644
--- a/third_party/aom/av1/common/scale.h
+++ b/third_party/aom/av1/common/scale.h
@@ -9,12 +9,11 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_SCALE_H_
-#define AV1_COMMON_SCALE_H_
+#ifndef AOM_AV1_COMMON_SCALE_H_
+#define AOM_AV1_COMMON_SCALE_H_
#include "av1/common/convolve.h"
#include "av1/common/mv.h"
-#include "aom_dsp/aom_convolve.h"
#ifdef __cplusplus
extern "C" {
@@ -65,4 +64,4 @@ static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
} // extern "C"
#endif
-#endif // AV1_COMMON_SCALE_H_
+#endif // AOM_AV1_COMMON_SCALE_H_
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
index d206586b5..233dc0efa 100644
--- a/third_party/aom/av1/common/scan.h
+++ b/third_party/aom/av1/common/scan.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_SCAN_H_
-#define AV1_COMMON_SCAN_H_
+#ifndef AOM_AV1_COMMON_SCAN_H_
+#define AOM_AV1_COMMON_SCAN_H_
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
@@ -52,4 +52,4 @@ static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
} // extern "C"
#endif
-#endif // AV1_COMMON_SCAN_H_
+#endif // AOM_AV1_COMMON_SCAN_H_
diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h
index c851d65fd..8c35bba86 100644
--- a/third_party/aom/av1/common/seg_common.h
+++ b/third_party/aom/av1/common/seg_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_SEG_COMMON_H_
-#define AV1_COMMON_SEG_COMMON_H_
+#ifndef AOM_AV1_COMMON_SEG_COMMON_H_
+#define AOM_AV1_COMMON_SEG_COMMON_H_
#include "aom_dsp/prob.h"
@@ -101,4 +101,4 @@ static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
} // extern "C"
#endif
-#endif // AV1_COMMON_SEG_COMMON_H_
+#endif // AOM_AV1_COMMON_SEG_COMMON_H_
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
index f9b734b8c..8df4c9a09 100644
--- a/third_party/aom/av1/common/thread_common.c
+++ b/third_party/aom/av1/common/thread_common.c
@@ -304,8 +304,9 @@ static INLINE void thread_loop_filter_rows(
}
// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(AV1LfSync *const lf_sync,
- LFWorkerData *const lf_data) {
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+ AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
+ LFWorkerData *const lf_data = (LFWorkerData *)arg2;
thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->xd, lf_sync);
return 1;
@@ -342,7 +343,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
AVxWorker *const worker = &workers[i];
LFWorkerData *const lf_data = &lf_sync->lfdata[i];
- worker->hook = (AVxWorkerHook)loop_filter_row_worker;
+ worker->hook = loop_filter_row_worker;
worker->data1 = lf_sync;
worker->data2 = lf_data;
@@ -649,8 +650,9 @@ AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
}
// Implement row loop restoration for each thread.
-static int loop_restoration_row_worker(AV1LrSync *const lr_sync,
- LRWorkerData *lrworkerdata) {
+static int loop_restoration_row_worker(void *arg1, void *arg2) {
+ AV1LrSync *const lr_sync = (AV1LrSync *)arg1;
+ LRWorkerData *lrworkerdata = (LRWorkerData *)arg2;
AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt;
FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
int lr_unit_row;
@@ -714,10 +716,12 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
int num_rows_lr = 0;
for (int plane = 0; plane < num_planes; plane++) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
const AV1PixelRect tile_rect = ctxt[plane].tile_rect;
const int max_tile_h = tile_rect.bottom - tile_rect.top;
- const int unit_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+ const int unit_size = cm->rst_info[plane].restoration_unit_size;
num_rows_lr =
AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h));
@@ -746,7 +750,7 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
for (i = 0; i < num_workers; ++i) {
AVxWorker *const worker = &workers[i];
lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
- worker->hook = (AVxWorkerHook)loop_restoration_row_worker;
+ worker->hook = loop_restoration_row_worker;
worker->data1 = lr_sync;
worker->data2 = &lr_sync->lrworkerdata[i];
diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h
index 4b0d5d2b8..23d61d72a 100644
--- a/third_party/aom/av1/common/thread_common.h
+++ b/third_party/aom/av1/common/thread_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_LOOPFILTER_THREAD_H_
-#define AV1_COMMON_LOOPFILTER_THREAD_H_
+#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_
+#define AOM_AV1_COMMON_THREAD_COMMON_H_
#include "config/aom_config.h"
@@ -116,4 +116,4 @@ void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
} // extern "C"
#endif
-#endif // AV1_COMMON_LOOPFILTER_THREAD_H_
+#endif // AOM_AV1_COMMON_THREAD_COMMON_H_
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
index 026c904b6..1b413487f 100644
--- a/third_party/aom/av1/common/tile_common.c
+++ b/third_party/aom/av1/common/tile_common.c
@@ -127,6 +127,22 @@ void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
assert(tile->mi_col_end > tile->mi_col_start);
}
+int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) {
+ int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
+ tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
+ int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+ return sb_rows;
+}
+
+int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
+ int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
+ tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
+ int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+ return sb_cols;
+}
+
int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
// Round the frame up to a whole number of max superblocks
mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2);
diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h
index be037fb17..c03553dc6 100644
--- a/third_party/aom/av1/common/tile_common.h
+++ b/third_party/aom/av1/common/tile_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_TILE_COMMON_H_
-#define AV1_COMMON_TILE_COMMON_H_
+#ifndef AOM_AV1_COMMON_TILE_COMMON_H_
+#define AOM_AV1_COMMON_TILE_COMMON_H_
#ifdef __cplusplus
extern "C" {
@@ -44,6 +44,9 @@ void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
// tiles horizontally or vertically in the frame.
int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles);
+int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile);
+int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile);
+
typedef struct {
int left, top, right, bottom;
} AV1PixelRect;
@@ -66,4 +69,4 @@ void av1_calculate_tile_rows(struct AV1Common *const cm);
} // extern "C"
#endif
-#endif // AV1_COMMON_TILE_COMMON_H_
+#endif // AOM_AV1_COMMON_TILE_COMMON_H_
diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h
index 1749baa57..06939ae43 100644
--- a/third_party/aom/av1/common/timing.h
+++ b/third_party/aom/av1/common/timing.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_TIMING_H_
-#define AOM_TIMING_H_
+#ifndef AOM_AV1_COMMON_TIMING_H_
+#define AOM_AV1_COMMON_TIMING_H_
#include "aom/aom_integer.h"
#include "av1/common/enums.h"
@@ -56,4 +56,4 @@ void set_resource_availability_parameters(
int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
int seq_tier);
-#endif // AOM_TIMING_H_
+#endif // AOM_AV1_COMMON_TIMING_H_
diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h
index 9a6b454ac..53e956450 100644
--- a/third_party/aom/av1/common/token_cdfs.h
+++ b/third_party/aom/av1/common/token_cdfs.h
@@ -9,6 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_
+#define AOM_AV1_COMMON_TOKEN_CDFS_H_
+
#include "config/aom_config.h"
#include "av1/common/entropy.h"
@@ -3548,3 +3551,5 @@ static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
{ AOM_CDF3(10923, 21845) },
{ AOM_CDF3(10923, 21845) },
{ AOM_CDF3(10923, 21845) } } } } };
+
+#endif // AOM_AV1_COMMON_TOKEN_CDFS_H_
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
index f0ab79d0f..1dda51f8b 100644
--- a/third_party/aom/av1/common/txb_common.h
+++ b/third_party/aom/av1/common/txb_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_TXB_COMMON_H_
-#define AV1_COMMON_TXB_COMMON_H_
+#ifndef AOM_AV1_COMMON_TXB_COMMON_H_
+#define AOM_AV1_COMMON_TXB_COMMON_H_
extern const int16_t k_eob_group_start[12];
extern const int16_t k_eob_offset_bits[12];
@@ -34,24 +34,6 @@ static const int base_level_count_to_index[13] = {
0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
};
-// Note: TX_PAD_2D is dependent to this offset table.
-static const int base_ref_offset[BASE_CONTEXT_POSITION_NUM][2] = {
- /* clang-format off*/
- { -2, 0 }, { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -2 }, { 0, -1 }, { 0, 1 },
- { 0, 2 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 2, 0 }
- /* clang-format on*/
-};
-
-#define CONTEXT_MAG_POSITION_NUM 3
-static const int mag_ref_offset_with_txclass[3][CONTEXT_MAG_POSITION_NUM][2] = {
- { { 0, 1 }, { 1, 0 }, { 1, 1 } },
- { { 0, 1 }, { 1, 0 }, { 0, 2 } },
- { { 0, 1 }, { 1, 0 }, { 2, 0 } }
-};
-static const int mag_ref_offset[CONTEXT_MAG_POSITION_NUM][2] = {
- { 0, 1 }, { 1, 0 }, { 1, 1 }
-};
-
static const TX_CLASS tx_type_to_class[TX_TYPES] = {
TX_CLASS_2D, // DCT_DCT
TX_CLASS_2D, // ADST_DCT
@@ -71,61 +53,6 @@ static const TX_CLASS tx_type_to_class[TX_TYPES] = {
TX_CLASS_HORIZ, // H_FLIPADST
};
-static const int8_t eob_to_pos_small[33] = {
- 0, 1, 2, // 0-2
- 3, 3, // 3-4
- 4, 4, 4, 4, // 5-8
- 5, 5, 5, 5, 5, 5, 5, 5, // 9-16
- 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32
-};
-
-static const int8_t eob_to_pos_large[17] = {
- 6, // place holder
- 7, // 33-64
- 8, 8, // 65-128
- 9, 9, 9, 9, // 129-256
- 10, 10, 10, 10, 10, 10, 10, 10, // 257-512
- 11 // 513-
-};
-
-static INLINE int get_eob_pos_token(const int eob, int *const extra) {
- int t;
-
- if (eob < 33) {
- t = eob_to_pos_small[eob];
- } else {
- const int e = AOMMIN((eob - 1) >> 5, 16);
- t = eob_to_pos_large[e];
- }
-
- *extra = eob - k_eob_group_start[t];
-
- return t;
-}
-
-static INLINE int av1_get_eob_pos_ctx(const TX_TYPE tx_type,
- const int eob_token) {
- static const int8_t tx_type_to_offset[TX_TYPES] = {
- -1, // DCT_DCT
- -1, // ADST_DCT
- -1, // DCT_ADST
- -1, // ADST_ADST
- -1, // FLIPADST_DCT
- -1, // DCT_FLIPADST
- -1, // FLIPADST_FLIPADST
- -1, // ADST_FLIPADST
- -1, // FLIPADST_ADST
- -1, // IDTX
- 10, // V_DCT
- 10, // H_DCT
- 10, // V_ADST
- 10, // H_ADST
- 10, // V_FLIPADST
- 10, // H_FLIPADST
- };
- return eob_token + tx_type_to_offset[tx_type];
-}
-
static INLINE int get_txb_bwl(TX_SIZE tx_size) {
tx_size = av1_get_adjusted_tx_size(tx_size);
return tx_size_wide_log2[tx_size];
@@ -141,36 +68,6 @@ static INLINE int get_txb_high(TX_SIZE tx_size) {
return tx_size_high[tx_size];
}
-static INLINE void get_base_count_mag(int *mag, int *count,
- const tran_low_t *tcoeffs, int bwl,
- int height, int row, int col) {
- mag[0] = 0;
- mag[1] = 0;
- for (int i = 0; i < NUM_BASE_LEVELS; ++i) count[i] = 0;
- for (int idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) {
- const int ref_row = row + base_ref_offset[idx][0];
- const int ref_col = col + base_ref_offset[idx][1];
- if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
- ref_col >= (1 << bwl))
- continue;
- const int pos = (ref_row << bwl) + ref_col;
- tran_low_t abs_coeff = abs(tcoeffs[pos]);
- // count
- for (int i = 0; i < NUM_BASE_LEVELS; ++i) {
- count[i] += abs_coeff > i;
- }
- // mag
- if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0) {
- if (abs_coeff > mag[0]) {
- mag[0] = abs_coeff;
- mag[1] = 1;
- } else if (abs_coeff == mag[0]) {
- ++mag[1];
- }
- }
- }
-}
-
static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int width) {
return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR);
}
@@ -179,30 +76,6 @@ static INLINE int get_padded_idx(const int idx, const int bwl) {
return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2);
}
-static INLINE int get_level_count(const uint8_t *const levels, const int stride,
- const int row, const int col, const int level,
- const int (*nb_offset)[2], const int nb_num) {
- int count = 0;
-
- for (int idx = 0; idx < nb_num; ++idx) {
- const int ref_row = row + nb_offset[idx][0];
- const int ref_col = col + nb_offset[idx][1];
- const int pos = ref_row * stride + ref_col;
- count += levels[pos] > level;
- }
- return count;
-}
-
-static INLINE void get_level_mag(const uint8_t *const levels, const int stride,
- const int row, const int col, int *const mag) {
- for (int idx = 0; idx < CONTEXT_MAG_POSITION_NUM; ++idx) {
- const int ref_row = row + mag_ref_offset[idx][0];
- const int ref_col = col + mag_ref_offset[idx][1];
- const int pos = ref_row * stride + ref_col;
- mag[idx] = levels[pos];
- }
-}
-
static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
int sig_mag) {
const int ctx = base_level_count_to_index[count];
@@ -267,84 +140,6 @@ static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
return ctx_idx;
}
-static INLINE int get_base_ctx(const uint8_t *const levels,
- const int c, // raster order
- const int bwl, const int level_minus_1,
- const int count) {
- const int row = c >> bwl;
- const int col = c - (row << bwl);
- const int stride = (1 << bwl) + TX_PAD_HOR;
- int mag_count = 0;
- int nb_mag[3] = { 0 };
-
- get_level_mag(levels, stride, row, col, nb_mag);
-
- for (int idx = 0; idx < 3; ++idx)
- mag_count += nb_mag[idx] > (level_minus_1 + 1);
- const int ctx_idx =
- get_base_ctx_from_count_mag(row, col, count, AOMMIN(2, mag_count));
- return ctx_idx;
-}
-
-#define BR_CONTEXT_POSITION_NUM 8 // Base range coefficient context
-// Note: TX_PAD_2D is dependent to this offset table.
-static const int br_ref_offset[BR_CONTEXT_POSITION_NUM][2] = {
- /* clang-format off*/
- { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -1 },
- { 0, 1 }, { 1, -1 }, { 1, 0 }, { 1, 1 },
- /* clang-format on*/
-};
-
-static const int br_level_map[9] = {
- 0, 0, 1, 1, 2, 2, 3, 3, 3,
-};
-
-// Note: If BR_MAG_OFFSET changes, the calculation of offset in
-// get_br_ctx_from_count_mag() must be updated.
-#define BR_MAG_OFFSET 1
-// TODO(angiebird): optimize this function by using a table to map from
-// count/mag to ctx
-
-static INLINE int get_br_count_mag(int *mag, const tran_low_t *tcoeffs, int bwl,
- int height, int row, int col, int level) {
- mag[0] = 0;
- mag[1] = 0;
- int count = 0;
- for (int idx = 0; idx < BR_CONTEXT_POSITION_NUM; ++idx) {
- const int ref_row = row + br_ref_offset[idx][0];
- const int ref_col = col + br_ref_offset[idx][1];
- if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
- ref_col >= (1 << bwl))
- continue;
- const int pos = (ref_row << bwl) + ref_col;
- tran_low_t abs_coeff = abs(tcoeffs[pos]);
- count += abs_coeff > level;
- if (br_ref_offset[idx][0] >= 0 && br_ref_offset[idx][1] >= 0) {
- if (abs_coeff > mag[0]) {
- mag[0] = abs_coeff;
- mag[1] = 1;
- } else if (abs_coeff == mag[0]) {
- ++mag[1];
- }
- }
- }
- return count;
-}
-
-static INLINE int get_br_ctx_from_count_mag(const int row, const int col,
- const int count, const int mag) {
- // DC: 0 - 1
- // Top row: 2 - 4
- // Left column: 5 - 7
- // others: 8 - 11
- static const int offset_pos[2][2] = { { 8, 5 }, { 2, 0 } };
- const int mag_clamp = AOMMIN(mag, 6);
- const int offset = mag_clamp >> 1;
- const int ctx =
- br_level_map[count] + offset * BR_TMP_OFFSET + offset_pos[!row][!col];
- return ctx;
-}
-
static INLINE int get_br_ctx_2d(const uint8_t *const levels,
const int c, // raster order
const int bwl) {
@@ -396,38 +191,6 @@ static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
return mag + 14;
}
-#define SIG_REF_OFFSET_NUM 5
-
-// Note: TX_PAD_2D is dependent to these offset tables.
-static const int sig_ref_offset[SIG_REF_OFFSET_NUM][2] = {
- { 0, 1 }, { 1, 0 }, { 1, 1 }, { 0, 2 }, { 2, 0 }
- // , { 1, 2 }, { 2, 1 },
-};
-
-static const int sig_ref_offset_vert[SIG_REF_OFFSET_NUM][2] = {
- { 1, 0 }, { 2, 0 }, { 0, 1 }, { 3, 0 }, { 4, 0 }
- // , { 1, 1 }, { 2, 1 },
-};
-
-static const int sig_ref_offset_horiz[SIG_REF_OFFSET_NUM][2] = {
- { 0, 1 }, { 0, 2 }, { 1, 0 }, { 0, 3 }, { 0, 4 }
- // , { 1, 1 }, { 1, 2 },
-};
-
-#define SIG_REF_DIFF_OFFSET_NUM 3
-
-static const int sig_ref_diff_offset[SIG_REF_DIFF_OFFSET_NUM][2] = {
- { 1, 1 }, { 0, 2 }, { 2, 0 }
-};
-
-static const int sig_ref_diff_offset_vert[SIG_REF_DIFF_OFFSET_NUM][2] = {
- { 2, 0 }, { 3, 0 }, { 4, 0 }
-};
-
-static const int sig_ref_diff_offset_horiz[SIG_REF_DIFF_OFFSET_NUM][2] = {
- { 0, 2 }, { 0, 3 }, { 0, 4 }
-};
-
static const uint8_t clip_max3[256] = {
0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -658,4 +421,4 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
void av1_init_lv_map(AV1_COMMON *cm);
-#endif // AV1_COMMON_TXB_COMMON_H_
+#endif // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
index 412d83ed8..4144c4389 100644
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -562,7 +562,7 @@ static int64_t highbd_warp_error(
const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
- ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
conv_params.use_jnt_comp_avg = 0;
for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
@@ -845,7 +845,7 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
- ConvolveParams conv_params = get_conv_params(0, 0, 0, 8);
+ ConvolveParams conv_params = get_conv_params(0, 0, 8);
conv_params.use_jnt_comp_avg = 0;
for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
index ce4032ee5..a1a4f067d 100644
--- a/third_party/aom/av1/common/warped_motion.h
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_WARPED_MOTION_H_
-#define AV1_COMMON_WARPED_MOTION_H_
+#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_
+#define AOM_AV1_COMMON_WARPED_MOTION_H_
#include <stdio.h>
#include <stdlib.h>
@@ -92,4 +92,4 @@ int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
int mi_col);
int get_shear_params(WarpedMotionParams *wm);
-#endif // AV1_COMMON_WARPED_MOTION_H_
+#endif // AOM_AV1_COMMON_WARPED_MOTION_H_
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
index 0c5286f9d..d9fb53785 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -14,7 +14,6 @@
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
index ae331b40d..5db2ccf6c 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -18,6 +18,12 @@
#include "av1/common/x86/av1_inv_txfm_avx2.h"
#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
index 7b5b29cf8..f74cbaeaa 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
-#define AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
#include <immintrin.h>
@@ -68,4 +68,4 @@ void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
}
#endif
-#endif // AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
index dd7cee24c..995bc3da4 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -16,6 +16,12 @@
#include "av1/common/x86/av1_inv_txfm_ssse3.h"
#include "av1/common/x86/av1_txfm_sse2.h"
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
// TODO(binpengsmail@gmail.com): replace some for loop with do {} while
static void idct4_new_sse2(const __m128i *input, __m128i *output,
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
index dc9be25d2..66bd339d1 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
-#define AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
#include <emmintrin.h> // SSE2
#include <tmmintrin.h> // SSSE3
@@ -94,10 +94,6 @@ static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
};
-// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
-static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
- 4 * 5793 };
-
DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
};
@@ -233,4 +229,4 @@ void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
} // extern "C"
#endif
-#endif // AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
index 721cfe059..77aeb6eb1 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse2.h
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_X86_AV1_TXFM_SSE2_H_
-#define AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
#include <emmintrin.h> // SSE2
@@ -314,4 +314,4 @@ typedef struct {
#ifdef __cplusplus
}
#endif // __cplusplus
-#endif // AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
index 367e02096..6cad821b1 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse4.h
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_TXFM_SSE4_H_
-#define AV1_TXFM_SSE4_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
#include <smmintrin.h>
@@ -45,8 +45,9 @@ static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
__m128i *output,
const int size,
- const int bit) {
- const __m128i sqrt2 = _mm_set1_epi32(NewSqrt2);
+ const int bit,
+ const int val) {
+ const __m128i sqrt2 = _mm_set1_epi32(val);
if (bit > 0) {
int i;
for (i = 0; i < size; i++) {
@@ -68,4 +69,4 @@ static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
}
#endif
-#endif // AV1_TXFM_SSE4_H_
+#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h
index 7479ac3e1..3b342cd4e 100644
--- a/third_party/aom/av1/common/x86/cfl_simd.h
+++ b/third_party/aom/av1/common/x86/cfl_simd.h
@@ -9,6 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_
+#define AOM_AV1_COMMON_X86_CFL_SIMD_H_
+
#include "av1/common/blockd.h"
// SSSE3 version is optimal for with == 4, we reuse them in AVX2
@@ -236,3 +239,5 @@ void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
int dst_stride, int alpha_q3, int bd);
void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
int dst_stride, int alpha_q3, int bd);
+
+#endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
index 1099144fe..0acafd044 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -11,10 +11,8 @@
#include <immintrin.h>
-#include "config/aom_dsp_rtcd.h"
#include "config/av1_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/x86/convolve_common_intrin.h"
#include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
index 637f83cf7..b1a62a4f6 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -11,9 +11,8 @@
#include <emmintrin.h>
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
index f66dee37d..5016642de 100644
--- a/third_party/aom/av1/common/x86/convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -11,9 +11,8 @@
#include <emmintrin.h>
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_common_intrin.h"
@@ -76,8 +75,8 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
return convolve(ss, coeffs);
}
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
- const uint8_t *dst, int dst_stride, int w, int h,
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
@@ -237,8 +236,8 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
}
}
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
- const uint8_t *dst, int dst_stride, int w, int h,
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
index 8444ffa93..ae68f0bbb 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -14,7 +14,6 @@
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
index eb340523a..3f8dafb4b 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -15,7 +15,6 @@
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
index 33183fdee..1d029db39 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -14,7 +14,6 @@
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
index debb05a6d..ade2af03e 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -15,6 +15,9 @@
#include "config/av1_rtcd.h"
#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
// Note:
// Total 32x4 registers to represent 32x32 block coefficients.
@@ -27,131 +30,125 @@
// ... ...
// v124, v125, v126, v127
-static void transpose_32x32_8x8(const __m256i *in, __m256i *out) {
+static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+ __m256i clamped, mask;
+
+ mask = _mm256_cmpgt_epi16(u, max);
+ clamped = _mm256_andnot_si256(mask, u);
+ mask = _mm256_and_si256(mask, max);
+ clamped = _mm256_or_si256(mask, clamped);
+ mask = _mm256_cmpgt_epi16(clamped, zero);
+ clamped = _mm256_and_si256(clamped, mask);
+
+ return clamped;
+}
+
+static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
+ __m256i res0, __m256i res1,
+ const int bd) {
+ __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
+ __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
+
+ x0 = _mm256_add_epi32(res0, x0);
+ x1 = _mm256_add_epi32(res1, x1);
+ x0 = _mm256_packus_epi32(x0, x1);
+ x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+ x0 = highbd_clamp_epi16_avx2(x0, bd);
+ return x0;
+}
+
+static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
+ __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
+
+ _mm256_storeu_si256((__m256i *)(output + i * stride), u);
+ }
+}
+
+static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+ __m256i tmp, round;
+ round = _mm256_set1_epi32(1 << (bit - 1));
+ tmp = _mm256_add_epi32(vec, round);
+ return _mm256_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
+ __m256i *output,
+ const int size,
+ const int bit) {
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = av1_round_shift_32_avx2(input[i], bit);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = _mm256_slli_epi32(input[i], -bit);
+ }
+ }
+}
+
+static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
__m256i u0, u1, u2, u3, u4, u5, u6, u7;
__m256i x0, x1;
- u0 = _mm256_unpacklo_epi32(in[0], in[4]);
- u1 = _mm256_unpackhi_epi32(in[0], in[4]);
+ u0 = _mm256_unpacklo_epi32(in[0], in[1]);
+ u1 = _mm256_unpackhi_epi32(in[0], in[1]);
- u2 = _mm256_unpacklo_epi32(in[8], in[12]);
- u3 = _mm256_unpackhi_epi32(in[8], in[12]);
+ u2 = _mm256_unpacklo_epi32(in[2], in[3]);
+ u3 = _mm256_unpackhi_epi32(in[2], in[3]);
- u4 = _mm256_unpacklo_epi32(in[16], in[20]);
- u5 = _mm256_unpackhi_epi32(in[16], in[20]);
+ u4 = _mm256_unpacklo_epi32(in[4], in[5]);
+ u5 = _mm256_unpackhi_epi32(in[4], in[5]);
- u6 = _mm256_unpacklo_epi32(in[24], in[28]);
- u7 = _mm256_unpackhi_epi32(in[24], in[28]);
+ u6 = _mm256_unpacklo_epi32(in[6], in[7]);
+ u7 = _mm256_unpackhi_epi32(in[6], in[7]);
x0 = _mm256_unpacklo_epi64(u0, u2);
x1 = _mm256_unpacklo_epi64(u4, u6);
out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
- out[16] = _mm256_permute2f128_si256(x0, x1, 0x31);
+ out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
x0 = _mm256_unpackhi_epi64(u0, u2);
x1 = _mm256_unpackhi_epi64(u4, u6);
- out[4] = _mm256_permute2f128_si256(x0, x1, 0x20);
- out[20] = _mm256_permute2f128_si256(x0, x1, 0x31);
+ out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
x0 = _mm256_unpacklo_epi64(u1, u3);
x1 = _mm256_unpacklo_epi64(u5, u7);
- out[8] = _mm256_permute2f128_si256(x0, x1, 0x20);
- out[24] = _mm256_permute2f128_si256(x0, x1, 0x31);
+ out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
x0 = _mm256_unpackhi_epi64(u1, u3);
x1 = _mm256_unpackhi_epi64(u5, u7);
- out[12] = _mm256_permute2f128_si256(x0, x1, 0x20);
- out[28] = _mm256_permute2f128_si256(x0, x1, 0x31);
-}
-
-static void transpose_32x32_16x16(const __m256i *in, __m256i *out) {
- transpose_32x32_8x8(&in[0], &out[0]);
- transpose_32x32_8x8(&in[1], &out[32]);
- transpose_32x32_8x8(&in[32], &out[1]);
- transpose_32x32_8x8(&in[33], &out[33]);
-}
-
-static void transpose_32x32(const __m256i *in, __m256i *out) {
- transpose_32x32_16x16(&in[0], &out[0]);
- transpose_32x32_16x16(&in[2], &out[64]);
- transpose_32x32_16x16(&in[64], &out[2]);
- transpose_32x32_16x16(&in[66], &out[66]);
+ out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
}
-static void load_buffer_32x32(const int32_t *coeff, __m256i *in) {
+static void load_buffer_32x32(const int32_t *coeff, __m256i *in,
+ int input_stiride, int size) {
int i;
- for (i = 0; i < 128; ++i) {
- in[i] = _mm256_loadu_si256((const __m256i *)coeff);
- coeff += 8;
+ for (i = 0; i < size; ++i) {
+ in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride));
}
}
-static __m256i highbd_clamp_epi32(__m256i x, int bd) {
- const __m256i zero = _mm256_setzero_si256();
- const __m256i one = _mm256_set1_epi16(1);
- const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
- __m256i clamped, mask;
-
- mask = _mm256_cmpgt_epi16(x, max);
- clamped = _mm256_andnot_si256(mask, x);
- mask = _mm256_and_si256(mask, max);
- clamped = _mm256_or_si256(mask, clamped);
- mask = _mm256_cmpgt_epi16(clamped, zero);
- clamped = _mm256_and_si256(clamped, mask);
-
- return clamped;
-}
-
-static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
- int fliplr, int flipud, int shift, int bd) {
- __m256i u0, u1, x0, x1, x2, x3, v0, v1, v2, v3;
- const __m256i zero = _mm256_setzero_si256();
- int i = 0;
- (void)fliplr;
- (void)flipud;
-
- __m256i round = _mm256_set1_epi32((1 << shift) >> 1);
-
- while (i < 128) {
- u0 = _mm256_loadu_si256((const __m256i *)output);
- u1 = _mm256_loadu_si256((const __m256i *)(output + 16));
-
- x0 = _mm256_unpacklo_epi16(u0, zero);
- x1 = _mm256_unpackhi_epi16(u0, zero);
- x2 = _mm256_unpacklo_epi16(u1, zero);
- x3 = _mm256_unpackhi_epi16(u1, zero);
-
- v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20);
- v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31);
- v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20);
- v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31);
-
- v0 = _mm256_add_epi32(v0, round);
- v1 = _mm256_add_epi32(v1, round);
- v2 = _mm256_add_epi32(v2, round);
- v3 = _mm256_add_epi32(v3, round);
-
- v0 = _mm256_sra_epi32(v0, _mm_cvtsi32_si128(shift));
- v1 = _mm256_sra_epi32(v1, _mm_cvtsi32_si128(shift));
- v2 = _mm256_sra_epi32(v2, _mm_cvtsi32_si128(shift));
- v3 = _mm256_sra_epi32(v3, _mm_cvtsi32_si128(shift));
-
- v0 = _mm256_add_epi32(v0, x0);
- v1 = _mm256_add_epi32(v1, x1);
- v2 = _mm256_add_epi32(v2, x2);
- v3 = _mm256_add_epi32(v3, x3);
-
- v0 = _mm256_packus_epi32(v0, v1);
- v2 = _mm256_packus_epi32(v2, v3);
-
- v0 = highbd_clamp_epi32(v0, bd);
- v2 = highbd_clamp_epi32(v2, bd);
-
- _mm256_storeu_si256((__m256i *)output, v0);
- _mm256_storeu_si256((__m256i *)(output + 16), v2);
- output += stride;
- i += 4;
- }
+static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *rounding, int bit) {
+ __m256i x;
+ x = _mm256_mullo_epi32(*w0, *n0);
+ x = _mm256_add_epi32(x, *rounding);
+ x = _mm256_srai_epi32(x, bit);
+ return x;
}
static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
@@ -200,18 +197,549 @@ static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
__m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
__m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
+ a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
a0 = _mm256_max_epi32(a0, *clamp_lo);
a0 = _mm256_min_epi32(a0, *clamp_hi);
a1 = _mm256_max_epi32(a1, *clamp_lo);
a1 = _mm256_min_epi32(a1, *clamp_hi);
- a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
- a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
*out0 = a0;
*out1 = a1;
}
+static INLINE void idct32_stage4_avx2(
+ __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
+ const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
+ const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+ bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+ bf1[17] = temp1;
+
+ temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+ bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+ bf1[18] = temp2;
+
+ temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+ bf1[21] = temp1;
+
+ temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+ bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+ bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_avx2(
+ __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
+ const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
+ const __m256i *clamp_hi, const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+ bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+ bf1[9] = temp1;
+
+ temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+ bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+ bf1[10] = temp2;
+
+ addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_avx2(
+ __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
+ const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+ const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[5] = temp1;
+
+ addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+ bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+ bf1[18] = temp1;
+ temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+ bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+ bf1[19] = temp2;
+ temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+ bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[10] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[11] = temp2;
+
+ addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+ temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[22] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
+ const int do_cols, const int bd,
+ const int out_shift,
+ const int log_range) {
+ if (do_cols) {
+ addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31);
+ addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30);
+ addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29);
+ addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28);
+ addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27);
+ addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26);
+ addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25);
+ addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24);
+ addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23);
+ addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22);
+ addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21);
+ addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20);
+ addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19);
+ addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18);
+ addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17);
+ addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i x;
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ x = _mm256_mullo_epi32(in[0], cospi32);
+ x = _mm256_add_epi32(x, rounding);
+ x = _mm256_srai_epi32(x, bit);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ if (do_cols) {
+ x = _mm256_max_epi32(x, clamp_lo);
+ x = _mm256_min_epi32(x, clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ x = _mm256_add_epi32(offset, x);
+ x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ x = _mm256_max_epi32(x, clamp_lo_out);
+ x = _mm256_min_epi32(x, clamp_hi_out);
+ }
+
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+ out[8] = x;
+ out[9] = x;
+ out[10] = x;
+ out[11] = x;
+ out[12] = x;
+ out[13] = x;
+ out[14] = x;
+ out[15] = x;
+ out[16] = x;
+ out[17] = x;
+ out[18] = x;
+ out[19] = x;
+ out[20] = x;
+ out[21] = x;
+ out[22] = x;
+ out[23] = x;
+ out[24] = x;
+ out[25] = x;
+ out[26] = x;
+ out[27] = x;
+ out[28] = x;
+ out[29] = x;
+ out[30] = x;
+ out[31] = x;
+}
+
+static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[4] = in[4];
+ bf1[8] = in[2];
+ bf1[12] = in[6];
+ bf1[16] = in[1];
+ bf1[20] = in[5];
+ bf1[24] = in[3];
+ bf1[28] = in[7];
+
+ // stage 2
+ bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+ bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+ bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+
+ bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+ bf1[17] = bf1[16];
+ bf1[18] = bf1[19];
+ bf1[21] = bf1[20];
+ bf1[22] = bf1[23];
+ bf1[25] = bf1[24];
+ bf1[26] = bf1[27];
+ bf1[29] = bf1[28];
+ bf1[30] = bf1[31];
+
+ // stage 4
+ bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+
+ bf1[9] = bf1[8];
+ bf1[10] = bf1[11];
+ bf1[13] = bf1[12];
+ bf1[14] = bf1[15];
+
+ idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[5] = bf1[4];
+ bf1[6] = bf1[7];
+
+ idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ bf1[3] = bf1[0];
+ bf1[2] = bf1[1];
+
+ idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+ }
+}
+
+static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[2] = in[8];
+ bf1[4] = in[4];
+ bf1[6] = in[12];
+ bf1[8] = in[2];
+ bf1[10] = in[10];
+ bf1[12] = in[6];
+ bf1[14] = in[14];
+ bf1[16] = in[1];
+ bf1[18] = in[9];
+ bf1[20] = in[5];
+ bf1[22] = in[13];
+ bf1[24] = in[3];
+ bf1[26] = in[11];
+ bf1[28] = in[7];
+ bf1[30] = in[15];
+
+ // stage 2
+ bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+ bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
+ bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
+ bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
+ bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
+ bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+ bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
+ bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
+ bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
+ bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
+ bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+ bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
+ bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
+ bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
+ bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
+ bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+
+ addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+ bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
+ bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
+
+ addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+ idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
+ bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
+
+ addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+ idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+ idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+ }
+}
+
static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
int out_shift) {
const int32_t *cospi = cospi_arr(bit);
@@ -270,43 +798,42 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
__m256i bf1[32], bf0[32];
- int col;
- for (col = 0; col < 4; ++col) {
+ {
// stage 0
// stage 1
- bf1[0] = in[0 * 4 + col];
- bf1[1] = in[16 * 4 + col];
- bf1[2] = in[8 * 4 + col];
- bf1[3] = in[24 * 4 + col];
- bf1[4] = in[4 * 4 + col];
- bf1[5] = in[20 * 4 + col];
- bf1[6] = in[12 * 4 + col];
- bf1[7] = in[28 * 4 + col];
- bf1[8] = in[2 * 4 + col];
- bf1[9] = in[18 * 4 + col];
- bf1[10] = in[10 * 4 + col];
- bf1[11] = in[26 * 4 + col];
- bf1[12] = in[6 * 4 + col];
- bf1[13] = in[22 * 4 + col];
- bf1[14] = in[14 * 4 + col];
- bf1[15] = in[30 * 4 + col];
- bf1[16] = in[1 * 4 + col];
- bf1[17] = in[17 * 4 + col];
- bf1[18] = in[9 * 4 + col];
- bf1[19] = in[25 * 4 + col];
- bf1[20] = in[5 * 4 + col];
- bf1[21] = in[21 * 4 + col];
- bf1[22] = in[13 * 4 + col];
- bf1[23] = in[29 * 4 + col];
- bf1[24] = in[3 * 4 + col];
- bf1[25] = in[19 * 4 + col];
- bf1[26] = in[11 * 4 + col];
- bf1[27] = in[27 * 4 + col];
- bf1[28] = in[7 * 4 + col];
- bf1[29] = in[23 * 4 + col];
- bf1[30] = in[15 * 4 + col];
- bf1[31] = in[31 * 4 + col];
+ bf1[0] = in[0];
+ bf1[1] = in[16];
+ bf1[2] = in[8];
+ bf1[3] = in[24];
+ bf1[4] = in[4];
+ bf1[5] = in[20];
+ bf1[6] = in[12];
+ bf1[7] = in[28];
+ bf1[8] = in[2];
+ bf1[9] = in[18];
+ bf1[10] = in[10];
+ bf1[11] = in[26];
+ bf1[12] = in[6];
+ bf1[13] = in[22];
+ bf1[14] = in[14];
+ bf1[15] = in[30];
+ bf1[16] = in[1];
+ bf1[17] = in[17];
+ bf1[18] = in[9];
+ bf1[19] = in[25];
+ bf1[20] = in[5];
+ bf1[21] = in[21];
+ bf1[22] = in[13];
+ bf1[23] = in[29];
+ bf1[24] = in[3];
+ bf1[25] = in[19];
+ bf1[26] = in[11];
+ bf1[27] = in[27];
+ bf1[28] = in[7];
+ bf1[29] = in[23];
+ bf1[30] = in[15];
+ bf1[31] = in[31];
// stage 2
bf0[0] = bf1[0];
@@ -568,91 +1095,255 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
// stage 9
if (do_cols) {
- addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0 * 4 + col,
- out + 31 * 4 + col);
- addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1 * 4 + col,
- out + 30 * 4 + col);
- addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2 * 4 + col,
- out + 29 * 4 + col);
- addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3 * 4 + col,
- out + 28 * 4 + col);
- addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4 * 4 + col,
- out + 27 * 4 + col);
- addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5 * 4 + col,
- out + 26 * 4 + col);
- addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6 * 4 + col,
- out + 25 * 4 + col);
- addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7 * 4 + col,
- out + 24 * 4 + col);
- addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8 * 4 + col,
- out + 23 * 4 + col);
- addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9 * 4 + col,
- out + 22 * 4 + col);
- addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
- out + 21 * 4 + col);
- addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
- out + 20 * 4 + col);
- addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
- out + 19 * 4 + col);
- addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
- out + 18 * 4 + col);
- addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
- out + 17 * 4 + col);
- addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
- out + 16 * 4 + col);
+ addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31);
+ addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30);
+ addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29);
+ addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28);
+ addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27);
+ addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26);
+ addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25);
+ addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24);
+ addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23);
+ addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22);
+ addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21);
+ addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20);
+ addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19);
+ addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18);
+ addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17);
+ addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16);
} else {
- addsub_shift_avx2(bf0[0], bf0[31], out + 0 * 4 + col, out + 31 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[1], bf0[30], out + 1 * 4 + col, out + 30 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[2], bf0[29], out + 2 * 4 + col, out + 29 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[3], bf0[28], out + 3 * 4 + col, out + 28 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[4], bf0[27], out + 4 * 4 + col, out + 27 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[5], bf0[26], out + 5 * 4 + col, out + 26 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[6], bf0[25], out + 6 * 4 + col, out + 25 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[7], bf0[24], out + 7 * 4 + col, out + 24 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[8], bf0[23], out + 8 * 4 + col, out + 23 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[9], bf0[22], out + 9 * 4 + col, out + 22 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
- out + 21 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
- out + 20 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
- out + 19 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
- out + 18 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
- out + 17 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
- out + 16 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
}
}
}
-void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
- int stride, TX_TYPE tx_type, int bd) {
- __m256i in[128], out[128];
- const int8_t *shift = inv_txfm_shift_ls[TX_32X32];
- const int txw_idx = get_txw_idx(TX_32X32);
- const int txh_idx = get_txh_idx(TX_32X32);
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
+ int do_cols, int bd, int out_shift);
+
+static const transform_1d_avx2
+ highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m256i buf1[64 * 2];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_avx2 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ // 1st stage: column transform
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ __m256i buf0[32];
+ const int32_t *input_row = input + i * input_stride * 8;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ __m256i *buf0_cur = buf0 + j * 8;
+ load_buffer_32x32(input_row + j * 8, buf0_cur, input_stride, 8);
+
+ transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
+ }
+
+ row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+ __m256i *_buf1 = buf1 + i * 8;
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+ }
+ }
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
switch (tx_type) {
case DCT_DCT:
- load_buffer_32x32(coeff, in);
- transpose_32x32(in, out);
- idct32_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
- transpose_32x32(in, out);
- idct32_avx2(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_32x32(in, output, stride, 0, 0, -shift[1], bd);
+ highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
+ stride, tx_type, tx_size, eob, bd);
break;
+ default: assert(0); break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ const int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ case DCT_DCT:
+ av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ // Assembly version doesn't support IDTX, so use C version for it.
+ case IDTX:
+ av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+
default: assert(0);
}
}
+
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_32X32:
+ av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param);
+ break;
+ case TX_16X16:
+ av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_8X8:
+ av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X8:
+ av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+ break;
+ case TX_8X4:
+ av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+ break;
+ case TX_8X16:
+ av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X8:
+ av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X32:
+ av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X16:
+ av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+ break;
+ case TX_32X64:
+ av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+ break;
+ case TX_64X32:
+ av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+ break;
+ case TX_4X4:
+ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+ break;
+ case TX_8X32:
+ av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X8:
+ av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+ break;
+ case TX_64X64:
+ case TX_16X64:
+ case TX_64X16:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(
+ input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+ txfm_param->eob, txfm_param->bd);
+ break;
+ default: assert(0 && "Invalid transform size"); break;
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
index 801a4133b..e29e0baf5 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -15,8 +15,60 @@
#include "config/av1_rtcd.h"
#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i clamped, mask;
+
+ mask = _mm_cmpgt_epi16(u, max);
+ clamped = _mm_andnot_si128(mask, u);
+ mask = _mm_and_si128(mask, max);
+ clamped = _mm_or_si128(mask, clamped);
+ mask = _mm_cmpgt_epi16(clamped, zero);
+ clamped = _mm_and_si128(clamped, mask);
+
+ return clamped;
+}
+
+static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
+ __m128i res0, __m128i res1,
+ const int bd) {
+ __m128i x0 = _mm_cvtepi16_epi32(pred);
+ __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
+
+ x0 = _mm_add_epi32(res0, x0);
+ x1 = _mm_add_epi32(res1, x1);
+ x0 = _mm_packus_epi32(x0, x1);
+ x0 = highbd_clamp_epi16(x0, bd);
+ return x0;
+}
+
+static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+ __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
+
+ _mm_storeu_si128((__m128i *)(output + i * stride), u);
+ }
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
+ }
+}
+
static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
@@ -57,18 +109,231 @@ static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
__m128i a0 = _mm_add_epi32(in0_w_offset, in1);
__m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
+ a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
a0 = _mm_max_epi32(a0, *clamp_lo);
a0 = _mm_min_epi32(a0, *clamp_hi);
a1 = _mm_max_epi32(a1, *clamp_lo);
a1 = _mm_min_epi32(a1, *clamp_hi);
- a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
- a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
*out0 = a0;
*out1 = a1;
}
+static INLINE void idct32_stage4_sse4_1(
+ __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
+ const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
+ const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+ bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+ bf1[17] = temp1;
+
+ temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+ bf1[29] =
+ half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+ bf1[18] = temp2;
+
+ temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+ bf1[21] = temp1;
+
+ temp2 =
+ half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+ bf1[25] =
+ half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+ bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_sse4_1(
+ __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
+ const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
+ const __m128i *clamp_hi, const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+ bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+ bf1[9] = temp1;
+
+ temp2 =
+ half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+ bf1[10] = temp2;
+
+ addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_sse4_1(
+ __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
+ const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+ const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[5] = temp1;
+
+ addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+ bf1[29] =
+ half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+ bf1[18] = temp1;
+ temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+ bf1[28] =
+ half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+ bf1[19] = temp2;
+ temp1 =
+ half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+ bf1[27] =
+ half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 =
+ half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[10] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[11] = temp2;
+
+ addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[27] =
+ half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+ temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[25] =
+ half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[22] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[24] =
+ half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
+ const int do_cols, const int bd,
+ const int out_shift,
+ const int log_range) {
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
+ addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
+ addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
+ addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
+ addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
+ addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
+ addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
+ addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
+ addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
+ addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
+ addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
+ addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
+ addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
+ addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
+ addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
+ addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
__m128i *out0, __m128i *out1,
const __m128i *clamp_lo, const __m128i *clamp_hi,
@@ -77,14 +342,14 @@ static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
__m128i a0 = _mm_add_epi32(offset, in0);
__m128i a1 = _mm_sub_epi32(offset, in1);
+ a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
a0 = _mm_max_epi32(a0, *clamp_lo);
a0 = _mm_min_epi32(a0, *clamp_hi);
a1 = _mm_max_epi32(a1, *clamp_lo);
a1 = _mm_min_epi32(a1, *clamp_hi);
- a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
- a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
*out0 = a0;
*out1 = a1;
}
@@ -96,9 +361,6 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
- const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
- const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
- const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i u0, u1, u2, u3;
__m128i v0, v1, v2, v3, x, y;
@@ -135,11 +397,19 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
v3 = _mm_add_epi32(v3, rnding);
v3 = _mm_srai_epi32(v3, bit);
- addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
- addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3);
+ addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2);
+ } else {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
+ }
}
-static void iadst4x4_sse4_1(__m128i *in, int bit) {
+static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
const int32_t *sinpi = sinpi_arr(bit);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
@@ -197,6 +467,21 @@ static void iadst4x4_sse4_1(__m128i *in, int bit) {
u3 = _mm_add_epi32(u3, rnding);
u3 = _mm_srai_epi32(u3, bit);
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ u0 = _mm_max_epi32(u0, clamp_lo);
+ u0 = _mm_min_epi32(u0, clamp_hi);
+ u1 = _mm_max_epi32(u1, clamp_lo);
+ u1 = _mm_min_epi32(u1, clamp_hi);
+ u2 = _mm_max_epi32(u2, clamp_lo);
+ u2 = _mm_min_epi32(u2, clamp_hi);
+ u3 = _mm_max_epi32(u3, clamp_lo);
+ u3 = _mm_min_epi32(u3, clamp_hi);
+ }
+
in[0] = u0;
in[1] = u1;
in[2] = u2;
@@ -217,22 +502,6 @@ static INLINE void round_shift_4x4(__m128i *in, int shift) {
in[3] = _mm_srai_epi32(in[3], shift);
}
-static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
- __m128i clamped, mask;
-
- mask = _mm_cmpgt_epi16(u, max);
- clamped = _mm_andnot_si128(mask, u);
- mask = _mm_and_si128(mask, max);
- clamped = _mm_or_si128(mask, clamped);
- mask = _mm_cmpgt_epi16(clamped, zero);
- clamped = _mm_and_si128(clamped, mask);
-
- return clamped;
-}
-
static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
int fliplr, int flipud, int shift, int bd) {
const __m128i zero = _mm_setzero_si128();
@@ -304,49 +573,49 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
case ADST_DCT:
load_buffer_4x4(coeff, in);
idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case FLIPADST_DCT:
load_buffer_4x4(coeff, in);
idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
break;
case ADST_FLIPADST:
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_ADST:
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
default: assert(0);
@@ -482,14 +751,19 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
} else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
- &clamp_lo, &clamp_hi, out_shift);
+ &clamp_lo_out, &clamp_hi_out, out_shift);
addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
- &clamp_lo, &clamp_hi, out_shift);
+ &clamp_lo_out, &clamp_hi_out, out_shift);
addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
- &clamp_lo, &clamp_hi, out_shift);
+ &clamp_lo_out, &clamp_hi_out, out_shift);
addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
- &clamp_lo, &clamp_hi, out_shift);
+ &clamp_lo_out, &clamp_hi_out, out_shift);
}
}
}
@@ -651,14 +925,18 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
out[12] = u[5];
out[14] = _mm_sub_epi32(kZero, u[1]);
} else {
- neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo, &clamp_hi,
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
out_shift);
- neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo, &clamp_hi,
+ neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
}
// Odd 8 points: 1, 3, ..., 15
@@ -796,14 +1074,18 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
out[13] = u[5];
out[15] = _mm_sub_epi32(kZero, u[1]);
} else {
- neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo, &clamp_hi,
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
out_shift);
- neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo, &clamp_hi,
+ neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
}
}
@@ -976,64 +1258,1141 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
}
}
-// 16x16
-static void load_buffer_16x16(const int32_t *coeff, __m128i *in) {
- int i;
- for (i = 0; i < 64; ++i) {
- in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2)));
+static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ x = _mm_mullo_epi32(in[0], cospi32);
+ x = _mm_add_epi32(x, rnding);
+ x = _mm_srai_epi32(x, bit);
+
+ // stage 4
+ // stage 5
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ x = _mm_add_epi32(x, offset);
+ x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ x = _mm_max_epi32(x, clamp_lo_out);
+ x = _mm_min_epi32(x, clamp_hi_out);
}
+
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
}
-static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
- int col) {
- int i;
- for (i = 0; i < 16; i += 2) {
- in8x8[i] = in[col];
- in8x8[i + 1] = in[col + 1];
- col += 4;
+static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0];
+ u1 = in[4];
+ u2 = in[2];
+ u3 = in[6];
+
+ x = _mm_mullo_epi32(in[1], cospi56);
+ y = _mm_mullo_epi32(in[7], cospim8);
+ u4 = _mm_add_epi32(x, y);
+ u4 = _mm_add_epi32(u4, rnding);
+ u4 = _mm_srai_epi32(u4, bit);
+
+ x = _mm_mullo_epi32(in[1], cospi8);
+ y = _mm_mullo_epi32(in[7], cospi56);
+ u7 = _mm_add_epi32(x, y);
+ u7 = _mm_add_epi32(u7, rnding);
+ u7 = _mm_srai_epi32(u7, bit);
+
+ x = _mm_mullo_epi32(in[5], cospi24);
+ y = _mm_mullo_epi32(in[3], cospim40);
+ u5 = _mm_add_epi32(x, y);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ x = _mm_mullo_epi32(in[5], cospi40);
+ y = _mm_mullo_epi32(in[3], cospi24);
+ u6 = _mm_add_epi32(x, y);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ // stage 3
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u1, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = _mm_mullo_epi32(v5, cospi32);
+ y = _mm_mullo_epi32(v6, cospi32);
+ u6 = _mm_add_epi32(y, x);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ u5 = _mm_sub_epi32(y, x);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ // stage 5
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
+ addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
+ addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
+ addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
}
}
-static void swap_addr(uint16_t **output1, uint16_t **output2) {
- uint16_t *tmp;
- tmp = *output1;
- *output1 = *output2;
- *output2 = tmp;
+static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ __m128i u[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(kZero, x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // stage 3
+ // stage 4
+ __m128i temp1, temp2;
+ temp1 = _mm_mullo_epi32(u[0], cospi16);
+ x = _mm_mullo_epi32(u[1], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+ u[4] = temp1;
+
+ temp2 = _mm_mullo_epi32(u[0], cospi48);
+ x = _mm_mullo_epi32(u[1], cospi16);
+ u[5] = _mm_sub_epi32(temp2, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // stage 5
+ // stage 6
+ temp1 = _mm_mullo_epi32(u[0], cospi32);
+ x = _mm_mullo_epi32(u[1], cospi32);
+ u[2] = _mm_add_epi32(temp1, x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(temp1, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ temp1 = _mm_mullo_epi32(u[4], cospi32);
+ x = _mm_mullo_epi32(u[5], cospi32);
+ u[6] = _mm_add_epi32(temp1, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(temp1, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
}
-static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
- int fliplr, int flipud, int shift, int bd) {
- __m128i in8x8[16];
- uint16_t *leftUp = &output[0];
- uint16_t *rightUp = &output[8];
- uint16_t *leftDown = &output[8 * stride];
- uint16_t *rightDown = &output[8 * stride + 8];
+static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[8], v[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
- if (fliplr) {
- swap_addr(&leftUp, &rightUp);
- swap_addr(&leftDown, &rightDown);
+ u[0] = _mm_mullo_epi32(in[7], cospi4);
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[7], cospi60);
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[5], cospi20);
+ x = _mm_mullo_epi32(in[2], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[5], cospi44);
+ x = _mm_mullo_epi32(in[2], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[3], cospi36);
+ x = _mm_mullo_epi32(in[4], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[3], cospi28);
+ x = _mm_mullo_epi32(in[4], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[1], cospi52);
+ x = _mm_mullo_epi32(in[6], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[1], cospi12);
+ x = _mm_mullo_epi32(in[6], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
}
+}
- if (flipud) {
- swap_addr(&leftUp, &leftDown);
- swap_addr(&rightUp, &rightDown);
+static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ in[0] = _mm_mullo_epi32(in[0], cospi32);
+ in[0] = _mm_add_epi32(in[0], rnding);
+ in[0] = _mm_srai_epi32(in[0], bit);
+
+ // stage 5
+ // stage 6
+ // stage 7
+ if (do_cols) {
+ in[0] = _mm_max_epi32(in[0], clamp_lo);
+ in[0] = _mm_min_epi32(in[0], clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ in[0] = _mm_add_epi32(in[0], offset);
+ in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+ in[0] = _mm_max_epi32(in[0], clamp_lo_out);
+ in[0] = _mm_min_epi32(in[0], clamp_hi_out);
+ }
+
+ out[0] = in[0];
+ out[1] = in[0];
+ out[2] = in[0];
+ out[3] = in[0];
+ out[4] = in[0];
+ out[5] = in[0];
+ out[6] = in[0];
+ out[7] = in[0];
+ out[8] = in[0];
+ out[9] = in[0];
+ out[10] = in[0];
+ out[11] = in[0];
+ out[12] = in[0];
+ out[13] = in[0];
+ out[14] = in[0];
+ out[15] = in[0];
}
+}
- // Left-up quarter
- assign_8x8_input_from_16x16(in, in8x8, 0);
- write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd);
+static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[2] = in[4];
+ u[4] = in[2];
+ u[6] = in[6];
+ u[8] = in[1];
+ u[10] = in[5];
+ u[12] = in[3];
+ u[14] = in[7];
+
+ // stage 2
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+
+ u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+ u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+
+ u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+ u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+
+ u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+ // stage 3
+ u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+ u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
+ u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
+
+ addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm_mullo_epi32(u[0], cospi32);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+ u[1] = u[0];
- // Right-up quarter
- assign_8x8_input_from_16x16(in, in8x8, 2);
- write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd);
+ u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+ u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
- // Left-down quarter
- assign_8x8_input_from_16x16(in, in8x8, 32);
- write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd);
+ addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
- // Right-down quarter
- assign_8x8_input_from_16x16(in, in8x8, 34);
- write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
+ x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = x;
+ y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = y;
+
+ // stage 5
+ addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ x = _mm_mullo_epi32(u[5], cospi32);
+ y = _mm_mullo_epi32(u[6], cospi32);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[13], cospi32);
+ u[10] = _mm_sub_epi32(y, x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[13] = _mm_add_epi32(x, y);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi32);
+ y = _mm_mullo_epi32(u[12], cospi32);
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ u[12] = _mm_add_epi32(x, y);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+ // stage 7
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
+ addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
+ addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
+ addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
+ addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
+ addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
+ addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
+ addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i v[16], x, y, temp1, temp2;
+
+ // Calculate the column 0, 1, 2, 3
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ x = _mm_mullo_epi32(in[0], cospi62);
+ v[0] = _mm_add_epi32(x, rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi2);
+ v[1] = _mm_sub_epi32(zero, x);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ // stage 3
+ v[8] = v[0];
+ v[9] = v[1];
+
+ // stage 4
+ temp1 = _mm_mullo_epi32(v[8], cospi8);
+ x = _mm_mullo_epi32(v[9], cospi56);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[8], cospi56);
+ x = _mm_mullo_epi32(v[9], cospi8);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[8] = temp1;
+ v[9] = temp2;
+
+ // stage 5
+ v[4] = v[0];
+ v[5] = v[1];
+ v[12] = v[8];
+ v[13] = v[9];
+
+ // stage 6
+ temp1 = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[4] = temp1;
+ v[5] = temp2;
+
+ temp1 = _mm_mullo_epi32(v[12], cospi16);
+ x = _mm_mullo_epi32(v[13], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[12], cospi48);
+ x = _mm_mullo_epi32(v[13], cospi16);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[12] = temp1;
+ v[13] = temp2;
+
+ // stage 7
+ v[2] = v[0];
+ v[3] = v[1];
+ v[6] = v[4];
+ v[7] = v[5];
+ v[10] = v[8];
+ v[11] = v[9];
+ v[14] = v[12];
+ v[15] = v[13];
+
+ // stage 8
+ y = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ v[2] = _mm_add_epi32(y, x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(y, x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ y = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ v[6] = _mm_add_epi32(y, x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(y, x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ y = _mm_mullo_epi32(v[10], cospi32);
+ x = _mm_mullo_epi32(v[11], cospi32);
+ v[10] = _mm_add_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ y = _mm_mullo_epi32(v[14], cospi32);
+ x = _mm_mullo_epi32(v[15], cospi32);
+ v[14] = _mm_add_epi32(y, x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(y, x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+ out[2] = v[12];
+ out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+ out[4] = v[6];
+ out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+ out[6] = v[10];
+ out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+ out[8] = v[3];
+ out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+ out[10] = v[15];
+ out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+ out[12] = v[5];
+ out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+ out[14] = v[9];
+ out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[16], x, y;
+
+ // Calculate the column 0, 1, 2, 3
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ __m128i zero = _mm_setzero_si128();
+ x = _mm_mullo_epi32(in[0], cospi62);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi2);
+ u[1] = _mm_sub_epi32(zero, x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ x = _mm_mullo_epi32(in[2], cospi54);
+ u[2] = _mm_add_epi32(x, rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ x = _mm_mullo_epi32(in[2], cospi10);
+ u[3] = _mm_sub_epi32(zero, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ x = _mm_mullo_epi32(in[4], cospi46);
+ u[4] = _mm_add_epi32(x, rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ x = _mm_mullo_epi32(in[4], cospi18);
+ u[5] = _mm_sub_epi32(zero, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ x = _mm_mullo_epi32(in[6], cospi38);
+ u[6] = _mm_add_epi32(x, rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ x = _mm_mullo_epi32(in[6], cospi26);
+ u[7] = _mm_sub_epi32(zero, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ u[8] = _mm_mullo_epi32(in[7], cospi34);
+ u[8] = _mm_add_epi32(u[8], rnding);
+ u[8] = _mm_srai_epi32(u[8], bit);
+
+ u[9] = _mm_mullo_epi32(in[7], cospi30);
+ u[9] = _mm_add_epi32(u[9], rnding);
+ u[9] = _mm_srai_epi32(u[9], bit);
+
+ u[10] = _mm_mullo_epi32(in[5], cospi42);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[11] = _mm_mullo_epi32(in[5], cospi22);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ u[12] = _mm_mullo_epi32(in[3], cospi50);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ u[13] = _mm_mullo_epi32(in[3], cospi14);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ u[14] = _mm_mullo_epi32(in[1], cospi58);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ u[15] = _mm_mullo_epi32(in[1], cospi6);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ y = _mm_mullo_epi32(u[8], cospi56);
+ x = _mm_mullo_epi32(u[9], cospi56);
+ u[8] = _mm_mullo_epi32(u[8], cospi8);
+ u[8] = _mm_add_epi32(u[8], x);
+ u[8] = _mm_add_epi32(u[8], rnding);
+ u[8] = _mm_srai_epi32(u[8], bit);
+
+ x = _mm_mullo_epi32(u[9], cospi8);
+ u[9] = _mm_sub_epi32(y, x);
+ u[9] = _mm_add_epi32(u[9], rnding);
+ u[9] = _mm_srai_epi32(u[9], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi24);
+ y = _mm_mullo_epi32(u[10], cospi24);
+ u[10] = _mm_mullo_epi32(u[10], cospi40);
+ u[10] = _mm_add_epi32(u[10], x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi40);
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi8);
+ y = _mm_mullo_epi32(u[12], cospi8);
+ u[12] = _mm_mullo_epi32(u[12], cospim56);
+ u[12] = _mm_add_epi32(u[12], x);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ x = _mm_mullo_epi32(u[13], cospim56);
+ u[13] = _mm_sub_epi32(y, x);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[15], cospi40);
+ y = _mm_mullo_epi32(u[14], cospi40);
+ u[14] = _mm_mullo_epi32(u[14], cospim24);
+ u[14] = _mm_add_epi32(u[14], x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ x = _mm_mullo_epi32(u[15], cospim24);
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ x = _mm_mullo_epi32(u[5], cospi48);
+ y = _mm_mullo_epi32(u[4], cospi48);
+ u[4] = _mm_mullo_epi32(u[4], cospi16);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ x = _mm_mullo_epi32(u[5], cospi16);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ x = _mm_mullo_epi32(u[7], cospi16);
+ y = _mm_mullo_epi32(u[6], cospi16);
+ u[6] = _mm_mullo_epi32(u[6], cospim48);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ x = _mm_mullo_epi32(u[7], cospim48);
+ u[7] = _mm_sub_epi32(y, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi48);
+ y = _mm_mullo_epi32(u[12], cospi48);
+ u[12] = _mm_mullo_epi32(u[12], cospi16);
+ u[12] = _mm_add_epi32(u[12], x);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi16);
+ u[13] = _mm_sub_epi32(y, x);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[15], cospi16);
+ y = _mm_mullo_epi32(u[14], cospi16);
+ u[14] = _mm_mullo_epi32(u[14], cospim48);
+ u[14] = _mm_add_epi32(u[14], x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ x = _mm_mullo_epi32(u[15], cospim48);
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 7
+ addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ y = _mm_mullo_epi32(u[2], cospi32);
+ x = _mm_mullo_epi32(u[3], cospi32);
+ u[2] = _mm_add_epi32(y, x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(y, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+ y = _mm_mullo_epi32(u[6], cospi32);
+ x = _mm_mullo_epi32(u[7], cospi32);
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(y, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ y = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[11], cospi32);
+ u[10] = _mm_add_epi32(y, x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ y = _mm_mullo_epi32(u[14], cospi32);
+ x = _mm_mullo_epi32(u[15], cospi32);
+ u[14] = _mm_add_epi32(y, x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
+ out[2] = u[12];
+ out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
+ out[4] = u[6];
+ out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
+ out[6] = u[10];
+ out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
+ out[8] = u[3];
+ out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
+ out[10] = u[15];
+ out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
+ out[12] = u[5];
+ out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
+ out[14] = u[9];
+ out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
}
static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
@@ -1067,27 +2426,26 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i u[16], v[16], x, y;
- int col;
- for (col = 0; col < 4; ++col) {
+ {
// stage 0
// stage 1
- u[0] = in[0 * 4 + col];
- u[1] = in[8 * 4 + col];
- u[2] = in[4 * 4 + col];
- u[3] = in[12 * 4 + col];
- u[4] = in[2 * 4 + col];
- u[5] = in[10 * 4 + col];
- u[6] = in[6 * 4 + col];
- u[7] = in[14 * 4 + col];
- u[8] = in[1 * 4 + col];
- u[9] = in[9 * 4 + col];
- u[10] = in[5 * 4 + col];
- u[11] = in[13 * 4 + col];
- u[12] = in[3 * 4 + col];
- u[13] = in[11 * 4 + col];
- u[14] = in[7 * 4 + col];
- u[15] = in[15 * 4 + col];
+ u[0] = in[0];
+ u[1] = in[8];
+ u[2] = in[4];
+ u[3] = in[12];
+ u[4] = in[2];
+ u[5] = in[10];
+ u[6] = in[6];
+ u[7] = in[14];
+ u[8] = in[1];
+ u[9] = in[9];
+ u[10] = in[5];
+ u[11] = in[13];
+ u[12] = in[3];
+ u[13] = in[11];
+ u[14] = in[7];
+ u[15] = in[15];
// stage 2
v[0] = u[0];
@@ -1200,37 +2558,37 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
// stage 7
if (do_cols) {
- addsub_no_clamp_sse4_1(v[0], v[15], out + 0 * 4 + col,
- out + 15 * 4 + col);
- addsub_no_clamp_sse4_1(v[1], v[14], out + 1 * 4 + col,
- out + 14 * 4 + col);
- addsub_no_clamp_sse4_1(v[2], v[13], out + 2 * 4 + col,
- out + 13 * 4 + col);
- addsub_no_clamp_sse4_1(v[3], v[12], out + 3 * 4 + col,
- out + 12 * 4 + col);
- addsub_no_clamp_sse4_1(v[4], v[11], out + 4 * 4 + col,
- out + 11 * 4 + col);
- addsub_no_clamp_sse4_1(v[5], v[10], out + 5 * 4 + col,
- out + 10 * 4 + col);
- addsub_no_clamp_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col);
- addsub_no_clamp_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col);
+ addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
+ addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
+ addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
+ addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
+ addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
+ addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
+ addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
+ addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
} else {
- addsub_shift_sse4_1(v[0], v[15], out + 0 * 4 + col, out + 15 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[1], v[14], out + 1 * 4 + col, out + 14 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[2], v[13], out + 2 * 4 + col, out + 13 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[3], v[12], out + 3 * 4 + col, out + 12 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[4], v[11], out + 4 * 4 + col, out + 11 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[5], v[10], out + 5 * 4 + col, out + 10 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
}
}
}
@@ -1269,106 +2627,104 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i u[16], v[16], x, y;
- const int col_num = 4;
- int col;
// Calculate the column 0, 1, 2, 3
- for (col = 0; col < col_num; ++col) {
+ {
// stage 0
// stage 1
// stage 2
- v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
- x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
+ v[0] = _mm_mullo_epi32(in[15], cospi2);
+ x = _mm_mullo_epi32(in[0], cospi62);
v[0] = _mm_add_epi32(v[0], x);
v[0] = _mm_add_epi32(v[0], rnding);
v[0] = _mm_srai_epi32(v[0], bit);
- v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
- x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
+ v[1] = _mm_mullo_epi32(in[15], cospi62);
+ x = _mm_mullo_epi32(in[0], cospi2);
v[1] = _mm_sub_epi32(v[1], x);
v[1] = _mm_add_epi32(v[1], rnding);
v[1] = _mm_srai_epi32(v[1], bit);
- v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
- x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
+ v[2] = _mm_mullo_epi32(in[13], cospi10);
+ x = _mm_mullo_epi32(in[2], cospi54);
v[2] = _mm_add_epi32(v[2], x);
v[2] = _mm_add_epi32(v[2], rnding);
v[2] = _mm_srai_epi32(v[2], bit);
- v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
- x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
+ v[3] = _mm_mullo_epi32(in[13], cospi54);
+ x = _mm_mullo_epi32(in[2], cospi10);
v[3] = _mm_sub_epi32(v[3], x);
v[3] = _mm_add_epi32(v[3], rnding);
v[3] = _mm_srai_epi32(v[3], bit);
- v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
- x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
+ v[4] = _mm_mullo_epi32(in[11], cospi18);
+ x = _mm_mullo_epi32(in[4], cospi46);
v[4] = _mm_add_epi32(v[4], x);
v[4] = _mm_add_epi32(v[4], rnding);
v[4] = _mm_srai_epi32(v[4], bit);
- v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
- x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
+ v[5] = _mm_mullo_epi32(in[11], cospi46);
+ x = _mm_mullo_epi32(in[4], cospi18);
v[5] = _mm_sub_epi32(v[5], x);
v[5] = _mm_add_epi32(v[5], rnding);
v[5] = _mm_srai_epi32(v[5], bit);
- v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
- x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
+ v[6] = _mm_mullo_epi32(in[9], cospi26);
+ x = _mm_mullo_epi32(in[6], cospi38);
v[6] = _mm_add_epi32(v[6], x);
v[6] = _mm_add_epi32(v[6], rnding);
v[6] = _mm_srai_epi32(v[6], bit);
- v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
- x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
+ v[7] = _mm_mullo_epi32(in[9], cospi38);
+ x = _mm_mullo_epi32(in[6], cospi26);
v[7] = _mm_sub_epi32(v[7], x);
v[7] = _mm_add_epi32(v[7], rnding);
v[7] = _mm_srai_epi32(v[7], bit);
- v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
- x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
+ v[8] = _mm_mullo_epi32(in[7], cospi34);
+ x = _mm_mullo_epi32(in[8], cospi30);
v[8] = _mm_add_epi32(v[8], x);
v[8] = _mm_add_epi32(v[8], rnding);
v[8] = _mm_srai_epi32(v[8], bit);
- v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
- x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
+ v[9] = _mm_mullo_epi32(in[7], cospi30);
+ x = _mm_mullo_epi32(in[8], cospi34);
v[9] = _mm_sub_epi32(v[9], x);
v[9] = _mm_add_epi32(v[9], rnding);
v[9] = _mm_srai_epi32(v[9], bit);
- v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
- x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
+ v[10] = _mm_mullo_epi32(in[5], cospi42);
+ x = _mm_mullo_epi32(in[10], cospi22);
v[10] = _mm_add_epi32(v[10], x);
v[10] = _mm_add_epi32(v[10], rnding);
v[10] = _mm_srai_epi32(v[10], bit);
- v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
- x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
+ v[11] = _mm_mullo_epi32(in[5], cospi22);
+ x = _mm_mullo_epi32(in[10], cospi42);
v[11] = _mm_sub_epi32(v[11], x);
v[11] = _mm_add_epi32(v[11], rnding);
v[11] = _mm_srai_epi32(v[11], bit);
- v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
- x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
+ v[12] = _mm_mullo_epi32(in[3], cospi50);
+ x = _mm_mullo_epi32(in[12], cospi14);
v[12] = _mm_add_epi32(v[12], x);
v[12] = _mm_add_epi32(v[12], rnding);
v[12] = _mm_srai_epi32(v[12], bit);
- v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
- x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
+ v[13] = _mm_mullo_epi32(in[3], cospi14);
+ x = _mm_mullo_epi32(in[12], cospi50);
v[13] = _mm_sub_epi32(v[13], x);
v[13] = _mm_add_epi32(v[13], rnding);
v[13] = _mm_srai_epi32(v[13], bit);
- v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
- x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
+ v[14] = _mm_mullo_epi32(in[1], cospi58);
+ x = _mm_mullo_epi32(in[14], cospi6);
v[14] = _mm_add_epi32(v[14], x);
v[14] = _mm_add_epi32(v[14], rnding);
v[14] = _mm_srai_epi32(v[14], bit);
- v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
- x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
+ v[15] = _mm_mullo_epi32(in[1], cospi6);
+ x = _mm_mullo_epi32(in[14], cospi58);
v[15] = _mm_sub_epi32(v[15], x);
v[15] = _mm_add_epi32(v[15], rnding);
v[15] = _mm_srai_epi32(v[15], bit);
@@ -1575,268 +2931,835 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
// stage 9
if (do_cols) {
- out[0 * col_num + col] = v[0];
- out[1 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
- out[2 * col_num + col] = v[12];
- out[3 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
- out[4 * col_num + col] = v[6];
- out[5 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
- out[6 * col_num + col] = v[10];
- out[7 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
- out[8 * col_num + col] = v[3];
- out[9 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
- out[10 * col_num + col] = v[15];
- out[11 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
- out[12 * col_num + col] = v[5];
- out[13 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
- out[14 * col_num + col] = v[9];
- out[15 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+ out[0] = v[0];
+ out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+ out[2] = v[12];
+ out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+ out[4] = v[6];
+ out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+ out[6] = v[10];
+ out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+ out[8] = v[3];
+ out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+ out[10] = v[15];
+ out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+ out[12] = v[5];
+ out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+ out[14] = v[9];
+ out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
} else {
- neg_shift_sse4_1(v[0], v[8], out + 0 * col_num + col,
- out + 1 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[12], v[4], out + 2 * col_num + col,
- out + 3 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[6], v[14], out + 4 * col_num + col,
- out + 5 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[10], v[2], out + 6 * col_num + col,
- out + 7 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[3], v[11], out + 8 * col_num + col,
- out + 9 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[15], v[7], out + 10 * col_num + col,
- out + 11 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[5], v[13], out + 12 * col_num + col,
- out + 13 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[9], v[1], out + 14 * col_num + col,
- out + 15 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
}
}
}
-void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
- int stride, TX_TYPE tx_type, int bd) {
- __m128i in[64], out[64];
- const int8_t *shift = inv_txfm_shift_ls[TX_16X16];
- const int txw_idx = get_txw_idx(TX_16X16);
- const int txh_idx = get_txh_idx(TX_16X16);
-
- switch (tx_type) {
- case DCT_DCT:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
- break;
- case DCT_ADST:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
- break;
- case ADST_DCT:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
- break;
- case ADST_ADST:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
- break;
- case FLIPADST_DCT:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
- break;
- case DCT_FLIPADST:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
- break;
- case ADST_FLIPADST:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 1, 1, -shift[1], bd);
- break;
- case FLIPADST_ADST:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
- break;
- default: assert(0);
+static INLINE void idct64_stage8_sse4_1(
+ __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
+ const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+ const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ int i;
+ __m128i temp1, temp2, temp3, temp4;
+ temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+ u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+ u[10] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+ u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+ u[11] = temp2;
+
+ for (i = 16; i < 20; ++i) {
+ addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+ addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
+ clamp_hi);
}
+
+ temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+ u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+ u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+ u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+ u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+ u[36] = temp1;
+ u[37] = temp2;
+ u[38] = temp3;
+ u[39] = temp4;
+
+ temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+ u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+ u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+ u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+ u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
}
-static void load_buffer_64x64_lower_32x32(const int32_t *coeff, __m128i *in) {
- int i, j;
+static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ int i;
+ __m128i temp1, temp2, temp3, temp4;
+ for (i = 0; i < 8; ++i) {
+ addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+ }
- __m128i zero = _mm_setzero_si128();
+ temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+ u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+ u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+ u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+ u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+ u[20] = temp1;
+ u[21] = temp2;
+ u[22] = temp3;
+ u[23] = temp4;
+ for (i = 32; i < 40; i++) {
+ addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+ }
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 8; ++j) {
- in[16 * i + j] =
- _mm_loadu_si128((const __m128i *)(coeff + 32 * i + 4 * j));
- in[16 * i + j + 8] = zero;
- }
+ for (i = 48; i < 56; i++) {
+ addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+ }
+}
+
+static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ __m128i temp1, temp2, temp3, temp4;
+ for (int i = 0; i < 16; i++) {
+ addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
}
- for (i = 0; i < 512; ++i) in[512 + i] = zero;
+ temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+ u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+ u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+ u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+ u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+
+ temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+ u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+ u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+ u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+ u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+ u[44] = temp1;
+ u[45] = temp2;
+ u[46] = temp3;
+ u[47] = temp4;
}
-static void transpose_64x64(__m128i *in, __m128i *out, int do_cols) {
- int i, j;
- for (i = 0; i < (do_cols ? 16 : 8); ++i) {
- for (j = 0; j < 8; ++j) {
- TRANSPOSE_4X4(in[(4 * i + 0) * 16 + j], in[(4 * i + 1) * 16 + j],
- in[(4 * i + 2) * 16 + j], in[(4 * i + 3) * 16 + j],
- out[(4 * j + 0) * 16 + i], out[(4 * j + 1) * 16 + i],
- out[(4 * j + 2) * 16 + i], out[(4 * j + 3) * 16 + i]);
+static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
+ int bd, int out_shift,
+ const int log_range) {
+ if (do_cols) {
+ for (int i = 0; i < 32; i++) {
+ addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
+ }
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ for (int i = 0; i < 32; i++) {
+ addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
+ &clamp_lo_out, &clamp_hi_out, out_shift);
}
}
}
-static void assign_16x16_input_from_32x32(const __m128i *in, __m128i *in16x16,
- int col) {
- int i;
- for (i = 0; i < 16 * 16 / 4; i += 4) {
- in16x16[i] = in[col];
- in16x16[i + 1] = in[col + 1];
- in16x16[i + 2] = in[col + 2];
- in16x16[i + 3] = in[col + 3];
- col += 8;
+static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+
+ {
+ __m128i x;
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
+
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ if (do_cols) {
+ x = _mm_max_epi32(x, clamp_lo);
+ x = _mm_min_epi32(x, clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ x = _mm_add_epi32(x, offset);
+ x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+
+ x = _mm_max_epi32(x, clamp_lo_out);
+ x = _mm_min_epi32(x, clamp_hi_out);
+ }
+
+ out[0] = x;
+ out[63] = x;
+ out[1] = x;
+ out[62] = x;
+ out[2] = x;
+ out[61] = x;
+ out[3] = x;
+ out[60] = x;
+ out[4] = x;
+ out[59] = x;
+ out[5] = x;
+ out[58] = x;
+ out[6] = x;
+ out[57] = x;
+ out[7] = x;
+ out[56] = x;
+ out[8] = x;
+ out[55] = x;
+ out[9] = x;
+ out[54] = x;
+ out[10] = x;
+ out[53] = x;
+ out[11] = x;
+ out[52] = x;
+ out[12] = x;
+ out[51] = x;
+ out[13] = x;
+ out[50] = x;
+ out[14] = x;
+ out[49] = x;
+ out[15] = x;
+ out[48] = x;
+ out[16] = x;
+ out[47] = x;
+ out[17] = x;
+ out[46] = x;
+ out[18] = x;
+ out[45] = x;
+ out[19] = x;
+ out[44] = x;
+ out[20] = x;
+ out[43] = x;
+ out[21] = x;
+ out[42] = x;
+ out[22] = x;
+ out[41] = x;
+ out[23] = x;
+ out[40] = x;
+ out[24] = x;
+ out[39] = x;
+ out[25] = x;
+ out[38] = x;
+ out[26] = x;
+ out[37] = x;
+ out[27] = x;
+ out[36] = x;
+ out[28] = x;
+ out[35] = x;
+ out[29] = x;
+ out[34] = x;
+ out[30] = x;
+ out[33] = x;
+ out[31] = x;
+ out[32] = x;
}
}
-static void write_buffer_32x32(__m128i *in, uint16_t *output, int stride,
- int fliplr, int flipud, int shift, int bd) {
- __m128i in16x16[16 * 16 / 4];
- uint16_t *leftUp = &output[0];
- uint16_t *rightUp = &output[16];
- uint16_t *leftDown = &output[16 * stride];
- uint16_t *rightDown = &output[16 * stride + 16];
+static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
- if (fliplr) {
- swap_addr(&leftUp, &rightUp);
- swap_addr(&leftDown, &rightDown);
- }
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
- if (flipud) {
- swap_addr(&leftUp, &leftDown);
- swap_addr(&rightUp, &rightDown);
- }
+ {
+ __m128i u[64];
+
+ // stage 1
+ u[0] = in[0];
+ u[8] = in[4];
+ u[16] = in[2];
+ u[24] = in[6];
+ u[32] = in[1];
+ u[40] = in[5];
+ u[48] = in[3];
+ u[56] = in[7];
+
+ // stage 2
+ u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
- // Left-up quarter
- assign_16x16_input_from_32x32(in, in16x16, 0);
- write_buffer_16x16(in16x16, leftUp, stride, fliplr, flipud, shift, bd);
+ // stage 3
+ u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[62] = u[63];
- // Right-up quarter
- assign_16x16_input_from_32x32(in, in16x16, 32 / 2 / 4);
- write_buffer_16x16(in16x16, rightUp, stride, fliplr, flipud, shift, bd);
+ // stage 4
+ __m128i temp1, temp2;
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ u[17] = u[16];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[30] = u[31];
+
+ temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = temp2;
+
+ temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[46] = temp2;
- // Left-down quarter
- assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4);
- write_buffer_16x16(in16x16, leftDown, stride, fliplr, flipud, shift, bd);
+ // stage 5
+ u[9] = u[8];
+ u[14] = u[15];
+
+ temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[22] = temp2;
+
+ u[35] = u[32];
+ u[34] = u[33];
+ u[36] = u[39];
+ u[37] = u[38];
+ u[43] = u[40];
+ u[42] = u[41];
+ u[44] = u[47];
+ u[45] = u[46];
+ u[51] = u[48];
+ u[50] = u[49];
+ u[52] = u[55];
+ u[53] = u[54];
+ u[59] = u[56];
+ u[58] = u[57];
+ u[60] = u[63];
+ u[61] = u[62];
- // Right-down quarter
- assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4 + 32 / 2 / 4);
- write_buffer_16x16(in16x16, rightDown, stride, fliplr, flipud, shift, bd);
-}
+ // stage 6
+ temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[0] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = temp2;
+ u[19] = u[16];
+ u[18] = u[17];
+ u[20] = u[23];
+ u[21] = u[22];
+ u[27] = u[24];
+ u[26] = u[25];
+ u[28] = u[31];
+ u[29] = u[30];
+
+ temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = temp1;
+ temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[35] = temp2;
+ temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[36] = temp1;
+ temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[37] = temp2;
+ temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = temp1;
+ temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[43] = temp2;
+ temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[44] = temp1;
+ temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[45] = temp2;
-static void assign_32x32_input_from_64x64(const __m128i *in, __m128i *in32x32,
- int col) {
- int i;
- for (i = 0; i < 32 * 32 / 4; i += 8) {
- in32x32[i] = in[col];
- in32x32[i + 1] = in[col + 1];
- in32x32[i + 2] = in[col + 2];
- in32x32[i + 3] = in[col + 3];
- in32x32[i + 4] = in[col + 4];
- in32x32[i + 5] = in[col + 5];
- in32x32[i + 6] = in[col + 6];
- in32x32[i + 7] = in[col + 7];
- col += 16;
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ u[11] = u[8];
+ u[10] = u[9];
+ u[12] = u[15];
+ u[13] = u[14];
+
+ temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = temp1;
+ temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[19] = temp2;
+ temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[20] = temp1;
+ temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[21] = temp2;
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ u[7] = u[0];
+ u[6] = u[1];
+ u[5] = u[2];
+ u[4] = u[3];
+ u[9] = u[9];
+
+ idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
}
}
-static void write_buffer_64x64(__m128i *in, uint16_t *output, int stride,
- int fliplr, int flipud, int shift, int bd) {
- __m128i in32x32[32 * 32 / 4];
- uint16_t *leftUp = &output[0];
- uint16_t *rightUp = &output[32];
- uint16_t *leftDown = &output[32 * stride];
- uint16_t *rightDown = &output[32 * stride + 32];
+static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
- if (fliplr) {
- swap_addr(&leftUp, &rightUp);
- swap_addr(&leftDown, &rightDown);
- }
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
- if (flipud) {
- swap_addr(&leftUp, &leftDown);
- swap_addr(&rightUp, &rightDown);
- }
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+ {
+ __m128i u[64];
+ __m128i tmp1, tmp2, tmp3, tmp4;
+ // stage 1
+ u[0] = in[0];
+ u[32] = in[1];
+ u[36] = in[9];
+ u[40] = in[5];
+ u[44] = in[13];
+ u[48] = in[3];
+ u[52] = in[11];
+ u[56] = in[7];
+ u[60] = in[15];
+ u[16] = in[2];
+ u[20] = in[10];
+ u[24] = in[6];
+ u[28] = in[14];
+ u[4] = in[8];
+ u[8] = in[4];
+ u[12] = in[12];
+
+ // stage 2
+ u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+ u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+ u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+ u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+ u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+ u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+ u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+ u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+ u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
- // Left-up quarter
- assign_32x32_input_from_64x64(in, in32x32, 0);
- write_buffer_32x32(in32x32, leftUp, stride, fliplr, flipud, shift, bd);
+ // stage 3
+ u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+ u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
+ u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
+ u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
+ u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[34] = u[35];
+ u[37] = u[36];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[42] = u[43];
+ u[45] = u[44];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[50] = u[51];
+ u[53] = u[52];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[58] = u[59];
+ u[61] = u[60];
+ u[62] = u[63];
- // Right-up quarter
- assign_32x32_input_from_64x64(in, in32x32, 64 / 2 / 4);
- write_buffer_32x32(in32x32, rightUp, stride, fliplr, flipud, shift, bd);
+ // stage 4
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+ u[17] = u[16];
+ u[18] = u[19];
+ u[21] = u[20];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[26] = u[27];
+ u[29] = u[28];
+ u[30] = u[31];
+
+ tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = tmp1;
+ u[34] = tmp2;
+ u[37] = tmp3;
+ u[38] = tmp4;
+
+ tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = tmp1;
+ u[42] = tmp2;
+ u[45] = tmp3;
+ u[46] = tmp4;
- // Left-down quarter
- assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4);
- write_buffer_32x32(in32x32, leftDown, stride, fliplr, flipud, shift, bd);
+ // stage 5
+ u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+
+ u[9] = u[8];
+ u[10] = u[11];
+ u[13] = u[12];
+ u[14] = u[15];
+
+ tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = tmp1;
+ u[18] = tmp2;
+ u[21] = tmp3;
+ u[22] = tmp4;
- // Right-down quarter
- assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4 + 64 / 2 / 4);
- write_buffer_32x32(in32x32, rightDown, stride, fliplr, flipud, shift, bd);
+ for (i = 32; i < 64; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[0] = tmp1;
+ u[5] = u[4];
+ u[6] = u[7];
+
+ tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = tmp1;
+ tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = tmp2;
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = tmp1;
+ u[35] = tmp2;
+ u[36] = tmp3;
+ u[37] = tmp4;
+
+ tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = tmp1;
+ u[43] = tmp2;
+ u[44] = tmp3;
+ u[45] = tmp4;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[5] = tmp1;
+ addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = tmp1;
+ u[19] = tmp2;
+ u[20] = tmp3;
+ u[21] = tmp4;
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+ }
}
static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
@@ -1847,7 +3770,6 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
- int col;
const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
@@ -1929,46 +3851,46 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
- for (col = 0; col < (do_cols ? 64 / 4 : 32 / 4); ++col) {
+ {
__m128i u[64], v[64];
// stage 1
- u[32] = in[1 * 16 + col];
- u[34] = in[17 * 16 + col];
- u[36] = in[9 * 16 + col];
- u[38] = in[25 * 16 + col];
- u[40] = in[5 * 16 + col];
- u[42] = in[21 * 16 + col];
- u[44] = in[13 * 16 + col];
- u[46] = in[29 * 16 + col];
- u[48] = in[3 * 16 + col];
- u[50] = in[19 * 16 + col];
- u[52] = in[11 * 16 + col];
- u[54] = in[27 * 16 + col];
- u[56] = in[7 * 16 + col];
- u[58] = in[23 * 16 + col];
- u[60] = in[15 * 16 + col];
- u[62] = in[31 * 16 + col];
-
- v[16] = in[2 * 16 + col];
- v[18] = in[18 * 16 + col];
- v[20] = in[10 * 16 + col];
- v[22] = in[26 * 16 + col];
- v[24] = in[6 * 16 + col];
- v[26] = in[22 * 16 + col];
- v[28] = in[14 * 16 + col];
- v[30] = in[30 * 16 + col];
-
- u[8] = in[4 * 16 + col];
- u[10] = in[20 * 16 + col];
- u[12] = in[12 * 16 + col];
- u[14] = in[28 * 16 + col];
-
- v[4] = in[8 * 16 + col];
- v[6] = in[24 * 16 + col];
-
- u[0] = in[0 * 16 + col];
- u[2] = in[16 * 16 + col];
+ u[32] = in[1];
+ u[34] = in[17];
+ u[36] = in[9];
+ u[38] = in[25];
+ u[40] = in[5];
+ u[42] = in[21];
+ u[44] = in[13];
+ u[46] = in[29];
+ u[48] = in[3];
+ u[50] = in[19];
+ u[52] = in[11];
+ u[54] = in[27];
+ u[56] = in[7];
+ u[58] = in[23];
+ u[60] = in[15];
+ u[62] = in[31];
+
+ v[16] = in[2];
+ v[18] = in[18];
+ v[20] = in[10];
+ v[22] = in[26];
+ v[24] = in[6];
+ v[26] = in[22];
+ v[28] = in[14];
+ v[30] = in[30];
+
+ u[8] = in[4];
+ u[10] = in[20];
+ u[12] = in[12];
+ u[14] = in[28];
+
+ v[4] = in[8];
+ v[6] = in[24];
+
+ u[0] = in[0];
+ u[2] = in[16];
// stage 2
v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
@@ -2301,39 +4223,1126 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
// stage 11
if (do_cols) {
for (i = 0; i < 32; i++) {
- addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
- &out[16 * (63 - i) + col]);
+ addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
}
} else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
for (i = 0; i < 32; i++) {
- addsub_shift_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
- &out[16 * (63 - i) + col], &clamp_lo, &clamp_hi,
- out_shift);
+ addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
+ &clamp_lo_out, &clamp_hi_out, out_shift);
}
}
}
}
-void av1_inv_txfm2d_add_64x64_sse4_1(const int32_t *coeff, uint16_t *output,
- int stride, TX_TYPE tx_type, int bd) {
- __m128i in[64 * 64 / 4], out[64 * 64 / 4];
- const int8_t *shift = inv_txfm_shift_ls[TX_64X64];
- const int txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
- const int txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
+static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1;
+
+ // stage 0
+ // stage 1
+ bf1 = in[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ if (do_cols) {
+ bf1 = _mm_max_epi32(bf1, clamp_lo);
+ bf1 = _mm_min_epi32(bf1, clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ bf1 = _mm_add_epi32(bf1, offset);
+ bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
+ bf1 = _mm_max_epi32(bf1, clamp_lo_out);
+ bf1 = _mm_min_epi32(bf1, clamp_hi_out);
+ }
+ out[0] = bf1;
+ out[1] = bf1;
+ out[2] = bf1;
+ out[3] = bf1;
+ out[4] = bf1;
+ out[5] = bf1;
+ out[6] = bf1;
+ out[7] = bf1;
+ out[8] = bf1;
+ out[9] = bf1;
+ out[10] = bf1;
+ out[11] = bf1;
+ out[12] = bf1;
+ out[13] = bf1;
+ out[14] = bf1;
+ out[15] = bf1;
+ out[16] = bf1;
+ out[17] = bf1;
+ out[18] = bf1;
+ out[19] = bf1;
+ out[20] = bf1;
+ out[21] = bf1;
+ out[22] = bf1;
+ out[23] = bf1;
+ out[24] = bf1;
+ out[25] = bf1;
+ out[26] = bf1;
+ out[27] = bf1;
+ out[28] = bf1;
+ out[29] = bf1;
+ out[30] = bf1;
+ out[31] = bf1;
+}
+
+static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32];
+
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[4] = in[4];
+ bf1[8] = in[2];
+ bf1[12] = in[6];
+ bf1[16] = in[1];
+ bf1[20] = in[5];
+ bf1[24] = in[3];
+ bf1[28] = in[7];
+
+ // stage 2
+ bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+ bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+ bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+
+ bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+ bf1[17] = bf1[16];
+ bf1[18] = bf1[19];
+ bf1[21] = bf1[20];
+ bf1[22] = bf1[23];
+ bf1[25] = bf1[24];
+ bf1[26] = bf1[27];
+ bf1[29] = bf1[28];
+ bf1[30] = bf1[31];
+
+ // stage 4 :
+ bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+
+ bf1[9] = bf1[8];
+ bf1[10] = bf1[11];
+ bf1[13] = bf1[12];
+ bf1[14] = bf1[15];
+
+ idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[5] = bf1[4];
+ bf1[6] = bf1[7];
+
+ idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ bf1[3] = bf1[0];
+ bf1[2] = bf1[1];
+
+ idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+}
+
+static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32];
+
+ // stage 0
+ // stage 1
+
+ bf1[0] = in[0];
+ bf1[2] = in[8];
+ bf1[4] = in[4];
+ bf1[6] = in[12];
+ bf1[8] = in[2];
+ bf1[10] = in[10];
+ bf1[12] = in[6];
+ bf1[14] = in[14];
+ bf1[16] = in[1];
+ bf1[18] = in[9];
+ bf1[20] = in[5];
+ bf1[22] = in[13];
+ bf1[24] = in[3];
+ bf1[26] = in[11];
+ bf1[28] = in[7];
+ bf1[30] = in[15];
+
+ // stage 2
+ bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+ bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
+ bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
+ bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
+ bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
+ bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+ bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
+ bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
+ bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
+ bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
+ bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+ bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
+ bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
+ bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
+ bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
+ bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+
+ addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+ // stage 4
+ bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+ bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
+ bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
+
+ addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+ idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
+ bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
+
+ addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+ idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+ // stage 6
+ addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+ idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+}
+
+static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32], bf0[32];
+
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[1] = in[16];
+ bf1[2] = in[8];
+ bf1[3] = in[24];
+ bf1[4] = in[4];
+ bf1[5] = in[20];
+ bf1[6] = in[12];
+ bf1[7] = in[28];
+ bf1[8] = in[2];
+ bf1[9] = in[18];
+ bf1[10] = in[10];
+ bf1[11] = in[26];
+ bf1[12] = in[6];
+ bf1[13] = in[22];
+ bf1[14] = in[14];
+ bf1[15] = in[30];
+ bf1[16] = in[1];
+ bf1[17] = in[17];
+ bf1[18] = in[9];
+ bf1[19] = in[25];
+ bf1[20] = in[5];
+ bf1[21] = in[21];
+ bf1[22] = in[13];
+ bf1[23] = in[29];
+ bf1[24] = in[3];
+ bf1[25] = in[19];
+ bf1[26] = in[11];
+ bf1[27] = in[27];
+ bf1[28] = in[7];
+ bf1[29] = in[23];
+ bf1[30] = in[15];
+ bf1[31] = in[31];
+
+ // stage 2
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] = bf1[4];
+ bf0[5] = bf1[5];
+ bf0[6] = bf1[6];
+ bf0[7] = bf1[7];
+ bf0[8] = bf1[8];
+ bf0[9] = bf1[9];
+ bf0[10] = bf1[10];
+ bf0[11] = bf1[11];
+ bf0[12] = bf1[12];
+ bf0[13] = bf1[13];
+ bf0[14] = bf1[14];
+ bf0[15] = bf1[15];
+ bf0[16] =
+ half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+ bf0[17] =
+ half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+ bf0[31] =
+ half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
+
+ // stage 3
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] =
+ half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+ bf1[9] =
+ half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+ bf1[15] =
+ half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+ addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] =
+ half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+ bf0[5] =
+ half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+ bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+ addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+ bf0[16] = bf1[16];
+ bf0[17] =
+ half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+ bf0[19] = bf1[19];
+ bf0[20] = bf1[20];
+ bf0[21] =
+ half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] =
+ half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+ bf0[27] = bf1[27];
+ bf0[28] = bf1[28];
+ bf0[29] =
+ half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+ bf0[31] = bf1[31];
+
+ // stage 5
+ bf1[0] =
+ half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+ bf1[1] =
+ half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+ bf1[2] =
+ half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+ bf1[3] =
+ half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+ addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] =
+ half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] =
+ half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+ bf1[15] = bf0[15];
+ addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+ bf0[4] = bf1[4];
+ bf0[5] =
+ half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[7] = bf1[7];
+ addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] =
+ half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+ bf0[22] = bf1[22];
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] = bf1[25];
+ bf0[26] =
+ half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 7
+ addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] =
+ half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+ // stage 8
+ addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] = bf1[18];
+ bf0[19] = bf1[19];
+ bf0[20] =
+ half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[28] = bf1[28];
+ bf0[29] = bf1[29];
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 9
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
+ addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
+ addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
+ addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
+ addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
+ addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
+ addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
+ addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
+ addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
+ addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
+ addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
+ addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
+ addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
+ addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
+ addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
+ addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+ break;
+ default:
+ av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
+ uint8_t *dest, int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
+ uint8_t *dest, int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
switch (tx_type) {
case DCT_DCT:
- load_buffer_64x64_lower_32x32(coeff, in);
- transpose_64x64(in, out, 0);
- idct64x64_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_64x64(in, out, 1);
- idct64x64_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_64x64(in, output, stride, 0, 0, -shift[1], bd);
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ // Assembly version doesn't support IDTX, so use C version for it.
+ case IDTX:
+ av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
break;
+ default: assert(0);
+ }
+}
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ int eob = txfm_param->eob;
+ int bd = txfm_param->bd;
+ int lossless = txfm_param->lossless;
+ const int32_t *src = cast_to_int32(input);
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+ return;
+ }
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+ break;
default:
- av1_inv_txfm2d_add_64x64_c(coeff, output, stride, tx_type, bd);
+ av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ }
+}
+
+static const transform_1d_sse4_1
+ highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
+ { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ {
+ { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
+ NULL },
+ { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
+ NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
+ idct32x32_sse4_1 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
+ idct64x64_sse4_1 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64 * 16];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 2;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+ __m128i buf0[64];
+ const int32_t *input_row = input + i * input_stride * 4;
+ for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(
+ buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ highbd_inv_txfm2d_add_no_identity_sse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ default: assert(0); break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_32X32:
+ av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X16:
+ av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_8X8:
+ av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X8:
+ av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+ break;
+ case TX_8X4:
+ av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+ break;
+ case TX_8X16:
+ av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X8:
+ av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X32:
+ av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X16:
+ av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+ break;
+ case TX_32X64:
+ av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+ break;
+ case TX_64X32:
+ av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+ break;
+ case TX_4X4:
+ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+ break;
+ case TX_8X32:
+ av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X8:
+ av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+ break;
+ case TX_64X64:
+ case TX_16X64:
+ case TX_64X16:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(
+ input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+ txfm_param->eob, txfm_param->bd);
break;
+ default: assert(0 && "Invalid transform size"); break;
}
}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
index 608bd88a4..e298cf653 100644
--- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -14,7 +14,6 @@
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/x86/convolve_common_intrin.h"
#include "aom_dsp/x86/convolve_sse4_1.h"
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
index b29bd1d79..6f24e5948 100644
--- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H
-#define _HIGHBD_TXFM_UTILITY_SSE4_H
+#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
#include <smmintrin.h> /* SSE4.1 */
@@ -75,6 +75,17 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
out[63]);
}
+static INLINE void transpose_32x32(const __m128i *input, __m128i *output) {
+ for (int j = 0; j < 8; j++) {
+ for (int i = 0; i < 8; i++) {
+ TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8],
+ input[i * 32 + j + 16], input[i * 32 + j + 24],
+ output[j * 32 + i + 0], output[j * 32 + i + 8],
+ output[j * 32 + i + 16], output[j * 32 + i + 24]);
+ }
+ }
+}
+
// Note:
// rounding = 1 << (bit - 1)
static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
@@ -100,4 +111,15 @@ static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
return x;
}
-#endif // _HIGHBD_TXFM_UTILITY_SSE4_H
+typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift);
+
+typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+ const int num_cols);
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd);
+
+#endif // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
index a08beaafd..4bcab0564 100644
--- a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -19,10 +19,21 @@ static const uint8_t warp_highbd_arrange_bytes[16] = {
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
};
-static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
- int sx, int alpha, int k,
- const int offset_bits_horiz,
- const int reduce_bits_horiz) {
+static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
+ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+static const uint8_t highbd_shuffle_alpha0_mask2[16] = {
+ 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
+};
+static const uint8_t highbd_shuffle_alpha0_mask3[16] = {
+ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m128i *coeff) {
// Filter even-index pixels
const __m128i tmp_0 = _mm_loadu_si128(
(__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
@@ -43,27 +54,13 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
// coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
- const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
// coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
- const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
// coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
- const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
// coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
- const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
- ((1 << reduce_bits_horiz) >> 1));
-
- // Calculate filtered results
- const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
- const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
- const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
- const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
-
- __m128i res_even =
- _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
- res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
- _mm_cvtsi32_si128(reduce_bits_horiz));
+ coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
// Filter odd-index pixels
const __m128i tmp_1 = _mm_loadu_si128(
@@ -80,15 +77,63 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
- const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+ coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
+ int sx, __m128i *coeff) {
+ // Filter coeff
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ coeff[0] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
+ coeff[2] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
+ coeff[4] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
+ coeff[6] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
+
+ coeff[1] = coeff[0];
+ coeff[3] = coeff[2];
+ coeff[5] = coeff[4];
+ coeff[7] = coeff[6];
+}
+
+static INLINE void highbd_filter_src_pixels(
+ const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
+ const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
+ const __m128i src_1 = *src;
+ const __m128i src2_1 = *src2;
- const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
- const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
- const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
- const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
+ const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
+ ((1 << reduce_bits_horiz) >> 1));
+
+ const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
+
+ __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+ _mm_cvtsi32_si128(reduce_bits_horiz));
+
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
__m128i res_odd =
_mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
@@ -101,6 +146,145 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
}
+static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
+ __m128i *tmp, int sx, int alpha, int k,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ (void)alpha;
+ int k;
+
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)alpha;
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter_beta0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ int k;
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+}
+
+static INLINE void highbd_prepare_warp_horizontal_filter(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ if (alpha == 0 && beta == 0)
+ highbd_warp_horizontal_filter_alpha0_beta0(
+ ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+
+ else if (alpha == 0 && beta != 0)
+ highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+
+ else if (alpha != 0 && beta == 0)
+ highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ else
+ highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+}
+
void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
int width, int height, int stride,
uint16_t *pred, int p_col, int p_row,
@@ -247,27 +431,13 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
- horizontal_filter(src_padded, src2_padded, tmp, sx, alpha, k,
- offset_bits_horiz, reduce_bits_horiz);
+ highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
+ offset_bits_horiz, reduce_bits_horiz);
}
} else {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- int sx = sx4 + beta * (k + 4);
-
- // Load source pixels
- const __m128i src =
- _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
- const __m128i src2 =
- _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
-
- horizontal_filter(src, src2, tmp, sx, alpha, k, offset_bits_horiz,
- reduce_bits_horiz);
- }
+ highbd_prepare_warp_horizontal_filter(
+ ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
}
// Vertical filter
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
index d1ea26290..9f2e2b457 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -13,7 +13,6 @@
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/x86/convolve_common_intrin.h"
#include "aom_dsp/x86/convolve_sse4_1.h"
@@ -21,6 +20,21 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
+static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi16(w0);
+ const __m256i wt1 = _mm256_set1_epi16(w1);
+ const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ return wt;
+}
+
+static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
+ return _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
+}
+
void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
const InterpFilterParams *filter_params_x,
@@ -34,11 +48,7 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint8_t *const src_ptr = src - fo_horiz;
const int bits = FILTER_BITS - conv_params->round_1;
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi16(w0);
- const __m256i wt1 = _mm256_set1_epi16(w1);
- const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ const __m256i wt = unpack_weights_avx2(conv_params);
const int do_average = conv_params->do_average;
const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
const int offset_0 =
@@ -68,13 +78,11 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
(void)subpel_y_q4;
for (i = 0; i < h; i += 2) {
+ const uint8_t *src_data = src_ptr + i * src_stride;
+ CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
for (j = 0; j < w; j += 8) {
- const __m256i data = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&src_ptr[i * src_stride + j + src_stride]))),
- 0x20);
+ const __m256i data =
+ load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
__m256i res = convolve_lowbd_x(data, coeffs, filt);
@@ -86,13 +94,8 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
// Accumulate values into the destination buffer
if (do_average) {
- const __m256i data_ref_0 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
- 0x20);
-
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
const __m256i comp_avg_res =
comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
@@ -141,11 +144,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m256i round_const =
_mm256_set1_epi32((1 << conv_params->round_1) >> 1);
const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi16(w0);
- const __m256i wt1 = _mm256_set1_epi16(w1);
- const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ const __m256i wt = unpack_weights_avx2(conv_params);
const int do_average = conv_params->do_average;
const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
const int offset_0 =
@@ -172,72 +171,35 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
for (j = 0; j < w; j += 16) {
const uint8_t *data = &src_ptr[j];
__m256i src6;
-
// Load lines a and b. Line a to lower 128, line b to upper 128
- const __m256i src_01a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- 0x20);
-
- const __m256i src_12a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- 0x20);
-
- const __m256i src_23a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- 0x20);
-
- const __m256i src_34a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- 0x20);
-
- const __m256i src_45a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
- const __m256i src_56a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- src6, 0x20);
-
- s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
- s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
- s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
- s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
- s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
- s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+ {
+ __m256i src_ab[7];
+ __m256i src_a[7];
+ src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ for (int kk = 0; kk < 6; ++kk) {
+ data += src_stride;
+ src_a[kk + 1] =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+ }
+ src6 = src_a[6];
+ s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+ s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+ s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+ s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+ s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+ s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+ }
for (i = 0; i < h; i += 2) {
- data = &src_ptr[i * src_stride + j];
- const __m256i src_67a = _mm256_permute2x128_si256(
- src6,
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- 0x20);
+ data = &src_ptr[(i + 7) * src_stride + j];
+ const __m256i src7 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
- const __m256i src_78a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- src6, 0x20);
+ _mm_loadu_si128((__m128i *)(data + src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
@@ -266,13 +228,8 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
if (w - j < 16) {
if (do_average) {
- const __m256i data_ref_0 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
- 0x20);
-
+ const __m256i data_ref_0 = load_line2_avx2(
+ &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
const __m256i comp_avg_res =
comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
@@ -325,19 +282,12 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
_mm256_add_epi16(res_hi_round, offset_const_2);
if (do_average) {
- const __m256i data_ref_0_lo = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
- 0x20);
-
- const __m256i data_ref_0_hi = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j + 8]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]))),
- 0x20);
+ const __m256i data_ref_0_lo = load_line2_avx2(
+ &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i data_ref_0_hi =
+ load_line2_avx2(&dst[i * dst_stride + j + 8],
+ &dst[i * dst_stride + j + 8 + dst_stride]);
const __m256i comp_avg_res_lo =
comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
@@ -404,11 +354,7 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi16(w0);
- const __m256i wt1 = _mm256_set1_epi16(w1);
- const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ const __m256i wt = unpack_weights_avx2(conv_params);
const int do_average = conv_params->do_average;
const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
const int offset_0 =
@@ -442,15 +388,14 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
for (j = 0; j < w; j += 8) {
/* Horizontal filter */
{
+ const uint8_t *src_h = src_ptr + j;
for (i = 0; i < im_h; i += 2) {
- __m256i data = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+ __m256i data =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
if (i + 1 < im_h)
data = _mm256_inserti128_si256(
- data,
- _mm_loadu_si128(
- (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
- 1);
+ data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
+ src_h += (src_stride << 1);
__m256i res = convolve_lowbd_x(data, coeffs_x, filt);
res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
@@ -500,13 +445,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
if (do_average) {
- const __m256i data_ref_0 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
- 0x20);
-
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
const __m256i comp_avg_res =
comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
@@ -534,12 +475,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
if (do_average) {
- const __m256i data_ref_0 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
- 0x20);
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
const __m256i comp_avg_res =
comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
@@ -598,11 +536,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
const __m128i left_shift = _mm_cvtsi32_si128(bits);
const int do_average = conv_params->do_average;
const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi16(w0);
- const __m256i wt1 = _mm256_set1_epi16(w1);
- const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ const __m256i wt = unpack_weights_avx2(conv_params);
const __m256i zero = _mm256_setzero_si256();
const int offset_0 =
@@ -663,13 +597,8 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
// Accumulate values into the destination buffer
if (do_average) {
- const __m256i data_ref_0 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
- 0x20);
-
+ const __m256i data_ref_0 = load_line2_avx2(
+ &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
const __m256i comp_avg_res =
comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c
index ffbb31849..f645e0454 100644
--- a/third_party/aom/av1/common/x86/reconinter_avx2.c
+++ b/third_party/aom/av1/common/x86/reconinter_avx2.c
@@ -16,8 +16,504 @@
#include "aom/aom_integer.h"
#include "aom_dsp/blend.h"
#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
#include "av1/common/blockd.h"
+static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
+ const __m256i s1) {
+ const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
+ return _mm256_abs_epi16(
+ _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4)));
+ // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int stride0,
+ const uint8_t *src1, int stride1,
+ int h, int w) {
+ const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+ const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
+ int i = 0;
+ if (4 == w) {
+ do {
+ const __m128i s0A = xx_loadl_32(src0);
+ const __m128i s0B = xx_loadl_32(src0 + stride0);
+ const __m128i s0C = xx_loadl_32(src0 + stride0 * 2);
+ const __m128i s0D = xx_loadl_32(src0 + stride0 * 3);
+ const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+ const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
+ const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
+ const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
+
+ const __m128i s1A = xx_loadl_32(src1);
+ const __m128i s1B = xx_loadl_32(src1 + stride1);
+ const __m128i s1C = xx_loadl_32(src1 + stride1 * 2);
+ const __m128i s1D = xx_loadl_32(src1 + stride1 * 3);
+ const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+ const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
+ const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
+ const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
+ const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ const __m128i x_m8 =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
+ xx_storeu_128(mask, x_m8);
+ src0 += (stride0 << 2);
+ src1 += (stride1 << 2);
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (8 == w) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + stride0);
+ const __m128i s0C = xx_loadl_64(src0 + stride0 * 2);
+ const __m128i s0D = xx_loadl_64(src0 + stride0 * 3);
+ const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
+ const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + stride1);
+ const __m128i s1C = xx_loadl_64(src1 + stride1 * 2);
+ const __m128i s1D = xx_loadl_64(src1 + stride1 * 3);
+ const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
+ const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
+ const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
+ const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
+ const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
+ yy_storeu_256(mask, m8);
+ src0 += stride0 << 2;
+ src1 += stride1 << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (16 == w) {
+ do {
+ const __m128i s0A = xx_load_128(src0);
+ const __m128i s0B = xx_load_128(src0 + stride0);
+ const __m128i s1A = xx_load_128(src1);
+ const __m128i s1B = xx_load_128(src1 + stride1);
+ const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
+ const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
+ const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
+ const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);
+
+ const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
+ const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);
+
+ const __m256i m8 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
+ yy_storeu_256(mask, m8);
+ src0 += stride0 << 1;
+ src1 += stride1 << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else {
+ do {
+ int j = 0;
+ do {
+ const __m256i s0 = yy_loadu_256(src0 + j);
+ const __m256i s1 = yy_loadu_256(src1 + j);
+ const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
+ const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
+ const __m256i s0H =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
+ const __m256i s1H =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
+ const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
+ const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
+ const __m256i m8 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
+ yy_storeu_256(mask + j, m8);
+ j += 32;
+ } while (j < w);
+ src0 += stride0;
+ src1 += stride1;
+ mask += w;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
+ const __m256i *data_src1,
+ const __m256i *round_const,
+ const __m256i *mask_base_16,
+ const __m256i *clip_diff, int round) {
+ const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+ const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+ const __m256i diff = _mm256_max_epu16(diffa, diffb);
+ const __m256i diff_round =
+ _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+ const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+ const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+ return diff_clamp;
+}
+
+static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
+ const __m256i *data_src1,
+ const __m256i *round_const,
+ const __m256i *mask_base_16,
+ const __m256i *clip_diff,
+ int round) {
+ const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+ const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+ const __m256i diff = _mm256_max_epu16(diffa, diffb);
+ const __m256i diff_round =
+ _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+ const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+ const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+ const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
+ return diff_const_16;
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_avx2(
+ uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+ const int mask_base = 38;
+ const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+ const __m256i y38 = _mm256_set1_epi16(mask_base);
+ const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ int i = 0;
+ if (w == 4) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+ const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+ _mm_unpacklo_epi64(s0A, s0B));
+ const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+ _mm_unpacklo_epi64(s1A, s1B));
+ const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ xx_storeu_128(mask,
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (w == 8) {
+ do {
+ const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+ const __m256i s0CD =
+ yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+ const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+ const __m256i s1CD =
+ yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+ const __m256i m16AB =
+ calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+ const __m256i m16CD =
+ calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (w == 16) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else if (w == 32) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 32;
+ i += 1;
+ } while (i < h);
+ } else if (w == 64) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 64;
+ i += 1;
+ } while (i < h);
+ } else {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s0E = yy_loadu_256(src0 + 64);
+ const __m256i s0F = yy_loadu_256(src0 + 80);
+ const __m256i s0G = yy_loadu_256(src0 + 96);
+ const __m256i s0H = yy_loadu_256(src0 + 112);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i s1E = yy_loadu_256(src1 + 64);
+ const __m256i s1F = yy_loadu_256(src1 + 80);
+ const __m256i s1G = yy_loadu_256(src1 + 96);
+ const __m256i s1H = yy_loadu_256(src1 + 112);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m16E =
+ calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+ const __m256i m16F =
+ calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+ const __m256i m16G =
+ calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+ const __m256i m16H =
+ calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+ const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+ yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 128;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_inv_avx2(
+ uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+ const int mask_base = 38;
+ const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+ const __m256i y38 = _mm256_set1_epi16(mask_base);
+ const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ int i = 0;
+ if (w == 4) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+ const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+ _mm_unpacklo_epi64(s0A, s0B));
+ const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+ _mm_unpacklo_epi64(s1A, s1B));
+ const __m256i m16 =
+ calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ xx_storeu_128(mask,
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (w == 8) {
+ do {
+ const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+ const __m256i s0CD =
+ yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+ const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+ const __m256i s1CD =
+ yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+ const __m256i m16AB =
+ calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+ const __m256i m16CD =
+ calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (w == 16) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else if (w == 32) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 32;
+ i += 1;
+ } while (i < h);
+ } else if (w == 64) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 64;
+ i += 1;
+ } while (i < h);
+ } else {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s0E = yy_loadu_256(src0 + 64);
+ const __m256i s0F = yy_loadu_256(src0 + 80);
+ const __m256i s0G = yy_loadu_256(src0 + 96);
+ const __m256i s0H = yy_loadu_256(src0 + 112);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i s1E = yy_loadu_256(src1 + 64);
+ const __m256i s1F = yy_loadu_256(src1 + 80);
+ const __m256i s1G = yy_loadu_256(src1 + 96);
+ const __m256i s1H = yy_loadu_256(src1 + 112);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m16E =
+ calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+ const __m256i m16F =
+ calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+ const __m256i m16G =
+ calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+ const __m256i m16H =
+ calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+ const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+ yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 128;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_d16_avx2(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ const int shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ // When rounding constant is added, there is a possibility of overflow.
+ // However that much precision is not required. Code should very well work for
+ // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+ // there is a possibility of corner case bugs.
+ assert(DIFF_FACTOR_LOG2 == 4);
+ assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+
+ if (mask_type == DIFFWTD_38) {
+ build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
+ src1_stride, h, w, shift);
+ } else {
+ build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
+ src1_stride, h, w, shift);
+ }
+}
+
void av1_build_compound_diffwtd_mask_highbd_avx2(
uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c
index 375def62e..0aaf1f454 100644
--- a/third_party/aom/av1/common/x86/selfguided_avx2.c
+++ b/third_party/aom/av1/common/x86/selfguided_avx2.c
@@ -546,17 +546,18 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
}
}
-void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
- int dgd_stride, int32_t *flt0,
- int32_t *flt1, int flt_stride,
- int sgr_params_idx, int bit_depth,
- int highbd) {
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0,
+ int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth,
+ int highbd) {
// The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
// Ctl and Dtl is 32-byte aligned.
const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
- DECLARE_ALIGNED(32, int32_t,
- buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]);
+ int32_t *buf = aom_memalign(
+ 32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3));
+ if (!buf) return -1;
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -625,6 +626,8 @@ void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
height, highbd);
}
+ aom_free(buf);
+ return 0;
}
void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
@@ -635,8 +638,10 @@ void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
- av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1,
- width, eps, bit_depth, highbd);
+ const int ret = av1_selfguided_restoration_avx2(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ (void)ret;
+ assert(!ret);
const sgr_params_type *const params = &sgr_params[eps];
int xq[2];
decode_xq(xqd, xq, params);
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index c64150b9d..ea3f6d942 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -499,13 +499,15 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
}
}
-void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
- int height, int dgd_stride,
- int32_t *flt0, int32_t *flt1,
- int flt_stride, int sgr_params_idx,
- int bit_depth, int highbd) {
- DECLARE_ALIGNED(16, int32_t, buf[4 * RESTORATION_PROC_UNIT_PELS]);
- memset(buf, 0, sizeof(buf));
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
+ int height, int dgd_stride, int32_t *flt0,
+ int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth,
+ int highbd) {
+ int32_t *buf = (int32_t *)aom_memalign(
+ 16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+ if (!buf) return -1;
+ memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -574,6 +576,8 @@ void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
height, highbd);
}
+ aom_free(buf);
+ return 0;
}
void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
@@ -584,8 +588,10 @@ void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
- av1_selfguided_restoration_sse4_1(dat8, width, height, stride, flt0, flt1,
- width, eps, bit_depth, highbd);
+ const int ret = av1_selfguided_restoration_sse4_1(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ (void)ret;
+ assert(!ret);
const sgr_params_type *const params = &sgr_params[eps];
int xq[2];
decode_xq(xqd, xq, params);
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c
index efc542cbf..b810cea2e 100644
--- a/third_party/aom/av1/common/x86/warp_plane_sse4.c
+++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c
@@ -203,15 +203,72 @@ static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8,
static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9,
9, 11, 11, 13, 13, 15, 15, 0 };
-static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
- int alpha, int k,
+static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1 };
+
+static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3,
+ 2, 3, 2, 3, 2, 3, 2, 3 };
+
+static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5,
+ 4, 5, 4, 5, 4, 5, 4, 5 };
+
+static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7,
+ 6, 7, 6, 7, 6, 7, 6, 7 };
+
+static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 0, 1, 2, 3 };
+static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7,
+ 4, 5, 6, 7, 4, 5, 6, 7 };
+static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11,
+ 8, 9, 10, 11, 8, 9, 10, 11 };
+static const uint8_t shuffle_gamma0_mask3[16] = {
+ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
const int offset_bits_horiz,
- const int reduce_bits_horiz) {
+ const int reduce_bits_horiz, int k) {
const __m128i src_even =
_mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
const __m128i src_odd =
_mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+ // The pixel order we need for 'src' is:
+ // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+ const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+ const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
+ // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+ const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+ _mm_srli_si128(src_odd, 4));
+ const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
+ // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+ const __m128i src_13 =
+ _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+ const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
+ // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+ const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
+ _mm_srli_si128(src_even, 6));
+ const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
+
+ const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
+ ((1 << reduce_bits_horiz) >> 1));
+ // Note: The values res_02 + res_46 and res_13 + res_57 both
+ // fit into int16s at this point, but their sum may be too wide to fit
+ // into an int16. However, once we also add round_const, the sum of
+ // all of these fits into a uint16.
+ //
+ // The wrapping behaviour of _mm_add_* is used here to make sure we
+ // get the correct result despite converting between different
+ // (implicit) types.
+ const __m128i res_even = _mm_add_epi16(res_02, res_46);
+ const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+ const __m128i res =
+ _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+ tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+}
+
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m128i *coeff) {
// Filter even-index pixels
const __m128i tmp_0 = _mm_loadl_epi64(
(__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
@@ -249,47 +306,504 @@ static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
// Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
// Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+ coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
// Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
// Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+ coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
- // The pixel order we need for 'src' is:
- // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
- const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
- const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
- // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
- const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
- _mm_srli_si128(src_odd, 4));
- const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
- // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
- const __m128i src_13 =
- _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
- const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
- // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
- const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
- _mm_srli_si128(src_even, 6));
- const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
+static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
+ __m128i *coeff) {
+ // Filter even-index pixels
+ const __m128i tmp_0 =
+ _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
- const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
- ((1 << reduce_bits_horiz) >> 1));
+ // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+ coeff[0] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01));
+ // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+ coeff[1] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23));
+ // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+ coeff[2] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45));
+ // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+ coeff[3] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67));
+}
- // Note: The values res_02 + res_46 and res_13 + res_57 both
- // fit into int16s at this point, but their sum may be too wide to fit
- // into an int16. However, once we also add round_const, the sum of
- // all of these fits into a uint16.
- //
- // The wrapping behaviour of _mm_add_* is used here to make sure we
- // get the correct result despite converting between different
- // (implicit) types.
- const __m128i res_even = _mm_add_epi16(res_02, res_46);
- const __m128i res_odd = _mm_add_epi16(res_13, res_57);
- const __m128i res =
- _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
- tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+ int alpha, int k,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+}
+
+static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
+ int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta,
+ int p_height, int height, int i,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+}
+
+static INLINE void warp_horizontal_filter_alpha0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)alpha;
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void warp_horizontal_filter_beta0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ int k;
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ (void)alpha;
+ int k;
+
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void unpack_weights_and_set_round_const(
+ ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+ __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
+ *res_sub_const =
+ _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ *wt = _mm_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
+ __m128i *coeffs) {
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ // even coeffs
+ coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ // odd coeffs
+ coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
+ __m128i *coeffs) {
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ // even coeffs
+ coeffs[0] =
+ _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0));
+ coeffs[1] =
+ _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1));
+ coeffs[2] =
+ _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2));
+ coeffs[3] =
+ _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3));
+
+ // odd coeffs
+ coeffs[4] = coeffs[0];
+ coeffs[5] = coeffs[1];
+ coeffs[6] = coeffs[2];
+ coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
+ __m128i *res_lo, __m128i *res_hi,
+ int k) {
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ const __m128i *src = tmp + (k + 4);
+ const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+
+ const __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
+
+ const __m128i res_odd =
+ _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output(
+ __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
+ const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
+ uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
+ const int reduce_bits_vert, int p_stride, int p_width,
+ const int round_bits) {
+ __m128i res_lo_1 = *res_lo;
+ __m128i res_hi_1 = *res_hi;
+
+ if (conv_params->is_compound) {
+ __m128i *const p =
+ (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+ res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
+ reduce_bits_vert);
+ const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
+ __m128i res_lo_16;
+ if (conv_params->do_average) {
+ __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ const __m128i p_16 = _mm_loadl_epi64(p);
+
+ if (conv_params->use_jnt_comp_avg) {
+ const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
+ const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
+ }
+
+ res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
+
+ res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
+ round_bits);
+ __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
+ *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
+ } else {
+ _mm_storel_epi64(p, temp_lo_16);
+ }
+ if (p_width > 4) {
+ __m128i *const p4 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+ res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
+ reduce_bits_vert);
+ const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
+ __m128i res_hi_16;
+
+ if (conv_params->do_average) {
+ __m128i *const dst8_4 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ const __m128i p4_16 = _mm_loadl_epi64(p4);
+
+ if (conv_params->use_jnt_comp_avg) {
+ const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
+ const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+ res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
+ }
+ res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
+
+ res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
+ round_bits);
+ __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
+ *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
+
+ } else {
+ _mm_storel_epi64(p4, temp_hi_16);
+ }
+ }
+ } else {
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+ const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ // Note: If we're outputting a 4x4 block, we need to be very careful
+ // to only output 4 pixels at this point, to avoid encode/decode
+ // mismatches when encoding with multiple threads.
+ if (p_width == 4) {
+ *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+ } else {
+ _mm_storel_epi64(p, res_8bit);
+ }
+ }
+}
+
+static INLINE void warp_vertical_filter(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs(gamma, sy, coeffs);
+
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ int k;
+ (void)gamma;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
+
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_delta0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ (void)delta;
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ (void)delta;
+ (void)gamma;
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void prepare_warp_vertical_filter(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ if (gamma == 0 && delta == 0)
+ warp_vertical_filter_gamma0_delta0(
+ pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
+ sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
+ else if (gamma == 0 && delta != 0)
+ warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+ else if (gamma != 0 && delta == 0)
+ warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+ else
+ warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+}
+
+static INLINE void prepare_warp_horizontal_filter(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ if (alpha == 0 && beta == 0)
+ warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ else if (alpha == 0 && beta != 0)
+ warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+ else if (alpha != 0 && beta == 0)
+ warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+ else
+ warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
}
void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
@@ -309,24 +823,12 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
- const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
const __m128i reduce_bits_vert_const =
_mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const __m128i res_sub_const =
- _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
- (1 << (offset_bits - conv_params->round_1 - 1)));
- __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
- __m128i round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
-
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m128i wt0 = _mm_set1_epi16(w0);
- const __m128i wt1 = _mm_set1_epi16(w1);
- const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
/* Note: For this code to work, the left/right frame borders need to be
@@ -340,6 +842,13 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
}
}*/
+ __m128i res_add_const_1;
+ if (conv_params->is_compound == 1) {
+ res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
+ } else {
+ res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+ }
for (i = 0; i < p_height; i += 8) {
for (j = 0; j < p_width; j += 8) {
@@ -419,203 +928,15 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
reduce_bits_horiz);
}
} else {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- int sx = sx4 + beta * (k + 4);
-
- // Load source pixels
- const __m128i src =
- _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
- horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
- reduce_bits_horiz);
- }
+ prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
}
// Vertical filter
- for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
- int sy = sy4 + delta * (k + 4);
-
- // Load from tmp and rearrange pairs of consecutive rows into the
- // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
- const __m128i *src = tmp + (k + 4);
- const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
- const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
- const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
- const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
- // Filter even-index pixels
- const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
- const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
- const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
- const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
- const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
- const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
- const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
- const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
- const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
- const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
- const __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- if (conv_params->is_compound) {
- __m128i *const p =
- (__m128i *)&conv_params
- ->dst[(i + k + 4) * conv_params->dst_stride + j];
- res_lo = _mm_add_epi32(res_lo, res_add_const);
- res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
- reduce_bits_vert_shift);
- const __m128i temp_lo_16 = _mm_packus_epi32(res_lo, res_lo);
- __m128i res_lo_16;
- if (conv_params->do_average) {
- __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
- const __m128i p_16 = _mm_loadl_epi64(p);
-
- if (conv_params->use_jnt_comp_avg) {
- const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
- const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
- const __m128i shifted_32 =
- _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
- res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
- } else {
- res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
- }
-
- res_lo_16 = _mm_add_epi16(res_lo_16, res_sub_const);
-
- res_lo_16 = _mm_sra_epi16(
- _mm_add_epi16(res_lo_16, round_bits_const), round_bits_shift);
- __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
- *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
- } else {
- _mm_storel_epi64(p, temp_lo_16);
- }
- if (p_width > 4) {
- __m128i *const p4 =
- (__m128i *)&conv_params
- ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
-
- res_hi = _mm_add_epi32(res_hi, res_add_const);
- res_hi =
- _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
- reduce_bits_vert_shift);
- const __m128i temp_hi_16 = _mm_packus_epi32(res_hi, res_hi);
- __m128i res_hi_16;
-
- if (conv_params->do_average) {
- __m128i *const dst8_4 =
- (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
- const __m128i p4_16 = _mm_loadl_epi64(p4);
-
- if (conv_params->use_jnt_comp_avg) {
- const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
- const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, wt);
- const __m128i shifted_32 =
- _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
- res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
- } else {
- res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
- }
- res_hi_16 = _mm_add_epi16(res_hi_16, res_sub_const);
-
- res_hi_16 = _mm_sra_epi16(
- _mm_add_epi16(res_hi_16, round_bits_const), round_bits_shift);
- __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
- *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
-
- } else {
- _mm_storel_epi64(p4, temp_hi_16);
- }
- }
- } else {
- // Round and pack into 8 bits
- const __m128i round_const =
- _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
- ((1 << reduce_bits_vert) >> 1));
-
- const __m128i res_lo_round = _mm_srai_epi32(
- _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
- const __m128i res_hi_round = _mm_srai_epi32(
- _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
-
- const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
- __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
- // Store, blending with 'pred' if needed
- __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
- // Note: If we're outputting a 4x4 block, we need to be very careful
- // to only output 4 pixels at this point, to avoid encode/decode
- // mismatches when encoding with multiple threads.
- if (p_width == 4) {
- *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
- } else {
- _mm_storel_epi64(p, res_8bit);
- }
- }
- }
+ prepare_warp_vertical_filter(
+ pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
+ j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
}
}
}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
index e1449fd21..87a6e1239 100644
--- a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
@@ -39,7 +39,8 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
DECLARE_ALIGNED(32, uint16_t,
temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
- int intermediate_height = h + SUBPEL_TAPS - 1;
+ int intermediate_height = h + SUBPEL_TAPS - 2;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
const int center_tap = ((SUBPEL_TAPS - 1) / 2);
const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
index 3083d224b..f9d00b733 100644
--- a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
@@ -32,7 +32,8 @@ void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
DECLARE_ALIGNED(16, uint16_t,
temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
- int intermediate_height = h + SUBPEL_TAPS - 1;
+ int intermediate_height = h + SUBPEL_TAPS - 2;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
int i, j;
const int center_tap = ((SUBPEL_TAPS - 1) / 2);
const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
diff --git a/third_party/aom/av1/decoder/accounting.h b/third_party/aom/av1/decoder/accounting.h
index 9099d081b..288e5e63e 100644
--- a/third_party/aom/av1/decoder/accounting.h
+++ b/third_party/aom/av1/decoder/accounting.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_ACCOUNTING_H_
-#define AOM_ACCOUNTING_H_
+#ifndef AOM_AV1_DECODER_ACCOUNTING_H_
+#define AOM_AV1_DECODER_ACCOUNTING_H_
#include <stdlib.h>
#include "aom/aomdx.h"
@@ -79,4 +79,4 @@ void aom_accounting_dump(Accounting *accounting);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
-#endif // AOM_ACCOUNTING_H_
+#endif // AOM_AV1_DECODER_ACCOUNTING_H_
diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c
index 6dbc4f3eb..31f14b531 100644
--- a/third_party/aom/av1/decoder/decodeframe.c
+++ b/third_party/aom/av1/decoder/decodeframe.c
@@ -43,6 +43,7 @@
#include "av1/common/entropy.h"
#include "av1/common/entropymode.h"
#include "av1/common/entropymv.h"
+#include "av1/common/frame_buffers.h"
#include "av1/common/idct.h"
#include "av1/common/mvref_common.h"
#include "av1/common/pred_common.h"
@@ -87,18 +88,25 @@ int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
static void set_planes_to_neutral_grey(const SequenceHeader *const seq_params,
const YV12_BUFFER_CONFIG *const buf,
int only_chroma) {
- const int val = 1 << (seq_params->bit_depth - 1);
-
- for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
- const int is_uv = plane > 0;
- for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
- if (seq_params->use_highbitdepth) {
- // TODO(yaowu): replace this with aom_memset16() for speed
- for (int col_idx = 0; col_idx < buf->crop_widths[is_uv]; col_idx++) {
- uint16_t *base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
- base[row_idx * buf->strides[is_uv] + col_idx] = val;
+ if (seq_params->use_highbitdepth) {
+ const int val = 1 << (seq_params->bit_depth - 1);
+ for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+ const int is_uv = plane > 0;
+ uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
+ // Set the first row to neutral grey. Then copy the first row to all
+ // subsequent rows.
+ if (buf->crop_heights[is_uv] > 0) {
+ aom_memset16(base, val, buf->crop_widths[is_uv]);
+ for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+ memcpy(&base[row_idx * buf->strides[is_uv]], base,
+ sizeof(*base) * buf->crop_widths[is_uv]);
}
- } else {
+ }
+ }
+ } else {
+ for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+ const int is_uv = plane > 0;
+ for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
memset(&buf->buffers[plane][row_idx * buf->uv_stride], 1 << 7,
buf->crop_widths[is_uv]);
}
@@ -687,11 +695,10 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
for (int x = 0; x < b8_w; x += b4_w) {
MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
is_compound = has_second_ref(this_mbmi);
- DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]);
int tmp_dst_stride = 8;
assert(bw < 8 || bh < 8);
ConvolveParams conv_params = get_conv_params_no_round(
- 0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd);
+ 0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
conv_params.use_jnt_comp_avg = 0;
struct buf_2d *const dst_buf = &pd->dst;
uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
@@ -735,7 +742,6 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv,
subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref],
&pre, &src_stride);
- conv_params.ref = ref;
conv_params.do_average = ref;
if (is_masked_compound_type(mi->interinter_comp.type)) {
// masked compound type has its own average mechanism
@@ -762,7 +768,6 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
uint8_t *const dst = dst_buf->buf;
uint8_t *pre[2];
SubpelParams subpel_params[2];
- DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
int src_stride[2];
for (ref = 0; ref < 1 + is_compound; ++ref) {
const struct scale_factors *const sf =
@@ -797,7 +802,7 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
}
ConvolveParams conv_params = get_conv_params_no_round(
- 0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd);
+ 0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
&conv_params.bck_offset,
&conv_params.use_jnt_comp_avg, is_compound);
@@ -808,7 +813,6 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
WarpTypesAllowed warp_types;
warp_types.global_warp_allowed = is_global[ref];
warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
- conv_params.ref = ref;
conv_params.do_average = ref;
if (is_masked_compound_type(mi->interinter_comp.type)) {
// masked compound type has its own average mechanism
@@ -931,7 +935,7 @@ static void dec_build_prediction_by_above_preds(
// Adjust mb_to_bottom_edge to have the correct value for the OBMC
// prediction block. This is half the height of the original block,
// except for 128-wide blocks, where we only use a height of 32.
- int this_height = xd->n8_h * MI_SIZE;
+ int this_height = xd->n4_h * MI_SIZE;
int pred_height = AOMMIN(this_height / 2, 32);
xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
@@ -984,7 +988,7 @@ static void dec_build_prediction_by_left_preds(
// Adjust mb_to_right_edge to have the correct value for the OBMC
// prediction block. This is half the width of the original block,
// except for 128-wide blocks, where we only use a width of 32.
- int this_width = xd->n8_w * MI_SIZE;
+ int this_width = xd->n4_w * MI_SIZE;
int pred_width = AOMMIN(this_width / 2, 32);
xd->mb_to_right_edge += (this_width - pred_width) * 8;
@@ -1006,8 +1010,6 @@ static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
MACROBLOCKD *xd, int mi_row,
int mi_col) {
const int num_planes = av1_num_planes(cm);
- DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -1018,19 +1020,23 @@ static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int len = sizeof(uint16_t);
- dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
- dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
- dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
- dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
- dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
- dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
+ dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+ dst_buf1[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+ dst_buf1[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+ dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+ dst_buf2[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+ dst_buf2[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
} else {
- dst_buf1[0] = tmp_buf1;
- dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
- dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
- dst_buf2[0] = tmp_buf2;
- dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
- dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
+ dst_buf1[0] = xd->tmp_obmc_bufs[0];
+ dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+ dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+ dst_buf2[0] = xd->tmp_obmc_bufs[1];
+ dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+ dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
}
dec_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
dst_width1, dst_height1, dst_stride1);
@@ -1069,8 +1075,9 @@ static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
}
dec_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
- if (mbmi->motion_mode == OBMC_CAUSAL)
+ if (mbmi->motion_mode == OBMC_CAUSAL) {
dec_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+ }
#if CONFIG_MISMATCH_DEBUG
for (int plane = 0; plane < num_planes; ++plane) {
const struct macroblockd_plane *pd = &xd->plane[plane];
@@ -1225,9 +1232,18 @@ static void decode_token_recon_block(AV1Decoder *const pbi,
set_color_index_map_offset);
}
+#if LOOP_FILTER_BITMASK
+static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ MB_MODE_INFO *mbmi);
+#endif
+
static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
- TX_SIZE tx_size, int depth, int blk_row,
- int blk_col, aom_reader *r) {
+ TX_SIZE tx_size, int depth,
+#if LOOP_FILTER_BITMASK
+ AV1_COMMON *cm, int mi_row, int mi_col,
+#endif
+ int blk_row, int blk_col, aom_reader *r) {
FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
int is_split = 0;
const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -1271,15 +1287,29 @@ static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
mbmi->tx_size = sub_txs;
txfm_partition_update(xd->above_txfm_context + blk_col,
xd->left_txfm_context + blk_row, sub_txs, tx_size);
+#if LOOP_FILTER_BITMASK
+ store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col, BLOCK_8X8,
+ TX_4X4, mbmi);
+#endif
return;
}
+#if LOOP_FILTER_BITMASK
+ if (depth + 1 == MAX_VARTX_DEPTH) {
+ store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+ txsize_to_bsize[tx_size], sub_txs, mbmi);
+ }
+#endif
assert(bsw > 0 && bsh > 0);
for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
int offsetr = blk_row + row;
int offsetc = blk_col + col;
- read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r);
+ read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1,
+#if LOOP_FILTER_BITMASK
+ cm, mi_row, mi_col,
+#endif
+ offsetr, offsetc, r);
}
}
} else {
@@ -1293,6 +1323,10 @@ static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
mbmi->tx_size = tx_size;
txfm_partition_update(xd->above_txfm_context + blk_col,
xd->left_txfm_context + blk_row, tx_size, tx_size);
+#if LOOP_FILTER_BITMASK
+ store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+ txsize_to_bsize[tx_size], tx_size, mbmi);
+#endif
}
}
@@ -1330,6 +1364,191 @@ static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
}
}
+#if LOOP_FILTER_BITMASK
+static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ MB_MODE_INFO *mbmi) {
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
+ const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
+ const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const int is_square_transform_size = tx_size <= TX_64X64;
+ int mask_id = 0;
+ int offset = 0;
+ const int half_ratio_tx_size_max32 =
+ (tx_size > TX_64X64) & (tx_size <= TX_32X16);
+ if (is_square_transform_size) {
+ switch (tx_size) {
+ case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+ case TX_8X8:
+ mask_id = mask_id_table_tx_8x8[bsize];
+ offset = 19;
+ break;
+ case TX_16X16:
+ mask_id = mask_id_table_tx_16x16[bsize];
+ offset = 33;
+ break;
+ case TX_32X32:
+ mask_id = mask_id_table_tx_32x32[bsize];
+ offset = 42;
+ break;
+ case TX_64X64: mask_id = 46; break;
+ default: assert(!is_square_transform_size); return;
+ }
+ mask_id += offset;
+ } else if (half_ratio_tx_size_max32) {
+ int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size];
+ mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+ } else if (tx_size == TX_32X64) {
+ mask_id = 59;
+ } else if (tx_size == TX_64X32) {
+ mask_id = 60;
+ } else { // quarter ratio tx size
+ mask_id = 61 + (tx_size - TX_4X16);
+ }
+ int index = 0;
+ const int row = mi_row % MI_SIZE_64X64;
+ const int col = mi_col % MI_SIZE_64X64;
+ const int shift = get_index_shift(col, row, &index);
+ const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+ for (int i = 0; i + index < 4; ++i) {
+ // y vertical.
+ lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // y horizontal.
+ lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ // u/v vertical.
+ lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // u/v horizontal.
+ lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ }
+}
+
+static void store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+ // Use a lookup table that provides one bitmask for a given block size and
+ // a univariant transform size.
+ int index;
+ int shift;
+ int row;
+ int col;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
+ const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
+ const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
+ int mask_id = 0;
+ int offset = 0;
+ const int half_ratio_tx_size_max32 =
+ (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16);
+ if (is_square_transform_size) {
+ switch (mbmi->tx_size) {
+ case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+ case TX_8X8:
+ mask_id = mask_id_table_tx_8x8[bsize];
+ offset = 19;
+ break;
+ case TX_16X16:
+ mask_id = mask_id_table_tx_16x16[bsize];
+ offset = 33;
+ break;
+ case TX_32X32:
+ mask_id = mask_id_table_tx_32x32[bsize];
+ offset = 42;
+ break;
+ case TX_64X64: mask_id = 46; break;
+ default: assert(!is_square_transform_size); return;
+ }
+ mask_id += offset;
+ } else if (half_ratio_tx_size_max32) {
+ int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size];
+ mask_id =
+ 47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+ } else if (mbmi->tx_size == TX_32X64) {
+ mask_id = 59;
+ } else if (mbmi->tx_size == TX_64X32) {
+ mask_id = 60;
+ } else { // quarter ratio tx size
+ mask_id = 61 + (mbmi->tx_size - TX_4X16);
+ }
+ row = mi_row % MI_SIZE_64X64;
+ col = mi_col % MI_SIZE_64X64;
+ shift = get_index_shift(col, row, &index);
+ const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+ for (int i = 0; i + index < 4; ++i) {
+ // y vertical.
+ lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // y horizontal.
+ lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ // u/v vertical.
+ lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // u/v horizontal.
+ lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ }
+}
+
+static void store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+ int index;
+ int shift;
+ int row;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const int row_start = mi_row % MI_SIZE_64X64;
+ const int col_start = mi_col % MI_SIZE_64X64;
+ shift = get_index_shift(col_start, row_start, &index);
+ const uint64_t top_edge_mask =
+ ((uint64_t)1 << (shift + mi_size_wide[bsize])) - ((uint64_t)1 << shift);
+ lfm->is_horz_border.bits[index] |= top_edge_mask;
+ const int is_vert_border = mask_id_table_vert_border[bsize];
+ const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start;
+ for (int i = 0; i + index < 4; ++i) {
+ lfm->is_vert_border.bits[i + index] |=
+ (left_mask_univariant_reordered[is_vert_border].bits[i] << vert_shift);
+ }
+ const int is_skip = mbmi->skip && is_inter_block(mbmi);
+ if (is_skip) {
+ const int is_skip_mask = mask_id_table_tx_4x4[bsize];
+ for (int i = 0; i + index < 4; ++i) {
+ lfm->skip.bits[i + index] |=
+ (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift);
+ }
+ }
+ const uint8_t level_vert_y = get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
+ const uint8_t level_horz_y = get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
+ const uint8_t level_u = get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
+ const uint8_t level_v = get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
+ for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
+ index = 0;
+ row = r % MI_SIZE_64X64;
+ memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_u[row][col_start], level_u,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_v[row][col_start], level_v,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ }
+}
+#endif
+
static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td,
int mi_row, int mi_col, aom_reader *r,
PARTITION_TYPE partition, BLOCK_SIZE bsize) {
@@ -1353,14 +1572,46 @@ static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td,
for (int idy = 0; idy < height; idy += bh)
for (int idx = 0; idx < width; idx += bw)
- read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r);
+ read_tx_size_vartx(xd, mbmi, max_tx_size, 0,
+#if LOOP_FILTER_BITMASK
+ cm, mi_row, mi_col,
+#endif
+ idy, idx, r);
} else {
mbmi->tx_size = read_tx_size(cm, xd, inter_block_tx, !mbmi->skip, r);
if (inter_block_tx)
memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
- set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h,
+ set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
mbmi->skip && is_inter_block(mbmi), xd);
+#if LOOP_FILTER_BITMASK
+ const int w = mi_size_wide[bsize];
+ const int h = mi_size_high[bsize];
+ if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
+ store_bitmask_univariant_tx(cm, mi_row, mi_col, bsize, mbmi);
+ } else {
+ for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
+ for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
+ store_bitmask_univariant_tx(cm, mi_row + row, mi_col + col,
+ BLOCK_64X64, mbmi);
+ }
+ }
+ }
+#endif
+ }
+#if LOOP_FILTER_BITMASK
+ const int w = mi_size_wide[bsize];
+ const int h = mi_size_high[bsize];
+ if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
+ store_bitmask_other_info(cm, mi_row, mi_col, bsize, mbmi);
+ } else {
+ for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
+ for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
+ store_bitmask_other_info(cm, mi_row + row, mi_col + col, BLOCK_64X64,
+ mbmi);
+ }
+ }
}
+#endif
if (cm->delta_q_present_flag) {
for (int i = 0; i < MAX_SEGMENTS; i++) {
@@ -1952,6 +2203,11 @@ static void setup_quantization(AV1_COMMON *const cm,
cm->v_dc_delta_q = cm->u_dc_delta_q;
cm->v_ac_delta_q = cm->u_ac_delta_q;
}
+ } else {
+ cm->u_dc_delta_q = 0;
+ cm->u_ac_delta_q = 0;
+ cm->v_dc_delta_q = 0;
+ cm->v_ac_delta_q = 0;
}
cm->dequant_bit_depth = seq_params->bit_depth;
cm->using_qmatrix = aom_rb_read_bit(rb);
@@ -2082,29 +2338,9 @@ static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
cm->cur_frame->height = cm->height;
}
-static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
- struct aom_read_bit_buffer *rb) {
- const SequenceHeader *const seq_params = &cm->seq_params;
- int width, height;
+static void setup_buffer_pool(AV1_COMMON *cm) {
BufferPool *const pool = cm->buffer_pool;
-
- if (frame_size_override_flag) {
- int num_bits_width = seq_params->num_bits_width;
- int num_bits_height = seq_params->num_bits_height;
- av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
- if (width > seq_params->max_frame_width ||
- height > seq_params->max_frame_height) {
- aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
- "Frame dimensions are larger than the maximum values");
- }
- } else {
- width = seq_params->max_frame_width;
- height = seq_params->max_frame_height;
- }
-
- setup_superres(cm, rb, &width, &height);
- resize_context_buffers(cm, width, height);
- setup_render_size(cm, rb);
+ const SequenceHeader *const seq_params = &cm->seq_params;
lock_buffer_pool(pool);
if (aom_realloc_frame_buffer(
@@ -2140,6 +2376,31 @@ static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
}
+static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
+ struct aom_read_bit_buffer *rb) {
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ int width, height;
+
+ if (frame_size_override_flag) {
+ int num_bits_width = seq_params->num_bits_width;
+ int num_bits_height = seq_params->num_bits_height;
+ av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+ if (width > seq_params->max_frame_width ||
+ height > seq_params->max_frame_height) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Frame dimensions are larger than the maximum values");
+ }
+ } else {
+ width = seq_params->max_frame_width;
+ height = seq_params->max_frame_height;
+ }
+
+ setup_superres(cm, rb, &width, &height);
+ resize_context_buffers(cm, width, height);
+ setup_render_size(cm, rb);
+ setup_buffer_pool(cm);
+}
+
static void setup_sb_size(SequenceHeader *seq_params,
struct aom_read_bit_buffer *rb) {
set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
@@ -2158,7 +2419,6 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
int width, height;
int found = 0;
int has_valid_ref_frame = 0;
- BufferPool *const pool = cm->buffer_pool;
for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
if (aom_rb_read_bit(rb)) {
YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
@@ -2208,39 +2468,7 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Referenced frame has incompatible color format");
}
-
- lock_buffer_pool(pool);
- if (aom_realloc_frame_buffer(
- get_frame_new_buffer(cm), cm->width, cm->height,
- seq_params->subsampling_x, seq_params->subsampling_y,
- seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment,
- &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
- pool->cb_priv)) {
- unlock_buffer_pool(pool);
- aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
- "Failed to allocate frame buffer");
- }
- unlock_buffer_pool(pool);
-
- pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x =
- seq_params->subsampling_x;
- pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y =
- seq_params->subsampling_y;
- pool->frame_bufs[cm->new_fb_idx].buf.bit_depth =
- (unsigned int)seq_params->bit_depth;
- pool->frame_bufs[cm->new_fb_idx].buf.color_primaries =
- seq_params->color_primaries;
- pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
- seq_params->transfer_characteristics;
- pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
- seq_params->matrix_coefficients;
- pool->frame_bufs[cm->new_fb_idx].buf.monochrome = seq_params->monochrome;
- pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
- seq_params->chroma_sample_position;
- pool->frame_bufs[cm->new_fb_idx].buf.color_range = seq_params->color_range;
- pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
- pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
+ setup_buffer_pool(cm);
}
// Same function as av1_read_uniform but reading from uncompresses header wb
@@ -2252,7 +2480,7 @@ static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
if (v < m)
return v;
else
- return (v << 1) - m + aom_rb_read_literal(rb, 1);
+ return (v << 1) - m + aom_rb_read_bit(rb);
}
static void read_tile_info_max_tile(AV1_COMMON *const cm,
@@ -2344,6 +2572,10 @@ static void read_tile_info(AV1Decoder *const pbi,
// tile to use for cdf update
cm->context_update_tile_id =
aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+ if (cm->context_update_tile_id >= cm->tile_rows * cm->tile_cols) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid context_update_tile_id");
+ }
// tile size magnitude
pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
}
@@ -2746,31 +2978,13 @@ static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
#endif // CONFIG_MULTITHREAD
}
-static INLINE int get_sb_rows_in_tile(AV1Decoder *pbi, TileInfo tile) {
- AV1_COMMON *cm = &pbi->common;
- int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
- tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
- int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
-
- return sb_rows;
-}
-
-static INLINE int get_sb_cols_in_tile(AV1Decoder *pbi, TileInfo tile) {
- AV1_COMMON *cm = &pbi->common;
- int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
- tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
- int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
-
- return sb_cols;
-}
-
static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
TileInfo tile_info, const int mi_row) {
AV1_COMMON *const cm = &pbi->common;
const int num_planes = av1_num_planes(cm);
TileDataDec *const tile_data =
pbi->tile_data + tile_info.tile_row * cm->tile_cols + tile_info.tile_col;
- const int sb_cols_in_tile = get_sb_cols_in_tile(pbi, tile_info);
+ const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
const int sb_row_in_tile =
(mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2;
int sb_col_in_tile = 0;
@@ -2792,15 +3006,11 @@ static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
}
static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
+ if (aom_reader_has_overflowed(r)) return -1;
+
uint32_t nb_bits = aom_reader_tell(r);
uint32_t nb_bytes = (nb_bits + 7) >> 3;
-
- const uint8_t *p_begin = aom_reader_find_begin(r);
- const uint8_t *p_end = aom_reader_find_end(r);
-
- // It is legal to have no padding bytes (nb_bytes == p_end - p_begin).
- if ((ptrdiff_t)nb_bytes > p_end - p_begin) return -1;
- const uint8_t *p = p_begin + nb_bytes;
+ const uint8_t *p = aom_reader_find_begin(r) + nb_bytes;
// aom_reader_tell() returns 1 for a newly initialized decoder, and the
// return value only increases as values are decoded. So nb_bits > 0, and
@@ -2810,6 +3020,7 @@ static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
if ((last_byte & (2 * pattern - 1)) != pattern) return -1;
// Make sure that all padding bytes are zero as required by the spec.
+ const uint8_t *p_end = aom_reader_find_end(r);
while (p < p_end) {
if (*p != 0) return -1;
p++;
@@ -2863,6 +3074,11 @@ static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row,
// Bit-stream parsing and decoding of the superblock
decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
cm->seq_params.sb_size, 0x3);
+
+ if (aom_reader_has_overflowed(td->bit_reader)) {
+ aom_merge_corrupted_flag(&td->xd.corrupted, 1);
+ return;
+ }
}
}
@@ -2950,6 +3166,11 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
td->xd.corrupted = 0;
td->xd.mc_buf[0] = td->mc_buf[0];
td->xd.mc_buf[1] = td->mc_buf[1];
+ td->xd.tmp_conv_dst = td->tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ td->xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
+ }
+
for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
@@ -3236,6 +3457,7 @@ static int row_mt_worker_hook(void *arg1, void *arg2) {
#endif
frame_row_mt_info->row_mt_exit = 1;
#if CONFIG_MULTITHREAD
+ pthread_cond_broadcast(pbi->row_mt_cond_);
pthread_mutex_unlock(pbi->row_mt_mutex_);
#endif
return 0;
@@ -3386,16 +3608,24 @@ static void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm,
aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
}
-void av1_free_mc_tmp_buf(ThreadData *thread_data, int use_highbd) {
+void av1_free_mc_tmp_buf(ThreadData *thread_data) {
int ref;
for (ref = 0; ref < 2; ref++) {
- if (use_highbd)
+ if (thread_data->mc_buf_use_highbd)
aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
else
aom_free(thread_data->mc_buf[ref]);
thread_data->mc_buf[ref] = NULL;
}
thread_data->mc_buf_size = 0;
+ thread_data->mc_buf_use_highbd = 0;
+
+ aom_free(thread_data->tmp_conv_dst);
+ thread_data->tmp_conv_dst = NULL;
+ for (int i = 0; i < 2; ++i) {
+ aom_free(thread_data->tmp_obmc_bufs[i]);
+ thread_data->tmp_obmc_bufs[i] = NULL;
+ }
}
static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data,
@@ -3411,6 +3641,17 @@ static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data,
}
}
thread_data->mc_buf_size = buf_size;
+ thread_data->mc_buf_use_highbd = use_highbd;
+
+ CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+ sizeof(*thread_data->tmp_conv_dst)));
+ for (int i = 0; i < 2; ++i) {
+ CHECK_MEM_ERROR(
+ cm, thread_data->tmp_obmc_bufs[i],
+ aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->tmp_obmc_bufs[i])));
+ }
}
static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook,
@@ -3425,6 +3666,10 @@ static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook,
thread_data->td->xd.corrupted = 0;
thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0];
thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1];
+ thread_data->td->xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->xd.tmp_obmc_bufs[j] = thread_data->td->tmp_obmc_bufs[j];
+ }
winterface->sync(worker);
worker->hook = worker_hook;
@@ -3511,7 +3756,7 @@ static void decode_mt_init(AV1Decoder *pbi) {
for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
if (thread_data->td->mc_buf_size != buf_size) {
- av1_free_mc_tmp_buf(thread_data->td, use_highbd);
+ av1_free_mc_tmp_buf(thread_data->td);
allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
}
}
@@ -3783,8 +4028,8 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
av1_tile_init(&tile_data->tile_info, cm, row, col);
- max_sb_rows =
- AOMMAX(max_sb_rows, get_sb_rows_in_tile(pbi, tile_data->tile_info));
+ max_sb_rows = AOMMAX(max_sb_rows,
+ av1_get_sb_rows_in_tile(cm, tile_data->tile_info));
}
}
@@ -3905,6 +4150,8 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
if (!seq_params->monochrome)
pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
+ else
+ pars->chroma_scaling_from_luma = 0;
if (seq_params->monochrome || pars->chroma_scaling_from_luma ||
((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
@@ -4412,6 +4659,29 @@ static void show_existing_frame_reset(AV1Decoder *const pbi,
*cm->fc = cm->frame_contexts[existing_frame_idx];
}
+static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ int i;
+
+ memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+ memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+
+ lock_buffer_pool(cm->buffer_pool);
+ for (i = 0; i < FRAME_BUFFERS; ++i) {
+ if (i != cm->new_fb_idx) {
+ frame_bufs[i].ref_count = 0;
+ cm->buffer_pool->release_fb_cb(cm->buffer_pool->cb_priv,
+ &frame_bufs[i].raw_frame_buffer);
+ } else {
+ assert(frame_bufs[i].ref_count == 1);
+ }
+ frame_bufs[i].cur_frame_offset = 0;
+ av1_zero(frame_bufs[i].ref_frame_offset);
+ }
+ av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
+ unlock_buffer_pool(cm->buffer_pool);
+}
+
// On success, returns 0. On failure, calls aom_internal_error and does not
// return.
static int read_uncompressed_header(AV1Decoder *pbi,
@@ -4443,6 +4713,11 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->reset_decoder_state = 0;
if (cm->show_existing_frame) {
+ if (pbi->sequence_header_changed) {
+ aom_internal_error(
+ &cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "New sequence header starts with a show_existing_frame.");
+ }
// Show an existing frame directly.
const int existing_frame_idx = aom_rb_read_literal(rb, 3);
const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
@@ -4493,6 +4768,18 @@ static int read_uncompressed_header(AV1Decoder *pbi,
}
cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2); // 2 bits
+ if (pbi->sequence_header_changed) {
+ if (pbi->common.frame_type == KEY_FRAME) {
+ // This is the start of a new coded video sequence.
+ pbi->sequence_header_changed = 0;
+ pbi->decoding_first_frame = 1;
+ reset_frame_buffers(&pbi->common);
+ } else {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Sequence header has changed without a keyframe.");
+ }
+ }
+
cm->show_frame = aom_rb_read_bit(rb);
if (seq_params->still_picture &&
(cm->frame_type != KEY_FRAME || !cm->show_frame)) {
@@ -4582,8 +4869,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
}
}
- frame_size_override_flag =
- frame_is_sframe(cm) ? 1 : aom_rb_read_literal(rb, 1);
+ frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb);
cm->frame_offset =
aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1);
@@ -5152,7 +5438,7 @@ static void setup_frame_info(AV1Decoder *pbi) {
const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
if (pbi->td.mc_buf_size != buf_size) {
- av1_free_mc_tmp_buf(&pbi->td, use_highbd);
+ av1_free_mc_tmp_buf(&pbi->td);
allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd);
}
}
@@ -5166,6 +5452,11 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
const int tile_count_tg = end_tile - start_tile + 1;
if (initialize_flag) setup_frame_info(pbi);
+ const int num_planes = av1_num_planes(cm);
+#if LOOP_FILTER_BITMASK
+ av1_loop_filter_frame_init(cm, 0, num_planes);
+ av1_zero_array(cm->lf.lfm, cm->lf.lfm_num);
+#endif
if (pbi->max_threads > 1 && !(cm->large_scale_tile && !pbi->ext_tile_debug) &&
pbi->row_mt)
@@ -5177,7 +5468,6 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
else
*p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
- const int num_planes = av1_num_planes(cm);
// If the bit stream is monochrome, set the U and V buffers to a constant.
if (num_planes < 3) {
set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);
@@ -5190,7 +5480,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
if (!cm->allow_intrabc && !cm->single_tile_decoding) {
if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
#if LOOP_FILTER_BITMASK
- av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+ av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 1, 0,
num_planes, 0);
#else
if (pbi->num_workers > 1) {
@@ -5255,6 +5545,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
if (!xd->corrupted) {
if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+ assert(cm->context_update_tile_id < pbi->allocated_tiles);
*cm->fc = pbi->tile_data[cm->context_update_tile_id].tctx;
av1_reset_cdf_symbol_counters(cm->fc);
}
diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h
index d289b31f2..ddad273f1 100644
--- a/third_party/aom/av1/decoder/decodeframe.h
+++ b/third_party/aom/av1/decoder/decodeframe.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_DECODER_DECODEFRAME_H_
-#define AV1_DECODER_DECODEFRAME_H_
+#ifndef AOM_AV1_DECODER_DECODEFRAME_H_
+#define AOM_AV1_DECODER_DECODEFRAME_H_
#ifdef __cplusplus
extern "C" {
@@ -74,7 +74,7 @@ struct aom_read_bit_buffer *av1_init_read_bit_buffer(
struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
const uint8_t *data_end);
-void av1_free_mc_tmp_buf(struct ThreadData *thread_data, int use_highbd);
+void av1_free_mc_tmp_buf(struct ThreadData *thread_data);
void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
@@ -82,4 +82,4 @@ void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
} // extern "C"
#endif
-#endif // AV1_DECODER_DECODEFRAME_H_
+#endif // AOM_AV1_DECODER_DECODEFRAME_H_
diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c
index 5e920b18d..551e4d543 100644
--- a/third_party/aom/av1/decoder/decodemv.c
+++ b/third_party/aom/av1/decoder/decodemv.c
@@ -94,42 +94,26 @@ static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
}
return reduced_delta_qindex;
}
-static int read_delta_lflevel(AV1_COMMON *cm, const MACROBLOCKD *xd,
- aom_reader *r, int lf_id,
- MB_MODE_INFO *const mbmi, int mi_col,
+static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r,
+ aom_cdf_prob *const cdf,
+ const MB_MODE_INFO *const mbmi, int mi_col,
int mi_row) {
- int sign, abs, reduced_delta_lflevel = 0;
- BLOCK_SIZE bsize = mbmi->sb_type;
+ int reduced_delta_lflevel = 0;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
const int b_col = mi_col & (cm->seq_params.mib_size - 1);
const int b_row = mi_row & (cm->seq_params.mib_size - 1);
const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
- FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
read_delta_lf_flag) {
- if (cm->delta_lf_multi) {
- assert(lf_id >= 0 &&
- lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
- : FRAME_LF_COUNT - 2));
- abs = aom_read_symbol(r, ec_ctx->delta_lf_multi_cdf[lf_id],
- DELTA_LF_PROBS + 1, ACCT_STR);
- } else {
- abs = aom_read_symbol(r, ec_ctx->delta_lf_cdf, DELTA_LF_PROBS + 1,
- ACCT_STR);
- }
+ int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
const int smallval = (abs < DELTA_LF_SMALL);
if (!smallval) {
const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
const int thr = (1 << rem_bits) + 1;
abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
}
-
- if (abs) {
- sign = aom_read_bit(r, ACCT_STR);
- } else {
- sign = 1;
- }
-
+ const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1;
reduced_delta_lflevel = sign ? -abs : abs;
}
return reduced_delta_lflevel;
@@ -618,19 +602,22 @@ static void read_filter_intra_mode_info(const AV1_COMMON *const cm,
void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
int blk_col, TX_SIZE tx_size, aom_reader *r) {
MB_MODE_INFO *mbmi = xd->mi[0];
- const int inter_block = is_inter_block(mbmi);
- FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-
const int txk_type_idx =
av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
TX_TYPE *tx_type = &mbmi->txk_type[txk_type_idx];
+ *tx_type = DCT_DCT;
+
+ // No need to read transform type if block is skipped.
+ if (mbmi->skip || segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+ return;
+
+ // No need to read transform type for lossless mode(qindex==0).
+ const int qindex =
+ cm->seg.enabled ? xd->qindex[mbmi->segment_id] : cm->base_qindex;
+ if (qindex <= 0) return;
- const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
- if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1 &&
- ((!cm->seg.enabled && cm->base_qindex > 0) ||
- (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
- !mbmi->skip &&
- !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const int inter_block = is_inter_block(mbmi);
+ if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1) {
const TxSetType tx_set_type =
av1_get_ext_tx_set_type(tx_size, inter_block, cm->reduced_tx_set_used);
const int eset =
@@ -639,23 +626,22 @@ void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
// there is no need to read the tx_type
assert(eset != 0);
+ const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
if (inter_block) {
*tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
} else {
- PREDICTION_MODE intra_dir;
- if (mbmi->filter_intra_mode_info.use_filter_intra)
- intra_dir =
- fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
- else
- intra_dir = mbmi->mode;
+ const PREDICTION_MODE intra_mode =
+ mbmi->filter_intra_mode_info.use_filter_intra
+ ? fimode_to_intradir[mbmi->filter_intra_mode_info
+ .filter_intra_mode]
+ : mbmi->mode;
*tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
- r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+ r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode],
av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
}
- } else {
- *tx_type = DCT_DCT;
}
}
@@ -720,6 +706,43 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
}
}
+// If delta q is present, reads delta_q index.
+// Also reads delta_q loop filter levels, if present.
+static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ const int mi_row, const int mi_col,
+ aom_reader *r) {
+ if (cm->delta_q_present_flag) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ xd->current_qindex +=
+ read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
+ /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
+ xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
+ FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+ if (cm->delta_lf_present_flag) {
+ if (cm->delta_lf_multi) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ const int tmp_lvl =
+ xd->delta_lf[lf_id] +
+ read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi,
+ mi_col, mi_row) *
+ cm->delta_lf_res;
+ mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
+ clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ }
+ } else {
+ const int tmp_lvl = xd->delta_lf_from_base +
+ read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf,
+ mbmi, mi_col, mi_row) *
+ cm->delta_lf_res;
+ mbmi->delta_lf_from_base = xd->delta_lf_from_base =
+ clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ }
+ }
+ }
+}
+
static void read_intra_frame_mode_info(AV1_COMMON *const cm,
MACROBLOCKD *const xd, int mi_row,
int mi_col, aom_reader *r) {
@@ -743,33 +766,7 @@ static void read_intra_frame_mode_info(AV1_COMMON *const cm,
read_cdef(cm, r, xd, mi_col, mi_row);
- if (cm->delta_q_present_flag) {
- xd->current_qindex +=
- read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
- /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
- xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
- if (cm->delta_lf_present_flag) {
- if (cm->delta_lf_multi) {
- const int frame_lf_count =
- av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
- for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
- const int tmp_lvl =
- xd->delta_lf[lf_id] +
- read_delta_lflevel(cm, xd, r, lf_id, mbmi, mi_col, mi_row) *
- cm->delta_lf_res;
- mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
- clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
- }
- } else {
- const int tmp_lvl =
- xd->delta_lf_from_base +
- read_delta_lflevel(cm, xd, r, -1, mbmi, mi_col, mi_row) *
- cm->delta_lf_res;
- mbmi->delta_lf_from_base = xd->delta_lf_from_base =
- clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
- }
- }
- }
+ read_delta_q_params(cm, xd, mi_row, mi_col, r);
mbmi->current_qindex = xd->current_qindex;
@@ -1402,7 +1399,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
mbmi->motion_mode = SIMPLE_TRANSLATION;
if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode &&
!has_second_ref(mbmi))
- mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+ mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
if (mbmi->ref_frame[1] != INTRA_FRAME)
@@ -1463,20 +1460,20 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
read_mb_interp_filter(cm, xd, mbmi, r);
if (mbmi->motion_mode == WARPED_CAUSAL) {
- mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
- mbmi->wm_params[0].invalid = 0;
+ mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+ mbmi->wm_params.invalid = 0;
- if (mbmi->num_proj_ref[0] > 1)
- mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
- mbmi->num_proj_ref[0], bsize);
+ if (mbmi->num_proj_ref > 1)
+ mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+ mbmi->num_proj_ref, bsize);
- if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
+ if (find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
- &mbmi->wm_params[0], mi_row, mi_col)) {
+ &mbmi->wm_params, mi_row, mi_col)) {
#if WARPED_MOTION_DEBUG
printf("Warning: unexpected warped model from aomenc\n");
#endif
- mbmi->wm_params[0].invalid = 1;
+ mbmi->wm_params.invalid = 1;
}
}
@@ -1512,33 +1509,7 @@ static void read_inter_frame_mode_info(AV1Decoder *const pbi,
read_cdef(cm, r, xd, mi_col, mi_row);
- if (cm->delta_q_present_flag) {
- xd->current_qindex +=
- read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
- /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
- xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
- if (cm->delta_lf_present_flag) {
- if (cm->delta_lf_multi) {
- const int frame_lf_count =
- av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
- for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
- const int tmp_lvl =
- xd->delta_lf[lf_id] +
- read_delta_lflevel(cm, xd, r, lf_id, mbmi, mi_col, mi_row) *
- cm->delta_lf_res;
- mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
- clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
- }
- } else {
- const int tmp_lvl =
- xd->delta_lf_from_base +
- read_delta_lflevel(cm, xd, r, -1, mbmi, mi_col, mi_row) *
- cm->delta_lf_res;
- mbmi->delta_lf_from_base = xd->delta_lf_from_base =
- clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
- }
- }
- }
+ read_delta_q_params(cm, xd, mi_row, mi_col, r);
if (!mbmi->skip_mode)
inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
diff --git a/third_party/aom/av1/decoder/decodemv.h b/third_party/aom/av1/decoder/decodemv.h
index 6243bb168..1625e5bd2 100644
--- a/third_party/aom/av1/decoder/decodemv.h
+++ b/third_party/aom/av1/decoder/decodemv.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_DECODER_DECODEMV_H_
-#define AV1_DECODER_DECODEMV_H_
+#ifndef AOM_AV1_DECODER_DECODEMV_H_
+#define AOM_AV1_DECODER_DECODEMV_H_
#include "aom_dsp/bitreader.h"
@@ -32,4 +32,4 @@ void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
int blk_col, TX_SIZE tx_size, aom_reader *r);
-#endif // AV1_DECODER_DECODEMV_H_
+#endif // AOM_AV1_DECODER_DECODEMV_H_
diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c
index e978fad6c..a5f4fd67f 100644
--- a/third_party/aom/av1/decoder/decoder.c
+++ b/third_party/aom/av1/decoder/decoder.c
@@ -37,16 +37,11 @@
#include "av1/decoder/obu.h"
static void initialize_dec(void) {
- static volatile int init_done = 0;
-
- if (!init_done) {
- av1_rtcd();
- aom_dsp_rtcd();
- aom_scale_rtcd();
- av1_init_intra_predictors();
- av1_init_wedge_masks();
- init_done = 1;
- }
+ av1_rtcd();
+ aom_dsp_rtcd();
+ aom_scale_rtcd();
+ av1_init_intra_predictors();
+ av1_init_wedge_masks();
}
static void dec_setup_mi(AV1_COMMON *cm) {
@@ -171,8 +166,7 @@ void av1_decoder_remove(AV1Decoder *pbi) {
if (pbi->thread_data) {
for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
- const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0;
- av1_free_mc_tmp_buf(thread_data->td, use_highbd);
+ av1_free_mc_tmp_buf(thread_data->td);
aom_free(thread_data->td);
}
aom_free(pbi->thread_data);
@@ -209,8 +203,7 @@ void av1_decoder_remove(AV1Decoder *pbi) {
#if CONFIG_ACCOUNTING
aom_accounting_clear(&pbi->accounting);
#endif
- const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0;
- av1_free_mc_tmp_buf(&pbi->td, use_highbd);
+ av1_free_mc_tmp_buf(&pbi->td);
aom_free(pbi);
}
diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h
index 610b98d95..5ca939c24 100644
--- a/third_party/aom/av1/decoder/decoder.h
+++ b/third_party/aom/av1/decoder/decoder.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_DECODER_DECODER_H_
-#define AV1_DECODER_DECODER_H_
+#ifndef AOM_AV1_DECODER_DECODER_H_
+#define AOM_AV1_DECODER_DECODER_H_
#include "config/aom_config.h"
@@ -55,6 +55,11 @@ typedef struct ThreadData {
CB_BUFFER cb_buffer_base;
uint8_t *mc_buf[2];
int32_t mc_buf_size;
+ int mc_buf_use_highbd; // Boolean: whether the byte pointers stored in
+ // mc_buf were converted from highbd pointers.
+
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit;
decode_block_visitor_fn_t predict_and_recon_intra_block_visit;
@@ -199,6 +204,7 @@ typedef struct AV1Decoder {
int tg_start; // First tile in the current tilegroup
int tg_size_bit_offset;
int sequence_header_ready;
+ int sequence_header_changed;
#if CONFIG_INSPECTION
aom_inspect_cb inspect_cb;
void *inspect_ctx;
@@ -308,4 +314,4 @@ typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td,
} // extern "C"
#endif
-#endif // AV1_DECODER_DECODER_H_
+#endif // AOM_AV1_DECODER_DECODER_H_
diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h
index 687bba958..fe04f6abd 100644
--- a/third_party/aom/av1/decoder/decodetxb.h
+++ b/third_party/aom/av1/decoder/decodetxb.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef DECODETXB_H_
-#define DECODETXB_H_
+#ifndef AOM_AV1_DECODER_DECODETXB_H_
+#define AOM_AV1_DECODER_DECODETXB_H_
#include "config/aom_config.h"
@@ -29,4 +29,4 @@ void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
MACROBLOCKD *const xd, aom_reader *const r,
const int plane, const int row, const int col,
const TX_SIZE tx_size);
-#endif // DECODETXB_H_
+#endif // AOM_AV1_DECODER_DECODETXB_H_
diff --git a/third_party/aom/av1/decoder/detokenize.h b/third_party/aom/av1/decoder/detokenize.h
index ec85bf7ea..173b437a9 100644
--- a/third_party/aom/av1/decoder/detokenize.h
+++ b/third_party/aom/av1/decoder/detokenize.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_DECODER_DETOKENIZE_H_
-#define AV1_DECODER_DETOKENIZE_H_
+#ifndef AOM_AV1_DECODER_DETOKENIZE_H_
+#define AOM_AV1_DECODER_DETOKENIZE_H_
#include "config/aom_config.h"
@@ -26,4 +26,4 @@ void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_DECODER_DETOKENIZE_H_
+#endif // AOM_AV1_DECODER_DETOKENIZE_H_
diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h
index 9f854e015..1d264b07e 100644
--- a/third_party/aom/av1/decoder/dthread.h
+++ b/third_party/aom/av1/decoder/dthread.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_DECODER_DTHREAD_H_
-#define AV1_DECODER_DTHREAD_H_
+#ifndef AOM_AV1_DECODER_DTHREAD_H_
+#define AOM_AV1_DECODER_DTHREAD_H_
#include "config/aom_config.h"
@@ -79,4 +79,4 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker,
} // extern "C"
#endif
-#endif // AV1_DECODER_DTHREAD_H_
+#endif // AOM_AV1_DECODER_DTHREAD_H_
diff --git a/third_party/aom/av1/decoder/inspection.h b/third_party/aom/av1/decoder/inspection.h
index bb604f684..7214a9bed 100644
--- a/third_party/aom/av1/decoder/inspection.h
+++ b/third_party/aom/av1/decoder/inspection.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_INSPECTION_H_
-#define AOM_INSPECTION_H_
+#ifndef AOM_AV1_DECODER_INSPECTION_H_
+#define AOM_AV1_DECODER_INSPECTION_H_
#ifdef __cplusplus
extern "C" {
@@ -81,4 +81,4 @@ int ifd_inspect(insp_frame_data *fd, void *decoder);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
-#endif // AOM_INSPECTION_H_
+#endif // AOM_AV1_DECODER_INSPECTION_H_
diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c
index 715bc6837..44ecf818e 100644
--- a/third_party/aom/av1/decoder/obu.c
+++ b/third_party/aom/av1/decoder/obu.c
@@ -18,6 +18,7 @@
#include "aom_ports/mem_ops.h"
#include "av1/common/common.h"
+#include "av1/common/obu_util.h"
#include "av1/common/timing.h"
#include "av1/decoder/decoder.h"
#include "av1/decoder/decodeframe.h"
@@ -42,85 +43,6 @@ typedef enum {
SCALABILITY_SS = 14
} SCALABILITY_STRUCTURES;
-// Returns 1 when OBU type is valid, and 0 otherwise.
-static int valid_obu_type(int obu_type) {
- int valid_type = 0;
- switch (obu_type) {
- case OBU_SEQUENCE_HEADER:
- case OBU_TEMPORAL_DELIMITER:
- case OBU_FRAME_HEADER:
- case OBU_TILE_GROUP:
- case OBU_METADATA:
- case OBU_FRAME:
- case OBU_REDUNDANT_FRAME_HEADER:
- case OBU_TILE_LIST:
- case OBU_PADDING: valid_type = 1; break;
- default: break;
- }
- return valid_type;
-}
-
-// Parses OBU header and stores values in 'header'.
-static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
- int is_annexb, ObuHeader *header) {
- if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
-
- const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
- if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
-
- header->size = 1;
-
- if (aom_rb_read_bit(rb) != 0) {
- // Forbidden bit. Must not be set.
- return AOM_CODEC_CORRUPT_FRAME;
- }
-
- header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
-
- if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
-
- header->has_extension = aom_rb_read_bit(rb);
- header->has_size_field = aom_rb_read_bit(rb);
-
- if (!header->has_size_field && !is_annexb) {
- // section 5 obu streams must have obu_size field set.
- return AOM_CODEC_UNSUP_BITSTREAM;
- }
-
- if (aom_rb_read_bit(rb) != 0) {
- // obu_reserved_1bit must be set to 0.
- return AOM_CODEC_CORRUPT_FRAME;
- }
-
- if (header->has_extension) {
- if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
-
- header->size += 1;
- header->temporal_layer_id = aom_rb_read_literal(rb, 3);
- header->spatial_layer_id = aom_rb_read_literal(rb, 2);
- if (aom_rb_read_literal(rb, 3) != 0) {
- // extension_header_reserved_3bits must be set to 0.
- return AOM_CODEC_CORRUPT_FRAME;
- }
- }
-
- return AOM_CODEC_OK;
-}
-
-aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
- size_t *consumed, ObuHeader *header,
- int is_annexb) {
- if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
-
- // TODO(tomfinegan): Set the error handler here and throughout this file, and
- // confirm parsing work done via aom_read_bit_buffer is successful.
- struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
- NULL };
- aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
- if (parse_result == AOM_CODEC_OK) *consumed = header->size;
- return parse_result;
-}
-
aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
int operating_point_idc, unsigned int *number_spatial_layers,
unsigned int *number_temporal_layers) {
@@ -208,7 +130,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
SequenceHeader *const seq_params = &sh;
seq_params->profile = av1_read_profile(rb);
- if (seq_params->profile > PROFILE_2) {
+ if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) {
cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
return 0;
}
@@ -349,10 +271,8 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
// If a sequence header has been decoded before, we check if the new
// one is consistent with the old one.
if (pbi->sequence_header_ready) {
- if (!are_seq_headers_consistent(&cm->seq_params, seq_params)) {
- aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
- "Inconsistent sequence headers received.");
- }
+ if (!are_seq_headers_consistent(&cm->seq_params, seq_params))
+ pbi->sequence_header_changed = 1;
}
cm->seq_params = *seq_params;
@@ -620,9 +540,9 @@ static void read_metadata_hdr_mdcv(const uint8_t *data, size_t sz) {
static void scalability_structure(struct aom_read_bit_buffer *rb) {
int spatial_layers_cnt = aom_rb_read_literal(rb, 2);
- int spatial_layer_dimensions_present_flag = aom_rb_read_literal(rb, 1);
- int spatial_layer_description_present_flag = aom_rb_read_literal(rb, 1);
- int temporal_group_description_present_flag = aom_rb_read_literal(rb, 1);
+ int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb);
+ int spatial_layer_description_present_flag = aom_rb_read_bit(rb);
+ int temporal_group_description_present_flag = aom_rb_read_bit(rb);
aom_rb_read_literal(rb, 3); // reserved
if (spatial_layer_dimensions_present_flag) {
@@ -643,8 +563,8 @@ static void scalability_structure(struct aom_read_bit_buffer *rb) {
temporal_group_size = aom_rb_read_literal(rb, 8);
for (i = 0; i < temporal_group_size; i++) {
aom_rb_read_literal(rb, 3);
- aom_rb_read_literal(rb, 1);
- aom_rb_read_literal(rb, 1);
+ aom_rb_read_bit(rb);
+ aom_rb_read_bit(rb);
int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3);
for (j = 0; j < temporal_group_ref_cnt; j++) {
aom_rb_read_literal(rb, 8);
@@ -716,61 +636,6 @@ static size_t read_metadata(const uint8_t *data, size_t sz) {
return sz;
}
-static aom_codec_err_t read_obu_size(const uint8_t *data,
- size_t bytes_available,
- size_t *const obu_size,
- size_t *const length_field_size) {
- uint64_t u_obu_size = 0;
- if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
- 0) {
- return AOM_CODEC_CORRUPT_FRAME;
- }
-
- if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
- *obu_size = (size_t)u_obu_size;
- return AOM_CODEC_OK;
-}
-
-aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
- size_t bytes_available,
- int is_annexb,
- ObuHeader *obu_header,
- size_t *const payload_size,
- size_t *const bytes_read) {
- size_t length_field_size = 0, obu_size = 0;
- aom_codec_err_t status;
-
- if (is_annexb) {
- // Size field comes before the OBU header, and includes the OBU header
- status =
- read_obu_size(data, bytes_available, &obu_size, &length_field_size);
-
- if (status != AOM_CODEC_OK) return status;
- }
-
- struct aom_read_bit_buffer rb = { data + length_field_size,
- data + bytes_available, 0, NULL, NULL };
-
- status = read_obu_header(&rb, is_annexb, obu_header);
- if (status != AOM_CODEC_OK) return status;
-
- if (is_annexb) {
- // Derive the payload size from the data we've already read
- if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
-
- *payload_size = obu_size - obu_header->size;
- } else {
- // Size field comes after the OBU header, and is just the payload size
- status = read_obu_size(data + obu_header->size,
- bytes_available - obu_header->size, payload_size,
- &length_field_size);
- if (status != AOM_CODEC_OK) return status;
- }
-
- *bytes_read = length_field_size + obu_header->size;
- return AOM_CODEC_OK;
-}
-
// On success, returns a boolean that indicates whether the decoding of the
// current frame is finished. On failure, sets cm->error.error_code and
// returns -1.
@@ -781,8 +646,6 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
int frame_decoding_finished = 0;
int is_first_tg_obu_received = 1;
uint32_t frame_header_size = 0;
- int seq_header_received = 0;
- size_t seq_header_size = 0;
ObuHeader obu_header;
memset(&obu_header, 0, sizeof(obu_header));
pbi->seen_frame_header = 0;
@@ -853,19 +716,8 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
pbi->seen_frame_header = 0;
break;
case OBU_SEQUENCE_HEADER:
- if (!seq_header_received) {
- decoded_payload_size = read_sequence_header_obu(pbi, &rb);
- if (cm->error.error_code != AOM_CODEC_OK) return -1;
-
- seq_header_size = decoded_payload_size;
- seq_header_received = 1;
- } else {
- // Seeing another sequence header, skip as all sequence headers are
- // required to be identical except for the contents of
- // operating_parameters_info and the amount of trailing bits.
- // TODO(yaowu): verifying redundant sequence headers are identical.
- decoded_payload_size = seq_header_size;
- }
+ decoded_payload_size = read_sequence_header_obu(pbi, &rb);
+ if (cm->error.error_code != AOM_CODEC_OK) return -1;
break;
case OBU_FRAME_HEADER:
case OBU_REDUNDANT_FRAME_HEADER:
@@ -889,6 +741,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
assert(rb.bit_offset == 0);
rb.bit_offset = 8 * frame_header_size;
}
+
decoded_payload_size = frame_header_size;
pbi->frame_header_size = frame_header_size;
@@ -938,6 +791,11 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
decoded_payload_size = read_metadata(data, payload_size);
break;
case OBU_TILE_LIST:
+ if (CONFIG_NORMAL_TILE_MODE) {
+ cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return -1;
+ }
+
// This OBU type is purely for the large scale tile coding mode.
// The common camera frame header has to be already decoded.
if (!pbi->camera_frame_header_ready) {
diff --git a/third_party/aom/av1/decoder/obu.h b/third_party/aom/av1/decoder/obu.h
index 5f2197058..5ab243fc9 100644
--- a/third_party/aom/av1/decoder/obu.h
+++ b/third_party/aom/av1/decoder/obu.h
@@ -9,35 +9,12 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_DECODER_OBU_H
-#define AV1_DECODER_OBU_H
+#ifndef AOM_AV1_DECODER_OBU_H_
+#define AOM_AV1_DECODER_OBU_H_
#include "aom/aom_codec.h"
#include "av1/decoder/decoder.h"
-typedef struct {
- size_t size; // Size (1 or 2 bytes) of the OBU header (including the
- // optional OBU extension header) in the bitstream.
- OBU_TYPE type;
- int has_size_field;
- int has_extension;
- // The following fields come from the OBU extension header and therefore are
- // only used if has_extension is true.
- int temporal_layer_id;
- int spatial_layer_id;
-} ObuHeader;
-
-aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
- size_t *consumed, ObuHeader *header,
- int is_annexb);
-
-aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
- size_t bytes_available,
- int is_annexb,
- ObuHeader *obu_header,
- size_t *const payload_size,
- size_t *const bytes_read);
-
// Try to decode one frame from a buffer.
// Returns 1 if we decoded a frame,
// 0 if we didn't decode a frame but that's okay
@@ -51,4 +28,4 @@ aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
int operating_point_idc, unsigned int *num_spatial_layers,
unsigned int *num_temporal_layers);
-#endif
+#endif // AOM_AV1_DECODER_OBU_H_
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
index b721b6d2b..80f8e2e66 100644
--- a/third_party/aom/av1/encoder/aq_complexity.c
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -143,9 +143,10 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
aom_clear_system_state();
- low_var_thresh = (cpi->oxcf.pass == 2) ? AOMMAX(cpi->twopass.mb_av_energy,
- MIN_DEFAULT_LV_THRESH)
- : DEFAULT_LV_THRESH;
+ low_var_thresh =
+ (cpi->oxcf.pass == 2)
+ ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH)
+ : DEFAULT_LV_THRESH;
av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes);
logvar = av1_log_block_var(cpi, mb, bs);
diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h
index af525b36d..3421d74c9 100644
--- a/third_party/aom/av1/encoder/aq_complexity.h
+++ b/third_party/aom/av1/encoder/aq_complexity.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_AQ_COMPLEXITY_H_
-#define AV1_ENCODER_AQ_COMPLEXITY_H_
+#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
#ifdef __cplusplus
extern "C" {
@@ -34,4 +34,4 @@ void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi);
} // extern "C"
#endif
-#endif // AV1_ENCODER_AQ_COMPLEXITY_H_
+#endif // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
index dec2c730d..f532d48da 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -80,9 +80,11 @@ CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
}
void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
- aom_free(cr->map);
- aom_free(cr->last_coded_q_map);
- aom_free(cr);
+ if (cr != NULL) {
+ aom_free(cr->map);
+ aom_free(cr->last_coded_q_map);
+ aom_free(cr);
+ }
}
// Check if we should turn off cyclic refresh based on bitrate condition.
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
index 459ab80b8..b45781983 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.h
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_AQ_CYCLICREFRESH_H_
-#define AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
#include "av1/common/blockd.h"
@@ -95,4 +95,4 @@ static INLINE int cyclic_refresh_segment_id(int segment_id) {
} // extern "C"
#endif
-#endif // AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#endif // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
index 6cb6adc42..58f906bdc 100644
--- a/third_party/aom/av1/encoder/aq_variance.c
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -14,34 +14,33 @@
#include "aom_ports/mem.h"
#include "av1/encoder/aq_variance.h"
-
#include "av1/common/seg_common.h"
+#include "av1/encoder/encodeframe.h"
#include "av1/encoder/ratectrl.h"
#include "av1/encoder/rd.h"
#include "av1/encoder/segmentation.h"
#include "av1/encoder/dwt.h"
#include "aom_ports/system_state.h"
+static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0,
+ 0.9, .8, .7, .6 };
+
+static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0,
+ 0.75, 1.0, 1.0, 1.0 };
#define ENERGY_MIN (-4)
#define ENERGY_MAX (1)
#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
#define ENERGY_IN_BOUNDS(energy) \
assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
-static const double rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0,
- 0.75, 1.0, 1.0, 1.0 };
-static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
-
-#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
-
DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
+
DECLARE_ALIGNED(16, static const uint16_t,
av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
-unsigned int av1_vaq_segment_id(int energy) {
- ENERGY_IN_BOUNDS(energy);
- return SEGMENT_ID(energy);
-}
+static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
+
+#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
void av1_vaq_frame_setup(AV1_COMP *cpi) {
AV1_COMMON *cm = &cpi->common;
@@ -51,6 +50,12 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
int resolution_change =
cm->prev_frame && (cm->width != cm->prev_frame->width ||
cm->height != cm->prev_frame->height);
+ int avg_energy = (int)(cpi->twopass.mb_av_energy - 2);
+ double avg_ratio;
+ if (avg_energy > 7) avg_energy = 7;
+ if (avg_energy < 0) avg_energy = 0;
+ avg_ratio = rate_ratio[avg_energy];
+
if (resolution_change) {
memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
av1_clearall_segfeatures(seg);
@@ -69,9 +74,11 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
aom_clear_system_state();
for (i = 0; i < MAX_SEGMENTS; ++i) {
- int qindex_delta =
- av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
- rate_ratio[i], cm->seq_params.bit_depth);
+ // Set up avg segment id to be 1.0 and adjust the other segments around
+ // it.
+ int qindex_delta = av1_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[i] / avg_ratio,
+ cm->seq_params.bit_depth);
// We don't allow qindex 0 in a segment if the base value is not 0.
// Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
@@ -87,114 +94,58 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
}
}
-/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions
- * of variance() and highbd_8_variance(). It should not.
- */
-static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int w, int h, unsigned int *sse,
- int *sum) {
- int i, j;
-
- *sum = 0;
- *sse = 0;
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+ // This functions returns a score for the blocks local variance as calculated
+ // by: sum of the log of the (4x4 variances) of each subblock to the current
+ // block (x,bs)
+ // * 32 / number of pixels in the block_size.
+ // This is used for segmentation because to avoid situations in which a large
+ // block with a gentle gradient gets marked high variance even though each
+ // subblock has a low variance. This allows us to assign the same segment
+ // number for the same sorts of area regardless of how the partitioning goes.
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
- }
-
- a += a_stride;
- b += b_stride;
- }
-}
-
-static void aq_highbd_variance64(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w, int h,
- uint64_t *sse, uint64_t *sum) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ double var = 0;
+ unsigned int sse;
int i, j;
- uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
- }
- a += a_stride;
- b += b_stride;
- }
-}
-
-static void aq_highbd_8_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w, int h,
- unsigned int *sse, int *sum) {
- uint64_t sse_long = 0;
- uint64_t sum_long = 0;
- aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sse = (unsigned int)sse_long;
- *sum = (int)sum_long;
-}
-
-static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
- BLOCK_SIZE bs) {
- MACROBLOCKD *xd = &x->e_mbd;
- unsigned int var, sse;
int right_overflow =
(xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
int bottom_overflow =
(xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
- if (right_overflow || bottom_overflow) {
- const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
- const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
- int avg;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
- CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, bw, bh,
- &sse, &avg);
- sse >>= 2 * (xd->bd - 8);
- avg >>= (xd->bd - 8);
- } else {
- aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
- bw, bh, &sse, &avg);
- }
- var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh));
- return (unsigned int)((uint64_t)var * 256) / (bw * bh);
- } else {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- var =
- cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
- CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse);
- } else {
- var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
- av1_all_zeros, 0, &sse);
+ const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+ const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+ aom_clear_system_state();
+
+ for (i = 0; i < bh; i += 4) {
+ for (j = 0; j < bw; j += 4) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ var +=
+ log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+ x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+ x->plane[0].src.stride,
+ CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
+ 16);
+ } else {
+ var +=
+ log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+ x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+ x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
+ 16);
+ }
}
- return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
}
-}
+ // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561.
+ var /= (bw / 4 * bh / 4);
+ if (var > 7) var = 7;
-double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
- unsigned int var = block_variance(cpi, x, bs);
aom_clear_system_state();
- return log(var + 1.0);
+ return (int)(var);
}
#define DEFAULT_E_MIDPOINT 10.0
-int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
- double energy;
- double energy_midpoint;
- aom_clear_system_state();
- energy_midpoint =
- (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
- energy = av1_log_block_var(cpi, x, bs) - energy_midpoint;
- return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
-}
unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
MACROBLOCKD *xd = &x->e_mbd;
@@ -231,17 +182,21 @@ int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
int block_var_level) {
- ENERGY_IN_BOUNDS(block_var_level);
-
- const int rate_level = SEGMENT_ID(block_var_level);
+ int rate_level;
const AV1_COMMON *const cm = &cpi->common;
+
+ if (DELTAQ_MODULATION == 1) {
+ ENERGY_IN_BOUNDS(block_var_level);
+ rate_level = SEGMENT_ID(block_var_level);
+ } else {
+ rate_level = block_var_level;
+ }
int qindex_delta = av1_compute_qdelta_by_rate(
- &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[rate_level],
+ &cpi->rc, cm->frame_type, cm->base_qindex, deltaq_rate_ratio[rate_level],
cm->seq_params.bit_depth);
if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
qindex_delta = -cm->base_qindex + 1;
}
-
return qindex_delta;
}
diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h
index b1a8bc38a..2d22b663e 100644
--- a/third_party/aom/av1/encoder/aq_variance.h
+++ b/third_party/aom/av1/encoder/aq_variance.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_AQ_VARIANCE_H_
-#define AV1_ENCODER_AQ_VARIANCE_H_
+#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_
+#define AOM_AV1_ENCODER_AQ_VARIANCE_H_
#include "av1/encoder/encoder.h"
@@ -18,11 +18,9 @@
extern "C" {
#endif
-unsigned int av1_vaq_segment_id(int energy);
void av1_vaq_frame_setup(AV1_COMP *cpi);
-int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
-double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
int block_var_level);
int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
@@ -32,4 +30,4 @@ int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
} // extern "C"
#endif
-#endif // AV1_ENCODER_AQ_VARIANCE_H_
+#endif // AOM_AV1_ENCODER_AQ_VARIANCE_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
index b92b3469f..98505e0b1 100644
--- a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
@@ -11,24 +11,7 @@
#include <stdlib.h>
#include "av1/encoder/av1_fwd_txfm1d.h"
-
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
- int32_t size, int8_t bit);
-
-#define range_check(stage, input, buf, size, bit) \
- range_check_func(stage, input, buf, size, bit)
-#else // CONFIG_COEFFICIENT_RANGE_CHECKING
-
-#define range_check(stage, input, buf, size, bit) \
- { \
- (void)stage; \
- (void)input; \
- (void)buf; \
- (void)size; \
- (void)bit; \
- }
-#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+#include "av1/common/av1_txfm.h"
void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range) {
@@ -40,7 +23,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[4];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -49,7 +32,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[1] = input[1] + input[2];
bf1[2] = -input[2] + input[1];
bf1[3] = -input[3] + input[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -60,7 +43,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -70,7 +53,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[1] = bf0[2];
bf1[2] = bf0[1];
bf1[3] = bf0[3];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -83,7 +66,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[8];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -96,7 +79,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = -input[5] + input[2];
bf1[6] = -input[6] + input[1];
bf1[7] = -input[7] + input[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -111,7 +94,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
bf1[7] = bf0[7];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -126,7 +109,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = -bf0[5] + bf0[4];
bf1[6] = -bf0[6] + bf0[7];
bf1[7] = bf0[7] + bf0[6];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -141,7 +124,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -155,7 +138,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = bf0[5];
bf1[6] = bf0[3];
bf1[7] = bf0[7];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -168,7 +151,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[16];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -189,7 +172,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = -input[13] + input[2];
bf1[14] = -input[14] + input[1];
bf1[15] = -input[15] + input[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -212,7 +195,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
bf1[14] = bf0[14];
bf1[15] = bf0[15];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -235,7 +218,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = -bf0[13] + bf0[14];
bf1[14] = bf0[14] + bf0[13];
bf1[15] = bf0[15] + bf0[12];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -258,7 +241,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
bf1[15] = bf0[15];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -281,7 +264,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = -bf0[13] + bf0[12];
bf1[14] = -bf0[14] + bf0[15];
bf1[15] = bf0[15] + bf0[14];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -304,7 +287,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -326,7 +309,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[11];
bf1[14] = bf0[7];
bf1[15] = bf0[15];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -339,7 +322,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[32];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -376,7 +359,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = -input[29] + input[2];
bf1[30] = -input[30] + input[1];
bf1[31] = -input[31] + input[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -415,7 +398,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = bf0[29];
bf1[30] = bf0[30];
bf1[31] = bf0[31];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -454,7 +437,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = bf0[29] + bf0[26];
bf1[30] = bf0[30] + bf0[25];
bf1[31] = bf0[31] + bf0[24];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -493,7 +476,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
bf1[30] = bf0[30];
bf1[31] = bf0[31];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -532,7 +515,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = -bf0[29] + bf0[30];
bf1[30] = bf0[30] + bf0[29];
bf1[31] = bf0[31] + bf0[28];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -571,7 +554,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
bf1[31] = bf0[31];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -610,7 +593,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = -bf0[29] + bf0[28];
bf1[30] = -bf0[30] + bf0[31];
bf1[31] = bf0[31] + bf0[30];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 8
stage++;
@@ -649,7 +632,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
stage++;
@@ -687,7 +670,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = bf0[23];
bf1[30] = bf0[15];
bf1[31] = bf0[31];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -698,7 +681,7 @@ void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t s0, s1, s2, s3, s4, s5, s6, s7;
// stage 0
- range_check(0, input, input, 4, stage_range[0]);
+ av1_range_check_buf(0, input, input, 4, stage_range[0]);
x0 = input[0];
x1 = input[1];
x2 = input[2];
@@ -746,7 +729,7 @@ void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
output[1] = round_shift(s1, bit);
output[2] = round_shift(s2, bit);
output[3] = round_shift(s3, bit);
- range_check(6, input, output, 4, stage_range[6]);
+ av1_range_check_buf(6, input, output, 4, stage_range[6]);
}
void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -759,7 +742,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[8];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -773,7 +756,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = input[6];
bf1[6] = input[2];
bf1[7] = -input[5];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -788,7 +771,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = bf0[5];
bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -802,7 +785,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = bf0[5] + bf0[7];
bf1[6] = bf0[4] - bf0[6];
bf1[7] = bf0[5] - bf0[7];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -817,7 +800,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -831,7 +814,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = bf0[1] - bf0[5];
bf1[6] = bf0[2] - bf0[6];
bf1[7] = bf0[3] - bf0[7];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -846,7 +829,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -860,7 +843,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = bf0[2];
bf1[6] = bf0[7];
bf1[7] = bf0[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -873,7 +856,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[16];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -895,7 +878,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = -input[13];
bf1[14] = -input[5];
bf1[15] = input[10];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -918,7 +901,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[13];
bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -940,7 +923,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[13] + bf0[15];
bf1[14] = bf0[12] - bf0[14];
bf1[15] = bf0[13] - bf0[15];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -963,7 +946,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -985,7 +968,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[9] - bf0[13];
bf1[14] = bf0[10] - bf0[14];
bf1[15] = bf0[11] - bf0[15];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -1008,7 +991,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -1030,7 +1013,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[5] - bf0[13];
bf1[14] = bf0[6] - bf0[14];
bf1[15] = bf0[7] - bf0[15];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 8
stage++;
@@ -1053,7 +1036,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
stage++;
@@ -1075,7 +1058,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[2];
bf1[14] = bf0[15];
bf1[15] = bf0[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -1084,14 +1067,14 @@ void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
for (int i = 0; i < 4; ++i)
output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
assert(stage_range[0] + NewSqrt2Bits <= 32);
- range_check(0, input, output, 4, stage_range[0]);
+ av1_range_check_buf(0, input, output, 4, stage_range[0]);
}
void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range) {
(void)cos_bit;
for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
- range_check(0, input, output, 8, stage_range[0]);
+ av1_range_check_buf(0, input, output, 8, stage_range[0]);
}
void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -1100,14 +1083,14 @@ void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
for (int i = 0; i < 16; ++i)
output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
assert(stage_range[0] + NewSqrt2Bits <= 32);
- range_check(0, input, output, 16, stage_range[0]);
+ av1_range_check_buf(0, input, output, 16, stage_range[0]);
}
void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range) {
(void)cos_bit;
for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
- range_check(0, input, output, 32, stage_range[0]);
+ av1_range_check_buf(0, input, output, 32, stage_range[0]);
}
void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -1120,7 +1103,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[64];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -1189,7 +1172,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = -input[61] + input[2];
bf1[62] = -input[62] + input[1];
bf1[63] = -input[63] + input[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -1260,7 +1243,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[61];
bf1[62] = bf0[62];
bf1[63] = bf0[63];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -1331,7 +1314,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[61] + bf0[50];
bf1[62] = bf0[62] + bf0[49];
bf1[63] = bf0[63] + bf0[48];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -1402,7 +1385,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[61];
bf1[62] = bf0[62];
bf1[63] = bf0[63];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -1473,7 +1456,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[61] + bf0[58];
bf1[62] = bf0[62] + bf0[57];
bf1[63] = bf0[63] + bf0[56];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -1544,7 +1527,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
bf1[62] = bf0[62];
bf1[63] = bf0[63];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -1615,7 +1598,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = -bf0[61] + bf0[62];
bf1[62] = bf0[62] + bf0[61];
bf1[63] = bf0[63] + bf0[60];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 8
stage++;
@@ -1686,7 +1669,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
bf1[63] = bf0[63];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
stage++;
@@ -1757,7 +1740,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = -bf0[61] + bf0[60];
bf1[62] = -bf0[62] + bf0[63];
bf1[63] = bf0[63] + bf0[62];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 10
stage++;
@@ -1828,7 +1811,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 11
stage++;
@@ -1898,5 +1881,5 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[47];
bf1[62] = bf0[31];
bf1[63] = bf0[63];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
index 9472af8e6..9dcf16552 100644
--- a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_FWD_TXFM1D_H_
-#define AV1_FWD_TXFM1D_H_
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
#include "av1/common/av1_txfm.h"
@@ -46,4 +46,4 @@ void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
}
#endif
-#endif // AV1_FWD_TXFM1D_H_
+#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
index 174689a14..98b6530db 100644
--- a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
@@ -9,11 +9,11 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_FWD_TXFM2D_CFG_H_
-#define AV1_FWD_TXFM2D_CFG_H_
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
#include "av1/common/enums.h"
#include "av1/encoder/av1_fwd_txfm1d.h"
extern const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL];
extern const int8_t fwd_cos_bit_col[5][5];
extern const int8_t fwd_cos_bit_row[5][5];
-#endif // AV1_FWD_TXFM2D_CFG_H_
+#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
index d0477b35b..a0a926005 100644
--- a/third_party/aom/av1/encoder/av1_quantize.c
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -273,35 +273,32 @@ void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
- // obsolete skip_block
- const int skip_block = 0;
const qm_val_t *qm_ptr = qparam->qmatrix;
const qm_val_t *iqm_ptr = qparam->iqmatrix;
if (qm_ptr != NULL && iqm_ptr != NULL) {
- quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
- sc->scan, sc->iscan, qm_ptr, iqm_ptr,
- qparam->log_scale);
+ quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
} else {
switch (qparam->log_scale) {
case 0:
- aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
- sc->scan, sc->iscan);
+ aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
break;
case 1:
- aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
- sc->scan, sc->iscan);
+ aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
break;
case 2:
- aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
- sc->scan, sc->iscan);
+ aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
break;
default: assert(0);
}
@@ -392,28 +389,25 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
const SCAN_ORDER *sc,
const QUANT_PARAM *qparam) {
- // obsolete skip_block
- const int skip_block = 0;
const qm_val_t *qm_ptr = qparam->qmatrix;
const qm_val_t *iqm_ptr = qparam->iqmatrix;
if (qm_ptr != NULL && iqm_ptr != NULL) {
- highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
- sc->scan, sc->iscan, qm_ptr, iqm_ptr,
- qparam->log_scale);
+ highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
} else {
switch (qparam->log_scale) {
case 0:
if (LIKELY(n_coeffs >= 8)) {
- aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
- eob_ptr, sc->scan, sc->iscan);
+ aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
} else {
// TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
// quantization
- aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+ aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, p->zbin_QTX,
p->round_QTX, p->quant_QTX,
p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
@@ -421,15 +415,15 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
break;
case 1:
aom_highbd_quantize_b_32x32(
- coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX,
- p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
- p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
break;
case 2:
aom_highbd_quantize_b_64x64(
- coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX,
- p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
- p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
break;
default: assert(0);
}
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
index eaf8374de..35af9a67a 100644
--- a/third_party/aom/av1/encoder/av1_quantize.h
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_QUANTIZE_H_
-#define AV1_ENCODER_QUANTIZE_H_
+#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_
#include "config/aom_config.h"
@@ -145,4 +145,4 @@ void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
} // extern "C"
#endif
-#endif // AV1_ENCODER_QUANTIZE_H_
+#endif // AOM_AV1_ENCODER_AV1_QUANTIZE_H_
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
index 2070755cd..2c4acdb02 100644
--- a/third_party/aom/av1/encoder/bitstream.c
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -18,6 +18,7 @@
#include "aom_dsp/binary_codes_writer.h"
#include "aom_dsp/bitwriter_buffer.h"
#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
#include "aom_ports/mem_ops.h"
#include "aom_ports/system_state.h"
#if CONFIG_BITSTREAM_DEBUG
@@ -30,7 +31,6 @@
#include "av1/common/entropymode.h"
#include "av1/common/entropymv.h"
#include "av1/common/mvref_common.h"
-#include "av1/common/odintrin.h"
#include "av1/common/pred_common.h"
#include "av1/common/reconinter.h"
#include "av1/common/reconintra.h"
@@ -66,11 +66,11 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
aom_writer *const w, int plane,
FRAME_COUNTS *counts);
-static void write_intra_mode_kf(FRAME_CONTEXT *frame_ctx,
- const MB_MODE_INFO *mi,
- const MB_MODE_INFO *above_mi,
- const MB_MODE_INFO *left_mi,
- PREDICTION_MODE mode, aom_writer *w) {
+static void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
+ const MB_MODE_INFO *mi,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi,
+ PREDICTION_MODE mode, aom_writer *w) {
assert(!is_intrabc_block(mi));
(void)mi;
aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
@@ -297,7 +297,7 @@ static void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex,
DELTA_Q_PROBS + 1);
if (!smallval) {
- rem_bits = OD_ILOG_NZ(abs - 1) - 1;
+ rem_bits = get_msb(abs - 1);
thr = (1 << rem_bits) + 1;
aom_write_literal(w, rem_bits - 1, 3);
aom_write_literal(w, abs - thr, rem_bits);
@@ -326,7 +326,7 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
}
if (!smallval) {
- rem_bits = OD_ILOG_NZ(abs - 1) - 1;
+ rem_bits = get_msb(abs - 1);
thr = (1 << rem_bits) + 1;
aom_write_literal(w, rem_bits - 1, 3);
aom_write_literal(w, abs - thr, rem_bits);
@@ -836,8 +836,8 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
}
}
-static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
- PREDICTION_MODE mode, aom_writer *w) {
+static void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
+ PREDICTION_MODE mode, aom_writer *w) {
aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
INTRA_MODES);
}
@@ -933,45 +933,24 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
}
}
-static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
- const int mi_col, aom_writer *w) {
+// If delta q is present, writes delta_q index.
+// Also writes delta_q loop filter levels, if present.
+static void write_delta_q_params(AV1_COMP *cpi, const int mi_row,
+ const int mi_col, int skip, aom_writer *w) {
AV1_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->td.mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
- const struct segmentation *const seg = &cm->seg;
- struct segmentation_probs *const segp = &ec_ctx->seg;
- const MB_MODE_INFO *const mbmi = xd->mi[0];
- const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
- const PREDICTION_MODE mode = mbmi->mode;
- const int segment_id = mbmi->segment_id;
- const BLOCK_SIZE bsize = mbmi->sb_type;
- const int allow_hp = cm->allow_high_precision_mv;
- const int is_inter = is_inter_block(mbmi);
- const int is_compound = has_second_ref(mbmi);
- int skip, ref;
- (void)mi_row;
- (void)mi_col;
-
- write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1);
-
- write_skip_mode(cm, xd, segment_id, mbmi, w);
-
- assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
- skip = mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
-
- write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0);
-
- write_cdef(cm, xd, w, skip, mi_col, mi_row);
-
if (cm->delta_q_present_flag) {
- int super_block_upper_left =
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int super_block_upper_left =
((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+
if ((bsize != cm->seq_params.sb_size || skip == 0) &&
super_block_upper_left) {
assert(mbmi->current_qindex > 0);
- int reduced_delta_qindex =
+ const int reduced_delta_qindex =
(mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
write_delta_qindex(xd, reduced_delta_qindex, w);
xd->current_qindex = mbmi->current_qindex;
@@ -996,37 +975,96 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
}
}
}
+}
- if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+static void write_intra_prediction_modes(AV1_COMP *cpi, const int mi_row,
+ const int mi_col, int is_keyframe,
+ aom_writer *w) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const PREDICTION_MODE mode = mbmi->mode;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
- if (mbmi->skip_mode) return;
+ // Y mode.
+ if (is_keyframe) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
+ } else {
+ write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w);
+ }
- if (!is_inter) {
- write_intra_mode(ec_ctx, bsize, mode, w);
- const int use_angle_delta = av1_use_angle_delta(bsize);
+ // Y angle delta.
+ const int use_angle_delta = av1_use_angle_delta(bsize);
+ if (use_angle_delta && av1_is_directional_mode(mode)) {
+ write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+ ec_ctx->angle_delta_cdf[mode - V_PRED]);
+ }
- if (use_angle_delta && av1_is_directional_mode(mode)) {
- write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
- ec_ctx->angle_delta_cdf[mode - V_PRED]);
+ // UV mode and UV angle delta.
+ if (!cm->seq_params.monochrome &&
+ is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y)) {
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+ if (uv_mode == UV_CFL_PRED)
+ write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+ if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
+ write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+ ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
}
+ }
- if (!cm->seq_params.monochrome &&
- is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
- xd->plane[1].subsampling_y)) {
- const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
- write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
- if (uv_mode == UV_CFL_PRED)
- write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
- if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
- write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
- ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
- }
- }
+ // Palette.
+ if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) {
+ write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
+ }
- if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
- write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
+ // Filter intra.
+ write_filter_intra_mode_info(cm, xd, mbmi, w);
+}
- write_filter_intra_mode_info(cm, xd, mbmi, w);
+static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
+ const int mi_col, aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const struct segmentation *const seg = &cm->seg;
+ struct segmentation_probs *const segp = &ec_ctx->seg;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const PREDICTION_MODE mode = mbmi->mode;
+ const int segment_id = mbmi->segment_id;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int allow_hp = cm->allow_high_precision_mv;
+ const int is_inter = is_inter_block(mbmi);
+ const int is_compound = has_second_ref(mbmi);
+ int ref;
+
+ write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1);
+
+ write_skip_mode(cm, xd, segment_id, mbmi, w);
+
+ assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
+ const int skip =
+ mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
+
+ write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0);
+
+ write_cdef(cm, xd, w, skip, mi_col, mi_row);
+
+ write_delta_q_params(cpi, mi_row, mi_col, skip, w);
+
+ if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+
+ if (mbmi->skip_mode) return;
+
+ if (!is_inter) {
+ write_intra_prediction_modes(cpi, mi_row, mi_col, 0, w);
} else {
int16_t mode_ctx;
@@ -1172,11 +1210,7 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
const struct segmentation *const seg = &cm->seg;
struct segmentation_probs *const segp = &ec_ctx->seg;
- const MB_MODE_INFO *const above_mi = xd->above_mbmi;
- const MB_MODE_INFO *const left_mi = xd->left_mbmi;
const MB_MODE_INFO *const mbmi = xd->mi[0];
- const BLOCK_SIZE bsize = mbmi->sb_type;
- const PREDICTION_MODE mode = mbmi->mode;
if (seg->segid_preskip && seg->update_map)
write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
@@ -1188,69 +1222,14 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
write_cdef(cm, xd, w, skip, mi_col, mi_row);
- if (cm->delta_q_present_flag) {
- int super_block_upper_left =
- ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
- ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
- if ((bsize != cm->seq_params.sb_size || skip == 0) &&
- super_block_upper_left) {
- assert(mbmi->current_qindex > 0);
- int reduced_delta_qindex =
- (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
- write_delta_qindex(xd, reduced_delta_qindex, w);
- xd->current_qindex = mbmi->current_qindex;
- if (cm->delta_lf_present_flag) {
- if (cm->delta_lf_multi) {
- const int frame_lf_count =
- av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
- for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
- int reduced_delta_lflevel =
- (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
- cm->delta_lf_res;
- write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
- xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
- }
- } else {
- int reduced_delta_lflevel =
- (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
- cm->delta_lf_res;
- write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
- xd->delta_lf_from_base = mbmi->delta_lf_from_base;
- }
- }
- }
- }
+ write_delta_q_params(cpi, mi_row, mi_col, skip, w);
if (av1_allow_intrabc(cm)) {
write_intrabc_info(xd, mbmi_ext, w);
if (is_intrabc_block(mbmi)) return;
}
- write_intra_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
-
- const int use_angle_delta = av1_use_angle_delta(bsize);
- if (use_angle_delta && av1_is_directional_mode(mode)) {
- write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
- ec_ctx->angle_delta_cdf[mode - V_PRED]);
- }
-
- if (!cm->seq_params.monochrome &&
- is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
- xd->plane[1].subsampling_y)) {
- const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
- write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
- if (uv_mode == UV_CFL_PRED)
- write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
- if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
- write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
- ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
- }
- }
-
- if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
- write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
-
- write_filter_intra_mode_info(cm, xd, mbmi, w);
+ write_intra_prediction_modes(cpi, mi_row, mi_col, 1, w);
}
#if CONFIG_RD_DEBUG
@@ -1549,10 +1528,10 @@ static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w);
} else {
write_selected_tx_size(xd, w);
- set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, 0, xd);
+ set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h, 0, xd);
}
} else {
- set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h,
+ set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
skip && is_inter_block(mbmi), xd);
}
@@ -1694,15 +1673,14 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
}
static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
- aom_writer *const w, const TOKENEXTRA **tok,
- const TOKENEXTRA *const tok_end) {
+ aom_writer *const w, int tile_row, int tile_col) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
const int mi_row_start = tile->mi_row_start;
const int mi_row_end = tile->mi_row_end;
const int mi_col_start = tile->mi_col_start;
const int mi_col_end = tile->mi_col_end;
- int mi_row, mi_col;
+ int mi_row, mi_col, sb_row_in_tile;
av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row);
av1_init_above_context(cm, xd, tile->tile_row);
@@ -1716,13 +1694,21 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
for (mi_row = mi_row_start; mi_row < mi_row_end;
mi_row += cm->seq_params.mib_size) {
+ sb_row_in_tile =
+ (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2;
+ const TOKENEXTRA *tok =
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].start;
+ const TOKENEXTRA *tok_end =
+ tok + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count;
+
av1_zero_left_context(xd);
for (mi_col = mi_col_start; mi_col < mi_col_end;
mi_col += cm->seq_params.mib_size) {
- write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
+ write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
cm->seq_params.sb_size);
}
+ assert(tok == cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop);
}
}
@@ -2220,33 +2206,12 @@ static void write_ext_tile_info(const AV1_COMMON *const cm,
}
}
-#if USE_GF16_MULTI_LAYER
-static int get_refresh_mask_gf16(AV1_COMP *cpi) {
- if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common))
- return 0xFF;
-
- int refresh_mask = 0;
-
- if (cpi->refresh_last_frame || cpi->refresh_golden_frame ||
- cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
- cpi->refresh_alt_ref_frame) {
- assert(cpi->refresh_fb_idx >= 0 && cpi->refresh_fb_idx < REF_FRAMES);
- refresh_mask |= (1 << cpi->refresh_fb_idx);
- }
-
- return refresh_mask;
-}
-#endif // USE_GF16_MULTI_LAYER
-
static int get_refresh_mask(AV1_COMP *cpi) {
if ((cpi->common.frame_type == KEY_FRAME && cpi->common.show_frame) ||
frame_is_sframe(&cpi->common))
return 0xFF;
int refresh_mask = 0;
-#if USE_GF16_MULTI_LAYER
- if (cpi->rc.baseline_gf_interval == 16) return get_refresh_mask_gf16(cpi);
-#endif // USE_GF16_MULTI_LAYER
// NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be
// notified to get LAST3_FRAME refreshed and then the virtual indexes for all
@@ -2281,8 +2246,13 @@ static int get_refresh_mask(AV1_COMP *cpi) {
// Note: This is highly specific to the use of ARF as a forward reference,
// and this needs to be generalized as other uses are implemented
// (like RTC/temporal scalability).
- return refresh_mask |
- (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]);
+
+ if (cpi->preserve_arf_as_gld) {
+ return refresh_mask;
+ } else {
+ return refresh_mask |
+ (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]);
+ }
} else {
const int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
return refresh_mask |
@@ -2574,9 +2544,9 @@ static void write_film_grain_params(AV1_COMP *cpi,
aom_wb_write_literal(wb, pars->random_seed, 16);
- pars->random_seed += 3245; // For film grain test vectors purposes
+ pars->random_seed += 3381; // Changing random seed for film grain
if (!pars->random_seed) // Random seed should not be zero
- pars->random_seed += 1735;
+ pars->random_seed += 7391;
if (cm->frame_type == INTER_FRAME)
aom_wb_write_bit(wb, pars->update_parameters);
else
@@ -2685,7 +2655,8 @@ static void write_sb_size(SequenceHeader *seq_params,
aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
}
-void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) {
+static void write_sequence_header(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *wb) {
AV1_COMMON *const cm = &cpi->common;
SequenceHeader *seq_params = &cm->seq_params;
@@ -2695,8 +2666,10 @@ void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) {
int max_frame_height = cpi->oxcf.forced_max_frame_height
? cpi->oxcf.forced_max_frame_height
: cpi->oxcf.height;
+ // max((int)ceil(log2(max_frame_width)), 1)
const int num_bits_width =
(max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1;
+ // max((int)ceil(log2(max_frame_height)), 1)
const int num_bits_height =
(max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1;
assert(num_bits_width <= 16);
@@ -2954,7 +2927,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
assert(cm->frame_type == KEY_FRAME);
}
if (!seq_params->reduced_still_picture_hdr) {
- if (cm->show_existing_frame) {
+ if (encode_show_existing_frame(cm)) {
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
@@ -3254,14 +3227,14 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
if (cm->base_qindex > 0) {
aom_wb_write_bit(wb, cm->delta_q_present_flag);
if (cm->delta_q_present_flag) {
- aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2);
+ aom_wb_write_literal(wb, get_msb(cm->delta_q_res), 2);
xd->current_qindex = cm->base_qindex;
if (cm->allow_intrabc)
assert(cm->delta_lf_present_flag == 0);
else
aom_wb_write_bit(wb, cm->delta_lf_present_flag);
if (cm->delta_lf_present_flag) {
- aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
+ aom_wb_write_literal(wb, get_msb(cm->delta_lf_res), 2);
aom_wb_write_bit(wb, cm->delta_lf_multi);
av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
}
@@ -3508,7 +3481,7 @@ static void write_bitstream_level(BitstreamLevel bl,
aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
}
-static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
+uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
AV1_COMMON *const cm = &cpi->common;
struct aom_write_bit_buffer wb = { dst, 0 };
uint32_t size = 0;
@@ -3619,7 +3592,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
AV1_COMMON *const cm = &cpi->common;
aom_writer mode_bc;
int tile_row, tile_col;
- TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
uint32_t total_size = 0;
const int tile_cols = cm->tile_cols;
@@ -3684,8 +3656,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
- const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
- const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
const int data_offset = have_tiles ? 4 : 0;
const int tile_idx = tile_row * tile_cols + tile_col;
TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
@@ -3703,8 +3673,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
mode_bc.allow_update_cdf =
mode_bc.allow_update_cdf && !cm->disable_cdf_update;
aom_start_encode(&mode_bc, buf->data + data_offset);
- write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
- assert(tok == tok_end);
+ write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
aom_stop_encode(&mode_bc);
tile_size = mode_bc.pos;
buf->size = tile_size;
@@ -3794,8 +3763,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
const int tile_idx = tile_row * tile_cols + tile_col;
TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
- const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
- const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
int is_last_tile_in_tg = 0;
if (new_tg) {
@@ -3847,7 +3814,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes);
aom_start_encode(&mode_bc, dst + total_size);
- write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+ write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
aom_stop_encode(&mode_bc);
tile_size = mode_bc.pos;
assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
@@ -3990,7 +3957,8 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
data += obu_header_size + obu_payload_size + length_field_size;
}
- const int write_frame_header = (cm->num_tg > 1 || cm->show_existing_frame);
+ const int write_frame_header =
+ (cm->num_tg > 1 || encode_show_existing_frame(cm));
struct aom_write_bit_buffer saved_wb;
if (write_frame_header) {
// Write Frame Header OBU.
@@ -4017,7 +3985,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
saved_wb.bit_buffer += length_field_size;
}
- if (cm->show_existing_frame) {
+ if (encode_show_existing_frame(cm)) {
data_size = 0;
} else {
// Each tile group obu will be preceded by 4-byte size of the tile group
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
index 2047b6833..465ccaed5 100644
--- a/third_party/aom/av1/encoder/bitstream.h
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_BITSTREAM_H_
-#define AV1_ENCODER_BITSTREAM_H_
+#ifndef AOM_AV1_ENCODER_BITSTREAM_H_
+#define AOM_AV1_ENCODER_BITSTREAM_H_
#ifdef __cplusplus
extern "C" {
@@ -20,8 +20,13 @@ extern "C" {
struct aom_write_bit_buffer;
-void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb);
+// Writes only the OBU Sequence Header payload, and returns the size of the
+// payload written to 'dst'. This function does not write the OBU header, the
+// optional extension, or the OBU size to 'dst'.
+uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst);
+// Writes the OBU header byte, and the OBU header extension byte when
+// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
uint8_t *const dst);
@@ -32,8 +37,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) {
// Do not swap gf and arf indices for internal overlay frames
- return !cpi->multi_arf_allowed && cpi->rc.is_src_frame_alt_ref &&
- !cpi->rc.is_src_frame_ext_arf;
+ return cpi->rc.is_src_frame_alt_ref && !cpi->rc.is_src_frame_ext_arf;
}
void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
@@ -44,4 +48,4 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
} // extern "C"
#endif
-#endif // AV1_ENCODER_BITSTREAM_H_
+#endif // AOM_AV1_ENCODER_BITSTREAM_H_
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
index 003e59e39..0bc5dea82 100644
--- a/third_party/aom/av1/encoder/block.h
+++ b/third_party/aom/av1/encoder/block.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_BLOCK_H_
-#define AV1_ENCODER_BLOCK_H_
+#ifndef AOM_AV1_ENCODER_BLOCK_H_
+#define AOM_AV1_ENCODER_BLOCK_H_
#include "av1/common/entropymv.h"
#include "av1/common/entropy.h"
@@ -170,6 +170,7 @@ typedef struct {
InterpFilters filters;
int_mv mv[2];
int8_t ref_frames[2];
+ COMPOUND_TYPE comp_type;
} INTERPOLATION_FILTER_STATS;
typedef struct macroblock MACROBLOCK;
@@ -254,6 +255,19 @@ struct macroblock {
PALETTE_BUFFER *palette_buffer;
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
+
+ // buffer for hash value calculation of a block
+ // used only in av1_get_block_hash_value()
+ // [first hash/second hash]
+ // [two buffers used ping-pong]
+ uint32_t *hash_value_buffer[2][2];
+
+ CRC_CALCULATOR crc_calculator1;
+ CRC_CALCULATOR crc_calculator2;
+ int g_crc_initialized;
+
// These define limits to motion vector components to prevent them
// from extending outside the UMV borders
MvLimits mv_limits;
@@ -344,7 +358,6 @@ struct macroblock {
#if CONFIG_DIST_8X8
int using_dist_8x8;
aom_tune_metric tune_metric;
- DECLARE_ALIGNED(16, int16_t, pred_luma[MAX_SB_SQUARE]);
#endif // CONFIG_DIST_8X8
int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
@@ -352,6 +365,8 @@ struct macroblock {
int tx_search_prune[EXT_TX_SET_TYPES];
int must_find_valid_partition;
int tx_split_prune_flag; // Flag to skip tx split RD search.
+ int recalc_luma_mc_data; // Flag to indicate recalculation of MC data during
+ // interpolation filter search
};
static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
@@ -400,8 +415,38 @@ static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
return depth;
}
+static INLINE void set_blk_skip(MACROBLOCK *x, int plane, int blk_idx,
+ int skip) {
+ if (skip)
+ x->blk_skip[blk_idx] |= 1UL << plane;
+ else
+ x->blk_skip[blk_idx] &= ~(1UL << plane);
+#ifndef NDEBUG
+ // Set chroma planes to uninitialized states when luma is set to check if
+ // it will be set later
+ if (plane == 0) {
+ x->blk_skip[blk_idx] |= 1UL << (1 + 4);
+ x->blk_skip[blk_idx] |= 1UL << (2 + 4);
+ }
+
+ // Clear the initialization checking bit
+ x->blk_skip[blk_idx] &= ~(1UL << (plane + 4));
+#endif
+}
+
+static INLINE int is_blk_skip(MACROBLOCK *x, int plane, int blk_idx) {
+#ifndef NDEBUG
+ // Check if this is initialized
+ assert(!(x->blk_skip[blk_idx] & (1UL << (plane + 4))));
+
+ // The magic number is 0x77, this is to test if there is garbage data
+ assert((x->blk_skip[blk_idx] & 0x88) == 0);
+#endif
+ return (x->blk_skip[blk_idx] >> plane) & 1;
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_BLOCK_H_
+#endif // AOM_AV1_ENCODER_BLOCK_H_
diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c
index 66dedd9ed..f7cff9e53 100644
--- a/third_party/aom/av1/encoder/blockiness.c
+++ b/third_party/aom/av1/encoder/blockiness.c
@@ -16,7 +16,6 @@
#include "av1/common/common.h"
#include "av1/common/filter.h"
#include "aom/aom_integer.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_filter.h"
#include "aom_ports/mem.h"
#include "aom_ports/system_state.h"
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
index d6e556b93..57f59f304 100644
--- a/third_party/aom/av1/encoder/context_tree.c
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -175,14 +175,15 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
}
void av1_free_pc_tree(ThreadData *td, const int num_planes) {
- const int tree_nodes_inc = 1024;
-
- const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
- int i;
- for (i = 0; i < tree_nodes; ++i)
- free_tree_contexts(&td->pc_tree[i], num_planes);
- aom_free(td->pc_tree);
- td->pc_tree = NULL;
+ if (td->pc_tree != NULL) {
+ const int tree_nodes_inc = 1024;
+ const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+ for (int i = 0; i < tree_nodes; ++i) {
+ free_tree_contexts(&td->pc_tree[i], num_planes);
+ }
+ aom_free(td->pc_tree);
+ td->pc_tree = NULL;
+ }
}
void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
index c05f48a7a..4efc34985 100644
--- a/third_party/aom/av1/encoder/context_tree.h
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_CONTEXT_TREE_H_
-#define AV1_ENCODER_CONTEXT_TREE_H_
+#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_
+#define AOM_AV1_ENCODER_CONTEXT_TREE_H_
#include "av1/common/blockd.h"
#include "av1/encoder/block.h"
@@ -56,6 +56,8 @@ typedef struct {
int hybrid_pred_diff;
int comp_pred_diff;
int single_pred_diff;
+ // Skip certain ref frames during RD search of rectangular partitions.
+ int skip_ref_frame_mask;
// TODO(jingning) Use RD_COST struct here instead. This involves a boarder
// scope of refactoring.
@@ -109,4 +111,4 @@ void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
} // extern "C"
#endif
-#endif /* AV1_ENCODER_CONTEXT_TREE_H_ */
+#endif // AOM_AV1_ENCODER_CONTEXT_TREE_H_
diff --git a/third_party/aom/av1/encoder/corner_detect.h b/third_party/aom/av1/encoder/corner_detect.h
index 0317db5b3..cab59a774 100644
--- a/third_party/aom/av1/encoder/corner_detect.h
+++ b/third_party/aom/av1/encoder/corner_detect.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_CORNER_DETECT_H_
-#define AV1_ENCODER_CORNER_DETECT_H_
+#ifndef AOM_AV1_ENCODER_CORNER_DETECT_H_
+#define AOM_AV1_ENCODER_CORNER_DETECT_H_
#include <stdio.h>
#include <stdlib.h>
@@ -19,4 +19,4 @@
int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
int *points, int max_points);
-#endif // AV1_ENCODER_CORNER_DETECT_H_
+#endif // AOM_AV1_ENCODER_CORNER_DETECT_H_
diff --git a/third_party/aom/av1/encoder/corner_match.h b/third_party/aom/av1/encoder/corner_match.h
index 3b16f9efc..535d2faed 100644
--- a/third_party/aom/av1/encoder/corner_match.h
+++ b/third_party/aom/av1/encoder/corner_match.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_CORNER_MATCH_H_
-#define AV1_ENCODER_CORNER_MATCH_H_
+#ifndef AOM_AV1_ENCODER_CORNER_MATCH_H_
+#define AOM_AV1_ENCODER_CORNER_MATCH_H_
#include <stdio.h>
#include <stdlib.h>
@@ -30,4 +30,4 @@ int determine_correspondence(unsigned char *frm, int *frm_corners,
int height, int frm_stride, int ref_stride,
int *correspondence_pts);
-#endif // AV1_ENCODER_CORNER_MATCH_H_
+#endif // AOM_AV1_ENCODER_CORNER_MATCH_H_
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
index 5de7765c5..af5b09837 100644
--- a/third_party/aom/av1/encoder/cost.h
+++ b/third_party/aom/av1/encoder/cost.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_COST_H_
-#define AV1_ENCODER_COST_H_
+#ifndef AOM_AV1_ENCODER_COST_H_
+#define AOM_AV1_ENCODER_COST_H_
#include "aom_dsp/prob.h"
#include "aom/aom_integer.h"
@@ -44,4 +44,4 @@ void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
} // extern "C"
#endif
-#endif // AV1_ENCODER_COST_H_
+#endif // AOM_AV1_ENCODER_COST_H_
diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h
index 03318e5b7..37306c6a5 100644
--- a/third_party/aom/av1/encoder/dwt.h
+++ b/third_party/aom/av1/encoder/dwt.h
@@ -9,6 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#ifndef AOM_AV1_ENCODER_DWT_H_
+#define AOM_AV1_ENCODER_DWT_H_
+
#include "av1/common/common.h"
#include "av1/common/enums.h"
@@ -18,3 +21,5 @@ void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride);
void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
int hbd);
int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd);
+
+#endif // AOM_AV1_ENCODER_DWT_H_
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
index 27ca53761..cb226c59e 100644
--- a/third_party/aom/av1/encoder/encodeframe.c
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -40,11 +40,11 @@
#include "av1/common/reconinter.h"
#include "av1/common/seg_common.h"
#include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
#include "av1/encoder/aq_complexity.h"
#include "av1/encoder/aq_cyclicrefresh.h"
#include "av1/encoder/aq_variance.h"
-#include "av1/common/warped_motion.h"
#include "av1/encoder/global_motion.h"
#include "av1/encoder/encodeframe.h"
#include "av1/encoder/encodemb.h"
@@ -56,6 +56,7 @@
#include "av1/encoder/partition_model_weights.h"
#include "av1/encoder/rd.h"
#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
#include "av1/encoder/segmentation.h"
#include "av1/encoder/tokenize.h"
@@ -348,8 +349,9 @@ static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
x->skip = 0;
}
-static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data,
- ThreadData *td, PICK_MODE_CONTEXT *ctx, int mi_row,
+static void update_state(const AV1_COMP *const cpi,
+ const TileDataEnc *const tile_data, ThreadData *td,
+ const PICK_MODE_CONTEXT *const ctx, int mi_row,
int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
int i, x_idx, y;
const AV1_COMMON *const cm = &cpi->common;
@@ -359,7 +361,7 @@ static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data,
MACROBLOCKD *const xd = &x->e_mbd;
struct macroblock_plane *const p = x->plane;
struct macroblockd_plane *const pd = xd->plane;
- MB_MODE_INFO *mi = &ctx->mic;
+ const MB_MODE_INFO *const mi = &ctx->mic;
MB_MODE_INFO *const mi_addr = xd->mi[0];
const struct segmentation *const seg = &cm->seg;
const int bw = mi_size_wide[mi->sb_type];
@@ -505,12 +507,12 @@ static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q);
}
-static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
MACROBLOCK *const x, int mi_row, int mi_col,
RD_STATS *rd_cost, PARTITION_TYPE partition,
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
int64_t best_rd) {
- const AV1_COMMON *const cm = &cpi->common;
+ AV1_COMMON *const cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
TileInfo *const tile_info = &tile_data->tile_info;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -522,6 +524,13 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode;
int i, orig_rdmult;
+ if (best_rd < 0) {
+ ctx->rdcost = INT64_MAX;
+ ctx->skip = 0;
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
+
aom_clear_system_state();
set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
@@ -588,9 +597,10 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
if (aq_mode == VARIANCE_AQ) {
if (cpi->vaq_refresh) {
- const int energy =
- bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
- mbmi->segment_id = av1_vaq_segment_id(energy);
+ const int energy = bsize <= BLOCK_16X16
+ ? x->mb_energy
+ : av1_log_block_var(cpi, x, bsize);
+ mbmi->segment_id = energy;
}
x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
} else if (aq_mode == COMPLEXITY_AQ) {
@@ -1407,8 +1417,8 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col,
RUN_TYPE dry_run, BLOCK_SIZE bsize,
- PARTITION_TYPE partition, PICK_MODE_CONTEXT *ctx,
- int *rate) {
+ PARTITION_TYPE partition,
+ const PICK_MODE_CONTEXT *const ctx, int *rate) {
TileInfo *const tile = &tile_data->tile_info;
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *xd = &x->e_mbd;
@@ -1691,7 +1701,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
- x->mb_energy = av1_block_energy(cpi, x, bsize);
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
}
if (do_partition_search &&
@@ -1728,7 +1738,20 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
pc_tree->partitioning = partition;
}
}
-
+ for (int b = 0; b < 2; ++b) {
+ pc_tree->horizontal[b].skip_ref_frame_mask = 0;
+ pc_tree->vertical[b].skip_ref_frame_mask = 0;
+ }
+ for (int b = 0; b < 3; ++b) {
+ pc_tree->horizontala[b].skip_ref_frame_mask = 0;
+ pc_tree->horizontalb[b].skip_ref_frame_mask = 0;
+ pc_tree->verticala[b].skip_ref_frame_mask = 0;
+ pc_tree->verticalb[b].skip_ref_frame_mask = 0;
+ }
+ for (int b = 0; b < 4; ++b) {
+ pc_tree->horizontal4[b].skip_ref_frame_mask = 0;
+ pc_tree->vertical4[b].skip_ref_frame_mask = 0;
+ }
switch (partition) {
case PARTITION_NONE:
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
@@ -1741,7 +1764,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
mi_row + hbs < cm->mi_rows) {
RD_STATS tmp_rdc;
- PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
+ const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
av1_init_rd_stats(&tmp_rdc);
update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
@@ -1765,7 +1788,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
mi_col + hbs < cm->mi_cols) {
RD_STATS tmp_rdc;
- PICK_MODE_CONTEXT *ctx_v = &pc_tree->vertical[0];
+ const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
av1_init_rd_stats(&tmp_rdc);
update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
@@ -1812,7 +1835,8 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
case PARTITION_HORZ_A:
case PARTITION_HORZ_B:
case PARTITION_HORZ_4:
- case PARTITION_VERT_4: assert(0 && "Cannot handle extended partiton types");
+ case PARTITION_VERT_4:
+ assert(0 && "Cannot handle extended partition types");
default: assert(0); break;
}
@@ -2164,7 +2188,8 @@ static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
}
-static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+static INLINE void load_pred_mv(MACROBLOCK *x,
+ const PICK_MODE_CONTEXT *const ctx) {
memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
}
@@ -2221,12 +2246,11 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
// Try searching for an encoding for the given subblock. Returns zero if the
// rdcost is already too high (to tell the caller not to bother searching for
// encodings of further subblocks)
-static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
- TileDataEnc *tile_data, TOKENEXTRA **tp,
- int is_first, int is_last, int mi_row, int mi_col,
- BLOCK_SIZE subsize, RD_STATS *best_rdc,
- RD_STATS *sum_rdc, RD_STATS *this_rdc,
- PARTITION_TYPE partition,
+static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TOKENEXTRA **tp, int is_last,
+ int mi_row, int mi_col, BLOCK_SIZE subsize,
+ RD_STATS *best_rdc, RD_STATS *sum_rdc,
+ RD_STATS *this_rdc, PARTITION_TYPE partition,
PICK_MODE_CONTEXT *prev_ctx,
PICK_MODE_CONTEXT *this_ctx) {
#define RTS_X_RATE_NOCOEF_ARG
@@ -2236,25 +2260,20 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
- // On the first time around, write the rd stats straight to sum_rdc. Also, we
- // should treat sum_rdc as containing zeros (even if it doesn't) to avoid
- // having to zero it at the start.
- if (is_first) this_rdc = sum_rdc;
- const int64_t spent_rdcost = is_first ? 0 : sum_rdc->rdcost;
- const int64_t rdcost_remaining = best_rdc->rdcost - spent_rdcost;
+ const int64_t rdcost_remaining = best_rdc->rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc->rdcost - sum_rdc->rdcost);
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
rdcost_remaining);
- if (!is_first) {
- if (this_rdc->rate == INT_MAX) {
- sum_rdc->rdcost = INT64_MAX;
- } else {
- sum_rdc->rate += this_rdc->rate;
- sum_rdc->dist += this_rdc->dist;
- sum_rdc->rdcost += this_rdc->rdcost;
- }
+ if (this_rdc->rate == INT_MAX) {
+ sum_rdc->rdcost = INT64_MAX;
+ } else {
+ sum_rdc->rate += this_rdc->rate;
+ sum_rdc->dist += this_rdc->dist;
+ sum_rdc->rdcost += this_rdc->rdcost;
}
if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0;
@@ -2271,7 +2290,7 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
#undef RTS_MAX_RDCOST
}
-static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
+static void rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
TileDataEnc *tile_data, TOKENEXTRA **tp,
PC_TREE *pc_tree, RD_STATS *best_rdc,
PICK_MODE_CONTEXT ctxs[3],
@@ -2284,13 +2303,16 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
MACROBLOCKD *const xd = &x->e_mbd;
RD_STATS sum_rdc, this_rdc;
#define RTP_STX_TRY_ARGS
-
- if (!rd_try_subblock(cpi, td, tile_data, tp, 1, 0, mi_row0, mi_col0, subsize0,
+ int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ av1_init_rd_stats(&sum_rdc);
+ sum_rdc.rate = x->partition_cost[pl][partition];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row0, mi_col0, subsize0,
best_rdc, &sum_rdc, &this_rdc,
RTP_STX_TRY_ARGS partition, ctx, &ctxs[0]))
return;
- if (!rd_try_subblock(cpi, td, tile_data, tp, 0, 0, mi_row1, mi_col1, subsize1,
+ if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row1, mi_col1, subsize1,
best_rdc, &sum_rdc, &this_rdc,
RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1]))
return;
@@ -2302,15 +2324,13 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
// difference (obviously) doesn't contribute to the error.
const int try_block2 = 1;
if (try_block2 &&
- !rd_try_subblock(cpi, td, tile_data, tp, 0, 1, mi_row2, mi_col2, subsize2,
+ !rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2,
best_rdc, &sum_rdc, &this_rdc,
RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2]))
return;
if (sum_rdc.rdcost >= best_rdc->rdcost) return;
- int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- sum_rdc.rate += x->partition_cost[pl][partition];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost >= best_rdc->rdcost) return;
@@ -2321,45 +2341,6 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
#undef RTP_STX_TRY_ARGS
}
-#if CONFIG_DIST_8X8
-static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
- uint8_t *src_plane_8x8[MAX_MB_PLANE],
- uint8_t *dst_plane_8x8[MAX_MB_PLANE]) {
- const AV1_COMMON *const cm = &cpi->common;
- const int num_planes = av1_num_planes(cm);
- MACROBLOCKD *const xd = &x->e_mbd;
- int64_t dist_8x8, dist_8x8_uv, total_dist;
- const int src_stride = x->plane[0].src.stride;
- int plane;
-
- const int dst_stride = xd->plane[0].dst.stride;
- dist_8x8 =
- av1_dist_8x8(cpi, x, src_plane_8x8[0], src_stride, dst_plane_8x8[0],
- dst_stride, BLOCK_8X8, 8, 8, 8, 8, x->qindex)
- << 4;
-
- // Compute chroma distortion for a luma 8x8 block
- dist_8x8_uv = 0;
-
- if (num_planes > 1) {
- for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
- unsigned sse;
- const int src_stride_uv = x->plane[plane].src.stride;
- const int dst_stride_uv = xd->plane[plane].dst.stride;
- const int ssx = xd->plane[plane].subsampling_x;
- const int ssy = xd->plane[plane].subsampling_y;
- const BLOCK_SIZE plane_bsize = get_plane_block_size(BLOCK_8X8, ssx, ssy);
-
- cpi->fn_ptr[plane_bsize].vf(src_plane_8x8[plane], src_stride_uv,
- dst_plane_8x8[plane], dst_stride_uv, &sse);
- dist_8x8_uv += (int64_t)sse << 4;
- }
- }
-
- return total_dist = dist_8x8 + dist_8x8_uv;
-}
-#endif // CONFIG_DIST_8X8
-
static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
pc_tree->partitioning = PARTITION_NONE;
pc_tree->cb_search_range = SEARCH_FULL_PLANE;
@@ -2372,7 +2353,7 @@ static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
}
}
-static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
+static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
TileDataEnc *tile_data, TOKENEXTRA **tp,
int mi_row, int mi_col, BLOCK_SIZE bsize,
RD_STATS *rd_cost, int64_t best_rd,
@@ -2410,7 +2391,12 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
(void)*tp_orig;
(void)split_rd;
- av1_zero(pc_tree->pc_tree_stats);
+ if (best_rd < 0) {
+ pc_tree->none.rdcost = INT64_MAX;
+ pc_tree->none.skip = 0;
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
pc_tree->pc_tree_stats.valid = 1;
// Override partition costs at the edges of the frame in the same
@@ -2441,9 +2427,11 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
#ifndef NDEBUG
// Nothing should rely on the default value of this array (which is just
- // leftover from encoding the previous block. Setting it to magic number
+ // leftover from encoding the previous block. Setting it to fixed pattern
// when debugging.
- memset(x->blk_skip, 234, sizeof(x->blk_skip));
+ // bit 0, 1, 2 are blk_skip of each plane
+ // bit 4, 5, 6 are initialization checking of each plane
+ memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
#endif // NDEBUG
assert(mi_size_wide[bsize] == mi_size_high[bsize]);
@@ -2456,19 +2444,35 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
- x->mb_energy = av1_block_energy(cpi, x, bsize);
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
xd->left_txfm_context =
xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
+ do_square_split = 0;
+ }
+#endif
+
// PARTITION_NONE
if (partition_none_allowed) {
- if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
-
+ int pt_cost = 0;
+ if (bsize_at_least_8x8) {
+ pc_tree->partitioning = PARTITION_NONE;
+ pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+ ? partition_cost[PARTITION_NONE]
+ : 0;
+ }
+ int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0);
+ int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc.rdcost - partition_rd_cost);
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
- PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost);
+ PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost;
pc_tree->pc_tree_stats.skip = ctx_none->skip;
@@ -2476,9 +2480,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
if (none_rd) *none_rd = this_rdc.rdcost;
if (this_rdc.rate != INT_MAX) {
if (bsize_at_least_8x8) {
- const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
- ? partition_cost[PARTITION_NONE]
- : 0;
this_rdc.rate += pt_cost;
this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
}
@@ -2520,17 +2521,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
int64_t temp_best_rdcost = best_rdc.rdcost;
pn_rdc = best_rdc;
-#if CONFIG_DIST_8X8
- uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
-
- if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
- for (int i = 0; i < MAX_MB_PLANE; i++) {
- src_plane_8x8[i] = x->plane[i].src.buf;
- dst_plane_8x8[i] = xd->plane[i].dst.buf;
- }
- }
-#endif // CONFIG_DIST_8X8
-
// PARTITION_SPLIT
if (do_square_split) {
int reached_last_index = 0;
@@ -2548,6 +2538,8 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
pc_tree->split[idx]->index = idx;
int64_t *p_split_rd = &split_rd[idx];
+ // TODO(Cherma) : Account for partition cost while passing best rd to
+ // rd_pick_sqr_partition()
rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row + y_idx,
mi_col + x_idx, subsize, &this_rdc,
temp_best_rdcost - sum_rdc.rdcost,
@@ -2568,14 +2560,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
}
reached_last_index = (idx == 4);
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && reached_last_index &&
- sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
- sum_rdc.dist = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
- sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
- }
-#endif // CONFIG_DIST_8X8
-
if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
sum_rdc.rate += partition_cost[PARTITION_SPLIT];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -2634,14 +2618,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
}
}
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
- best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
- encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
- pc_tree, NULL);
- }
-#endif // CONFIG_DIST_8X8
-
if (bsize == cm->seq_params.sb_size) {
assert(best_rdc.rate < INT_MAX);
assert(best_rdc.dist < INT64_MAX);
@@ -2791,6 +2767,99 @@ static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
}
#undef FEATURE_SIZE
+static void ml_prune_rect_partition(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int64_t best_rd, int64_t none_rd,
+ int64_t *split_rd,
+ int *const dst_prune_horz,
+ int *const dst_prune_vert) {
+ if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+ best_rd = AOMMAX(best_rd, 1);
+ const NN_CONFIG *nn_config = NULL;
+ const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f };
+ float cur_thresh = 0.0f;
+ switch (bsize) {
+ case BLOCK_8X8:
+ nn_config = &av1_rect_partition_nnconfig_8;
+ cur_thresh = prob_thresholds[0];
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_rect_partition_nnconfig_16;
+ cur_thresh = prob_thresholds[1];
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_rect_partition_nnconfig_32;
+ cur_thresh = prob_thresholds[2];
+ break;
+ case BLOCK_64X64:
+ nn_config = &av1_rect_partition_nnconfig_64;
+ cur_thresh = prob_thresholds[3];
+ break;
+ case BLOCK_128X128:
+ nn_config = &av1_rect_partition_nnconfig_128;
+ cur_thresh = prob_thresholds[4];
+ break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config) return;
+ aom_clear_system_state();
+
+ // 1. Compute input features
+ float features[9];
+
+ // RD cost ratios
+ for (int i = 0; i < 5; i++) features[i] = 1.0f;
+ if (none_rd > 0 && none_rd < 1000000000)
+ features[0] = (float)none_rd / (float)best_rd;
+ for (int i = 0; i < 4; i++) {
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ features[1 + i] = (float)split_rd[i] / (float)best_rd;
+ }
+
+ // Variance ratios
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ int whole_block_variance;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ whole_block_variance = av1_high_get_sby_perpixel_variance(
+ cpi, &x->plane[0].src, bsize, xd->bd);
+ } else {
+ whole_block_variance =
+ av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+ }
+ whole_block_variance = AOMMAX(whole_block_variance, 1);
+
+ int split_variance[4];
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ struct buf_2d buf;
+ buf.stride = x->plane[0].src.stride;
+ const int bw = block_size_wide[bsize];
+ for (int i = 0; i < 4; ++i) {
+ const int x_idx = (i & 1) * bw / 2;
+ const int y_idx = (i >> 1) * bw / 2;
+ buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ split_variance[i] =
+ av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
+ } else {
+ split_variance[i] = av1_get_sby_perpixel_variance(cpi, &buf, subsize);
+ }
+ }
+
+ for (int i = 0; i < 4; i++)
+ features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
+
+ // 2. Do the prediction and prune 0-2 partitions based on their probabilities
+ float raw_scores[3] = { 0.0f };
+ av1_nn_predict(features, nn_config, raw_scores);
+ float probs[3] = { 0.0f };
+ av1_nn_softmax(raw_scores, probs, 3);
+
+ // probs[0] is the probability of the fact that both rectangular partitions
+ // are worse than current best_rd
+ if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1;
+ if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1;
+}
+
// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
// considered.
static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
@@ -2880,13 +2949,14 @@ static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
#define FEATURES 18
#define LABELS 4
// Use a ML model to predict if horz4 and vert4 should be considered.
-static void ml_prune_4_partition(const AV1_COMP *const cpi,
- const MACROBLOCK *const x, BLOCK_SIZE bsize,
- int part_ctx, int64_t best_rd,
- int64_t horz_rd[2], int64_t vert_rd[2],
- int64_t split_rd[4],
+static void ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int part_ctx,
+ int64_t best_rd, int64_t horz_rd[2],
+ int64_t vert_rd[2], int64_t split_rd[4],
int *const partition_horz4_allowed,
- int *const partition_vert4_allowed) {
+ int *const partition_vert4_allowed,
+ unsigned int pb_source_variance, int mi_row,
+ int mi_col) {
if (best_rd >= 1000000000) return;
const NN_CONFIG *nn_config = NULL;
switch (bsize) {
@@ -2903,7 +2973,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
float features[FEATURES];
int feature_index = 0;
features[feature_index++] = (float)part_ctx;
- features[feature_index++] = (float)get_unsigned_bits(x->source_variance);
+ features[feature_index++] = (float)get_unsigned_bits(pb_source_variance);
const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
int sub_block_rdcost[8] = { 0 };
@@ -2937,6 +3007,8 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
{
BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+ av1_num_planes(&cpi->common));
const int src_stride = x->plane[0].src.stride;
const uint8_t *src = x->plane[0].src.buf;
const MACROBLOCKD *const xd = &x->e_mbd;
@@ -2990,7 +3062,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
}
}
- const float denom = (float)(x->source_variance + 1);
+ const float denom = (float)(pb_source_variance + 1);
const float low_b = 0.1f;
const float high_b = 10.0f;
for (int i = 0; i < 4; ++i) {
@@ -3022,9 +3094,9 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
// Make decisions based on the model scores.
int thresh = max_score;
switch (bsize) {
- case BLOCK_16X16: thresh -= 400; break;
- case BLOCK_32X32: thresh -= 400; break;
- case BLOCK_64X64: thresh -= 100; break;
+ case BLOCK_16X16: thresh -= 500; break;
+ case BLOCK_32X32: thresh -= 500; break;
+ case BLOCK_64X64: thresh -= 200; break;
default: break;
}
*partition_horz4_allowed = 0;
@@ -3039,10 +3111,73 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
#undef FEATURES
#undef LABELS
+#define FEATURES 4
+// ML-based partition search breakout.
+static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ const MACROBLOCK *const x,
+ const RD_STATS *const rd_stats,
+ unsigned int pb_source_variance) {
+ const NN_CONFIG *nn_config = NULL;
+ int thresh = 0;
+ switch (bsize) {
+ case BLOCK_8X8:
+ nn_config = &av1_partition_breakout_nnconfig_8;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[0];
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_partition_breakout_nnconfig_16;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[1];
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_partition_breakout_nnconfig_32;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[2];
+ break;
+ case BLOCK_64X64:
+ nn_config = &av1_partition_breakout_nnconfig_64;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[3];
+ break;
+ case BLOCK_128X128:
+ nn_config = &av1_partition_breakout_nnconfig_128;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[4];
+ break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config || thresh < 0) return 0;
+
+ // Generate feature values.
+ float features[FEATURES];
+ int feature_index = 0;
+ aom_clear_system_state();
+
+ const int num_pels_log2 = num_pels_log2_lookup[bsize];
+ float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
+ rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+ rate_f;
+ features[feature_index++] = rate_f;
+
+ const float dist_f =
+ (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2);
+ features[feature_index++] = dist_f;
+
+ features[feature_index++] = (float)pb_source_variance;
+
+ const int dc_q = (int)x->plane[0].dequant_QTX[0];
+ features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
+ assert(feature_index == FEATURES);
+
+ // Calculate score using the NN model.
+ float score = 0.0f;
+ av1_nn_predict(features, nn_config, &score);
+
+ // Make decision.
+ return (int)(score * 100) >= thresh;
+}
+#undef FEATURES
+
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
-static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
+static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
TileDataEnc *tile_data, TOKENEXTRA **tp,
int mi_row, int mi_col, BLOCK_SIZE bsize,
RD_STATS *rd_cost, int64_t best_rd,
@@ -3068,6 +3203,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
int do_rectangular_split = 1;
+ int64_t cur_none_rd = 0;
int64_t split_rd[4] = { 0, 0, 0, 0 };
int64_t horz_rd[2] = { 0, 0 };
int64_t vert_rd[2] = { 0, 0 };
@@ -3077,6 +3213,12 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
int vert_ctx_is_ready = 0;
BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+ if (best_rd < 0) {
+ pc_tree->none.rdcost = INT64_MAX;
+ pc_tree->none.skip = 0;
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
// Override skipping rectangular partition operations for edge blocks
@@ -3129,9 +3271,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
#ifndef NDEBUG
// Nothing should rely on the default value of this array (which is just
- // leftover from encoding the previous block. Setting it to magic number
+ // leftover from encoding the previous block. Setting it to fixed pattern
// when debugging.
- memset(x->blk_skip, 234, sizeof(x->blk_skip));
+ // bit 0, 1, 2 are blk_skip of each plane
+ // bit 4, 5, 6 are initialization checking of each plane
+ memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
#endif // NDEBUG
assert(mi_size_wide[bsize] == mi_size_high[bsize]);
@@ -3143,7 +3287,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
- x->mb_energy = av1_block_energy(cpi, x, bsize);
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
const int cb_partition_search_ctrl =
@@ -3285,22 +3429,56 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
}
#endif
+ // Ref frames picked in the [i_th] quarter subblock during square partition
+ // RD search. It may be used to prune ref frame selection of rect partitions.
+ int ref_frames_used[4] = {
+ 0,
+ };
+
BEGIN_PARTITION_SEARCH:
if (x->must_find_valid_partition) {
partition_none_allowed = has_rows && has_cols;
partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
}
+
+ // Partition block source pixel variance.
+ unsigned int pb_source_variance = UINT_MAX;
+
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (block_size_high[bsize] <= 8) partition_horz_allowed = 0;
+ if (block_size_wide[bsize] <= 8) partition_vert_allowed = 0;
+ if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
+ do_square_split = 0;
+ }
+#endif
+
// PARTITION_NONE
if (partition_none_allowed) {
+ int pt_cost = 0;
+ if (bsize_at_least_8x8) {
+ pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+ ? partition_cost[PARTITION_NONE]
+ : 0;
+ }
+ int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0);
+ int64_t best_remain_rdcost = (best_rdc.rdcost == INT64_MAX)
+ ? INT64_MAX
+ : (best_rdc.rdcost - partition_rd_cost);
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
- PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost);
+ PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+ pb_source_variance = x->source_variance;
if (none_rd) *none_rd = this_rdc.rdcost;
+ cur_none_rd = this_rdc.rdcost;
if (this_rdc.rate != INT_MAX) {
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame);
+ for (int i = 0; i < 4; ++i) {
+ ref_frames_used[i] |= (1 << ref_type);
+ }
+ }
if (bsize_at_least_8x8) {
- const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
- ? partition_cost[PARTITION_NONE]
- : 0;
this_rdc.rate += pt_cost;
this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
}
@@ -3318,16 +3496,29 @@ BEGIN_PARTITION_SEARCH:
best_rdc = this_rdc;
if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
- // If all y, u, v transform blocks in this partition are skippable, and
- // the dist & rate are within the thresholds, the partition search is
- // terminated for current branch of the partition search tree.
- // The dist & rate thresholds are set to 0 at speed 0 to disable the
- // early termination at that speed.
- if (!x->e_mbd.lossless[xd->mi[0]->segment_id] &&
- (ctx_none->skippable && best_rdc.dist < dist_breakout_thr &&
- best_rdc.rate < rate_breakout_thr)) {
- do_square_split = 0;
- do_rectangular_split = 0;
+ if ((do_square_split || do_rectangular_split) &&
+ !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
+ const int use_ml_based_breakout =
+ bsize <= cpi->sf.use_square_partition_only_threshold &&
+ bsize > BLOCK_4X4 && xd->bd == 8;
+ if (use_ml_based_breakout) {
+ if (ml_predict_breakout(cpi, bsize, x, &this_rdc,
+ pb_source_variance)) {
+ do_square_split = 0;
+ do_rectangular_split = 0;
+ }
+ }
+
+ // If all y, u, v transform blocks in this partition are skippable,
+ // and the dist & rate are within the thresholds, the partition
+ // search is terminated for current branch of the partition search
+ // tree. The dist & rate thresholds are set to 0 at speed 0 to
+ // disable the early termination at that speed.
+ if (best_rdc.dist < dist_breakout_thr &&
+ best_rdc.rate < rate_breakout_thr) {
+ do_square_split = 0;
+ do_rectangular_split = 0;
+ }
}
#if CONFIG_FP_MB_STATS
@@ -3384,24 +3575,14 @@ BEGIN_PARTITION_SEARCH:
// store estimated motion vector
if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
-#if CONFIG_DIST_8X8
- uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
-
- if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
- for (int i = 0; i < num_planes; i++) {
- src_plane_8x8[i] = x->plane[i].src.buf;
- dst_plane_8x8[i] = xd->plane[i].dst.buf;
- }
- }
-#endif // CONFIG_DIST_8X8
-
// PARTITION_SPLIT
if (do_square_split) {
av1_init_rd_stats(&sum_rdc);
- int reached_last_index = 0;
subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
- int idx;
+ sum_rdc.rate = partition_cost[PARTITION_SPLIT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ int idx;
for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
const int x_idx = (idx & 1) * mi_step;
const int y_idx = (idx >> 1) * mi_step;
@@ -3413,8 +3594,13 @@ BEGIN_PARTITION_SEARCH:
pc_tree->split[idx]->index = idx;
int64_t *p_split_rd = &split_rd[idx];
+ int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc.rdcost - sum_rdc.rdcost);
+ if (cpi->sf.prune_ref_frame_for_rect_partitions)
+ pc_tree->split[idx]->none.rate = INT_MAX;
rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
- subsize, &this_rdc, best_rdc.rdcost - sum_rdc.rdcost,
+ subsize, &this_rdc, best_remain_rdcost,
pc_tree->split[idx], p_split_rd);
if (this_rdc.rate == INT_MAX) {
@@ -3424,11 +3610,16 @@ BEGIN_PARTITION_SEARCH:
sum_rdc.rate += this_rdc.rate;
sum_rdc.dist += this_rdc.dist;
sum_rdc.rdcost += this_rdc.rdcost;
-
+ if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+ pc_tree->split[idx]->none.rate != INT_MAX) {
+ const int ref_type =
+ av1_ref_frame_type(pc_tree->split[idx]->none.mic.ref_frame);
+ ref_frames_used[idx] |= (1 << ref_type);
+ }
if (idx <= 1 && (bsize <= BLOCK_8X8 ||
pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
- MB_MODE_INFO *const mbmi = &(pc_tree->split[idx]->none.mic);
- PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
// Neither palette mode nor cfl predicted
if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1;
@@ -3436,60 +3627,83 @@ BEGIN_PARTITION_SEARCH:
}
}
}
- reached_last_index = (idx == 4);
-
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && reached_last_index &&
- sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
- int64_t dist_8x8;
- dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-#ifdef DEBUG_DIST_8X8
- // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/)
- assert(sum_rdc.dist == dist_8x8);
-#endif // DEBUG_DIST_8X8
- sum_rdc.dist = dist_8x8;
- sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
- }
-#endif // CONFIG_DIST_8X8
+ const int reached_last_index = (idx == 4);
if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
- sum_rdc.rate += partition_cost[PARTITION_SPLIT];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost < best_rdc.rdcost) {
best_rdc = sum_rdc;
pc_tree->partitioning = PARTITION_SPLIT;
}
- } else if (cpi->sf.less_rectangular_check) {
+ } else if (cpi->sf.less_rectangular_check_level > 0) {
// skip rectangular partition test when larger block size
// gives better rd cost
- do_rectangular_split &= !partition_none_allowed;
+ if (cpi->sf.less_rectangular_check_level == 2 || idx <= 2)
+ do_rectangular_split &= !partition_none_allowed;
}
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
} // if (do_split)
+ pc_tree->horizontal[0].skip_ref_frame_mask = 0;
+ pc_tree->horizontal[1].skip_ref_frame_mask = 0;
+ pc_tree->vertical[0].skip_ref_frame_mask = 0;
+ pc_tree->vertical[1].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[1];
+ if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2] | ref_frames_used[3];
+ if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[2];
+ if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1] | ref_frames_used[3];
+ if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames;
+ }
+
+ int prune_horz = 0;
+ int prune_vert = 0;
+ if (cpi->sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
+ (partition_horz_allowed || partition_vert_allowed)) {
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+ ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd,
+ split_rd, &prune_horz, &prune_vert);
+ }
+
// PARTITION_HORZ
- if (partition_horz_allowed &&
+ if (partition_horz_allowed && !prune_horz &&
(do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
av1_init_rd_stats(&sum_rdc);
subsize = get_partition_subsize(bsize, PARTITION_HORZ);
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
- partition_none_allowed)
+ partition_none_allowed) {
pc_tree->horizontal[0].pred_interp_filter =
av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
-
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+ }
+ int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc.rdcost - sum_rdc.rdcost);
+ sum_rdc.rate = partition_cost[PARTITION_HORZ];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
- best_rdc.rdcost);
- horz_rd[0] = sum_rdc.rdcost;
+ best_remain_rdcost);
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ }
+ horz_rd[0] = this_rdc.rdcost;
if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) {
- PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
- MB_MODE_INFO *const mbmi = &(pc_tree->horizontal[0].mic);
- PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
+ const MB_MODE_INFO *const mbmi = &pc_tree->horizontal[0].mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
// Neither palette mode nor cfl predicted
if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1;
@@ -3501,24 +3715,15 @@ BEGIN_PARTITION_SEARCH:
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
- partition_none_allowed)
+ partition_none_allowed) {
pc_tree->horizontal[1].pred_interp_filter =
av1_extract_interp_filter(ctx_h->mic.interp_filters, 0);
-
+ }
rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
best_rdc.rdcost - sum_rdc.rdcost);
horz_rd[1] = this_rdc.rdcost;
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
- update_state(cpi, tile_data, td, &pc_tree->horizontal[1],
- mi_row + mi_step, mi_col, subsize, DRY_RUN_NORMAL);
- encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL,
- mi_row + mi_step, mi_col, subsize, NULL);
- }
-#endif // CONFIG_DIST_8X8
-
if (this_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
} else {
@@ -3526,24 +3731,9 @@ BEGIN_PARTITION_SEARCH:
sum_rdc.dist += this_rdc.dist;
sum_rdc.rdcost += this_rdc.rdcost;
}
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
- bsize == BLOCK_8X8) {
- int64_t dist_8x8;
- dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-#ifdef DEBUG_DIST_8X8
- // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/)
- assert(sum_rdc.dist == dist_8x8);
-#endif // DEBUG_DIST_8X8
- sum_rdc.dist = dist_8x8;
- sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
- }
-#endif // CONFIG_DIST_8X8
}
if (sum_rdc.rdcost < best_rdc.rdcost) {
- sum_rdc.rate += partition_cost[PARTITION_HORZ];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost < best_rdc.rdcost) {
best_rdc = sum_rdc;
@@ -3555,7 +3745,7 @@ BEGIN_PARTITION_SEARCH:
}
// PARTITION_VERT
- if (partition_vert_allowed &&
+ if (partition_vert_allowed && !prune_vert &&
(do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) {
av1_init_rd_stats(&sum_rdc);
subsize = get_partition_subsize(bsize, PARTITION_VERT);
@@ -3563,18 +3753,31 @@ BEGIN_PARTITION_SEARCH:
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
- partition_none_allowed)
+ partition_none_allowed) {
pc_tree->vertical[0].pred_interp_filter =
av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
-
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+ }
+ sum_rdc.rate = partition_cost[PARTITION_VERT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc.rdcost - sum_rdc.rdcost);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
PARTITION_VERT, subsize, &pc_tree->vertical[0],
- best_rdc.rdcost);
- vert_rd[0] = sum_rdc.rdcost;
+ best_remain_rdcost);
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ }
+ vert_rd[0] = this_rdc.rdcost;
const int64_t vert_max_rdcost = best_rdc.rdcost;
if (sum_rdc.rdcost < vert_max_rdcost && has_cols) {
- MB_MODE_INFO *const mbmi = &(pc_tree->vertical[0].mic);
- PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const MB_MODE_INFO *const mbmi = &pc_tree->vertical[0].mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
// Neither palette mode nor cfl predicted
if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1;
@@ -3587,24 +3790,15 @@ BEGIN_PARTITION_SEARCH:
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
- partition_none_allowed)
+ partition_none_allowed) {
pc_tree->vertical[1].pred_interp_filter =
av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
-
+ }
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
PARTITION_VERT, subsize, &pc_tree->vertical[1],
best_rdc.rdcost - sum_rdc.rdcost);
vert_rd[1] = this_rdc.rdcost;
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
- update_state(cpi, tile_data, td, &pc_tree->vertical[1], mi_row,
- mi_col + mi_step, subsize, DRY_RUN_NORMAL);
- encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
- mi_col + mi_step, subsize, NULL);
- }
-#endif // CONFIG_DIST_8X8
-
if (this_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
} else {
@@ -3612,25 +3806,9 @@ BEGIN_PARTITION_SEARCH:
sum_rdc.dist += this_rdc.dist;
sum_rdc.rdcost += this_rdc.rdcost;
}
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
- bsize == BLOCK_8X8) {
- int64_t dist_8x8;
- dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-#ifdef DEBUG_DIST_8X8
- // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 &&
- 0 /* !CONFIG_CFL */)
- assert(sum_rdc.dist == dist_8x8);
-#endif // DEBUG_DIST_8X8
- sum_rdc.dist = dist_8x8;
- sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
- }
-#endif // CONFIG_DIST_8X8
}
if (sum_rdc.rdcost < best_rdc.rdcost) {
- sum_rdc.rate += partition_cost[PARTITION_VERT];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost < best_rdc.rdcost) {
best_rdc = sum_rdc;
@@ -3641,6 +3819,17 @@ BEGIN_PARTITION_SEARCH:
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
}
+ if (pb_source_variance == UINT_MAX) {
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ pb_source_variance = av1_high_get_sby_perpixel_variance(
+ cpi, &x->plane[0].src, bsize, xd->bd);
+ } else {
+ pb_source_variance =
+ av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+ }
+ }
+
const int ext_partition_allowed =
do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
@@ -3649,15 +3838,26 @@ BEGIN_PARTITION_SEARCH:
int horzab_partition_allowed = ext_partition_allowed;
int vertab_partition_allowed = ext_partition_allowed;
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) {
+ horzab_partition_allowed = 0;
+ vertab_partition_allowed = 0;
+ }
+ }
+#endif
+
if (cpi->sf.prune_ext_partition_types_search_level) {
if (cpi->sf.prune_ext_partition_types_search_level == 1) {
+ // TODO(debargha,huisu@google.com): may need to tune the threshold for
+ // pb_source_variance.
horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
(pc_tree->partitioning == PARTITION_NONE &&
- x->source_variance < 32) ||
+ pb_source_variance < 32) ||
pc_tree->partitioning == PARTITION_SPLIT);
vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
(pc_tree->partitioning == PARTITION_NONE &&
- x->source_variance < 32) ||
+ pb_source_variance < 32) ||
pc_tree->partitioning == PARTITION_SPLIT);
} else {
horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
@@ -3712,6 +3912,9 @@ BEGIN_PARTITION_SEARCH:
if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed &&
partition_horz_allowed && partition_vert_allowed) {
+ // TODO(huisu@google.com): x->source_variance may not be the current block's
+ // variance. The correct one to use is pb_source_variance.
+ // Need to re-train the model to fix it.
ml_prune_ab_partition(bsize, pc_tree->partitioning,
get_unsigned_bits(x->source_variance),
best_rdc.rdcost, horz_rd, vert_rd, split_rd,
@@ -3736,6 +3939,21 @@ BEGIN_PARTITION_SEARCH:
pc_tree->horizontala[1].rd_mode_is_ready = 1;
}
}
+ pc_tree->horizontala[0].skip_ref_frame_mask = 0;
+ pc_tree->horizontala[1].skip_ref_frame_mask = 0;
+ pc_tree->horizontala[2].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0];
+ if (used_frames)
+ pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1];
+ if (used_frames)
+ pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2] | ref_frames_used[3];
+ if (used_frames)
+ pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames;
+ }
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row,
@@ -3754,6 +3972,21 @@ BEGIN_PARTITION_SEARCH:
pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B;
pc_tree->horizontalb[0].rd_mode_is_ready = 1;
}
+ pc_tree->horizontalb[0].skip_ref_frame_mask = 0;
+ pc_tree->horizontalb[1].skip_ref_frame_mask = 0;
+ pc_tree->horizontalb[2].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[1];
+ if (used_frames)
+ pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2];
+ if (used_frames)
+ pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[3];
+ if (used_frames)
+ pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames;
+ }
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
PARTITION_HORZ_B, mi_row, mi_col, subsize,
@@ -3773,6 +4006,18 @@ BEGIN_PARTITION_SEARCH:
pc_tree->verticala[0].mic.partition = PARTITION_VERT_A;
pc_tree->verticala[0].rd_mode_is_ready = 1;
}
+ pc_tree->verticala[0].skip_ref_frame_mask = 0;
+ pc_tree->verticala[1].skip_ref_frame_mask = 0;
+ pc_tree->verticala[2].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0];
+ if (used_frames) pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2];
+ if (used_frames) pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1] | ref_frames_used[3];
+ if (used_frames) pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames;
+ }
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
PARTITION_VERT_A, mi_row, mi_col, bsize2,
@@ -3791,6 +4036,18 @@ BEGIN_PARTITION_SEARCH:
pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B;
pc_tree->verticalb[0].rd_mode_is_ready = 1;
}
+ pc_tree->verticalb[0].skip_ref_frame_mask = 0;
+ pc_tree->verticalb[1].skip_ref_frame_mask = 0;
+ pc_tree->verticalb[2].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[2];
+ if (used_frames) pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1];
+ if (used_frames) pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[3];
+ if (used_frames) pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames;
+ }
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row,
@@ -3823,9 +4080,19 @@ BEGIN_PARTITION_SEARCH:
partition_horz_allowed && partition_vert_allowed) {
ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning, best_rdc.rdcost,
horz_rd, vert_rd, split_rd, &partition_horz4_allowed,
- &partition_vert4_allowed);
+ &partition_vert4_allowed, pb_source_variance, mi_row,
+ mi_col);
}
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (block_size_high[bsize] <= 16 || block_size_wide[bsize] <= 16) {
+ partition_horz4_allowed = 0;
+ partition_vert4_allowed = 0;
+ }
+ }
+#endif
+
// PARTITION_HORZ_4
if (partition_horz4_allowed && has_rows &&
(do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
@@ -3834,25 +4101,33 @@ BEGIN_PARTITION_SEARCH:
PICK_MODE_CONTEXT *ctx_prev = ctx_none;
subsize = get_partition_subsize(bsize, PARTITION_HORZ_4);
+ sum_rdc.rate = partition_cost[PARTITION_HORZ_4];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
for (int i = 0; i < 4; ++i) {
- int this_mi_row = mi_row + i * quarter_step;
+ const int this_mi_row = mi_row + i * quarter_step;
if (i > 0 && this_mi_row >= cm->mi_rows) break;
PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
ctx_this->rd_mode_is_ready = 0;
- if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3),
- this_mi_row, mi_col, subsize, &best_rdc, &sum_rdc,
- &this_rdc, PARTITION_HORZ_4, ctx_prev, ctx_this))
+ ctx_this->skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ const int used_frames = i <= 1
+ ? (ref_frames_used[0] | ref_frames_used[1])
+ : (ref_frames_used[2] | ref_frames_used[3]);
+ if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
+ }
+ if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row,
+ mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
+ PARTITION_HORZ_4, ctx_prev, ctx_this))
break;
ctx_prev = ctx_this;
}
if (sum_rdc.rdcost < best_rdc.rdcost) {
- sum_rdc.rate += partition_cost[PARTITION_HORZ_4];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost < best_rdc.rdcost) {
best_rdc = sum_rdc;
@@ -3870,16 +4145,25 @@ BEGIN_PARTITION_SEARCH:
PICK_MODE_CONTEXT *ctx_prev = ctx_none;
subsize = get_partition_subsize(bsize, PARTITION_VERT_4);
+ sum_rdc.rate = partition_cost[PARTITION_VERT_4];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
for (int i = 0; i < 4; ++i) {
- int this_mi_col = mi_col + i * quarter_step;
+ const int this_mi_col = mi_col + i * quarter_step;
if (i > 0 && this_mi_col >= cm->mi_cols) break;
PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
ctx_this->rd_mode_is_ready = 0;
- if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3), mi_row,
+ ctx_this->skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ const int used_frames = i <= 1
+ ? (ref_frames_used[0] | ref_frames_used[2])
+ : (ref_frames_used[1] | ref_frames_used[3]);
+ if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
+ }
+ if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row,
this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
PARTITION_VERT_4, ctx_prev, ctx_this))
break;
@@ -3888,7 +4172,6 @@ BEGIN_PARTITION_SEARCH:
}
if (sum_rdc.rdcost < best_rdc.rdcost) {
- sum_rdc.rate += partition_cost[PARTITION_VERT_4];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost < best_rdc.rdcost) {
best_rdc = sum_rdc;
@@ -3924,14 +4207,6 @@ BEGIN_PARTITION_SEARCH:
}
}
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
- best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
- encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
- pc_tree, NULL);
- }
-#endif // CONFIG_DIST_8X8
-
if (bsize == cm->seq_params.sb_size) {
assert(best_rdc.rate < INT_MAX);
assert(best_rdc.dist < INT64_MAX);
@@ -3950,6 +4225,15 @@ static void init_first_partition_pass_stats_tables(
}
}
+// clear pc_tree_stats
+static INLINE void clear_pc_tree_stats(PC_TREE *pt) {
+ if (pt == NULL) return;
+ pt->pc_tree_stats.valid = 0;
+ for (int i = 0; i < 4; ++i) {
+ clear_pc_tree_stats(pt->split[i]);
+ }
+}
+
// Minimum number of samples to trigger the
// mode_pruning_based_on_two_pass_partition_search feature.
#define FIRST_PARTITION_PASS_MIN_SAMPLES 16
@@ -3963,7 +4247,6 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
SPEED_FEATURES *const sf = &cpi->sf;
- int mi_col;
const int leaf_nodes = 256;
// Initialize the left context for the new SB row
@@ -3977,26 +4260,16 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
}
}
+ PC_TREE *const pc_root =
+ td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2];
// Code each SB in the row
- for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+ for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
mi_col += cm->seq_params.mib_size) {
- const struct segmentation *const seg = &cm->seg;
- int dummy_rate;
- int64_t dummy_dist;
- RD_STATS dummy_rdc;
- int i;
- int seg_skip = 0;
-
- const int idx_str = cm->mi_stride * mi_row + mi_col;
- MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
- PC_TREE *const pc_root =
- td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2];
-
av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
av1_fill_mode_rates(cm, x, xd->tile_ctx);
if (sf->adaptive_pred_interp_filter) {
- for (i = 0; i < leaf_nodes; ++i) {
+ for (int i = 0; i < leaf_nodes; ++i) {
td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
@@ -4015,10 +4288,12 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
av1_zero(x->pred_mv);
pc_root->index = 0;
+ const struct segmentation *const seg = &cm->seg;
+ int seg_skip = 0;
if (seg->enabled) {
const uint8_t *const map =
seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
- int segment_id =
+ const int segment_id =
map ? get_segment_id(cm, map, cm->seq_params.sb_size, mi_row, mi_col)
: 0;
seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
@@ -4039,15 +4314,14 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
cpi, block_wavelet_energy_level);
} else {
const int block_var_level =
- av1_block_energy(cpi, x, cm->seq_params.sb_size);
+ av1_log_block_var(cpi, x, cm->seq_params.sb_size);
x->sb_energy_level = block_var_level;
offset_qindex =
av1_compute_deltaq_from_energy_level(cpi, block_var_level);
}
- int qmask = ~(cm->delta_q_res - 1);
+ const int qmask = ~(cm->delta_q_res - 1);
int current_qindex = clamp(cm->base_qindex + offset_qindex,
cm->delta_q_res, 256 - cm->delta_q_res);
-
current_qindex =
((current_qindex - cm->base_qindex + cm->delta_q_res / 2) & qmask) +
cm->base_qindex;
@@ -4058,18 +4332,16 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
xd->mi[0]->current_qindex = current_qindex;
av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) {
- int j, k;
- int lfmask = ~(cm->delta_lf_res - 1);
- int delta_lf_from_base = offset_qindex / 2;
- delta_lf_from_base =
- ((delta_lf_from_base + cm->delta_lf_res / 2) & lfmask);
+ const int lfmask = ~(cm->delta_lf_res - 1);
+ const int delta_lf_from_base =
+ ((offset_qindex / 2 + cm->delta_lf_res / 2) & lfmask);
// pre-set the delta lf for loop filter. Note that this value is set
// before mi is assigned for each block in current superblock
- for (j = 0; j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row);
- j++) {
- for (k = 0; k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col);
- k++) {
+ for (int j = 0;
+ j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row); j++) {
+ for (int k = 0;
+ k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col); k++) {
cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
.delta_lf_from_base =
clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
@@ -4085,19 +4357,24 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
}
}
+ int dummy_rate;
+ int64_t dummy_dist;
+ RD_STATS dummy_rdc;
+ const int idx_str = cm->mi_stride * mi_row + mi_col;
+ MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
x->source_variance = UINT_MAX;
if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
- BLOCK_SIZE bsize;
set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
- bsize = seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size;
+ const BLOCK_SIZE bsize =
+ seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size;
set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
pc_root);
} else if (cpi->partition_search_skippable_frame) {
- BLOCK_SIZE bsize;
set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
- bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+ const BLOCK_SIZE bsize =
+ get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
@@ -4113,9 +4390,9 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
reset_partition(pc_root, cm->seq_params.sb_size);
x->use_cb_search_range = 0;
init_first_partition_pass_stats_tables(x->first_partition_pass_stats);
+ // Do the first pass if we need two pass partition search
if (cpi->sf.two_pass_partition_search &&
- cpi->sf.use_square_partition_only_threshold <
- cm->seq_params.sb_size &&
+ cpi->sf.use_square_partition_only_threshold > BLOCK_4X4 &&
mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows &&
mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols &&
cm->frame_type != KEY_FRAME) {
@@ -4123,6 +4400,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
// Reset the stats tables.
if (sf->mode_pruning_based_on_two_pass_partition_search)
av1_zero(x->first_partition_pass_stats);
+ clear_pc_tree_stats(pc_root);
rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col,
cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
pc_root, NULL);
@@ -4130,7 +4408,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
x->source_variance = UINT_MAX;
if (sf->adaptive_pred_interp_filter) {
- for (i = 0; i < leaf_nodes; ++i) {
+ for (int i = 0; i < leaf_nodes; ++i) {
td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
@@ -4157,7 +4435,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
x->use_cb_search_range = 1;
if (sf->mode_pruning_based_on_two_pass_partition_search) {
- for (i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+ for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
FIRST_PARTITION_PASS_STATS *const stat =
&x->first_partition_pass_stats[i];
if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
@@ -4174,21 +4452,17 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
}
}
}
-
- rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
- cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
- pc_root, NULL);
- } else {
- rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
- cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
- pc_root, NULL);
}
+
+ rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+ cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, pc_root,
+ NULL);
}
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
// TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
cm->tile_rows == 1) {
- av1_inter_mode_data_fit(x->rdmult);
+ av1_inter_mode_data_fit(tile_data, x->rdmult);
}
#endif
}
@@ -4233,6 +4507,32 @@ static TX_MODE select_tx_mode(const AV1_COMP *cpi) {
return cpi->common.tx_mode;
}
+void av1_alloc_tile_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int tile_col, tile_row;
+
+ if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
+ CHECK_MEM_ERROR(
+ cm, cpi->tile_data,
+ aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+ cpi->allocated_tiles = tile_cols * tile_rows;
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ int i, j;
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ for (j = 0; j < MAX_MODES; ++j) {
+ tile_data->thresh_freq_fact[i][j] = 32;
+ tile_data->mode_map[i][j] = j;
+ }
+ }
+ }
+}
+
void av1_init_tile_data(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
@@ -4240,28 +4540,9 @@ void av1_init_tile_data(AV1_COMP *cpi) {
const int tile_rows = cm->tile_rows;
int tile_col, tile_row;
TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+ TOKENLIST *tplist = cpi->tplist[0][0];
unsigned int tile_tok = 0;
-
- if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
- if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
- CHECK_MEM_ERROR(
- cm, cpi->tile_data,
- aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
- cpi->allocated_tiles = tile_cols * tile_rows;
-
- for (tile_row = 0; tile_row < tile_rows; ++tile_row)
- for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
- TileDataEnc *const tile_data =
- &cpi->tile_data[tile_row * tile_cols + tile_col];
- int i, j;
- for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
- for (j = 0; j < MAX_MODES; ++j) {
- tile_data->thresh_freq_fact[i][j] = 32;
- tile_data->mode_map[i][j] = j;
- }
- }
- }
- }
+ int tplist_count = 0;
for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
@@ -4274,6 +4555,9 @@ void av1_init_tile_data(AV1_COMP *cpi) {
pre_tok = cpi->tile_tok[tile_row][tile_col];
tile_tok = allocated_tokens(
*tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+ cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
+ tplist = cpi->tplist[tile_row][tile_col];
+ tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
tile_data->allow_update_cdf = !cm->large_scale_tile;
tile_data->allow_update_cdf =
tile_data->allow_update_cdf && !cm->disable_cdf_update;
@@ -4281,15 +4565,56 @@ void av1_init_tile_data(AV1_COMP *cpi) {
}
}
+void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
+ int tile_col, int mi_row) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const int tile_cols = cm->tile_cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ TOKENEXTRA *tok = NULL;
+ int sb_row_in_tile;
+ int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+
+ int num_mb_rows_in_sb =
+ ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+
+ sb_row_in_tile =
+ (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2;
+
+ get_start_tok(cpi, tile_row, tile_col, mi_row, &tok,
+ cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok;
+
+ encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok;
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].count =
+ (unsigned int)(cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop -
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].start);
+
+ assert(
+ (unsigned int)(tok -
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].start) <=
+ get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
+ cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes));
+
+ (void)tile_mb_cols;
+ (void)num_mb_rows_in_sb;
+}
+
void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
int tile_col) {
AV1_COMMON *const cm = &cpi->common;
TileDataEnc *const this_tile =
&cpi->tile_data[tile_row * cm->tile_cols + tile_col];
const TileInfo *const tile_info = &this_tile->tile_info;
- TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
int mi_row;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ av1_inter_mode_data_init(this_tile);
+#endif
+
av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
tile_info->mi_col_end, tile_row);
av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
@@ -4310,25 +4635,23 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
mi_row += cm->seq_params.mib_size) {
- encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+ av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
}
-
- cpi->tok_count[tile_row][tile_col] =
- (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
- assert(cpi->tok_count[tile_row][tile_col] <=
- allocated_tokens(*tile_info,
- cm->seq_params.mib_size_log2 + MI_SIZE_LOG2,
- av1_num_planes(cm)));
}
static void encode_tiles(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
int tile_col, tile_row;
+ if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
+ av1_alloc_tile_data(cpi);
+
av1_init_tile_data(cpi);
- for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
- for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
cpi->intrabc_used |= cpi->td.intrabc_used_this_tile;
}
@@ -4616,6 +4939,13 @@ static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) {
return 0;
}
+static void set_default_interp_skip_flags(AV1_COMP *cpi) {
+ const int num_planes = av1_num_planes(&cpi->common);
+ cpi->default_interp_skip_flags = (num_planes == 1)
+ ? DEFAULT_LUMA_INTERP_SKIP_FLAG
+ : DEFAULT_INTERP_SKIP_FLAG;
+}
+
static void encode_frame_internal(AV1_COMP *cpi) {
ThreadData *const td = &cpi->td;
MACROBLOCK *const x = &td->mb;
@@ -4683,41 +5013,41 @@ static void encode_frame_internal(AV1_COMP *cpi) {
av1_hash_table_create(&cm->cur_frame->hash_table);
av1_generate_block_2x2_hash_value(cpi->source, block_hash_values[0],
- is_block_same[0]);
+ is_block_same[0], &cpi->td.mb);
av1_generate_block_hash_value(cpi->source, 4, block_hash_values[0],
block_hash_values[1], is_block_same[0],
- is_block_same[1]);
+ is_block_same[1], &cpi->td.mb);
av1_add_to_hash_map_by_row_with_precal_data(
&cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
pic_width, pic_height, 4);
av1_generate_block_hash_value(cpi->source, 8, block_hash_values[1],
block_hash_values[0], is_block_same[1],
- is_block_same[0]);
+ is_block_same[0], &cpi->td.mb);
av1_add_to_hash_map_by_row_with_precal_data(
&cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
pic_width, pic_height, 8);
av1_generate_block_hash_value(cpi->source, 16, block_hash_values[0],
block_hash_values[1], is_block_same[0],
- is_block_same[1]);
+ is_block_same[1], &cpi->td.mb);
av1_add_to_hash_map_by_row_with_precal_data(
&cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
pic_width, pic_height, 16);
av1_generate_block_hash_value(cpi->source, 32, block_hash_values[1],
block_hash_values[0], is_block_same[1],
- is_block_same[0]);
+ is_block_same[0], &cpi->td.mb);
av1_add_to_hash_map_by_row_with_precal_data(
&cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
pic_width, pic_height, 32);
av1_generate_block_hash_value(cpi->source, 64, block_hash_values[0],
block_hash_values[1], is_block_same[0],
- is_block_same[1]);
+ is_block_same[1], &cpi->td.mb);
av1_add_to_hash_map_by_row_with_precal_data(
&cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
pic_width, pic_height, 64);
av1_generate_block_hash_value(cpi->source, 128, block_hash_values[1],
block_hash_values[0], is_block_same[1],
- is_block_same[0]);
+ is_block_same[0], &cpi->td.mb);
av1_add_to_hash_map_by_row_with_precal_data(
&cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
pic_width, pic_height, 128);
@@ -4769,7 +5099,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
av1_initialize_rd_consts(cpi);
av1_initialize_me_consts(cpi, x, cm->base_qindex);
init_encode_frame_mb_context(cpi);
-
+ set_default_interp_skip_flags(cpi);
if (cm->prev_frame)
cm->last_frame_seg_map = cm->prev_frame->seg_map;
else
@@ -4793,6 +5123,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
av1_zero(rdc->global_motion_used);
av1_zero(cpi->gmparams_cost);
+#if !CONFIG_GLOBAL_MOTION_SEARCH
+ cpi->global_motion_search_done = 1;
+#endif // !CONFIG_GLOBAL_MOTION_SEARCH
if (cpi->common.frame_type == INTER_FRAME && cpi->source &&
!cpi->global_motion_search_done) {
YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
@@ -4939,27 +5272,13 @@ static void encode_frame_internal(AV1_COMP *cpi) {
}
#endif
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- av1_inter_mode_data_init();
-#endif
-
- // If allowed, encoding tiles in parallel with one thread handling one tile.
- // TODO(geza.lore): The multi-threaded encoder is not safe with more than
- // 1 tile rows, as it uses the single above_context et al arrays from
- // cpi->common
- if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols) > 1 && cm->tile_rows == 1)
+ if (cpi->row_mt && (cpi->oxcf.max_threads > 1))
+ av1_encode_tiles_mt(cpi);
+ else if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols * cm->tile_rows) > 1)
av1_encode_tiles_mt(cpi);
else
encode_tiles(cpi);
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-#if INTER_MODE_RD_TEST
- if (cpi->sf.inter_mode_rd_model_estimation) {
- av1_inter_mode_data_show(cm);
- }
-#endif
-#endif
-
aom_usec_timer_mark(&emr_timer);
cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer);
}
@@ -5407,7 +5726,7 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
}
mbmi->tx_size = tx_size;
- set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h,
+ set_txfm_ctxs(tx_size, xd->n4_w, xd->n4_h,
(mbmi->skip || seg_skip) && is_inter_block(mbmi), xd);
}
CFL_CTX *const cfl = &xd->cfl;
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
index 62141dba4..e8cf9b468 100644
--- a/third_party/aom/av1/encoder/encodeframe.h
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_ENCODEFRAME_H_
-#define AV1_ENCODER_ENCODEFRAME_H_
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_H_
#include "aom/aom_integer.h"
#include "av1/common/blockd.h"
@@ -20,7 +20,7 @@
extern "C" {
#endif
-#define DELTAQ_MODULATION 0 // 0: variance based, 1: wavelet AC energy based
+#define DELTAQ_MODULATION 1 // 0: variance based, 1: wavelet AC energy based
struct macroblock;
struct yv12_buffer_config;
@@ -33,12 +33,15 @@ void av1_setup_src_planes(struct macroblock *x,
void av1_encode_frame(struct AV1_COMP *cpi);
+void av1_alloc_tile_data(struct AV1_COMP *cpi);
void av1_init_tile_data(struct AV1_COMP *cpi);
void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
int tile_col);
+void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td,
+ int tile_row, int tile_col, int mi_row);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_ENCODEFRAME_H_
+#endif // AOM_AV1_ENCODER_ENCODEFRAME_H_
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
index cea8db6f9..ad12577e6 100644
--- a/third_party/aom/av1/encoder/encodemb.c
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -222,11 +222,8 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
a = &args->ta[blk_col];
l = &args->tl[blk_row];
- // Assert not magic number (uninitialized).
- assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234);
- if ((plane != 0 || x->blk_skip[blk_row * bw + blk_col] == 0) &&
- !mbmi->skip_mode) {
+ if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) {
TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
tx_size, cm->reduced_tx_set_used);
if (args->enable_optimize_b) {
@@ -350,6 +347,66 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
}
}
+void av1_foreach_transformed_block_in_plane(
+ const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+ foreach_transformed_block_visitor visit, void *arg) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+ // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+ // transform size varies per plane, look it up in a common way.
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+ const uint8_t txh_unit = tx_size_high_unit[tx_size];
+ const int step = txw_unit * txh_unit;
+ int i = 0, r, c;
+
+ // If mb_to_right_edge is < 0 we are in a situation in which
+ // the current block size extends into the UMV and we won't
+ // visit the sub blocks that are wholly within the UMV.
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+
+ int blk_row, blk_col;
+
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+ int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+ int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+ mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_blocks_high; r += mu_blocks_high) {
+ const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
+ const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
+ for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
+ for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
+ visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
+ i += step;
+ }
+ }
+ }
+ }
+}
+
+void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ foreach_transformed_block_visitor visit,
+ void *arg, const int num_planes) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (!is_chroma_reference(mi_row, mi_col, bsize,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y))
+ continue;
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
+ }
+}
+
typedef struct encode_block_pass1_args {
AV1_COMMON *cm;
MACROBLOCK *x;
@@ -382,7 +439,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
txfm_param.tx_set_type = av1_get_ext_tx_set_type(
txfm_param.tx_size, is_inter_block(xd->mi[0]), cm->reduced_tx_set_used);
if (txfm_param.is_hbd) {
- av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, &txfm_param);
+ av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
return;
}
av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
@@ -513,9 +570,7 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
- // Assert not magic number (uninitialized).
- assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234);
- if (plane == 0 && x->blk_skip[blk_row * bw + blk_col]) {
+ if (plane == 0 && is_blk_skip(x, plane, blk_row * bw + blk_col)) {
*eob = 0;
p->txb_entropy_ctx[block] = 0;
} else {
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
index 673f87ea7..39080de59 100644
--- a/third_party/aom/av1/encoder/encodemb.h
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_ENCODEMB_H_
-#define AV1_ENCODER_ENCODEMB_H_
+#ifndef AOM_AV1_ENCODER_ENCODEMB_H_
+#define AOM_AV1_ENCODER_ENCODEMB_H_
#include "config/aom_config.h"
@@ -47,7 +47,18 @@ typedef enum AV1_XFORM_QUANT {
void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
int mi_row, int mi_col, RUN_TYPE dry_run);
+
+void av1_foreach_transformed_block_in_plane(
+ const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+ foreach_transformed_block_visitor visit, void *arg);
+
+void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ foreach_transformed_block_visitor visit,
+ void *arg, const int num_planes);
+
void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
+
void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, TX_TYPE tx_type,
@@ -82,4 +93,4 @@ void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
} // extern "C"
#endif
-#endif // AV1_ENCODER_ENCODEMB_H_
+#endif // AOM_AV1_ENCODER_ENCODEMB_H_
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
index 944e2c53d..42eb5abf6 100644
--- a/third_party/aom/av1/encoder/encodemv.c
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -18,19 +18,37 @@
#include "av1/encoder/encodemv.h"
#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+
+static INLINE int mv_class_base(MV_CLASS_TYPE c) {
+ return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
+static INLINE uint8_t log_in_base_2(unsigned int n) {
+ // get_msb() is only valid when n != 0.
+ return n == 0 ? 0 : get_msb(n);
+}
+
+static INLINE MV_CLASS_TYPE get_mv_class(int z, int *offset) {
+ const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
+ ? MV_CLASS_10
+ : (MV_CLASS_TYPE)log_in_base_2(z >> 3);
+ if (offset) *offset = z - mv_class_base(c);
+ return c;
+}
static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
MvSubpelPrecision precision) {
+ assert(comp != 0);
int offset;
const int sign = comp < 0;
const int mag = sign ? -comp : comp;
- const int mv_class = av1_get_mv_class(mag - 1, &offset);
+ const int mv_class = get_mv_class(mag - 1, &offset);
const int d = offset >> 3; // int mv data
const int fr = (offset >> 1) & 3; // fractional mv data
const int hp = offset & 1; // high precision mv data
- assert(comp != 0);
-
// Sign
aom_write_symbol(w, sign, mvcomp->sign_cdf, 2);
@@ -89,7 +107,7 @@ static void build_nmv_component_cost_table(int *mvcost,
for (v = 1; v <= MV_MAX; ++v) {
int z, c, o, d, e, f, cost = 0;
z = v - 1;
- c = av1_get_mv_class(z, &o);
+ c = get_mv_class(z, &o);
cost += class_cost[c];
d = (o >> 3); /* int mv data */
f = (o >> 1) & 3; /* fractional pel mv data */
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
index 64e9e7162..37ff547c8 100644
--- a/third_party/aom/av1/encoder/encodemv.h
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_ENCODEMV_H_
-#define AV1_ENCODER_ENCODEMV_H_
+#ifndef AOM_AV1_ENCODER_ENCODEMV_H_
+#define AOM_AV1_ENCODER_ENCODEMV_H_
#include "av1/encoder/encoder.h"
@@ -40,8 +40,16 @@ void av1_find_best_ref_mvs_from_stack(int allow_hp,
int_mv *nearest_mv, int_mv *near_mv,
int is_integer);
+static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
+ if (mv->row == 0) {
+ return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
+ } else {
+ return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_ENCODEMV_H_
+#endif // AOM_AV1_ENCODER_ENCODEMV_H_
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
index 13ea32e38..a2da2df89 100644
--- a/third_party/aom/av1/encoder/encoder.c
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -14,9 +14,28 @@
#include <stdio.h>
#include "config/aom_config.h"
-#include "config/av1_rtcd.h"
#include "config/aom_dsp_rtcd.h"
#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/noise_model.h"
+#endif
+#include "aom_dsp/psnr.h"
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
#include "av1/common/alloccommon.h"
#include "av1/common/cdef.h"
@@ -38,6 +57,7 @@
#include "av1/encoder/encodetxb.h"
#include "av1/encoder/ethread.h"
#include "av1/encoder/firstpass.h"
+#include "av1/encoder/grain_test_vectors.h"
#include "av1/encoder/hash_motion.h"
#include "av1/encoder/mbgraph.h"
#include "av1/encoder/picklpf.h"
@@ -49,26 +69,6 @@
#include "av1/encoder/speed_features.h"
#include "av1/encoder/temporal_filter.h"
-#include "aom_dsp/psnr.h"
-#if CONFIG_INTERNAL_STATS
-#include "aom_dsp/ssim.h"
-#endif
-#include "av1/encoder/grain_test_vectors.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#if CONFIG_DENOISE
-#include "aom_dsp/grain_table.h"
-#include "aom_dsp/noise_util.h"
-#include "aom_dsp/noise_model.h"
-#endif
-#include "aom_ports/aom_timer.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
-#include "aom_scale/aom_scale.h"
-#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
-#include "aom_util/debug_util.h"
-#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
-
#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
// av1 uses 10,000,000 ticks/second as time stamp
@@ -413,18 +413,13 @@ static void swap_mi_and_prev_mi(AV1_COMMON *cm) {
}
void av1_initialize_enc(void) {
- static volatile int init_done = 0;
-
- if (!init_done) {
- av1_rtcd();
- aom_dsp_rtcd();
- aom_scale_rtcd();
- av1_init_intra_predictors();
- av1_init_me_luts();
- av1_rc_init_minq_luts();
- av1_init_wedge_masks();
- init_done = 1;
- }
+ av1_rtcd();
+ aom_dsp_rtcd();
+ aom_scale_rtcd();
+ av1_init_intra_predictors();
+ av1_init_me_luts();
+ av1_rc_init_minq_luts();
+ av1_init_wedge_masks();
}
static void dealloc_context_buffers_ext(AV1_COMP *cpi) {
@@ -506,6 +501,11 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
aom_free(cpi->td.mb.wsrc_buf);
cpi->td.mb.wsrc_buf = NULL;
+ for (int i = 0; i < 2; i++)
+ for (int j = 0; j < 2; j++) {
+ aom_free(cpi->td.mb.hash_value_buffer[i][j]);
+ cpi->td.mb.hash_value_buffer[i][j] = NULL;
+ }
aom_free(cpi->td.mb.mask_buf);
cpi->td.mb.mask_buf = NULL;
@@ -527,10 +527,18 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
aom_free(cpi->tile_tok[0][0]);
cpi->tile_tok[0][0] = 0;
+ aom_free(cpi->tplist[0][0]);
+ cpi->tplist[0][0] = NULL;
+
av1_free_pc_tree(&cpi->td, num_planes);
aom_free(cpi->td.mb.palette_buffer);
+ aom_free(cpi->td.mb.tmp_conv_dst);
+ for (int j = 0; j < 2; ++j) {
+ aom_free(cpi->td.mb.tmp_obmc_bufs[j]);
+ }
+
#if CONFIG_DENOISE
if (cpi->denoise_and_model) {
aom_denoise_and_model_free(cpi->denoise_and_model);
@@ -785,6 +793,10 @@ static void alloc_compressor_data(AV1_COMP *cpi) {
av1_alloc_context_buffers(cm, cm->width, cm->height);
+ int mi_rows_aligned_to_sb =
+ ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+ int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
av1_alloc_txb_buf(cpi);
alloc_context_buffers_ext(cpi);
@@ -797,6 +809,11 @@ static void alloc_compressor_data(AV1_COMP *cpi) {
CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
}
+ aom_free(cpi->tplist[0][0]);
+
+ CHECK_MEM_ERROR(cm, cpi->tplist[0][0],
+ aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
+ sizeof(*cpi->tplist[0][0])));
av1_setup_pc_tree(&cpi->common, &cpi->td);
}
@@ -1067,6 +1084,32 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
10; // Default value (not signaled)
}
+ if (cm->seq_params.monochrome) {
+ cm->seq_params.subsampling_x = 1;
+ cm->seq_params.subsampling_y = 1;
+ } else if (cm->seq_params.color_primaries == AOM_CICP_CP_BT_709 &&
+ cm->seq_params.transfer_characteristics == AOM_CICP_TC_SRGB &&
+ cm->seq_params.matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ cm->seq_params.subsampling_x = 0;
+ cm->seq_params.subsampling_y = 0;
+ } else {
+ if (cm->seq_params.profile == 0) {
+ cm->seq_params.subsampling_x = 1;
+ cm->seq_params.subsampling_y = 1;
+ } else if (cm->seq_params.profile == 1) {
+ cm->seq_params.subsampling_x = 0;
+ cm->seq_params.subsampling_y = 0;
+ } else {
+ if (cm->seq_params.bit_depth == AOM_BITS_12) {
+ cm->seq_params.subsampling_x = oxcf->chroma_subsampling_x;
+ cm->seq_params.subsampling_y = oxcf->chroma_subsampling_y;
+ } else {
+ cm->seq_params.subsampling_x = 1;
+ cm->seq_params.subsampling_y = 0;
+ }
+ }
+ }
+
cm->width = oxcf->width;
cm->height = oxcf->height;
set_sb_size(&cm->seq_params,
@@ -2326,6 +2369,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
cpi->oxcf = *oxcf;
cpi->common.options = oxcf->cfg;
+ cpi->row_mt = oxcf->row_mt;
x->e_mbd.bd = (int)seq_params->bit_depth;
x->e_mbd.global_motion = cm->global_motion;
@@ -2350,6 +2394,22 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
CHECK_MEM_ERROR(cm, x->palette_buffer,
aom_memalign(16, sizeof(*x->palette_buffer)));
}
+
+ if (x->tmp_conv_dst == NULL) {
+ CHECK_MEM_ERROR(
+ cm, x->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
+ x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
+ }
+ for (int i = 0; i < 2; ++i) {
+ if (x->tmp_obmc_bufs[i] == NULL) {
+ CHECK_MEM_ERROR(cm, x->tmp_obmc_bufs[i],
+ aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*x->tmp_obmc_bufs[i])));
+ x->e_mbd.tmp_obmc_bufs[i] = x->tmp_obmc_bufs[i];
+ }
+ }
+
av1_reset_segment_features(cm);
set_high_precision_mv(cpi, 1, 0);
@@ -2367,11 +2427,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
rc->worst_quality = cpi->oxcf.worst_allowed_q;
rc->best_quality = cpi->oxcf.best_allowed_q;
- if (!oxcf->large_scale_tile)
- cm->interp_filter = cpi->sf.default_interp_filter;
- else
- cm->interp_filter = EIGHTTAP_REGULAR;
-
+ cm->interp_filter = oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
cm->switchable_motion_mode = 1;
if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
@@ -2588,6 +2644,15 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
(int32_t *)aom_memalign(
16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
+ for (int x = 0; x < 2; x++)
+ for (int y = 0; y < 2; y++)
+ CHECK_MEM_ERROR(
+ cm, cpi->td.mb.hash_value_buffer[x][y],
+ (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*cpi->td.mb.hash_value_buffer[0][0])));
+
+ cpi->td.mb.g_crc_initialized = 0;
+
CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
(int32_t *)aom_memalign(
16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
@@ -2913,9 +2978,19 @@ void av1_remove_compressor(AV1_COMP *cpi) {
// Deallocate allocated thread data.
if (t < cpi->num_workers - 1) {
aom_free(thread_data->td->palette_buffer);
+ aom_free(thread_data->td->tmp_conv_dst);
+ for (int j = 0; j < 2; ++j) {
+ aom_free(thread_data->td->tmp_obmc_bufs[j]);
+ }
aom_free(thread_data->td->above_pred_buf);
aom_free(thread_data->td->left_pred_buf);
aom_free(thread_data->td->wsrc_buf);
+ for (int x = 0; x < 2; x++) {
+ for (int y = 0; y < 2; y++) {
+ aom_free(thread_data->td->hash_value_buffer[x][y]);
+ thread_data->td->hash_value_buffer[x][y] = NULL;
+ }
+ }
aom_free(thread_data->td->mask_buf);
aom_free(thread_data->td->counts);
av1_free_pc_tree(thread_data->td, num_planes);
@@ -3058,53 +3133,7 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
}
#endif
-#if USE_GF16_MULTI_LAYER
-static void check_show_existing_frame_gf16(AV1_COMP *cpi) {
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- AV1_COMMON *const cm = &cpi->common;
- const FRAME_UPDATE_TYPE next_frame_update_type =
- gf_group->update_type[gf_group->index];
-
- if (cm->show_existing_frame == 1) {
- cm->show_existing_frame = 0;
- } else if (cpi->rc.is_last_bipred_frame) {
- cpi->rc.is_last_bipred_frame = 0;
- cm->show_existing_frame = 1;
- cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[BWDREF_FRAME - 1];
- } else if (next_frame_update_type == OVERLAY_UPDATE ||
- next_frame_update_type == INTNL_OVERLAY_UPDATE) {
- // Check the temporal filtering status for the next OVERLAY frame
- const int num_arfs_in_gf = cpi->num_extra_arfs + 1;
- int which_arf = 0, arf_idx;
- // Identify the index to the next overlay frame.
- for (arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) {
- if (gf_group->index == cpi->arf_pos_for_ovrly[arf_idx]) {
- which_arf = arf_idx;
- break;
- }
- }
- assert(arf_idx < num_arfs_in_gf);
- if (cpi->is_arf_filter_off[which_arf]) {
- cm->show_existing_frame = 1;
- cpi->rc.is_src_frame_alt_ref = 1;
- cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
- ? cpi->ref_fb_idx[ALTREF_FRAME - 1]
- : cpi->ref_fb_idx[BWDREF_FRAME - 1];
- cpi->is_arf_filter_off[which_arf] = 0;
- }
- }
- cpi->rc.is_src_frame_ext_arf = 0;
-}
-#endif // USE_GF16_MULTI_LAYER
-
static void check_show_existing_frame(AV1_COMP *cpi) {
-#if USE_GF16_MULTI_LAYER
- if (cpi->rc.baseline_gf_interval == 16) {
- check_show_existing_frame_gf16(cpi);
- return;
- }
-#endif // USE_GF16_MULTI_LAYER
-
const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
AV1_COMMON *const cm = &cpi->common;
const FRAME_UPDATE_TYPE next_frame_update_type =
@@ -3350,13 +3379,13 @@ static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) {
EXTREF_FRAME - 1 };
for (int i = 2; i > 0; --i) {
- cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]];
-
// [0] is allocated to the current coded frame, i.e. bwdref
memcpy(
cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME],
sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME]));
+
+ cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]];
}
}
@@ -3370,52 +3399,16 @@ static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) {
EXTREF_FRAME - 1 };
for (int i = 0; i < 2; ++i) {
- cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]];
-
// [0] is allocated to the current coded frame, i.e. bwdref
memcpy(
cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME],
sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME]));
- }
-}
-#endif // USE_SYMM_MULTI_LAYER
-#if USE_GF16_MULTI_LAYER
-static void update_reference_frames_gf16(AV1_COMP *cpi) {
- AV1_COMMON *const cm = &cpi->common;
- BufferPool *const pool = cm->buffer_pool;
-
- if (cm->frame_type == KEY_FRAME) {
- for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
- ref_cnt_fb(pool->frame_bufs,
- &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]],
- cm->new_fb_idx);
- }
- } else {
- if (cpi->refresh_last_frame || cpi->refresh_golden_frame ||
- cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
- cpi->refresh_alt_ref_frame) {
- assert(cpi->refresh_fb_idx >= 0 && cpi->refresh_fb_idx < REF_FRAMES);
- ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->refresh_fb_idx],
- cm->new_fb_idx);
- }
-
- // TODO(zoeliu): To handle cpi->interp_filter_selected[].
-
- // For GF of 16, an additional ref frame index mapping needs to be handled
- // if this is the last frame to encode in the current GF group.
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- if (gf_group->update_type[gf_group->index + 1] == OVERLAY_UPDATE)
- av1_ref_frame_map_idx_updates(cpi, gf_group->index + 1);
+ cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]];
}
-
-#if DUMP_REF_FRAME_IMAGES == 1
- // Dump out all reference frame images.
- dump_ref_frame_images(cpi);
-#endif // DUMP_REF_FRAME_IMAGES
}
-#endif // USE_GF16_MULTI_LAYER
+#endif // USE_SYMM_MULTI_LAYER
static void update_reference_frames(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
@@ -3424,12 +3417,20 @@ static void update_reference_frames(AV1_COMP *cpi) {
// for the purpose to verify no mismatch between encoder and decoder.
if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
-#if USE_GF16_MULTI_LAYER
- if (cpi->rc.baseline_gf_interval == 16) {
- update_reference_frames_gf16(cpi);
- return;
+ // In the case of show_existing frame, we will not send fresh flag
+ // to decoder. Any change in the reference frame buffer can be done by
+ // switching the virtual indices.
+ if (cm->show_existing_frame) {
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ cpi->rc.is_bwd_ref_frame = 0;
+ cpi->rc.is_last_bipred_frame = 0;
+ cpi->rc.is_bipred_frame = 0;
}
-#endif // USE_GF16_MULTI_LAYER
BufferPool *const pool = cm->buffer_pool;
@@ -3458,9 +3459,15 @@ static void update_reference_frames(AV1_COMP *cpi) {
// slot and, if we're updating the GF, the current frame becomes the new GF.
int tmp;
- ref_cnt_fb(pool->frame_bufs,
- &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]],
- cm->new_fb_idx);
+ // ARF in general is a better reference than overlay. We shouldkeep ARF as
+ // reference instead of replacing it with overlay.
+
+ if (!cpi->preserve_arf_as_gld) {
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]],
+ cm->new_fb_idx);
+ }
+
tmp = cpi->ref_fb_idx[ALTREF_FRAME - 1];
cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp;
@@ -3758,7 +3765,7 @@ static void set_size_independent_vars(AV1_COMP *cpi) {
av1_set_speed_features_framesize_independent(cpi);
av1_set_rd_speed_thresholds(cpi);
av1_set_rd_speed_thresholds_sub8x8(cpi);
- cpi->common.interp_filter = cpi->sf.default_interp_filter;
+ cpi->common.interp_filter = SWITCHABLE;
cpi->common.switchable_motion_mode = 1;
}
@@ -3818,7 +3825,8 @@ static void set_restoration_unit_size(int width, int height, int sx, int sy,
rst[2].restoration_unit_size = rst[1].restoration_unit_size;
}
-static void init_ref_frame_bufs(AV1_COMMON *cm) {
+static void init_ref_frame_bufs(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
int i;
BufferPool *const pool = cm->buffer_pool;
cm->new_fb_idx = INVALID_IDX;
@@ -3828,7 +3836,7 @@ static void init_ref_frame_bufs(AV1_COMMON *cm) {
}
if (cm->seq_params.force_screen_content_tools) {
for (i = 0; i < FRAME_BUFFERS; ++i) {
- av1_hash_table_init(&pool->frame_bufs[i].hash_table);
+ av1_hash_table_init(&pool->frame_bufs[i].hash_table, &cpi->td.mb);
}
}
}
@@ -3846,7 +3854,7 @@ static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
seq_params->use_highbitdepth = use_highbitdepth;
alloc_raw_frame_buffers(cpi);
- init_ref_frame_bufs(cm);
+ init_ref_frame_bufs(cpi);
alloc_util_frame_buffers(cpi);
init_motion_estimation(cpi); // TODO(agrange) This can be removed.
@@ -4220,7 +4228,7 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
if (lf->filter_level[0] || lf->filter_level[1]) {
#if LOOP_FILTER_BITMASK
- av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0);
+ av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, 0, num_planes, 0);
#else
if (cpi->num_workers > 1)
av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0,
@@ -4587,8 +4595,8 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
rc->projected_frame_size < rc->max_frame_bandwidth)
loop = 0;
- if (recode_loop_test_global_motion(cpi)) {
- loop = 1;
+ if (!cpi->sf.gm_disable_recode) {
+ if (recode_loop_test_global_motion(cpi)) loop = 1;
}
if (loop) {
@@ -4716,47 +4724,6 @@ static void set_ext_overrides(AV1_COMP *cpi) {
cpi->ext_use_error_resilient && cpi->common.frame_type != KEY_FRAME;
}
-static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
- InterpFilter ifilter;
- int ref_total[REF_FRAMES] = { 0 };
- MV_REFERENCE_FRAME ref;
- int mask = 0;
- int arf_idx = ALTREF_FRAME;
-
- if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
- cpi->refresh_alt2_ref_frame)
- return mask;
-
- for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
- for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
- ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
-
- for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) {
- if ((ref_total[LAST_FRAME] &&
- cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
- (ref_total[LAST2_FRAME] == 0 ||
- cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50 <
- ref_total[LAST2_FRAME]) &&
- (ref_total[LAST3_FRAME] == 0 ||
- cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50 <
- ref_total[LAST3_FRAME]) &&
- (ref_total[GOLDEN_FRAME] == 0 ||
- cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 <
- ref_total[GOLDEN_FRAME]) &&
- (ref_total[BWDREF_FRAME] == 0 ||
- cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50 <
- ref_total[BWDREF_FRAME]) &&
- (ref_total[ALTREF2_FRAME] == 0 ||
- cpi->interp_filter_selected[ALTREF2_FRAME][ifilter] * 50 <
- ref_total[ALTREF2_FRAME]) &&
- (ref_total[ALTREF_FRAME] == 0 ||
- cpi->interp_filter_selected[arf_idx][ifilter] * 50 <
- ref_total[ALTREF_FRAME]))
- mask |= 1 << ifilter;
- }
- return mask;
-}
-
#define DUMP_RECON_FRAMES 0
#if DUMP_RECON_FRAMES == 1
@@ -4914,7 +4881,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
if (cm->current_video_frame > 0)
cpi->ref_frame_flags = get_ref_frame_flags(cpi);
- if (cm->show_existing_frame) {
+ if (encode_show_existing_frame(cm)) {
// NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
// BWDREF_FRAME in the reference frame buffer.
if (cm->frame_type == KEY_FRAME) {
@@ -4925,20 +4892,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
cm->show_frame = 1;
cpi->frame_flags = *frame_flags;
- // In the case of show_existing frame, we will not send fresh flag
- // to decoder. Any change in the reference frame buffer can be done by
- // switching the virtual indices.
-
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
-
- cpi->rc.is_bwd_ref_frame = 0;
- cpi->rc.is_last_bipred_frame = 0;
- cpi->rc.is_bipred_frame = 0;
-
restore_coding_context(cpi);
// Build the bitstream
@@ -4990,10 +4943,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
av1_rc_postencode_update(cpi, *size);
}
- // Decrement count down till next gf
- if (cpi->rc.frames_till_gf_update_due > 0)
- cpi->rc.frames_till_gf_update_due--;
-
++cm->current_video_frame;
return AOM_CODEC_OK;
@@ -5002,9 +4951,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
// Set default state for segment based loop filter update flags.
cm->lf.mode_ref_delta_update = 0;
- if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search)
- cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
-
// Set various flags etc to special state if it is a key frame.
if (frame_is_intra_only(cm) || frame_is_sframe(cm)) {
// Reset the loop filter deltas and segmentation map.
@@ -5246,15 +5192,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
// takes a space in the gf group. Therefore, even when
// it is not shown, we still need update the count down.
- // TODO(weitinglin): This is a work-around to handle the condition
- // when a frame is drop. We should fix the cm->show_frame flag
- // instead of checking the other condition to update the counter properly.
- if (cm->show_frame || is_frame_droppable(cpi)) {
- // Decrement count down till next gf
- if (cpi->rc.frames_till_gf_update_due > 0)
- cpi->rc.frames_till_gf_update_due--;
- }
-
if (cm->show_frame) {
// TODO(zoeliu): We may only swamp mi and prev_mi for those frames that
// are
@@ -5279,6 +5216,50 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
return AOM_CODEC_OK;
}
+static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+ // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
+ // differently here for rc->avg_frame_bandwidth.
+ if (cpi->common.show_frame || cpi->rc.is_bwd_ref_frame) {
+ if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+ cpi->common.frame_type == KEY_FRAME) {
+ // If this is a show_existing_frame with a source other than altref,
+ // or if it is not a displayed forward keyframe, the keyframe update
+ // counters were incremented when it was originally encoded.
+ cpi->rc.frames_since_key++;
+ cpi->rc.frames_to_key--;
+ }
+ }
+}
+
+static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+ // TODO(weitinglin): Updating this counter for is_frame_droppable
+ // is a work-around to handle the condition when a frame is drop.
+ // We should fix the cpi->common.show_frame flag
+ // instead of checking the other condition to update the counter properly.
+ if (cpi->common.show_frame || is_frame_droppable(cpi)) {
+ // Decrement count down till next gf
+ if (cpi->rc.frames_till_gf_update_due > 0)
+ cpi->rc.frames_till_gf_update_due--;
+ }
+}
+
+static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) {
+ // Increment the gf group index ready for the next frame. If this is
+ // a show_existing_frame with a source other than altref, or if it is not
+ // a displayed forward keyframe, the index was incremented when it was
+ // originally encoded.
+ if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+ cpi->common.frame_type == KEY_FRAME) {
+ ++cpi->twopass.gf_group.index;
+ }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+ update_keyframe_counters(cpi);
+ update_frames_till_gf_update(cpi);
+ if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi);
+}
+
static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
int skip_adapt, unsigned int *frame_flags) {
if (cpi->oxcf.rc_mode == AOM_CBR) {
@@ -5290,6 +5271,7 @@ static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
AOM_CODEC_OK) {
return AOM_CODEC_ERROR;
}
+ update_rc_counts(cpi);
check_show_existing_frame(cpi);
return AOM_CODEC_OK;
}
@@ -5319,14 +5301,8 @@ static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
cm->cum_txcoeff_cost_timer);
#endif
- // Do not do post-encoding update for those frames that do not have a spot
- // in
- // a gf group, but note that an OVERLAY frame always has a spot in a gf
- // group,
- // even when show_existing_frame is used.
- if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref) {
- av1_twopass_postencode_update(cpi);
- }
+ av1_twopass_postencode_update(cpi);
+ update_rc_counts(cpi);
check_show_existing_frame(cpi);
return AOM_CODEC_OK;
}
@@ -5734,7 +5710,7 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
av1_get_block_hash_value(
cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
block_size, &hash_value_1, &hash_value_2,
- (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH));
+ (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb);
// Hashing does not work for highbitdepth currently.
// TODO(Roger): Make it work for highbitdepth.
if (av1_use_hash_me(&cpi->common)) {
@@ -5822,13 +5798,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0);
- // Is multi-arf enabled.
- // Note that at the moment multi_arf is only configured for 2 pass VBR
- if ((oxcf->pass == 2) && (cpi->oxcf.enable_auto_arf > 1))
- cpi->multi_arf_allowed = 1;
- else
- cpi->multi_arf_allowed = 0;
-
// Normal defaults
cm->refresh_frame_context = oxcf->frame_parallel_decoding_mode
? REFRESH_FRAME_CONTEXT_DISABLED
@@ -5850,16 +5819,20 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
struct lookahead_entry *lookahead_src = NULL;
if (cm->current_video_frame > 0)
lookahead_src = av1_lookahead_peek(cpi->lookahead, 0);
- if (lookahead_src != NULL &&
- ((cpi->oxcf.error_resilient_mode |
- ((lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT) != 0)) ||
- (cpi->oxcf.s_frame_mode |
- ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0))) &&
- !(rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) {
- cm->show_existing_frame = 0;
+
+ int use_show_existing = 1;
+ if (lookahead_src != NULL) {
+ const int is_error_resilient =
+ cpi->oxcf.error_resilient_mode ||
+ (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
+ const int is_s_frame = cpi->oxcf.s_frame_mode ||
+ (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+ const int is_key_frame =
+ (rc->frames_to_key == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY);
+ use_show_existing = !(is_error_resilient || is_s_frame) || is_key_frame;
}
- if (oxcf->pass == 2 && cm->show_existing_frame) {
+ if (oxcf->pass == 2 && cm->show_existing_frame && use_show_existing) {
// Manage the source buffer and flush out the source frame that has been
// coded already; Also get prepared for PSNR calculation if needed.
if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) {
@@ -6415,3 +6388,50 @@ int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) {
const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
}
+
+aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
+ if (!cpi) return NULL;
+
+ uint8_t header_buf[512] = { 0 };
+ const uint32_t sequence_header_size =
+ write_sequence_header_obu(cpi, &header_buf[0]);
+ assert(sequence_header_size <= sizeof(header_buf));
+ if (sequence_header_size == 0) return NULL;
+
+ const size_t obu_header_size = 1;
+ const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size);
+ const size_t payload_offset = obu_header_size + size_field_size;
+
+ if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
+ memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
+
+ if (write_obu_header(OBU_SEQUENCE_HEADER, 0, &header_buf[0]) !=
+ obu_header_size) {
+ return NULL;
+ }
+
+ size_t coded_size_field_size = 0;
+ if (aom_uleb_encode(sequence_header_size, size_field_size,
+ &header_buf[obu_header_size],
+ &coded_size_field_size) != 0) {
+ return NULL;
+ }
+ assert(coded_size_field_size == size_field_size);
+
+ aom_fixed_buf_t *global_headers =
+ (aom_fixed_buf_t *)malloc(sizeof(*global_headers));
+ if (!global_headers) return NULL;
+
+ const size_t global_header_buf_size =
+ obu_header_size + size_field_size + sequence_header_size;
+
+ global_headers->buf = malloc(global_header_buf_size);
+ if (!global_headers->buf) {
+ free(global_headers);
+ return NULL;
+ }
+
+ memcpy(global_headers->buf, &header_buf[0], global_header_buf_size);
+ global_headers->sz = global_header_buf_size;
+ return global_headers;
+}
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
index 2b7ab711d..ee7fc4637 100644
--- a/third_party/aom/av1/encoder/encoder.h
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_ENCODER_H_
-#define AV1_ENCODER_ENCODER_H_
+#ifndef AOM_AV1_ENCODER_ENCODER_H_
+#define AOM_AV1_ENCODER_ENCODER_H_
#include <stdio.h>
@@ -142,7 +142,6 @@ typedef struct AV1EncoderConfig {
int noise_sensitivity; // pre processing blur: recommendation 0
int sharpness; // sharpening output: recommendation 0:
int speed;
- int dev_sf;
// maximum allowed bitrate for any intra frame in % of bitrate target.
unsigned int rc_max_intra_bitrate_pct;
// maximum allowed bitrate for any inter frame in % of bitrate target.
@@ -249,6 +248,7 @@ typedef struct AV1EncoderConfig {
int min_gf_interval;
int max_gf_interval;
+ int row_mt;
int tile_columns;
int tile_rows;
int tile_width_count;
@@ -309,6 +309,9 @@ typedef struct AV1EncoderConfig {
float noise_level;
int noise_block_size;
#endif
+
+ unsigned int chroma_subsampling_x;
+ unsigned int chroma_subsampling_y;
} AV1EncoderConfig;
static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
@@ -401,6 +404,43 @@ typedef struct FRAME_COUNTS {
[SWITCHABLE_FILTERS];
} FRAME_COUNTS;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
+
+typedef struct {
+ int ready;
+ double a;
+ double b;
+ double dist_mean;
+ double ld_mean;
+ double sse_mean;
+ double sse_sse_mean;
+ double sse_ld_mean;
+ int num;
+ double dist_sum;
+ double ld_sum;
+ double sse_sum;
+ double sse_sse_sum;
+ double sse_ld_sum;
+} InterModeRdModel;
+
+typedef struct {
+ int idx;
+ int64_t rd;
+} RdIdxPair;
+// TODO(angiebird): This is an estimated size. We still need to figure what is
+// the maximum number of modes.
+#define MAX_INTER_MODES 1024
+typedef struct inter_modes_info {
+ int num;
+ MB_MODE_INFO mbmi_arr[MAX_INTER_MODES];
+ int mode_rate_arr[MAX_INTER_MODES];
+ int64_t sse_arr[MAX_INTER_MODES];
+ int64_t est_rd_arr[MAX_INTER_MODES];
+ RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+} InterModesInfo;
+#endif
+
// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
typedef struct TileDataEnc {
TileInfo tile_info;
@@ -411,8 +451,18 @@ typedef struct TileDataEnc {
CFL_CTX cfl;
DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
uint8_t allow_update_cdf;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+ InterModesInfo inter_modes_info;
+#endif
} TileDataEnc;
+typedef struct {
+ TOKENEXTRA *start;
+ TOKENEXTRA *stop;
+ unsigned int count;
+} TOKENLIST;
+
typedef struct RD_COUNTS {
int64_t comp_pred_diff[REFERENCE_MODES];
// Stores number of 4x4 blocks using global motion per reference frame.
@@ -427,11 +477,14 @@ typedef struct ThreadData {
FRAME_COUNTS *counts;
PC_TREE *pc_tree;
PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+ uint32_t *hash_value_buffer[2][2];
int32_t *wsrc_buf;
int32_t *mask_buf;
uint8_t *above_pred_buf;
uint8_t *left_pred_buf;
PALETTE_BUFFER *palette_buffer;
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
int intrabc_used_this_tile;
} ThreadData;
@@ -502,6 +555,7 @@ typedef struct AV1_COMP {
int previous_index;
int cur_poc; // DebugInfo
+ unsigned int row_mt;
int scaled_ref_idx[REF_FRAMES];
int ref_fb_idx[REF_FRAMES];
int refresh_fb_idx; // ref frame buffer index to refresh
@@ -647,13 +701,12 @@ typedef struct AV1_COMP {
search_site_config ss_cfg;
- int multi_arf_allowed;
-
TileDataEnc *tile_data;
int allocated_tiles; // Keep track of memory allocated for tiles.
TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS];
+ TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
@@ -703,8 +756,13 @@ typedef struct AV1_COMP {
#if CONFIG_DENOISE
struct aom_denoise_and_model_t *denoise_and_model;
#endif
+ // Stores the default value of skip flag depending on chroma format
+ // Set as 1 for monochrome and 3 for other color formats
+ int default_interp_skip_flags;
+ int preserve_arf_as_gld;
} AV1_COMP;
+// Must not be called more than once.
void av1_initialize_enc(void);
struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
@@ -833,6 +891,22 @@ static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2,
return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
}
+static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
+ int mi_row, TOKENEXTRA **tok, int sb_size_log2,
+ int num_planes) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+
+ const int tile_mb_cols =
+ (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+ const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2;
+
+ *tok = cpi->tile_tok[tile_row][tile_col] +
+ get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes);
+}
+
void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
#define ALT_MIN_LAG 3
@@ -885,8 +959,27 @@ static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
return !av1_superres_scaled(cm) && av1_resize_scaled(cm);
}
+// Don't allow a show_existing_frame to coincide with an error resilient
+// frame. An exception can be made for a forward keyframe since it has no
+// previous dependencies.
+static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
+ return cm->show_existing_frame &&
+ (!cm->error_resilient_mode || cm->frame_type == KEY_FRAME);
+}
+
+// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
+// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
+// function, the memory must be freed by the caller. Both the buf member of the
+// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory
+// returned must be freed via call to free().
+//
+// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically,
+// the obu_has_size_field bit is set, and the buffer contains the obu_size
+// field.
+aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_ENCODER_H_
+#endif // AOM_AV1_ENCODER_ENCODER_H_
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
index 81f360733..5a31d93d7 100644
--- a/third_party/aom/av1/encoder/encodetxb.c
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -133,6 +133,38 @@ static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
return error;
}
+static const int8_t eob_to_pos_small[33] = {
+ 0, 1, 2, // 0-2
+ 3, 3, // 3-4
+ 4, 4, 4, 4, // 5-8
+ 5, 5, 5, 5, 5, 5, 5, 5, // 9-16
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32
+};
+
+static const int8_t eob_to_pos_large[17] = {
+ 6, // place holder
+ 7, // 33-64
+ 8, 8, // 65-128
+ 9, 9, 9, 9, // 129-256
+ 10, 10, 10, 10, 10, 10, 10, 10, // 257-512
+ 11 // 513-
+};
+
+static INLINE int get_eob_pos_token(const int eob, int *const extra) {
+ int t;
+
+ if (eob < 33) {
+ t = eob_to_pos_small[eob];
+ } else {
+ const int e = AOMMIN((eob - 1) >> 5, 16);
+ t = eob_to_pos_large[e];
+ }
+
+ *extra = eob - k_eob_group_start[t];
+
+ return t;
+}
+
#if CONFIG_ENTROPY_STATS
void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
TX_CLASS tx_class, PLANE_TYPE plane,
@@ -464,8 +496,12 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
aom_writer *w, int blk_row, int blk_col, int plane,
TX_SIZE tx_size, const tran_low_t *tcoeff,
uint16_t eob, TXB_CTX *txb_ctx) {
- const PLANE_TYPE plane_type = get_plane_type(plane);
const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, eob == 0,
+ ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
+ if (eob == 0) return;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
tx_size, cm->reduced_tx_set_used);
const TX_CLASS tx_class = tx_type_to_class[tx_type];
@@ -475,18 +511,10 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
const int bwl = get_txb_bwl(tx_size);
const int width = get_txb_wide(tx_size);
const int height = get_txb_high(tx_size);
- FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
uint8_t levels_buf[TX_PAD_2D];
uint8_t *const levels = set_levels(levels_buf, width);
DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
-
- aom_write_symbol(w, eob == 0,
- ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
- if (plane == 0 && eob == 0) {
- assert(tx_type == DCT_DCT);
- }
- if (eob == 0) return;
-
av1_txb_init_levels(tcoeff, width, height, levels);
av1_write_tx_type(cm, xd, blk_row, blk_col, plane, tx_size, w);
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
index 0442cc613..40ae343b0 100644
--- a/third_party/aom/av1/encoder/encodetxb.h
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef ENCODETXB_H_
-#define ENCODETXB_H_
+#ifndef AOM_AV1_ENCODER_ENCODETXB_H_
+#define AOM_AV1_ENCODER_ENCODETXB_H_
#include "config/aom_config.h"
@@ -84,4 +84,4 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
}
#endif
-#endif // COEFFS_CODING_H_
+#endif // AOM_AV1_ENCODER_ENCODETXB_H_
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
index 637d6824c..e8ac30bb5 100644
--- a/third_party/aom/av1/encoder/ethread.c
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -27,7 +27,8 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
}
-static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+static int enc_worker_hook(void *arg1, void *unused) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
AV1_COMP *const cpi = thread_data->cpi;
const AV1_COMMON *const cm = &cpi->common;
const int tile_cols = cm->tile_cols;
@@ -47,88 +48,141 @@ static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
return 1;
}
-void av1_encode_tiles_mt(AV1_COMP *cpi) {
+static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
AV1_COMMON *const cm = &cpi->common;
- const int tile_cols = cm->tile_cols;
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
- int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols);
- int i;
- av1_init_tile_data(cpi);
+ CHECK_MEM_ERROR(cm, cpi->workers,
+ aom_malloc(num_workers * sizeof(*cpi->workers)));
- // Only run once to create threads and allocate thread data.
- if (cpi->num_workers == 0) {
- CHECK_MEM_ERROR(cm, cpi->workers,
- aom_malloc(num_workers * sizeof(*cpi->workers)));
+ CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+ aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
- CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
- aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
- for (i = 0; i < num_workers; i++) {
- AVxWorker *const worker = &cpi->workers[i];
- EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+ ++cpi->num_workers;
+ winterface->init(worker);
+
+ thread_data->cpi = cpi;
+
+ if (i < num_workers - 1) {
+ // Allocate thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td,
+ aom_memalign(32, sizeof(*thread_data->td)));
+ av1_zero(*thread_data->td);
+
+ // Set up pc_tree.
+ thread_data->td->pc_tree = NULL;
+ av1_setup_pc_tree(cm, thread_data->td);
+
+ CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
+ (uint8_t *)aom_memalign(
+ 16, MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->td->above_pred_buf)));
+ CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
+ (uint8_t *)aom_memalign(
+ 16, MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->td->left_pred_buf)));
+
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->wsrc_buf,
+ (int32_t *)aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
+
+ for (int x = 0; x < 2; x++)
+ for (int y = 0; y < 2; y++)
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->hash_value_buffer[x][y],
+ (uint32_t *)aom_malloc(
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*thread_data->td->hash_value_buffer[0][0])));
+
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->mask_buf,
+ (int32_t *)aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
+ // Allocate frame counters in thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td->counts,
+ aom_calloc(1, sizeof(*thread_data->td->counts)));
+
+ // Allocate buffers used by palette coding mode.
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->palette_buffer,
+ aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
+
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+ sizeof(*thread_data->td->tmp_conv_dst)));
+ for (int j = 0; j < 2; ++j) {
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->tmp_obmc_bufs[j],
+ aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->td->tmp_obmc_bufs[j])));
+ }
- ++cpi->num_workers;
- winterface->init(worker);
+ // Create threads
+ if (!winterface->reset(worker))
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Tile encoder thread creation failed");
+ } else {
+ // Main thread acts as a worker and uses the thread data in cpi.
+ thread_data->td = &cpi->td;
+ }
+ winterface->sync(worker);
+ }
+}
- thread_data->cpi = cpi;
+static void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ // Encode a frame
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
- if (i < num_workers - 1) {
- // Allocate thread data.
- CHECK_MEM_ERROR(cm, thread_data->td,
- aom_memalign(32, sizeof(*thread_data->td)));
- av1_zero(*thread_data->td);
+ // Set the starting tile for each thread.
+ thread_data->start = i;
- // Set up pc_tree.
- thread_data->td->pc_tree = NULL;
- av1_setup_pc_tree(cm, thread_data->td);
+ if (i == cpi->num_workers - 1)
+ winterface->execute(worker);
+ else
+ winterface->launch(worker);
+ }
+}
- CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
- (uint8_t *)aom_memalign(
- 16, MAX_MB_PLANE * MAX_SB_SQUARE *
- sizeof(*thread_data->td->above_pred_buf)));
- CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
- (uint8_t *)aom_memalign(
- 16, MAX_MB_PLANE * MAX_SB_SQUARE *
- sizeof(*thread_data->td->left_pred_buf)));
+static void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
- CHECK_MEM_ERROR(
- cm, thread_data->td->wsrc_buf,
- (int32_t *)aom_memalign(
- 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
- CHECK_MEM_ERROR(
- cm, thread_data->td->mask_buf,
- (int32_t *)aom_memalign(
- 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
- // Allocate frame counters in thread data.
- CHECK_MEM_ERROR(cm, thread_data->td->counts,
- aom_calloc(1, sizeof(*thread_data->td->counts)));
-
- // Allocate buffers used by palette coding mode.
- CHECK_MEM_ERROR(
- cm, thread_data->td->palette_buffer,
- aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
-
- // Create threads
- if (!winterface->reset(worker))
- aom_internal_error(&cm->error, AOM_CODEC_ERROR,
- "Tile encoder thread creation failed");
- } else {
- // Main thread acts as a worker and uses the thread data in cpi.
- thread_data->td = &cpi->td;
- }
+ // Encoding ends.
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ winterface->sync(worker);
+ }
+}
- winterface->sync(worker);
+static void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) {
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+ cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile;
+ // Accumulate counters.
+ if (i < cpi->num_workers - 1) {
+ av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
+ accumulate_rd_opt(&cpi->td, thread_data->td);
+ cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
}
- } else {
- num_workers = AOMMIN(num_workers, cpi->num_workers);
}
+}
- for (i = 0; i < num_workers; i++) {
+static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ for (int i = 0; i < num_workers; i++) {
AVxWorker *const worker = &cpi->workers[i];
EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
- worker->hook = (AVxWorkerHook)enc_worker_hook;
+ worker->hook = hook;
worker->data1 = thread_data;
worker->data2 = NULL;
@@ -139,47 +193,59 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
+ for (int x = 0; x < 2; x++) {
+ for (int y = 0; y < 2; y++) {
+ memcpy(thread_data->td->hash_value_buffer[x][y],
+ cpi->td.mb.hash_value_buffer[x][y],
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*thread_data->td->hash_value_buffer[0][0]));
+ thread_data->td->mb.hash_value_buffer[x][y] =
+ thread_data->td->hash_value_buffer[x][y];
+ }
+ }
thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
}
if (thread_data->td->counts != &cpi->counts) {
memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
}
- if (i < num_workers - 1)
+ if (i < num_workers - 1) {
thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
- }
-
- // Encode a frame
- for (i = 0; i < num_workers; i++) {
- AVxWorker *const worker = &cpi->workers[i];
- EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
-
- // Set the starting tile for each thread.
- thread_data->start = i;
+ thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->mb.tmp_obmc_bufs[j] =
+ thread_data->td->tmp_obmc_bufs[j];
+ }
- if (i == cpi->num_workers - 1)
- winterface->execute(worker);
- else
- winterface->launch(worker);
+ thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
+ thread_data->td->mb.tmp_obmc_bufs[j];
+ }
+ }
}
+}
- // Encoding ends.
- for (i = 0; i < num_workers; i++) {
- AVxWorker *const worker = &cpi->workers[i];
- winterface->sync(worker);
- }
+void av1_encode_tiles_mt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows);
- for (i = 0; i < num_workers; i++) {
- AVxWorker *const worker = &cpi->workers[i];
- EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
- cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile;
- // Accumulate counters.
- if (i < cpi->num_workers - 1) {
- av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
- accumulate_rd_opt(&cpi->td, thread_data->td);
- cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
- }
+ if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
+ av1_alloc_tile_data(cpi);
+
+ av1_init_tile_data(cpi);
+ // Only run once to create threads and allocate thread data.
+ if (cpi->num_workers == 0) {
+ create_enc_workers(cpi, num_workers);
+ } else {
+ num_workers = AOMMIN(num_workers, cpi->num_workers);
}
+ prepare_enc_workers(cpi, enc_worker_hook, num_workers);
+ launch_enc_workers(cpi, num_workers);
+ sync_enc_workers(cpi, num_workers);
+ accumulate_counters_enc_workers(cpi, num_workers);
}
// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h
index b6b1fed4e..5de4b4803 100644
--- a/third_party/aom/av1/encoder/ethread.h
+++ b/third_party/aom/av1/encoder/ethread.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_ETHREAD_H_
-#define AV1_ENCODER_ETHREAD_H_
+#ifndef AOM_AV1_ENCODER_ETHREAD_H_
+#define AOM_AV1_ENCODER_ETHREAD_H_
#ifdef __cplusplus
extern "C" {
@@ -34,4 +34,4 @@ void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
} // extern "C"
#endif
-#endif // AV1_ENCODER_ETHREAD_H_
+#endif // AOM_AV1_ENCODER_ETHREAD_H_
diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h
index 48178b964..e0432cc97 100644
--- a/third_party/aom/av1/encoder/extend.h
+++ b/third_party/aom/av1/encoder/extend.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_EXTEND_H_
-#define AV1_ENCODER_EXTEND_H_
+#ifndef AOM_AV1_ENCODER_EXTEND_H_
+#define AOM_AV1_ENCODER_EXTEND_H_
#include "aom_scale/yv12config.h"
#include "aom/aom_integer.h"
@@ -29,4 +29,4 @@ void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
} // extern "C"
#endif
-#endif // AV1_ENCODER_EXTEND_H_
+#endif // AOM_AV1_ENCODER_EXTEND_H_
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
index ef0800c79..69dd20c52 100644
--- a/third_party/aom/av1/encoder/firstpass.c
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -31,6 +31,7 @@
#include "av1/encoder/aq_variance.h"
#include "av1/encoder/av1_quantize.h"
#include "av1/encoder/block.h"
+#include "av1/encoder/dwt.h"
#include "av1/encoder/encodeframe.h"
#include "av1/encoder/encodemb.h"
#include "av1/encoder/encodemv.h"
@@ -39,7 +40,7 @@
#include "av1/encoder/firstpass.h"
#include "av1/encoder/mcomp.h"
#include "av1/encoder/rd.h"
-#include "av1/encoder/dwt.h"
+#include "av1/encoder/reconinter_enc.h"
#define OUTPUT_FPF 0
#define ARF_STATS_OUTPUT 0
@@ -51,9 +52,10 @@
#define FACTOR_PT_LOW 0.70
#define FACTOR_PT_HIGH 0.90
#define FIRST_PASS_Q 10.0
-#define GF_MAX_BOOST 96.0
+#define GF_MAX_BOOST 90.0
#define INTRA_MODE_PENALTY 1024
-#define KF_MAX_BOOST 128.0
+#define KF_MIN_FRAME_BOOST 80.0
+#define KF_MAX_FRAME_BOOST 128.0
#define MIN_ARF_GF_BOOST 240
#define MIN_DECAY_FACTOR 0.01
#define MIN_KF_BOOST 300
@@ -62,6 +64,7 @@
#define DEFAULT_GRP_WEIGHT 1.0
#define RC_FACTOR_MIN 0.75
#define RC_FACTOR_MAX 1.75
+#define MIN_FWD_KF_INTERVAL 8
#define NCOUNT_INTRA_THRESH 8192
#define NCOUNT_INTRA_FACTOR 3
@@ -1562,576 +1565,9 @@ static int calculate_boost_bits(int frame_count, int boost,
0);
}
-#if USE_GF16_MULTI_LAYER
-// === GF Group of 16 ===
-#define GF_INTERVAL_16 16
-#define GF_FRAME_PARAMS (REF_FRAMES + 5)
-
-// GF Group of 16: multi-layer hierarchical coding structure
-// 1st Layer: Frame 0 and Frame 16 (ALTREF)
-// 2nd Layer: Frame 8 (ALTREF2)
-// 3rd Layer: Frame 4 and 12 (ALTREF2)
-// 4th Layer: Frame 2, 6, 10, and 14 (BWDREF)
-// 5th Layer: Frame 1, 3, 5, 7, 9, 11, 13, and 15
-static const unsigned char gf16_multi_layer_params[][GF_FRAME_PARAMS] = {
- // gf_group->index: coding order
- // (Frame #) : display order
- {
- // gf_group->index == 0 (Frame 0)
- OVERLAY_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // References (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF_FRAME, // Index (current) of reference to get updated
- GOLDEN_FRAME // cpi->refresh_golden_frame = 1
- },
- {
- // gf_group->index == 1 (Frame 16)
- ARF_UPDATE, // update_type
- GF_INTERVAL_16 - 1, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- ALTREF_FRAME, // cpi->alt_fb_idx ===> cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- GOLDEN_FRAME, // cpi->gld_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF_FRAME, // Index (current) of reference to get updated
- ALTREF_FRAME // cpi->refresh_alt_ref_frame = 1
- },
- {
- // gf_group->index == 2 (Frame 8)
- INTNL_ARF_UPDATE, // update_type
- (GF_INTERVAL_16 >> 1) - 1, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF2_FRAME, // Index (current) of reference to get updated
- ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1
- },
- {
- // gf_group->index == 3 (Frame 4)
- INTNL_ARF_UPDATE, // update_type
- (GF_INTERVAL_16 >> 2) - 1, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx
- // (BWDREF_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx
- // (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF2_FRAME, // Index (current) of reference to get updated
- ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1
- },
- {
- // gf_group->index == 4 (Frame 2)
- BRF_UPDATE, // update_type
- 0, // arf_src_offset
- 1, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx
- // (BWDREF_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx
- // (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- REF_FRAMES, // Index (current) of reference to get updated
- BWDREF_FRAME // cpi->refresh_bwd_ref_frame = 1
- },
- {
- // gf_group->index == 5 (Frame 1)
- LAST_BIPRED_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx ===> cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 6 (Frame 3)
- LF_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME)
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 7 (Frame 4 - OVERLAY)
- INTNL_OVERLAY_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- BWDREF_FRAME, // Index (current) of reference to get updated
- ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1
- },
- {
- // gf_group->index == 8 (Frame 6)
- BRF_UPDATE, // update_type
- 0, // arf_src_offset
- 1, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx -> cpi->bwd_fb_idx (BWDREF_FRAME)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF2_FRAME, // Index (current) of reference to get updated
- BWDREF_FRAME // cpi->refresh_bwd_frame = 1
- },
- {
- // gf_group->index == 9 (Frame 5)
- LAST_BIPRED_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 10 (Frame 7)
- LF_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 11 (Frame 8 - OVERLAY)
- INTNL_OVERLAY_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- BWDREF_FRAME, // Index (current) of reference to get updated
- ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1
- },
- {
- // gf_group->index == 12 (Frame 12)
- INTNL_ARF_UPDATE, // update_type
- (GF_INTERVAL_16 >> 2) - 1, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF2_FRAME, // Index (current) of reference to get updated
- ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1
- },
- {
- // gf_group->index == 13 (Frame 10)
- BRF_UPDATE, // update_type
- 0, // arf_src_offset
- 1, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF2_FRAME, // Index (current) of reference to get updated
- BWDREF_FRAME // cpi->refresh_bwd_frame = 1
- },
- {
- // gf_group->index == 14 (Frame 9)
- LAST_BIPRED_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 15 (Frame 11)
- LF_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 16 (Frame 12 - OVERLAY)
- INTNL_OVERLAY_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- BWDREF_FRAME, // Index (current) of reference to get updated
- ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1
- },
- {
- // gf_group->index == 17 (Frame 14)
- BRF_UPDATE, // update_type
- 0, // arf_src_offset
- 1, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- BWDREF_FRAME, // Index (current) of reference to get updated
- BWDREF_FRAME // cpi->refresh_bwd_frame = 1
- },
- {
- // gf_group->index == 18 (Frame 13)
- LAST_BIPRED_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 19 (Frame 15)
- LF_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 20 (Frame 16 - OVERLAY: Belonging to the next GF
- // group)
- OVERLAY_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF_FRAME, // Index (current) of reference to get updated
- GOLDEN_FRAME // cpi->refresh_golden_frame = 1
- }
-};
-
-// === GF Group of 16 ===
-static void define_gf_group_structure_16(AV1_COMP *cpi) {
- RATE_CONTROL *const rc = &cpi->rc;
- TWO_PASS *const twopass = &cpi->twopass;
- GF_GROUP *const gf_group = &twopass->gf_group;
- const int key_frame = cpi->common.frame_type == KEY_FRAME;
-
- assert(rc->baseline_gf_interval == GF_INTERVAL_16);
-
- // Total number of frames to consider for GF group of 16:
- // = GF group interval + number of OVERLAY's
- // = rc->baseline_gf_interval + MAX_EXT_ARFS + 1 + 1
- // NOTE: The OVERLAY frame for the next GF group also needs to consider to
- // prepare for the reference frame index mapping.
-
- const int gf_update_frames = rc->baseline_gf_interval + MAX_EXT_ARFS + 2;
-
- for (int frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
- int param_idx = 0;
-
- // Treat KEY_FRAME differently
- if (frame_index == 0 && key_frame) {
- gf_group->update_type[frame_index] = KF_UPDATE;
-
- gf_group->rf_level[frame_index] = KF_STD;
- gf_group->arf_src_offset[frame_index] = 0;
- gf_group->brf_src_offset[frame_index] = 0;
- gf_group->bidir_pred_enabled[frame_index] = 0;
- for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx)
- gf_group->ref_fb_idx_map[frame_index][ref_idx] = ref_idx;
- gf_group->refresh_idx[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1];
- gf_group->refresh_flag[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1];
-
- continue;
- }
-
- // == update_type ==
- gf_group->update_type[frame_index] =
- gf16_multi_layer_params[frame_index][param_idx++];
-
- // == rf_level ==
- // Derive rf_level from update_type
- switch (gf_group->update_type[frame_index]) {
- case LF_UPDATE: gf_group->rf_level[frame_index] = INTER_NORMAL; break;
- case ARF_UPDATE: gf_group->rf_level[frame_index] = GF_ARF_LOW; break;
- case OVERLAY_UPDATE:
- gf_group->rf_level[frame_index] = INTER_NORMAL;
- break;
- case BRF_UPDATE: gf_group->rf_level[frame_index] = GF_ARF_LOW; break;
- case LAST_BIPRED_UPDATE:
- gf_group->rf_level[frame_index] = INTER_NORMAL;
- break;
- case BIPRED_UPDATE: gf_group->rf_level[frame_index] = INTER_NORMAL; break;
- case INTNL_ARF_UPDATE:
- gf_group->rf_level[frame_index] = GF_ARF_LOW;
- break;
- case INTNL_OVERLAY_UPDATE:
- gf_group->rf_level[frame_index] = INTER_NORMAL;
- break;
- default: gf_group->rf_level[frame_index] = INTER_NORMAL; break;
- }
-
- // == arf_src_offset ==
- gf_group->arf_src_offset[frame_index] =
- gf16_multi_layer_params[frame_index][param_idx++];
-
- // == brf_src_offset ==
- gf_group->brf_src_offset[frame_index] =
- gf16_multi_layer_params[frame_index][param_idx++];
-
- // == bidir_pred_enabled ==
- // Derive bidir_pred_enabled from bidir_src_offset
- gf_group->bidir_pred_enabled[frame_index] =
- gf_group->brf_src_offset[frame_index] ? 1 : 0;
-
- // == ref_fb_idx_map ==
- for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx)
- gf_group->ref_fb_idx_map[frame_index][ref_idx] =
- gf16_multi_layer_params[frame_index][param_idx++];
-
- // == refresh_idx ==
- gf_group->refresh_idx[frame_index] =
- gf16_multi_layer_params[frame_index][param_idx++];
-
- // == refresh_flag ==
- gf_group->refresh_flag[frame_index] =
- gf16_multi_layer_params[frame_index][param_idx];
- }
-
- // Mark the ARF_UPDATE / INTNL_ARF_UPDATE and OVERLAY_UPDATE /
- // INTNL_OVERLAY_UPDATE for rate allocation
- // NOTE: Indexes are designed in the display order backward:
- // ALT[3] .. ALT[2] .. ALT[1] .. ALT[0],
- // but their coding order is as follows:
- // ALT0-ALT2-ALT3 .. OVERLAY3 .. OVERLAY2-ALT1 .. OVERLAY1 .. OVERLAY0
-
- const int num_arfs_in_gf = cpi->num_extra_arfs + 1;
- const int sub_arf_interval = rc->baseline_gf_interval / num_arfs_in_gf;
-
- // == arf_pos_for_ovrly ==: Position for OVERLAY
- for (int arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) {
- const int prior_num_arfs =
- (arf_idx <= 1) ? num_arfs_in_gf : (num_arfs_in_gf - 1);
- cpi->arf_pos_for_ovrly[arf_idx] =
- sub_arf_interval * (num_arfs_in_gf - arf_idx) + prior_num_arfs;
- }
-
- // == arf_pos_in_gf ==: Position for ALTREF
- cpi->arf_pos_in_gf[0] = 1;
- cpi->arf_pos_in_gf[1] = cpi->arf_pos_for_ovrly[2] + 1;
- cpi->arf_pos_in_gf[2] = 2;
- cpi->arf_pos_in_gf[3] = 3;
-
- // == arf_update_idx ==
- // == arf_ref_idx ==
- // NOTE: Due to the hierarchical nature of GF16, these two parameters only
- // relect the index to the nearest future overlay.
- int start_frame_index = 0;
- for (int arf_idx = (num_arfs_in_gf - 1); arf_idx >= 0; --arf_idx) {
- const int end_frame_index = cpi->arf_pos_for_ovrly[arf_idx];
- for (int frame_index = start_frame_index; frame_index <= end_frame_index;
- ++frame_index) {
- gf_group->arf_update_idx[frame_index] = arf_idx;
- gf_group->arf_ref_idx[frame_index] = arf_idx;
- }
- start_frame_index = end_frame_index + 1;
- }
-}
-#endif // USE_GF16_MULTI_LAYER
-
#if USE_SYMM_MULTI_LAYER
+// #define CHCEK_GF_PARAMETER
+#ifdef CHCEK_GF_PARAMETER
void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
int frame_nums) {
static const char *update_type_strings[] = {
@@ -2149,9 +1585,15 @@ void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
}
+
+ fprintf(fid, "number of nodes in each level: \n");
+ for (int i = 0; i < MAX_PYRAMID_LVL; ++i) {
+ fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]);
+ }
+ fprintf(fid, "\n");
fclose(fid);
}
-
+#endif // CHCEK_GF_PARAMETER
static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) {
// Derive rf_level from update_type
switch (update_type) {
@@ -2169,14 +1611,17 @@ static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) {
static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r,
int *frame_ind, int arf_ind, int level) {
- if (r - l == 2) {
- // leaf node, not a look-ahead frame
- gf_group->update_type[*frame_ind] = LF_UPDATE;
- gf_group->arf_src_offset[*frame_ind] = 0;
- gf_group->arf_pos_in_gf[*frame_ind] = 0;
- gf_group->arf_update_idx[*frame_ind] = arf_ind;
- gf_group->pyramid_level[*frame_ind] = level;
- ++(*frame_ind);
+ if (r - l < 4) {
+ while (++l < r) {
+ // leaf nodes, not a look-ahead frame
+ gf_group->update_type[*frame_ind] = LF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->arf_pos_in_gf[*frame_ind] = 0;
+ gf_group->arf_update_idx[*frame_ind] = arf_ind;
+ gf_group->pyramid_level[*frame_ind] = 0;
+ ++gf_group->pyramid_lvl_nodes[0];
+ ++(*frame_ind);
+ }
} else {
int m = (l + r) / 2;
int arf_pos_in_gf = *frame_ind;
@@ -2186,6 +1631,7 @@ static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r,
gf_group->arf_pos_in_gf[*frame_ind] = 0;
gf_group->arf_update_idx[*frame_ind] = 1; // mark all internal ARF 1
gf_group->pyramid_level[*frame_ind] = level;
+ ++gf_group->pyramid_lvl_nodes[level];
++(*frame_ind);
// set parameters for frames displayed before this frame
@@ -2209,7 +1655,7 @@ static INLINE unsigned char get_pyramid_height(int pyramid_width) {
assert(pyramid_width <= 16 && pyramid_width >= 4 &&
"invalid gf interval for pyramid structure");
- return pyramid_width == 16 ? 4 : (pyramid_width >= 8 ? 3 : 2);
+ return pyramid_width > 12 ? 4 : (pyramid_width > 6 ? 3 : 2);
}
static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
@@ -2217,6 +1663,10 @@ static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
int frame_index = 0;
gf_group->pyramid_height = get_pyramid_height(gf_interval);
+ assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL);
+
+ av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL);
+
// At the beginning of each GF group it will be a key or overlay frame,
gf_group->update_type[frame_index] = OVERLAY_UPDATE;
gf_group->arf_src_offset[frame_index] = 0;
@@ -2236,9 +1686,6 @@ static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
// set parameters for the rest of the frames
set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0,
gf_group->pyramid_height - 1);
-
- // check_frame_params(gf_group, gf_interval, frame_index);
-
return frame_index;
}
@@ -2248,8 +1695,8 @@ void define_customized_gf_group_structure(AV1_COMP *cpi) {
GF_GROUP *const gf_group = &twopass->gf_group;
const int key_frame = cpi->common.frame_type == KEY_FRAME;
- assert(rc->baseline_gf_interval == 4 || rc->baseline_gf_interval == 8 ||
- rc->baseline_gf_interval == 16);
+ assert(rc->baseline_gf_interval >= 4 &&
+ rc->baseline_gf_interval <= MAX_PYRAMID_SIZE);
const int gf_update_frames =
construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval);
@@ -2305,8 +1752,9 @@ void define_customized_gf_group_structure(AV1_COMP *cpi) {
// This parameter is useless?
gf_group->arf_ref_idx[frame_index] = 0;
-
+#ifdef CHCEK_GF_PARAMETER
check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames);
+#endif
}
// It is an example of how to define a GF stucture manually. The function will
@@ -2447,16 +1895,10 @@ static int define_gf_group_structure_4(AV1_COMP *cpi) {
static void define_gf_group_structure(AV1_COMP *cpi) {
RATE_CONTROL *const rc = &cpi->rc;
-#if USE_GF16_MULTI_LAYER
- if (rc->baseline_gf_interval == 16) {
- define_gf_group_structure_16(cpi);
- return;
- }
-#endif // USE_GF16_MULTI_LAYER
#if USE_SYMM_MULTI_LAYER
- const int valid_customized_gf_length = rc->baseline_gf_interval == 4 ||
- rc->baseline_gf_interval == 8 ||
- rc->baseline_gf_interval == 16;
+ const int valid_customized_gf_length =
+ rc->baseline_gf_interval >= 4 &&
+ rc->baseline_gf_interval <= MAX_PYRAMID_SIZE;
// used the new structure only if extra_arf is allowed
if (valid_customized_gf_length && rc->source_alt_ref_pending &&
cpi->extra_arf_allowed > 0) {
@@ -2685,6 +2127,18 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
gf_group->brf_src_offset[frame_index] = 0;
}
+#if USE_SYMM_MULTI_LAYER
+#define LEAF_REDUCTION_FACTOR 0.75f
+#define LVL_3_BOOST_FACTOR 0.8f
+#define LVL_2_BOOST_FACTOR 0.3f
+
+static float_t lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = {
+ { 1, 0, 0 },
+ { LVL_3_BOOST_FACTOR, 0, 0 }, // Leaking budget works better
+ { LVL_3_BOOST_FACTOR, (1 - LVL_3_BOOST_FACTOR) * LVL_2_BOOST_FACTOR,
+ (1 - LVL_3_BOOST_FACTOR) * (1 - LVL_2_BOOST_FACTOR) }
+};
+#endif // USE_SYMM_MULTI_LAYER
static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
double group_error, int gf_arf_bits) {
RATE_CONTROL *const rc = &cpi->rc;
@@ -2771,20 +2225,39 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
// BIPRED_UPDATE frames need to be further adjusted.
gf_group->bit_allocation[frame_index] = target_frame_size;
#if USE_SYMM_MULTI_LAYER
- } else if (cpi->new_bwdref_update_rule == 1 &&
+ } else if (cpi->new_bwdref_update_rule &&
gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
+ assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
+ gf_group->pyramid_height >= 0 &&
+ "non-valid height for a pyramid structure");
+
int arf_pos = gf_group->arf_pos_in_gf[frame_index];
gf_group->bit_allocation[frame_index] = 0;
- // Tried boosting up the allocated bits on backward reference frame
- // by (target_frame_size >> 2) as in the original setting. However it
- // does not bring gains for pyramid structure with GF length = 16.
gf_group->bit_allocation[arf_pos] = target_frame_size;
-#endif
+#if MULTI_LVL_BOOST_VBR_CQ
+ const int pyr_h = gf_group->pyramid_height - 2;
+ const int this_lvl = gf_group->pyramid_level[arf_pos];
+ const int dist2top = gf_group->pyramid_height - 1 - this_lvl;
+
+ const float_t budget =
+ LEAF_REDUCTION_FACTOR * gf_group->pyramid_lvl_nodes[0];
+ const float_t lvl_boost = budget * lvl_budget_factor[pyr_h][dist2top] /
+ gf_group->pyramid_lvl_nodes[this_lvl];
+
+ gf_group->bit_allocation[arf_pos] += (int)(target_frame_size * lvl_boost);
+#endif // MULTI_LVL_BOOST_VBR_CQ
+#endif // USE_SYMM_MULTI_LAYER
} else {
assert(gf_group->update_type[frame_index] == LF_UPDATE ||
gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
gf_group->bit_allocation[frame_index] = target_frame_size;
+#if MULTI_LVL_BOOST_VBR_CQ
+ if (cpi->new_bwdref_update_rule) {
+ gf_group->bit_allocation[frame_index] -=
+ (int)(target_frame_size * LEAF_REDUCTION_FACTOR);
+ }
+#endif // MULTI_LVL_BOOST_VBR_CQ
}
++frame_index;
@@ -2833,9 +2306,11 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
int i;
double boost_score = 0.0;
-#if !FIX_GF_INTERVAL_LENGTH
+#if !CONFIG_FIX_GF_LENGTH
double old_boost_score = 0.0;
double mv_ratio_accumulator_thresh;
+ int active_max_gf_interval;
+ int active_min_gf_interval;
#endif
double gf_group_err = 0.0;
#if GROUP_ADAPTIVE_MAXQ
@@ -2862,8 +2337,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
int f_boost = 0;
int b_boost = 0;
int flash_detected;
- int active_max_gf_interval;
- int active_min_gf_interval;
int64_t gf_group_bits;
double gf_group_error_left;
int gf_arf_bits;
@@ -2898,11 +2371,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
gf_group_skip_pct -= this_frame->intra_skip_pct;
gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
}
-#if !FIX_GF_INTERVAL_LENGTH
+#if !CONFIG_FIX_GF_LENGTH
// Motion breakout threshold for loop below depends on image size.
mv_ratio_accumulator_thresh =
(cpi->initial_height + cpi->initial_width) / 4.0;
-#endif
// Set a maximum and minimum interval for the GF group.
// If the image appears almost completely static we can extend beyond this.
{
@@ -2915,23 +2387,19 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if (active_min_gf_interval > rc->max_gf_interval)
active_min_gf_interval = rc->max_gf_interval;
- if (cpi->multi_arf_allowed) {
+ // The value chosen depends on the active Q range. At low Q we have
+ // bits to spare and are better with a smaller interval and smaller boost.
+ // At high Q when there are few bits to spare we are better with a longer
+ // interval to spread the cost of the GF.
+ active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
+
+ // We have: active_min_gf_interval <= rc->max_gf_interval
+ if (active_max_gf_interval < active_min_gf_interval)
+ active_max_gf_interval = active_min_gf_interval;
+ else if (active_max_gf_interval > rc->max_gf_interval)
active_max_gf_interval = rc->max_gf_interval;
- } else {
- // The value chosen depends on the active Q range. At low Q we have
- // bits to spare and are better with a smaller interval and smaller boost.
- // At high Q when there are few bits to spare we are better with a longer
- // interval to spread the cost of the GF.
- active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
-
- // We have: active_min_gf_interval <= rc->max_gf_interval
- if (active_max_gf_interval < active_min_gf_interval)
- active_max_gf_interval = active_min_gf_interval;
- else if (active_max_gf_interval > rc->max_gf_interval)
- active_max_gf_interval = rc->max_gf_interval;
- }
}
-
+#endif // !CONFIG_FIX_GF_LENGTH
double avg_sr_coded_error = 0;
double avg_raw_err_stdev = 0;
int non_zero_stdev_count = 0;
@@ -2990,10 +2458,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
boost_score +=
decay_accumulator *
calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
-#if FIX_GF_INTERVAL_LENGTH
+#if CONFIG_FIX_GF_LENGTH
if (i == (FIXED_GF_LENGTH + 1)) break;
#else
- // Skip breaking condition for FIX_GF_INTERVAL_LENGTH
+ // Skip breaking condition for CONFIG_FIX_GF_LENGTH
// Break out conditions.
if (
// Break at active_max_gf_interval unless almost totally static.
@@ -3017,7 +2485,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
}
}
old_boost_score = boost_score;
-#endif // FIX_GF_INTERVAL_LENGTH
+#endif // CONFIG_FIX_GF_LENGTH
*this_frame = next_frame;
}
twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
@@ -3030,44 +2498,116 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
assert(num_mbs > 0);
if (i) avg_sr_coded_error /= i;
+ if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
+
+ // Disable extra altrefs and backward refs for "still" gf group:
+ // zero_motion_accumulator: minimum percentage of (0,0) motion;
+ // avg_sr_coded_error: average of the SSE per pixel of each frame;
+ // avg_raw_err_stdev: average of the standard deviation of (0,0)
+ // motion error per block of each frame.
+ const int disable_bwd_extarf =
+ (zero_motion_accumulator > MIN_ZERO_MOTION &&
+ avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+ avg_raw_err_stdev < MAX_RAW_ERR_VAR);
+
+ if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
+
+#define REDUCE_GF_LENGTH_THRESH 4
+#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
+#define REDUCE_GF_LENGTH_BY 1
+ int alt_offset = 0;
+#if REDUCE_LAST_GF_LENGTH
+ // TODO(weitinglin): The length reduction stretagy is tweaking using AOM_Q
+ // mode, and hurting the performance of VBR mode. We need to investigate how
+ // to adjust GF length for other modes.
+
+ int allow_gf_length_reduction =
+ cpi->oxcf.rc_mode == AOM_Q || cpi->extra_arf_allowed == 0;
+
+ // We are going to have an alt ref, but we don't have do adjustment for
+ // lossless mode
+ if (allow_alt_ref && allow_gf_length_reduction &&
+ (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval) &&
+ !is_lossless_requested(&cpi->oxcf)) {
+ // adjust length of this gf group if one of the following condition met
+ // 1: only one overlay frame left and this gf is too long
+ // 2: next gf group is too short to have arf compared to the current gf
+
+ // maximum length of next gf group
+ const int next_gf_len = rc->frames_to_key - i;
+ const int single_overlay_left =
+ next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
+ // the next gf is probably going to have a ARF but it will be shorter than
+ // this gf
+ const int unbalanced_gf =
+ i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+ next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+ next_gf_len + 1 >= rc->min_gf_interval;
+
+ if (single_overlay_left || unbalanced_gf) {
+ // Note: Tried roll_back = DIVIDE_AND_ROUND(i, 8), but is does not work
+ // better in the current setting
+ const int roll_back = REDUCE_GF_LENGTH_BY;
+ alt_offset = -roll_back;
+ i -= roll_back;
+ }
+ }
+#endif
+
// Should we use the alternate reference frame.
if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
(i >= rc->min_gf_interval)) {
// Calculate the boost for alt ref.
rc->gfu_boost =
- calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+ calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost);
rc->source_alt_ref_pending = 1;
+
+ // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF
+ cpi->preserve_arf_as_gld = 1;
} else {
rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
rc->source_alt_ref_pending = 0;
+ cpi->preserve_arf_as_gld = 0;
}
// Set the interval until the next gf.
- if (cpi->oxcf.fwd_kf_enabled) {
- // Ensure the gf group before the next keyframe will contain an altref
- if ((rc->frames_to_key - i < rc->min_gf_interval) &&
- (rc->frames_to_key != i)) {
- rc->baseline_gf_interval = AOMMIN(rc->frames_to_key - rc->min_gf_interval,
- rc->static_scene_max_gf_interval);
- } else {
+ // If forward keyframes are enabled, ensure the final gf group obeys the
+ // MIN_FWD_KF_INTERVAL.
+ if (cpi->oxcf.fwd_kf_enabled &&
+ ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) {
+ if (i == rc->frames_to_key) {
rc->baseline_gf_interval = i;
+ // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
+ } else if ((rc->frames_to_key - i <
+ AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
+ (rc->frames_to_key != i)) {
+ // if possible, merge the last two gf groups
+ if (rc->frames_to_key <= MAX_PYRAMID_SIZE) {
+ rc->baseline_gf_interval = rc->frames_to_key;
+ // if merging the last two gf groups creates a group that is too long,
+ // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
+ } else {
+ rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
+ }
+ } else {
+ rc->baseline_gf_interval =
+ i - (is_key_frame || rc->source_alt_ref_pending);
}
} else {
rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
}
- if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
- // Disable extra altrefs and backward refs for "still" gf group:
- // zero_motion_accumulator: minimum percentage of (0,0) motion;
- // avg_sr_coded_error: average of the SSE per pixel of each frame;
- // avg_raw_err_stdev: average of the standard deviation of (0,0)
- // motion error per block of each frame.
- const int disable_bwd_extarf =
- (zero_motion_accumulator > MIN_ZERO_MOTION &&
- avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
- avg_raw_err_stdev < MAX_RAW_ERR_VAR);
-
- if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
+#if REDUCE_LAST_ALT_BOOST
+#define LAST_ALR_BOOST_FACTOR 0.2f
+ rc->arf_boost_factor = 1.0;
+ if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
+ // Reduce the boost of altref in the last gf group
+ if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY ||
+ rc->frames_to_key - i == 0) {
+ rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+ }
+ }
+#endif
if (!cpi->extra_arf_allowed) {
cpi->num_extra_arfs = 0;
@@ -3439,6 +2979,11 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// how many bits to spend on it.
decay_accumulator = 1.0;
boost_score = 0.0;
+ const double kf_max_boost =
+ cpi->oxcf.rc_mode == AOM_Q
+ ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+ KF_MAX_FRAME_BOOST)
+ : KF_MAX_FRAME_BOOST;
for (i = 0; i < (rc->frames_to_key - 1); ++i) {
if (EOF == input_stats(twopass, &next_frame)) break;
@@ -3450,7 +2995,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if ((i <= rc->max_gf_interval) ||
((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
const double frame_boost =
- calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST);
+ calc_frame_boost(cpi, this_frame, 0, kf_max_boost);
// How fast is prediction quality decaying.
if (!detect_flash(twopass, 0)) {
@@ -3513,147 +3058,6 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
twopass->modified_error_left -= kf_group_err;
}
-#if USE_GF16_MULTI_LAYER
-// === GF Group of 16 ===
-void av1_ref_frame_map_idx_updates(AV1_COMP *cpi, int gf_frame_index) {
- TWO_PASS *const twopass = &cpi->twopass;
- GF_GROUP *const gf_group = &twopass->gf_group;
-
- int ref_fb_idx_prev[REF_FRAMES];
- int ref_fb_idx_curr[REF_FRAMES];
-
- for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
- ref_fb_idx_prev[ref_frame] = cpi->ref_fb_idx[ref_frame];
- }
-
- // Update map index for each reference frame
- for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) {
- int ref_frame = gf_group->ref_fb_idx_map[gf_frame_index][ref_idx];
- ref_fb_idx_curr[ref_idx] = ref_fb_idx_prev[ref_frame - LAST_FRAME];
- }
-
- for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
- cpi->ref_fb_idx[ref_frame] = ref_fb_idx_curr[ref_frame];
- }
-}
-
-// Define the reference buffers that will be updated post encode.
-static void configure_buffer_updates_16(AV1_COMP *cpi) {
- TWO_PASS *const twopass = &cpi->twopass;
- GF_GROUP *const gf_group = &twopass->gf_group;
-
- if (gf_group->update_type[gf_group->index] == KF_UPDATE) {
- cpi->refresh_fb_idx = 0;
-
- cpi->refresh_last_frame = 1;
- cpi->refresh_golden_frame = 1;
- cpi->refresh_bwd_ref_frame = 1;
- cpi->refresh_alt2_ref_frame = 1;
- cpi->refresh_alt_ref_frame = 1;
-
- return;
- }
-
- // Update reference frame map indexes
- av1_ref_frame_map_idx_updates(cpi, gf_group->index);
-
- // Update refresh index
- switch (gf_group->refresh_idx[gf_group->index]) {
- case LAST_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST_FRAME - LAST_FRAME];
- break;
-
- case LAST2_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST2_FRAME - LAST_FRAME];
- break;
-
- case LAST3_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST3_FRAME - LAST_FRAME];
- break;
-
- case GOLDEN_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
- break;
-
- case BWDREF_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[BWDREF_FRAME - 1];
- break;
-
- case ALTREF2_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
- break;
-
- case ALTREF_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
- break;
-
- case REF_FRAMES:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[REF_FRAMES - 1];
- break;
-
- default: assert(0); break;
- }
-
- // Update refresh flags
- switch (gf_group->refresh_flag[gf_group->index]) {
- case LAST_FRAME:
- cpi->refresh_last_frame = 1;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
- break;
-
- case GOLDEN_FRAME:
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 1;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
- break;
-
- case BWDREF_FRAME:
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 1;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
- break;
-
- case ALTREF2_FRAME:
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 1;
- cpi->refresh_alt_ref_frame = 0;
- break;
-
- case ALTREF_FRAME:
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 1;
- break;
-
- default: assert(0); break;
- }
-
- switch (gf_group->update_type[gf_group->index]) {
- case BRF_UPDATE: cpi->rc.is_bwd_ref_frame = 1; break;
-
- case LAST_BIPRED_UPDATE: cpi->rc.is_last_bipred_frame = 1; break;
-
- case BIPRED_UPDATE: cpi->rc.is_bipred_frame = 1; break;
-
- case INTNL_OVERLAY_UPDATE: cpi->rc.is_src_frame_ext_arf = 1;
- case OVERLAY_UPDATE: cpi->rc.is_src_frame_alt_ref = 1; break;
-
- default: break;
- }
-}
-#endif // USE_GF16_MULTI_LAYER
-
// Define the reference buffers that will be updated post encode.
static void configure_buffer_updates(AV1_COMP *cpi) {
TWO_PASS *const twopass = &cpi->twopass;
@@ -3667,14 +3071,6 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
cpi->rc.is_bipred_frame = 0;
cpi->rc.is_src_frame_ext_arf = 0;
-#if USE_GF16_MULTI_LAYER
- RATE_CONTROL *const rc = &cpi->rc;
- if (rc->baseline_gf_interval == 16) {
- configure_buffer_updates_16(cpi);
- return;
- }
-#endif // USE_GF16_MULTI_LAYER
-
switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
case KF_UPDATE:
cpi->refresh_last_frame = 1;
@@ -3979,8 +3375,7 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
: cpi->common.MBs;
// The multiplication by 256 reverses a scaling factor of (>> 8)
// applied when combining MB error values for the frame.
- twopass->mb_av_energy =
- log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+ twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0);
twopass->frame_avg_haar_energy =
log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0);
}
@@ -4020,9 +3415,6 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) {
}
twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
- // Increment the gf group index ready for the next frame.
- ++twopass->gf_group.index;
-
// If the rate control is drifting consider adjustment to min or maxq.
if ((cpi->oxcf.rc_mode != AOM_Q) &&
(cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
index b0c1a21e4..4b7325ae2 100644
--- a/third_party/aom/av1/encoder/firstpass.h
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_FIRSTPASS_H_
-#define AV1_ENCODER_FIRSTPASS_H_
+#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_
+#define AOM_AV1_ENCODER_FIRSTPASS_H_
#include "av1/common/enums.h"
#include "av1/common/onyxc_int.h"
@@ -47,15 +47,7 @@ typedef struct {
// number of bi-predictive frames.
#define BFG_INTERVAL 2
// The maximum number of extra ALTREF's except ALTREF_FRAME
-// NOTE: REF_FRAMES indicates the maximum number of frames that may be buffered
-// to serve as references. Currently REF_FRAMES == 8.
-#define USE_GF16_MULTI_LAYER 0
-
-#if USE_GF16_MULTI_LAYER
-#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME)
-#else // !USE_GF16_MULTI_LAYER
#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
-#endif // USE_GF16_MULTI_LAYER
#define MIN_EXT_ARF_INTERVAL 4
@@ -126,6 +118,7 @@ typedef struct {
unsigned char arf_pos_in_gf[(MAX_LAG_BUFFERS * 2) + 1];
unsigned char pyramid_level[(MAX_LAG_BUFFERS * 2) + 1];
unsigned char pyramid_height;
+ unsigned char pyramid_lvl_nodes[MAX_PYRAMID_LVL];
#endif
unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
@@ -197,10 +190,6 @@ void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi,
// Post encode update of the rate control parameters for 2-pass
void av1_twopass_postencode_update(struct AV1_COMP *cpi);
-#if USE_GF16_MULTI_LAYER
-void av1_ref_frame_map_idx_updates(struct AV1_COMP *cpi, int gf_frame_index);
-#endif // USE_GF16_MULTI_LAYER
-
static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
if (arf_pending && MAX_EXT_ARFS > 0)
return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)
@@ -216,4 +205,4 @@ static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
} // extern "C"
#endif
-#endif // AV1_ENCODER_FIRSTPASS_H_
+#endif // AOM_AV1_ENCODER_FIRSTPASS_H_
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
index f07d1bc00..e9f8b0bb4 100644
--- a/third_party/aom/av1/encoder/global_motion.c
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -32,8 +32,8 @@
// Border over which to compute the global motion
#define ERRORADV_BORDER 0
-static const double erroradv_tr[] = { 0.75, 0.70, 0.65 };
-static const double erroradv_prod_tr[] = { 22000, 20000, 18000 };
+static const double erroradv_tr[] = { 0.65, 0.60, 0.55 };
+static const double erroradv_prod_tr[] = { 20000, 18000, 16000 };
int is_enough_erroradvantage(double best_erroradvantage, int params_cost,
int erroradv_type) {
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
index 2c15753fd..c7c016c43 100644
--- a/third_party/aom/av1/encoder/global_motion.h
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_GLOBAL_MOTION_H_
-#define AV1_ENCODER_GLOBAL_MOTION_H_
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_
#include "aom/aom_integer.h"
#include "aom_scale/yv12config.h"
@@ -61,4 +61,4 @@ int compute_global_motion_feature_based(TransformationType type,
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_GLOBAL_MOTION_H_
+#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_H_
diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h
index 45632da9b..945dc3733 100644
--- a/third_party/aom/av1/encoder/grain_test_vectors.h
+++ b/third_party/aom/av1/encoder/grain_test_vectors.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_GRAIN_TEST_VECTORS_H_
-#define AV1_GRAIN_TEST_VECTORS_H_
+#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
/* Test vectors for emulation of different film grain types.
* Note that bit depth would be derived from the bitstream and
@@ -778,4 +778,4 @@ static aom_film_grain_t film_grain_test_vectors[16] = {
45231 /* random_seed */
},
};
-#endif // AV1_GRAIN_TEST_VECTORS_H_
+#endif // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h
index 8b6227540..826c004d6 100644
--- a/third_party/aom/av1/encoder/hash.h
+++ b/third_party/aom/av1/encoder/hash.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_HASH_H_
-#define AV1_ENCODER_HASH_H_
+#ifndef AOM_AV1_ENCODER_HASH_H_
+#define AOM_AV1_ENCODER_HASH_H_
#include "config/aom_config.h"
@@ -43,8 +43,10 @@ typedef struct _CRC32C {
// init table for software version crc32c
void av1_crc32c_calculator_init(CRC32C *p_crc32c);
+#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_HASH_H_
+#endif // AOM_AV1_ENCODER_HASH_H_
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
index f2ff5b495..e85a516e8 100644
--- a/third_party/aom/av1/encoder/hash_motion.c
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -13,14 +13,12 @@
#include "config/av1_rtcd.h"
+#include "av1/encoder/block.h"
#include "av1/encoder/hash.h"
#include "av1/encoder/hash_motion.h"
static const int crc_bits = 16;
static const int block_size_bits = 3;
-static CRC_CALCULATOR crc_calculator1;
-static CRC_CALCULATOR crc_calculator2;
-static int g_crc_initialized = 0;
static void hash_table_clear_all(hash_table *p_hash_table) {
if (p_hash_table->p_lookup_table == NULL) {
@@ -106,11 +104,11 @@ static int hash_block_size_to_index(int block_size) {
}
}
-void av1_hash_table_init(hash_table *p_hash_table) {
- if (g_crc_initialized == 0) {
- av1_crc_calculator_init(&crc_calculator1, 24, 0x5D6DCB);
- av1_crc_calculator_init(&crc_calculator2, 24, 0x864CFB);
- g_crc_initialized = 1;
+void av1_hash_table_init(hash_table *p_hash_table, MACROBLOCK *x) {
+ if (x->g_crc_initialized == 0) {
+ av1_crc_calculator_init(&x->crc_calculator1, 24, 0x5D6DCB);
+ av1_crc_calculator_init(&x->crc_calculator2, 24, 0x864CFB);
+ x->g_crc_initialized = 1;
}
p_hash_table->p_lookup_table = NULL;
}
@@ -181,7 +179,8 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
uint32_t *pic_block_hash[2],
- int8_t *pic_block_same_info[3]) {
+ int8_t *pic_block_same_info[3],
+ MACROBLOCK *x) {
const int width = 2;
const int height = 2;
const int x_end = picture->y_crop_width - width + 1;
@@ -201,9 +200,9 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
pic_block_hash[0][pos] = av1_get_crc_value(
- &crc_calculator1, (uint8_t *)p, length * sizeof(p[0]));
+ &x->crc_calculator1, (uint8_t *)p, length * sizeof(p[0]));
pic_block_hash[1][pos] = av1_get_crc_value(
- &crc_calculator2, (uint8_t *)p, length * sizeof(p[0]));
+ &x->crc_calculator2, (uint8_t *)p, length * sizeof(p[0]));
pos++;
}
pos += width - 1;
@@ -220,9 +219,9 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
pic_block_hash[0][pos] =
- av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0]));
+ av1_get_crc_value(&x->crc_calculator1, p, length * sizeof(p[0]));
pic_block_hash[1][pos] =
- av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0]));
+ av1_get_crc_value(&x->crc_calculator2, p, length * sizeof(p[0]));
pos++;
}
pos += width - 1;
@@ -235,7 +234,8 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
uint32_t *src_pic_block_hash[2],
uint32_t *dst_pic_block_hash[2],
int8_t *src_pic_block_same_info[3],
- int8_t *dst_pic_block_same_info[3]) {
+ int8_t *dst_pic_block_same_info[3],
+ MACROBLOCK *x) {
const int pic_width = picture->y_crop_width;
const int x_end = picture->y_crop_width - block_size + 1;
const int y_end = picture->y_crop_height - block_size + 1;
@@ -254,14 +254,14 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
p[2] = src_pic_block_hash[0][pos + src_size * pic_width];
p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size];
dst_pic_block_hash[0][pos] =
- av1_get_crc_value(&crc_calculator1, (uint8_t *)p, length);
+ av1_get_crc_value(&x->crc_calculator1, (uint8_t *)p, length);
p[0] = src_pic_block_hash[1][pos];
p[1] = src_pic_block_hash[1][pos + src_size];
p[2] = src_pic_block_hash[1][pos + src_size * pic_width];
p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size];
dst_pic_block_hash[1][pos] =
- av1_get_crc_value(&crc_calculator2, (uint8_t *)p, length);
+ av1_get_crc_value(&x->crc_calculator2, (uint8_t *)p, length);
dst_pic_block_same_info[0][pos] =
src_pic_block_same_info[0][pos] &&
@@ -388,17 +388,9 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
return 1;
}
-// global buffer for hash value calculation of a block
-// used only in av1_get_block_hash_value()
-#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
-// [first hash/second hash]
-// [two buffers used ping-pong]
-// [num of 2x2 blocks in 128x128]
-static uint32_t hash_value_buffer[2][2][AOM_BUFFER_SIZE_FOR_BLOCK_HASH];
-
void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
uint32_t *hash_value1, uint32_t *hash_value2,
- int use_highbitdepth) {
+ int use_highbitdepth, MACROBLOCK *x) {
uint32_t to_hash[4];
const int add_value = hash_block_size_to_index(block_size) << crc_bits;
assert(add_value >= 0);
@@ -415,10 +407,12 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
get_pixels_in_1D_short_array_by_block_2x2(
y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
- hash_value_buffer[0][0][pos] = av1_get_crc_value(
- &crc_calculator1, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash));
- hash_value_buffer[1][0][pos] = av1_get_crc_value(
- &crc_calculator2, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash));
+ x->hash_value_buffer[0][0][pos] =
+ av1_get_crc_value(&x->crc_calculator1, (uint8_t *)pixel_to_hash,
+ sizeof(pixel_to_hash));
+ x->hash_value_buffer[1][0][pos] =
+ av1_get_crc_value(&x->crc_calculator2, (uint8_t *)pixel_to_hash,
+ sizeof(pixel_to_hash));
}
}
} else {
@@ -429,10 +423,10 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
stride, pixel_to_hash);
assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
- hash_value_buffer[0][0][pos] = av1_get_crc_value(
- &crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
- hash_value_buffer[1][0][pos] = av1_get_crc_value(
- &crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
+ x->hash_value_buffer[0][0][pos] = av1_get_crc_value(
+ &x->crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
+ x->hash_value_buffer[1][0][pos] = av1_get_crc_value(
+ &x->crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
}
}
}
@@ -457,24 +451,24 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
assert(srcPos + src_sub_block_in_width + 1 <
AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
- to_hash[0] = hash_value_buffer[0][src_idx][srcPos];
- to_hash[1] = hash_value_buffer[0][src_idx][srcPos + 1];
+ to_hash[0] = x->hash_value_buffer[0][src_idx][srcPos];
+ to_hash[1] = x->hash_value_buffer[0][src_idx][srcPos + 1];
to_hash[2] =
- hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width];
- to_hash[3] =
- hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width + 1];
+ x->hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width];
+ to_hash[3] = x->hash_value_buffer[0][src_idx]
+ [srcPos + src_sub_block_in_width + 1];
- hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value(
- &crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash));
+ x->hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value(
+ &x->crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash));
- to_hash[0] = hash_value_buffer[1][src_idx][srcPos];
- to_hash[1] = hash_value_buffer[1][src_idx][srcPos + 1];
+ to_hash[0] = x->hash_value_buffer[1][src_idx][srcPos];
+ to_hash[1] = x->hash_value_buffer[1][src_idx][srcPos + 1];
to_hash[2] =
- hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width];
- to_hash[3] =
- hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width + 1];
- hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value(
- &crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash));
+ x->hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width];
+ to_hash[3] = x->hash_value_buffer[1][src_idx]
+ [srcPos + src_sub_block_in_width + 1];
+ x->hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value(
+ &x->crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash));
dst_pos++;
}
}
@@ -483,8 +477,6 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
sub_block_in_width >>= 1;
}
- *hash_value1 = (hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value;
- *hash_value2 = hash_value_buffer[1][dst_idx][0];
+ *hash_value1 = (x->hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value;
+ *hash_value2 = x->hash_value_buffer[1][dst_idx][0];
}
-
-#undef AOM_BUFFER_SIZE_FOR_BLOCK_HASH
diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h
index 8deb92eb6..df3ec3215 100644
--- a/third_party/aom/av1/encoder/hash_motion.h
+++ b/third_party/aom/av1/encoder/hash_motion.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_HASH_MOTION_H_
-#define AV1_ENCODER_HASH_MOTION_H_
+#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_
+#define AOM_AV1_ENCODER_HASH_MOTION_H_
#include "config/aom_config.h"
@@ -34,7 +34,7 @@ typedef struct _hash_table {
Vector **p_lookup_table;
} hash_table;
-void av1_hash_table_init(hash_table *p_hash_table);
+void av1_hash_table_init(hash_table *p_hash_table, struct macroblock *x);
void av1_hash_table_destroy(hash_table *p_hash_table);
void av1_hash_table_create(hash_table *p_hash_table);
int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value);
@@ -44,13 +44,15 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
uint32_t hash_value2);
void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
uint32_t *pic_block_hash[2],
- int8_t *pic_block_same_info[3]);
+ int8_t *pic_block_same_info[3],
+ struct macroblock *x);
void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
int block_size,
uint32_t *src_pic_block_hash[2],
uint32_t *dst_pic_block_hash[2],
int8_t *src_pic_block_same_info[3],
- int8_t *dst_pic_block_same_info[3]);
+ int8_t *dst_pic_block_same_info[3],
+ struct macroblock *x);
void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
uint32_t *pic_hash[2],
int8_t *pic_is_same,
@@ -67,10 +69,10 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
int block_size, int x_start, int y_start);
void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
uint32_t *hash_value1, uint32_t *hash_value2,
- int use_highbitdepth);
+ int use_highbitdepth, struct macroblock *x);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_HASH_MOTION_H_
+#endif // AOM_AV1_ENCODER_HASH_MOTION_H_
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
index 0922557d0..67898fd18 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -121,15 +121,45 @@ static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
int32_t *dst_coeff = (int32_t *)coeff;
- av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
- txfm_param->bd);
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ switch (tx_type) {
+ // use the c version for anything including identity for now
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ default:
+ av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ }
}
static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
int32_t *dst_coeff = (int32_t *)coeff;
- av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
- txfm_param->bd);
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ switch (tx_type) {
+ // use the c version for anything including identity for now
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ default:
+ av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ }
}
static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
index 6155b255a..daabc7119 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_HYBRID_FWD_TXFM_H_
-#define AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
#include "config/aom_config.h"
@@ -28,4 +28,4 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
} // extern "C"
#endif
-#endif // AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#endif // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h
index 3897c2a6a..e55224cf7 100644
--- a/third_party/aom/av1/encoder/lookahead.h
+++ b/third_party/aom/av1/encoder/lookahead.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_LOOKAHEAD_H_
-#define AV1_ENCODER_LOOKAHEAD_H_
+#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_
+#define AOM_AV1_ENCODER_LOOKAHEAD_H_
#include "aom_scale/yv12config.h"
#include "aom/aom_integer.h"
@@ -103,4 +103,4 @@ unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx);
} // extern "C"
#endif
-#endif // AV1_ENCODER_LOOKAHEAD_H_
+#endif // AOM_AV1_ENCODER_LOOKAHEAD_H_
diff --git a/third_party/aom/av1/encoder/mathutils.h b/third_party/aom/av1/encoder/mathutils.h
index 23243dd9e..64f936176 100644
--- a/third_party/aom/av1/encoder/mathutils.h
+++ b/third_party/aom/av1/encoder/mathutils.h
@@ -9,6 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#ifndef AOM_AV1_ENCODER_MATHUTILS_H_
+#define AOM_AV1_ENCODER_MATHUTILS_H_
+
#include <memory.h>
#include <math.h>
#include <stdio.h>
@@ -23,7 +26,7 @@ static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) {
double c;
// Forward elimination
for (k = 0; k < n - 1; k++) {
- // Bring the largest magitude to the diagonal position
+ // Bring the largest magnitude to the diagonal position
for (i = n - 1; i > k; i--) {
if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) {
for (j = 0; j < n; j++) {
@@ -352,3 +355,5 @@ static INLINE int SVD(double *U, double *W, double *V, double *matx, int M,
return 0;
}
+
+#endif // AOM_AV1_ENCODER_MATHUTILS_H_
diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c
index 472173634..1a35ff77c 100644
--- a/third_party/aom/av1/encoder/mbgraph.c
+++ b/third_party/aom/av1/encoder/mbgraph.c
@@ -17,11 +17,12 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_mem/aom_mem.h"
#include "aom_ports/system_state.h"
-#include "av1/encoder/segmentation.h"
-#include "av1/encoder/mcomp.h"
#include "av1/common/blockd.h"
#include "av1/common/reconinter.h"
#include "av1/common/reconintra.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
int mb_row, int mb_col) {
@@ -140,7 +141,7 @@ static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) {
// calculate SATD for each intra prediction mode;
// we're intentionally not doing 4x4, we just want a rough estimate
- for (mode = DC_PRED; mode <= PAETH_PRED; mode++) {
+ for (mode = INTRA_MODE_START; mode < INTRA_MODE_END; mode++) {
unsigned int err;
xd->mi[0]->mode = mode;
diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h
index 3e0a4fa9b..ba08476f7 100644
--- a/third_party/aom/av1/encoder/mbgraph.h
+++ b/third_party/aom/av1/encoder/mbgraph.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_MBGRAPH_H_
-#define AV1_ENCODER_MBGRAPH_H_
+#ifndef AOM_AV1_ENCODER_MBGRAPH_H_
+#define AOM_AV1_ENCODER_MBGRAPH_H_
#ifdef __cplusplus
extern "C" {
@@ -38,4 +38,4 @@ void av1_update_mbgraph_stats(struct AV1_COMP *cpi);
} // extern "C"
#endif
-#endif // AV1_ENCODER_MBGRAPH_H_
+#endif // AOM_AV1_ENCODER_MBGRAPH_H_
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
index c4572a341..8f6de9b53 100644
--- a/third_party/aom/av1/encoder/mcomp.c
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -29,6 +29,7 @@
#include "av1/encoder/encodemv.h"
#include "av1/encoder/mcomp.h"
#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
// #define NEW_DIAMOND_SEARCH
@@ -219,7 +220,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
thismse = upsampled_pref_error( \
xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride, \
pre(y, y_stride, r, c), y_stride, sp(c), sp(r), second_pred, mask, \
- mask_stride, invert_mask, w, h, &sse); \
+ mask_stride, invert_mask, w, h, &sse, use_accurate_subpel_search); \
v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
v += thismse; \
if (v < besterr) { \
@@ -342,19 +343,19 @@ static unsigned int setup_center_error(
if (second_pred != NULL) {
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+ uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
if (mask) {
- aom_highbd_comp_mask_pred(comp_pred16, second_pred, w, h, y + offset,
+ aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y + offset,
y_stride, mask, mask_stride, invert_mask);
} else {
if (xd->jcp_param.use_jnt_comp_avg)
- aom_highbd_jnt_comp_avg_pred(comp_pred16, second_pred, w, h,
- y + offset, y_stride, &xd->jcp_param);
+ aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
+ y_stride, &xd->jcp_param);
else
- aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+ aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
y_stride);
}
- besterr =
- vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
} else {
DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
if (mask) {
@@ -648,51 +649,54 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm,
int subpel_x_q3, int subpel_y_q3,
const uint8_t *second_pred, const uint8_t *mask,
int mask_stride, int invert_mask, int w, int h,
- unsigned int *sse) {
+ unsigned int *sse, int subpel_search) {
unsigned int besterr;
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+ uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
if (second_pred != NULL) {
if (mask) {
aom_highbd_comp_mask_upsampled_pred(
- xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd);
+ xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd,
+ subpel_search);
} else {
if (xd->jcp_param.use_jnt_comp_avg)
aom_highbd_jnt_comp_avg_upsampled_pred(
- xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h,
- subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param);
+ xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param, subpel_search);
else
- aom_highbd_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16,
- second_pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride, xd->bd);
+ aom_highbd_comp_avg_upsampled_pred(
+ xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, xd->bd, subpel_search);
}
} else {
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h,
- subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd);
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
+ subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+ subpel_search);
}
-
- besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse);
+ besterr = vfp->vf(pred8, w, src, src_stride, sse);
} else {
DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
if (second_pred != NULL) {
if (mask) {
- aom_comp_mask_upsampled_pred(
- xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask);
+ aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
+ second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, mask,
+ mask_stride, invert_mask, subpel_search);
} else {
if (xd->jcp_param.use_jnt_comp_avg)
aom_jnt_comp_avg_upsampled_pred(
xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride, &xd->jcp_param);
+ subpel_y_q3, y, y_stride, &xd->jcp_param, subpel_search);
else
aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
second_pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride);
+ subpel_y_q3, y, y_stride, subpel_search);
}
} else {
aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride);
+ subpel_y_q3, y, y_stride, subpel_search);
}
besterr = vfp->vf(pred, w, src, src_stride, sse);
@@ -707,10 +711,11 @@ static unsigned int upsampled_setup_center_error(
const int src_stride, const uint8_t *const y, int y_stride,
const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2],
- unsigned int *sse1, int *distortion) {
- unsigned int besterr = upsampled_pref_error(
- xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride, y + offset,
- y_stride, 0, 0, second_pred, mask, mask_stride, invert_mask, w, h, sse1);
+ unsigned int *sse1, int *distortion, int subpel_search) {
+ unsigned int besterr =
+ upsampled_pref_error(xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride,
+ y + offset, y_stride, 0, 0, second_pred, mask,
+ mask_stride, invert_mask, w, h, sse1, subpel_search);
*distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
return besterr;
@@ -781,7 +786,8 @@ int av1_find_best_sub_pixel_tree(
besterr = upsampled_setup_center_error(
xd, cm, mi_row, mi_col, bestmv, ref_mv, error_per_bit, vfp, src_address,
src_stride, y, y_stride, second_pred, mask, mask_stride, invert_mask, w,
- h, offset, mvjcost, mvcost, sse1, distortion);
+ h, offset, mvjcost, mvcost, sse1, distortion,
+ use_accurate_subpel_search);
else
besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
src_address, src_stride, y, y_stride,
@@ -802,7 +808,8 @@ int av1_find_best_sub_pixel_tree(
thismse = upsampled_pref_error(
xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
- mask, mask_stride, invert_mask, w, h, &sse);
+ mask, mask_stride, invert_mask, w, h, &sse,
+ use_accurate_subpel_search);
} else {
thismse = estimate_upsampled_pref_error(
xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
@@ -837,7 +844,8 @@ int av1_find_best_sub_pixel_tree(
thismse = upsampled_pref_error(
xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
- mask, mask_stride, invert_mask, w, h, &sse);
+ mask, mask_stride, invert_mask, w, h, &sse,
+ use_accurate_subpel_search);
} else {
thismse = estimate_upsampled_pref_error(
xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
@@ -929,8 +937,8 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
int16_t bc = mbmi->mv[0].as_mv.col;
int16_t *tr = &mbmi->mv[0].as_mv.row;
int16_t *tc = &mbmi->mv[0].as_mv.col;
- WarpedMotionParams best_wm_params = mbmi->wm_params[0];
- int best_num_proj_ref = mbmi->num_proj_ref[0];
+ WarpedMotionParams best_wm_params = mbmi->wm_params;
+ int best_num_proj_ref = mbmi->num_proj_ref;
unsigned int bestmse;
int minc, maxc, minr, maxr;
const int start = cm->allow_high_precision_mv ? 0 : 4;
@@ -962,18 +970,18 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
if (total_samples > 1)
- mbmi->num_proj_ref[0] =
+ mbmi->num_proj_ref =
selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
- if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, *tr,
- *tc, &mbmi->wm_params[0], mi_row, mi_col)) {
+ if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, *tr,
+ *tc, &mbmi->wm_params, mi_row, mi_col)) {
thismse =
av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col, &this_mv);
if (thismse < bestmse) {
best_idx = idx;
- best_wm_params = mbmi->wm_params[0];
- best_num_proj_ref = mbmi->num_proj_ref[0];
+ best_wm_params = mbmi->wm_params;
+ best_num_proj_ref = mbmi->num_proj_ref;
bestmse = thismse;
}
}
@@ -990,8 +998,8 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
*tr = br;
*tc = bc;
- mbmi->wm_params[0] = best_wm_params;
- mbmi->num_proj_ref[0] = best_num_proj_ref;
+ mbmi->wm_params = best_wm_params;
+ mbmi->num_proj_ref = best_num_proj_ref;
return bestmse;
}
@@ -2013,8 +2021,16 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
const uint8_t *mask, int mask_stride,
int invert_mask, const MV *center_mv,
const uint8_t *second_pred) {
- const MV neighbors[8] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
- { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } };
+ static const search_neighbors neighbors[8] = {
+ { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 },
+ { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 },
+ { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 },
+ { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 },
+ { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 }
+ };
const MACROBLOCKD *const xd = &x->e_mbd;
const struct buf_2d *const what = &x->plane[0].src;
const struct buf_2d *const in_what = &xd->plane[0].pre[0];
@@ -2022,6 +2038,10 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
MV *best_mv = &x->best_mv.as_mv;
unsigned int best_sad = INT_MAX;
int i, j;
+ uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P * SEARCH_GRID_STRIDE_8P] =
+ { 0 };
+ int grid_center = SEARCH_GRID_CENTER_8P;
+ int grid_coord = grid_center;
clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max,
x->mv_limits.row_min, x->mv_limits.row_max);
@@ -2043,13 +2063,20 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
}
+ do_refine_search_grid[grid_coord] = 1;
+
for (i = 0; i < search_range; ++i) {
int best_site = -1;
for (j = 0; j < 8; ++j) {
- const MV mv = { best_mv->row + neighbors[j].row,
- best_mv->col + neighbors[j].col };
+ grid_coord = grid_center + neighbors[j].coord_offset;
+ if (do_refine_search_grid[grid_coord] == 1) {
+ continue;
+ }
+ const MV mv = { best_mv->row + neighbors[j].coord.row,
+ best_mv->col + neighbors[j].coord.col };
+ do_refine_search_grid[grid_coord] = 1;
if (is_mv_in(&x->mv_limits, &mv)) {
unsigned int sad;
if (mask) {
@@ -2079,8 +2106,9 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
if (best_site == -1) {
break;
} else {
- best_mv->row += neighbors[best_site].row;
- best_mv->col += neighbors[best_site].col;
+ best_mv->row += neighbors[best_site].coord.row;
+ best_mv->col += neighbors[best_site].coord.col;
+ grid_center += neighbors[best_site].coord_offset;
}
}
return best_sad;
@@ -2099,11 +2127,11 @@ static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
}
int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
- MV *mvp_full, int step_param, int error_per_bit,
+ MV *mvp_full, int step_param, int method,
+ int run_mesh_search, int error_per_bit,
int *cost_list, const MV *ref_mv, int var_max, int rd,
int x_pos, int y_pos, int intra) {
const SPEED_FEATURES *const sf = &cpi->sf;
- const SEARCH_METHODS method = sf->mv.search_method;
const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
int var = 0;
@@ -2168,11 +2196,35 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
default: assert(0 && "Invalid search method.");
}
+ // Should we allow a follow on exhaustive search?
+ if (!run_mesh_search) {
+ if (method == NSTEP) {
+ if (is_exhaustive_allowed(cpi, x)) {
+ int exhuastive_thr = sf->exhaustive_searches_thresh;
+ exhuastive_thr >>=
+ 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+ // Threshold variance for an exhaustive full search.
+ if (var > exhuastive_thr) run_mesh_search = 1;
+ }
+ }
+ }
+
+ if (run_mesh_search) {
+ int var_ex;
+ MV tmp_mv_ex;
+ var_ex = full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
+ cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
+ if (var_ex < var) {
+ var = var_ex;
+ x->best_mv.as_mv = tmp_mv_ex;
+ }
+ }
+
if (method != NSTEP && rd && var < var_max)
var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
do {
- if (!av1_use_hash_me(&cpi->common)) break;
+ if (!intra || !av1_use_hash_me(&cpi->common)) break;
// already single ME
// get block size and original buffer of current block
@@ -2195,7 +2247,7 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
av1_get_block_hash_value(
what, what_stride, block_width, &hash_value1, &hash_value2,
- x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, x);
const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
// for intra, at lest one matching can be found, itself.
@@ -2279,7 +2331,8 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
MV this_mv = { r, c }; \
thismse = upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, &this_mv, \
mask, vfp, z, pre(y, y_stride, r, c), \
- y_stride, sp(c), sp(r), w, h, &sse); \
+ y_stride, sp(c), sp(r), w, h, &sse, \
+ use_accurate_subpel_search); \
if ((v = MVC(r, c) + thismse) < besterr) { \
besterr = v; \
br = r; \
@@ -2307,18 +2360,20 @@ static int upsampled_obmc_pref_error(
MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
const MV *const mv, const int32_t *mask, const aom_variance_fn_ptr_t *vfp,
const int32_t *const wsrc, const uint8_t *const y, int y_stride,
- int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse) {
+ int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse,
+ int subpel_search) {
unsigned int besterr;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h,
- subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd);
- besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse);
+ DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
+ subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+ subpel_search);
+ besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
} else {
- DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride);
+ subpel_y_q3, y, y_stride, subpel_search);
besterr = vfp->ovf(pred, w, wsrc, mask, sse);
}
@@ -2330,10 +2385,11 @@ static unsigned int upsampled_setup_obmc_center_error(
const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
const uint8_t *const y, int y_stride, int w, int h, int offset,
- int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) {
- unsigned int besterr =
- upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc,
- y + offset, y_stride, 0, 0, w, h, sse1);
+ int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion,
+ int subpel_search) {
+ unsigned int besterr = upsampled_obmc_pref_error(
+ xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc, y + offset, y_stride, 0,
+ 0, w, h, sse1, subpel_search);
*distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
return besterr;
@@ -2388,11 +2444,12 @@ int av1_find_best_obmc_sub_pixel_tree_up(
bestmv->row *= 8;
bestmv->col *= 8;
- // use_accurate_subpel_search can be 0 or 1
+ // use_accurate_subpel_search can be 0 or 1 or 2
if (use_accurate_subpel_search)
besterr = upsampled_setup_obmc_center_error(
xd, cm, mi_row, mi_col, mask, bestmv, ref_mv, error_per_bit, vfp, z, y,
- y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion);
+ y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion,
+ use_accurate_subpel_search);
else
besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
z, y, y_stride, offset, mvjcost, mvcost,
@@ -2408,7 +2465,8 @@ int av1_find_best_obmc_sub_pixel_tree_up(
if (use_accurate_subpel_search) {
thismse = upsampled_obmc_pref_error(
xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
- pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse);
+ pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+ use_accurate_subpel_search);
} else {
thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
sp(tr), src_address, mask, &sse);
@@ -2439,7 +2497,8 @@ int av1_find_best_obmc_sub_pixel_tree_up(
if (use_accurate_subpel_search) {
thismse = upsampled_obmc_pref_error(
xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
- pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse);
+ pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+ use_accurate_subpel_search);
} else {
thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr),
src_address, mask, &sse);
@@ -2643,11 +2702,12 @@ int obmc_diamond_search_sad(const MACROBLOCK *x, const search_site_config *cfg,
return best_sad;
}
-int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
- MV *mvp_full, int step_param, int sadpb,
- int further_steps, int do_refine,
- const aom_variance_fn_ptr_t *fn_ptr,
- const MV *ref_mv, MV *dst_mv, int is_second) {
+static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv,
+ int is_second) {
const int32_t *wsrc = x->wsrc_buf;
const int32_t *mask = x->mask_buf;
MV temp_mv;
@@ -2704,6 +2764,31 @@ int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
return bestsme;
}
+int av1_obmc_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
+ int step_param, int sadpb, int further_steps,
+ int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv, int is_second) {
+ if (cpi->sf.obmc_full_pixel_search_level == 0) {
+ return obmc_full_pixel_diamond(cpi, x, mvp_full, step_param, sadpb,
+ further_steps, do_refine, fn_ptr, ref_mv,
+ dst_mv, is_second);
+ } else {
+ const int32_t *wsrc = x->wsrc_buf;
+ const int32_t *mask = x->mask_buf;
+ const int search_range = 8;
+ *dst_mv = *mvp_full;
+ clamp_mv(dst_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ int thissme = obmc_refining_search_sad(
+ x, wsrc, mask, dst_mv, sadpb, search_range, fn_ptr, ref_mv, is_second);
+ if (thissme < INT_MAX)
+ thissme = get_obmc_mvpred_var(x, wsrc, mask, dst_mv, ref_mv, fn_ptr, 1,
+ is_second);
+ return thissme;
+ }
+}
+
// Note(yunqingwang): The following 2 functions are only used in the motion
// vector unit test, which return extreme motion vectors allowed by the MV
// limits.
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
index 539e8f4e4..a975218b0 100644
--- a/third_party/aom/av1/encoder/mcomp.h
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_MCOMP_H_
-#define AV1_ENCODER_MCOMP_H_
+#ifndef AOM_AV1_ENCODER_MCOMP_H_
+#define AOM_AV1_ENCODER_MCOMP_H_
#include "av1/encoder/block.h"
#include "aom_dsp/variance.h"
@@ -31,6 +31,11 @@ extern "C" {
// for Block_16x16
#define BORDER_MV_PIXELS_B16 (16 + AOM_INTERP_EXTEND)
+#define SEARCH_RANGE_8P 3
+#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1)
+#define SEARCH_GRID_CENTER_8P \
+ (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P)
+
// motion search site
typedef struct search_site {
MV mv;
@@ -43,6 +48,11 @@ typedef struct search_site_config {
int searches_per_step;
} search_site_config;
+typedef struct {
+ MV coord;
+ int coord_offset;
+} search_neighbors;
+
void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
void av1_init3smotion_compensation(search_site_config *cfg, int stride);
@@ -120,14 +130,15 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, MV *mvp_full, int step_param,
- int error_per_bit, int *cost_list, const MV *ref_mv,
- int var_max, int rd, int x_pos, int y_pos, int intra);
-
-int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
- MV *mvp_full, int step_param, int sadpb,
- int further_steps, int do_refine,
- const aom_variance_fn_ptr_t *fn_ptr,
- const MV *ref_mv, MV *dst_mv, int is_second);
+ int method, int run_mesh_search, int error_per_bit,
+ int *cost_list, const MV *ref_mv, int var_max, int rd,
+ int x_pos, int y_pos, int intra);
+
+int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv, int is_second);
int av1_find_best_obmc_sub_pixel_tree_up(
MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
@@ -147,4 +158,4 @@ unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
} // extern "C"
#endif
-#endif // AV1_ENCODER_MCOMP_H_
+#endif // AOM_AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c
index 3a27e5845..d21def43a 100644
--- a/third_party/aom/av1/encoder/ml.c
+++ b/third_party/aom/av1/encoder/ml.c
@@ -10,7 +10,9 @@
*/
#include <assert.h>
+#include <math.h>
+#include "aom_dsp/aom_dsp_common.h"
#include "av1/encoder/ml.h"
void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
@@ -55,3 +57,17 @@ void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
weights += num_input_nodes;
}
}
+
+void av1_nn_softmax(const float *input, float *output, int n) {
+ // Softmax function is invariant to adding the same constant
+ // to all input values, so we subtract the maximum input to avoid
+ // possible overflow.
+ float max_inp = input[0];
+ for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]);
+ float sum_out = 0.0f;
+ for (int i = 0; i < n; i++) {
+ output[i] = (float)exp(input[i] - max_inp);
+ sum_out += output[i];
+ }
+ for (int i = 0; i < n; i++) output[i] /= sum_out;
+}
diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h
index 614cb60bb..cb8ef2871 100644
--- a/third_party/aom/av1/encoder/ml.h
+++ b/third_party/aom/av1/encoder/ml.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_ML_H_
-#define AV1_ENCODER_ML_H_
+#ifndef AOM_AV1_ENCODER_ML_H_
+#define AOM_AV1_ENCODER_ML_H_
#ifdef __cplusplus
extern "C" {
@@ -37,8 +37,13 @@ typedef struct {
void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
float *output);
+// Applies the softmax normalization function to the input
+// to get a valid probability distribution in the output:
+// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
+void av1_nn_softmax(const float *input, float *output, int n);
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_RD_H_
+#endif // AOM_AV1_ENCODER_ML_H_
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
index bbdd50784..8b88c4755 100644
--- a/third_party/aom/av1/encoder/palette.h
+++ b/third_party/aom/av1/encoder/palette.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_PALETTE_H_
-#define AV1_ENCODER_PALETTE_H_
+#ifndef AOM_AV1_ENCODER_PALETTE_H_
+#define AOM_AV1_ENCODER_PALETTE_H_
#include "av1/common/blockd.h"
@@ -93,4 +93,4 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
} // extern "C"
#endif
-#endif /* AV1_ENCODER_PALETTE_H_ */
+#endif // AOM_AV1_ENCODER_PALETTE_H_
diff --git a/third_party/aom/av1/encoder/partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h
index 279d39495..437ea43f9 100644
--- a/third_party/aom/av1/encoder/partition_model_weights.h
+++ b/third_party/aom/av1/encoder/partition_model_weights.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
-#define AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
+#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
#ifdef __cplusplus
extern "C" {
@@ -1314,204 +1314,112 @@ static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
#define FEATURE_SIZE 18
#define LABEL_SIZE 4
-static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 48] = {
- 0.121894f, 0.058485f, 0.702226f, 0.015457f, -0.123380f, -0.573450f,
- 0.319576f, 0.118808f, 0.166057f, 0.526984f, 0.015211f, -0.025050f,
- 0.085717f, -0.028221f, -0.580062f, -0.270530f, -0.092371f, 0.037679f,
- 0.083573f, 0.007112f, -0.358623f, -0.264443f, -0.064819f, 0.022013f,
- -0.040077f, -0.291967f, -0.293100f, 0.072266f, -0.270572f, -0.292253f,
- -0.260105f, -0.294472f, -0.275752f, 0.054315f, 0.000085f, 0.105115f,
- -0.363572f, -0.016542f, 0.185943f, -0.359903f, 0.038765f, -0.377668f,
- 0.172692f, 0.127749f, -0.031275f, -0.242528f, -0.145880f, -0.055247f,
- -0.000265f, -0.355224f, 0.089917f, -0.377841f, -0.209766f, 0.030899f,
- 0.039546f, -0.375030f, -0.041605f, 0.137677f, 0.021282f, -0.150442f,
- -0.189445f, 0.009293f, -0.316033f, 0.038745f, -0.278761f, 0.005692f,
- -0.071763f, -0.302936f, -0.224572f, -0.211841f, 0.057503f, 0.005435f,
- -0.930979f, 0.115513f, 0.689958f, 0.221318f, 1.003891f, 0.359540f,
- -0.640534f, -0.162373f, -0.118105f, 0.205587f, 0.019710f, 0.025067f,
- -0.025344f, 0.002831f, 0.033078f, 0.040175f, -0.007502f, 0.026272f,
- 0.083443f, -0.880884f, 0.436948f, 0.293297f, 0.051678f, -0.133328f,
- -0.180323f, 0.667835f, 0.070733f, -0.003060f, -0.221804f, 0.146601f,
- 0.064024f, 0.056758f, -0.077361f, 0.105587f, -0.185500f, -0.133552f,
- 0.138269f, 0.165055f, 0.628284f, 0.846449f, 0.058825f, 0.223157f,
- 0.277896f, -0.381303f, 0.408241f, 0.643301f, 0.067494f, 0.120822f,
- -0.182491f, -0.111373f, -0.033374f, 0.131387f, -0.114654f, 0.114318f,
- 0.094718f, -0.052232f, 0.385903f, 1.212304f, 0.425305f, -0.052993f,
- 0.291474f, -0.319730f, 0.023090f, -0.317259f, 0.011181f, -0.034185f,
- -0.100671f, 0.186185f, -0.432511f, -0.115957f, -0.067746f, -0.177810f,
- -0.226700f, 0.004464f, 0.006809f, 0.171360f, -0.080723f, 0.099826f,
- -0.062301f, -0.358755f, -0.202549f, -0.084616f, -0.042313f, -0.325560f,
- 0.010452f, -0.341089f, -0.013566f, -0.340129f, 0.034675f, -0.036518f,
- -0.036473f, -0.192892f, 0.650235f, 0.609437f, -0.160982f, 0.125535f,
- -1.004575f, 0.521969f, 1.318091f, 0.614004f, -0.106622f, -0.077453f,
- -0.037328f, -0.081940f, 0.007640f, 0.026654f, -0.080332f, -0.077356f,
- -0.288170f, -0.319680f, -0.131712f, -0.150985f, 0.073218f, 0.089502f,
- -0.280502f, 0.003941f, -0.249937f, 0.244263f, 0.023269f, 0.080263f,
- 0.073172f, -0.200036f, 0.022381f, 0.008592f, -0.339517f, -0.135073f,
- 0.177199f, 0.208363f, 0.652360f, 0.272990f, 0.609535f, 0.145805f,
- 0.022527f, -0.088378f, 0.205008f, 0.101021f, -0.019673f, -0.252681f,
- 0.116034f, -0.062052f, 0.009991f, 0.138933f, -0.182428f, 0.052542f,
- -0.350825f, -0.122654f, -0.154687f, 0.066747f, 0.021541f, -0.212169f,
- -0.087093f, -0.087488f, 0.178129f, -0.146544f, 0.013919f, -0.273899f,
- 0.223753f, -0.187327f, -0.118795f, -0.191892f, -0.355979f, 0.023794f,
- -0.135236f, 0.058918f, 0.069080f, 0.279287f, 0.369689f, 1.134526f,
- 0.659511f, 0.250223f, 0.286040f, 0.515284f, 0.067791f, -0.156385f,
- 0.143283f, 0.050884f, 0.089956f, -0.040850f, -0.003650f, -0.081162f,
- 0.086004f, 0.116578f, 0.826254f, 0.504869f, -0.196022f, -0.207279f,
- 0.200503f, -0.196801f, 0.008211f, 0.411158f, -0.075855f, -0.036690f,
- 0.111519f, -0.057838f, -0.005846f, 0.111067f, 0.174712f, -0.078054f,
- 0.765897f, 0.018670f, -0.306960f, -0.020034f, -0.332875f, 0.662707f,
- -0.461233f, -1.007542f, -0.693995f, -1.243352f, -0.014745f, 0.004036f,
- -0.009141f, 0.003325f, -0.011233f, -0.000819f, 0.006369f, 0.002418f,
- -0.035906f, -0.005135f, 1.073830f, 1.020736f, -0.182611f, -1.038976f,
- -0.226695f, -0.375663f, 0.364568f, 0.620995f, -0.018615f, 0.011347f,
- 0.045786f, 0.041077f, 0.010886f, -0.148428f, 0.028007f, -0.022322f,
- -0.165985f, 0.233315f, -0.277531f, -0.329683f, -0.516967f, -0.390750f,
- 0.006948f, 0.133744f, -0.375681f, -0.116877f, -0.009441f, -0.008597f,
- -0.160679f, 0.102150f, -0.142647f, -0.117501f, 0.035035f, 0.228687f,
- -1.117397f, -0.005171f, -0.008708f, 0.413042f, -0.298532f, 0.614909f,
- -0.181084f, -0.711770f, 0.344033f, 0.287220f, -0.112848f, -0.052866f,
- -0.222466f, 0.025029f, -0.107558f, 0.137036f, -0.276661f, -0.038808f,
- -0.057448f, 0.037563f, 0.526020f, 0.447997f, 0.288366f, 0.264815f,
- 0.319974f, -0.193091f, 0.353830f, 0.412950f, -0.280454f, 0.092737f,
- 0.070919f, 0.043336f, 0.041214f, -0.052147f, 0.010860f, 0.191325f,
- 0.079783f, -0.425672f, -0.053469f, -0.005495f, 0.184526f, -0.166171f,
- 0.084459f, -0.042165f, -0.261759f, -0.248723f, -0.073483f, -0.377884f,
- -0.189614f, -0.054146f, -0.261279f, 0.196347f, -0.087568f, 0.070533f,
- -0.145492f, -0.041500f, -0.465861f, 0.077369f, 0.020645f, -0.440232f,
- -0.414585f, -0.168627f, -0.050011f, -0.336676f, -0.344943f, -0.288140f,
- 0.085513f, -0.200425f, 0.218516f, 0.049604f, -0.280952f, -0.242674f,
- -1.969931f, 0.013374f, -0.039643f, 1.113947f, 0.018568f, 0.916330f,
- -0.302934f, -0.225816f, 0.189529f, -0.361971f, 0.021073f, -0.050143f,
- -0.041415f, 0.015126f, 0.018091f, -0.082401f, 0.017152f, 0.064856f,
- 0.156170f, 0.145323f, -0.281409f, 0.213357f, -0.058966f, 0.158668f,
- 0.033742f, 0.378820f, -0.662875f, -0.455532f, -0.702928f, 0.234325f,
- 0.139627f, -1.360650f, 0.040921f, -0.044373f, -0.059999f, -0.048565f,
- 0.115339f, -0.105888f, -0.170567f, -0.206097f, -0.349537f, 0.107941f,
- -0.356286f, -0.374928f, 0.143257f, -0.317790f, 0.079875f, -0.359345f,
- 0.081321f, -0.219772f, -0.077213f, 0.110624f, -0.252329f, -0.266481f,
- 0.190135f, 0.121214f, 0.661064f, -0.037820f, -0.373068f, -0.065209f,
- -0.286154f, -0.120695f, -0.110670f, -0.193589f, -0.010867f, -0.048054f,
- -0.032010f, 0.110627f, 0.054094f, -0.884309f, -1.171623f, -0.386911f,
- -0.756058f, 0.030362f, 0.563628f, -0.334227f, -0.111213f, 1.143898f,
- -0.940454f, 0.084510f, 0.671010f, 0.312244f, -0.052592f, -0.014376f,
- 0.039965f, -0.010763f, -0.114936f, -0.146020f, 0.015874f, 0.027439f,
- -1.702315f, 0.148702f, 0.153021f, 0.363147f, -0.488933f, 0.220772f,
- 0.640310f, -0.173911f, -0.169523f, -0.082261f, -0.014854f, 0.024414f,
- 0.061041f, -0.013998f, 0.086539f, 0.000466f, 0.037472f, -0.010665f,
- -0.326646f, 0.106971f, 0.405589f, 0.555345f, -0.318315f, 0.526498f,
- 0.119246f, 0.022213f, 0.171237f, 0.214651f, 0.062904f, -0.023764f,
- 0.011831f, 0.079644f, -0.096530f, -0.054373f, -0.306309f, -0.203709f,
- -0.353217f, -0.350005f, -0.329549f, 0.062679f, -0.387625f, -0.237111f,
- -0.025050f, -0.193987f, 0.002235f, -0.380821f, -0.051036f, -0.136020f,
- 0.077989f, -0.361691f, 0.120485f, 0.157746f, 0.073394f, -0.284401f,
- 0.113221f, 0.109808f, 0.000197f, 0.122523f, 0.081411f, -0.048544f,
- -0.136577f, -0.007158f, -0.208952f, -0.276831f, 0.260479f, -1.392915f,
- -0.865248f, 0.114577f, -0.000749f, -0.060338f, -0.091176f, -0.108421f,
- 0.221256f, 0.100176f, -0.877560f, -1.248838f, 0.643005f, 0.064580f,
- -0.049878f, 0.267988f, -0.434340f, -0.299254f, -0.097572f, 0.009606f,
- 0.063810f, -0.090525f, 0.027760f, 0.043484f, 0.041697f, 0.108024f,
- -0.359586f, -0.197090f, 0.121397f, 0.152206f, -0.391126f, -0.283145f,
- 0.008754f, -0.059022f, -0.218745f, 0.043042f, -0.056716f, 0.153051f,
- -0.210372f, -0.029681f, -0.288354f, 0.065242f, -0.189376f, 0.115013f,
- -0.251488f, -0.533091f, 0.037768f, -0.319107f, -0.161364f, -0.103967f,
- 0.063271f, -0.313289f, -0.312093f, -0.045239f, 0.150607f, 0.001487f,
- 0.019602f, -0.338031f, -0.036214f, 0.112736f, -0.367762f, 0.122367f,
- 0.094670f, 0.175590f, 0.301041f, -0.135257f, 0.539620f, 0.328619f,
- -0.163971f, 0.137256f, 0.238805f, 0.483722f, 0.121353f, 0.083630f,
- -0.283568f, 0.291661f, -0.061122f, -0.195295f, 0.153459f, -0.153727f,
- -0.238839f, -0.071736f, 0.601437f, -0.664072f, 0.230827f, 0.198753f,
- -0.039196f, 0.206751f, 0.529020f, 0.904132f, -0.219471f, 0.186694f,
- -0.208608f, -0.093385f, -0.161617f, 0.003930f, -0.429869f, -0.123563f,
- 0.626098f, -0.002495f, -0.245511f, -1.069848f, 0.296115f, -0.940267f,
- -1.649122f, -0.512937f, -0.802874f, -1.000239f, -0.027629f, 0.020434f,
- -0.003030f, 0.035986f, -0.004812f, -0.009193f, -0.004644f, -0.024347f,
- 0.068439f, -0.314339f, 0.095057f, -0.212372f, 0.197523f, -0.040878f,
- -0.272164f, -0.243326f, -0.204955f, 0.157199f, -0.049964f, -0.091537f,
- -0.058012f, -0.306650f, 0.098621f, -0.146778f, -0.154447f, -0.177889f,
- -0.009698f, 0.025427f, 0.350576f, -0.448237f, -0.068823f, 1.224960f,
- -0.776883f, -0.692167f, -0.948497f, -0.492598f, 0.029440f, -0.056460f,
- 0.021654f, 0.004352f, 0.041508f, -0.027179f, 0.006789f, -0.023573f,
- 0.207775f, -0.280273f, -0.347984f, -0.129935f, 0.151512f, -0.087294f,
- -0.494352f, -0.341424f, 0.044084f, -0.064080f, 0.073091f, -0.145574f,
- 0.094715f, -0.258786f, -0.020419f, -0.401823f, 0.009397f, -0.138642f,
- -0.034953f, -0.077419f, 0.636610f, 0.314980f, 1.110610f, -0.343368f,
- 0.696647f, -0.649667f, 0.653491f, -0.096006f, -0.090469f, -0.066975f,
- -0.105864f, -0.015666f, 0.102056f, -0.105344f, -0.273495f, -0.014686f,
- 0.122031f, 0.139524f, -1.042029f, -0.562510f, 0.885644f, 1.088059f,
- 0.189223f, 0.049404f, -0.167371f, 0.018703f, -0.208390f, -0.159002f,
- -0.377130f, -0.151118f, 0.117861f, 0.026986f, -0.032433f, 0.081603f,
- -0.106729f, -0.040134f, 0.015161f, 0.290572f, 0.241446f, 1.390085f,
- 0.438915f, -0.358097f, -0.171799f, 0.879758f, -0.014110f, 0.029562f,
- -0.073583f, -0.125817f, -0.036512f, -0.040275f, 0.037997f, 0.120979f,
- 0.064538f, -0.038841f, 0.034797f, 0.110229f, -0.239779f, -0.004558f,
- 0.226534f, 0.111286f, -0.268198f, 0.237673f, -0.328237f, -0.090774f,
- -0.269690f, -0.202147f, -0.181808f, -0.305238f, 0.110058f, -0.169217f,
- -0.300125f, 0.069031f, -0.081358f, -0.376174f, -0.349980f, 0.071443f,
- -0.396278f, -0.389503f, -0.190410f, -0.014767f, -0.265229f, -0.099787f,
- 0.079847f, -0.214580f, -0.235661f, -0.184227f, 0.111099f, -0.083945f,
- -0.153809f, -0.284092f, -0.132497f, -0.154841f, -0.517157f, -0.640603f,
- -0.357036f, -0.486142f, -0.182819f, -0.475022f, 0.079282f, 0.081168f,
- -0.120831f, -0.016048f, -0.232495f, 0.214329f, -0.055058f, 0.032856f,
- 0.061753f, 0.003226f, 0.097028f, 0.084535f, -1.563199f, 0.434928f,
- -0.403710f, 0.520696f, -0.401696f, 0.450568f, -0.074121f, 0.076622f,
- -0.098421f, 0.167036f, -0.255250f, -0.526313f, -0.933693f, -0.558104f,
- 0.194341f, 0.173326f, 0.071112f, -0.651961f, -1.327587f, -0.705289f,
- -1.138889f, 0.197167f, -0.714654f, -0.113891f, 0.080158f, 0.000301f,
- 0.057905f, 0.060718f, -0.635995f, 0.100026f, -0.038239f, -0.025530f,
-};
-
-static const float av1_4_partition_nn_bias_16_layer0[48] = {
- -0.079252f, -0.083606f, -0.112759f, -0.071622f, 0.444562f, 0.215649f,
- -0.337661f, -0.242379f, -0.053829f, 0.165168f, -0.076613f, -0.190579f,
- -0.060175f, -0.571661f, -0.454075f, -1.462711f, -0.161563f, -0.088748f,
- -0.030279f, -0.456293f, -0.134473f, -0.194976f, 0.044373f, -0.503954f,
- -0.083563f, 0.123344f, 0.011821f, 0.085445f, -0.050294f, -0.135194f,
- 0.057815f, 0.543558f, -0.090602f, -0.104671f, -0.285075f, 0.354335f,
- 1.037007f, -0.023879f, -0.025025f, -0.094408f, -0.101200f, -0.142105f,
- -0.380607f, -0.059067f, -0.113017f, -0.137448f, -0.177840f, 0.468505f,
-};
-
-static const float av1_4_partition_nn_weights_16_layer1[48 * LABEL_SIZE] = {
- 0.174954f, -0.239117f, 0.073252f, 0.258881f, 0.579781f, 0.441827f,
- 0.372037f, -0.062362f, 0.068477f, 0.376811f, -0.130520f, 0.214951f,
- -0.200674f, 0.240347f, 0.152954f, 1.360264f, 0.334630f, -0.064789f,
- -0.270826f, 0.212699f, 0.045669f, -0.150852f, -0.412603f, 0.122481f,
- -0.230246f, 0.005004f, 0.321417f, -0.554083f, -0.186742f, -0.197687f,
- -0.028669f, -0.138559f, -0.117773f, 0.024953f, 0.326367f, -0.109951f,
- -1.098959f, -0.136134f, 0.563218f, 0.191799f, 0.126191f, -0.093113f,
- 0.185371f, 0.058468f, 0.245247f, -0.138064f, -0.471573f, -0.209372f,
- -0.111171f, 0.222275f, -0.350556f, -0.106336f, 0.268877f, 0.090639f,
- -0.083008f, -0.190791f, -0.243922f, -0.121182f, -0.133733f, -0.078450f,
- 0.099751f, 0.353020f, -0.199079f, -0.463492f, -0.647884f, 0.166611f,
- -0.464034f, 0.045096f, -0.312178f, -0.190972f, -0.468297f, 0.662376f,
- -0.197071f, -0.653123f, -0.354365f, -0.088501f, -0.302671f, 0.140713f,
- 0.885444f, 0.350273f, -0.003345f, 0.217260f, 0.219156f, 0.240653f,
- 0.347840f, 0.101849f, -0.244565f, -0.166971f, 0.091056f, 0.319912f,
- 0.268459f, 0.250726f, -0.155819f, -0.087588f, 0.010749f, -0.192344f,
- 0.344808f, 0.223482f, -0.189563f, -0.067317f, -0.348191f, -0.085265f,
- 0.259318f, 0.102408f, 0.096675f, -0.255564f, -0.168480f, -0.068189f,
- -0.457704f, 0.010565f, 0.228573f, -0.124421f, 0.202488f, 0.148519f,
- 0.002180f, 0.099099f, -0.179019f, 0.245414f, -0.038307f, 0.116897f,
- -0.031377f, 0.368533f, -0.793891f, 0.148614f, 0.075441f, 0.102465f,
- -0.310002f, -0.355369f, -0.206713f, -0.262276f, 0.068578f, -0.044980f,
- 0.092689f, -0.181058f, 0.016279f, 0.155965f, 0.545361f, -0.390699f,
- -0.042457f, 0.110238f, 0.114640f, 0.112525f, 0.522221f, 0.533164f,
- -0.331720f, -0.212966f, 0.140823f, 0.251311f, -0.006092f, -0.800438f,
- 0.007981f, -0.585140f, -0.006526f, 0.541683f, -0.298498f, 0.084322f,
- -0.056467f, -0.361806f, -0.256347f, -1.419173f, -0.159093f, 0.023017f,
- 0.667915f, -0.176995f, 0.022307f, -0.169493f, 0.581377f, 0.044929f,
- 0.044914f, -0.056290f, 0.324196f, 0.648043f, -0.089381f, -0.054971f,
- 0.064782f, 0.629356f, -0.003760f, -0.123822f, 0.144133f, -0.378821f,
- 1.116858f, 0.128552f, -0.668783f, 0.207194f, -0.437781f, -0.283321f,
- -0.549404f, 0.010538f, 0.208997f, 0.231396f, -0.174347f, 0.161910f,
+static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = {
+ -2.032866f, 0.056691f, 0.495960f, 0.778785f, 0.548153f, -0.806942f,
+ 0.481155f, 0.282298f, 0.584980f, 0.504688f, 0.209648f, 0.234616f,
+ 0.213484f, 0.221969f, 0.205862f, 0.235054f, 0.317863f, 0.257139f,
+ 0.529478f, 0.098122f, -0.657532f, 0.036296f, 0.327728f, 1.323180f,
+ -0.813082f, 0.160216f, -0.702030f, 0.722733f, -0.270576f, -0.347416f,
+ -0.264700f, -0.254248f, 0.159820f, 0.087995f, -0.184163f, 0.117357f,
+ 0.074194f, -0.667369f, 0.498246f, 0.420506f, 0.072409f, -0.121581f,
+ 0.315788f, 0.000525f, 0.414986f, 0.678166f, -0.011230f, 0.188131f,
+ -0.227749f, 0.009564f, 0.108672f, 0.106923f, -0.080695f, -0.279382f,
+ -0.061339f, -0.297835f, -0.134707f, 0.145865f, -0.009655f, -0.000842f,
+ -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f,
+ -0.143413f, 0.279668f, 0.000885f, -0.022380f, -0.140194f, -0.310473f,
+ 0.252699f, 0.066204f, 0.477568f, 0.994609f, -0.276000f, 1.213182f,
+ 0.277028f, -0.411570f, -0.211559f, 0.377815f, 0.121488f, -0.100559f,
+ -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f,
+ -0.143196f, -0.334035f, 0.162305f, 0.142279f, -0.001210f, -0.135252f,
+ -0.033562f, 0.204307f, -0.039757f, -0.394174f, 0.126617f, -0.128648f,
+ -0.410979f, 0.107641f, -0.117573f, -0.326512f, 0.235166f, 0.084959f,
+ 0.290063f, -0.005838f, 0.459894f, 1.023709f, -0.196145f, 1.100137f,
+ -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f,
+ -0.029743f, 0.125113f, -0.131024f, -0.321458f, -0.143996f, 0.008714f,
+ -0.101234f, 0.079706f, -1.128615f, -0.467381f, 0.220563f, -0.409900f,
+ -0.435353f, 0.759499f, -0.465799f, -0.394309f, 0.176282f, -0.086275f,
+ -0.161225f, -0.354814f, 0.562871f, 0.418253f, 0.414361f, 0.445480f,
+ -0.995903f, -0.086632f, -0.230645f, 0.354656f, -0.317576f, 0.079926f,
+ 0.424369f, 0.997232f, -0.304388f, 1.071667f, -0.023540f, 0.029677f,
+ 0.108564f, 0.183581f, -0.201395f, -0.054854f, -0.193039f, -0.049899f,
+ -0.271949f, -0.358483f, 0.304930f, 0.023823f, -0.009319f, -0.214247f,
+ 0.100712f, -0.050162f, 0.327103f, -0.212999f, -0.030496f, 0.316380f,
+ -0.439589f, -0.249959f, 0.229777f, -0.353664f, -0.384559f, 0.114236f,
+ 0.023119f, 0.007927f, 0.618368f, 0.957759f, -0.019780f, -1.002389f,
+ 0.564277f, -0.839531f, 1.040445f, 0.054340f, 0.031908f, -0.032893f,
+ -0.019170f, -0.042011f, 0.568928f, 0.362567f, -0.559999f, -0.605344f,
+ -0.586146f, -0.290778f, 0.195943f, -0.109580f, -0.088898f, -0.113054f,
+ 0.293282f, 0.429019f, 0.306136f, 0.863025f, 0.021234f, 0.125770f,
+ -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f, 0.064151f,
+ 0.029883f, 0.076287f, 0.757543f, 0.276713f, -2.529775f, -0.351727f,
+ -1.832316f, 0.544780f, -0.944529f, 0.509705f, -0.010236f, -0.016181f,
+ 0.021520f, 0.086417f, 0.041312f, 0.296853f, -0.372378f, 0.354446f,
+ -1.366762f, 0.048875f, 0.464918f, -0.007450f, 0.750013f, -0.360261f,
+ 0.518532f, 0.753776f, 0.641448f, 0.710746f, 0.250866f, 0.257063f,
+ 0.283421f, 0.253585f, 0.170303f, 0.210426f, 0.208842f, 0.158000f,
+ -0.033144f, 0.130748f, 0.907147f, 0.409248f, -0.854301f, -0.981307f,
+ 0.294427f, -0.507137f, 1.079967f, 0.203203f, 0.383890f, 0.368278f,
+ 0.305122f, 0.449288f, -0.044507f, -0.547263f, -0.298245f, -0.497834f,
+ 0.007016f, -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f,
+ 0.033502f, -0.018578f, -0.231531f, 0.177949f, 0.099564f, -0.010233f,
+ -0.333055f, -0.078586f, -0.417867f, 0.171271f, 0.013662f, -0.143599f,
+ -0.117296f, 0.135382f, 0.048321f, 0.000924f, -0.055024f, -0.405595f,
+ -0.068260f, -0.271011f, -0.436425f, 0.206751f, -0.899890f, 0.605510f,
+ 0.535649f, -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f,
+ 0.004660f, 0.176644f, -1.008475f, -0.038895f, 0.155429f, -0.095229f,
+ -0.680124f, -0.258063f, -0.261901f, 0.110380f, -0.337649f, -0.505870f,
+ -1.428536f, 0.610629f, 0.254905f, 0.045098f, 0.044109f, 0.172329f,
+ 0.060001f, -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f,
+ -0.312134f, 0.081261f, 0.160166f, 0.112690f, 0.266081f, 0.030175f,
+ -0.242746f, 0.000754f, -0.341811f, -0.149774f, -0.017484f, -0.301342f,
+ -0.121466f, 0.067300f, 0.342176f, 0.474538f, 0.085441f, -0.263935f,
+ 0.479235f, -0.003713f, -0.784840f, 0.119480f, 0.456632f, -0.640082f,
+ -0.080575f, -0.744403f, 0.259970f, 0.034667f, -0.274641f, -0.257594f,
+ -1.121124f, -0.003745f, -0.420693f, 0.300441f, -0.100976f, -1.049016f,
+ 0.201960f, 0.113054f, 0.187010f, 1.237427f, 0.054803f, -0.028673f,
+ 0.003596f, -0.034724f, 0.117246f, 0.190977f, 0.278915f, 0.224307f,
+ 0.017852f, -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f,
+ 0.045698f, -0.301095f, 0.184447f, 0.348240f, -0.017021f, -0.145064f,
+ -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f,
+ 0.647597f, -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f,
+ -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f,
+ -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f,
+ -0.971412f, 0.038542f, 0.705204f, 0.887113f, 0.150430f, -0.243676f,
+ 0.638410f, 0.320953f, 0.776676f, 0.527584f, 0.070389f, 0.051554f,
+ 0.177519f, 0.140451f, 0.128892f, 0.087771f, 0.197660f, 0.194764f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer0[24] = {
+ 0.614063f, -0.384872f, 0.084884f, -0.023980f, -0.378765f, -0.082312f,
+ -0.458271f, 0.189578f, -0.046169f, -0.073308f, -0.372322f, 0.162793f,
+ 0.148803f, 0.829214f, -0.221162f, -0.111157f, -0.017484f, -0.280596f,
+ -0.031905f, -0.143459f, 0.078823f, -0.021940f, 0.026834f, 0.257472f,
+};
+
+static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = {
+ -0.985391f, 0.587616f, 0.740683f, 0.192066f, 0.447080f, -0.016585f,
+ 0.680449f, 0.028983f, 0.643111f, 0.234338f, 0.107148f, 0.328456f,
+ -0.216394f, 1.106838f, -0.179062f, -0.129108f, -0.121655f, -0.151340f,
+ -0.306017f, -0.350989f, 0.859284f, -0.372831f, -0.954419f, 0.250495f,
+ 1.046732f, 0.287923f, -0.421088f, 0.326613f, -0.314396f, -0.084757f,
+ -0.474228f, 0.687999f, 0.052334f, 0.441708f, -0.630698f, -0.350348f,
+ -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f, 0.603119f,
+ 0.120245f, 0.182920f, -1.117797f, -0.239594f, -0.296296f, -0.718093f,
+ 0.489497f, -0.527019f, 0.102453f, 0.426731f, 0.034606f, 0.311461f,
+ -0.012723f, -0.229877f, -0.284290f, 0.383227f, 0.065696f, -0.222400f,
+ 1.279248f, -0.862190f, 0.629766f, -0.250011f, -0.325060f, -0.360115f,
+ -0.159540f, -0.291856f, -0.038348f, 0.224639f, 0.600934f, 0.030205f,
+ 1.337615f, -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f,
+ -0.481860f, 0.563625f, -0.154709f, -0.101198f, -0.758796f, -0.507616f,
+ -0.095253f, -0.711135f, 0.207759f, 0.076313f, -0.056087f, -0.162719f,
+ -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f, -1.504446f,
};
static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = {
- -0.197883f,
- -0.136696f,
- 0.094115f,
- 0.612799f,
+ -0.462133f,
+ 0.465060f,
+ 0.062211f,
+ 0.401786f,
};
static const NN_CONFIG av1_4_partition_nnconfig_16 = {
@@ -1519,7 +1427,7 @@ static const NN_CONFIG av1_4_partition_nnconfig_16 = {
LABEL_SIZE, // num_outputs
1, // num_hidden_layers
{
- 48, // num_hidden_nodes
+ 24, // num_hidden_nodes
},
{
av1_4_partition_nn_weights_16_layer0,
@@ -1532,143 +1440,143 @@ static const NN_CONFIG av1_4_partition_nnconfig_16 = {
};
static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = {
- 0.114554f, 0.043669f, 0.313291f, 0.167688f, -0.413357f, 0.088232f,
- 0.301915f, -0.358117f, 0.267711f, -0.252716f, -0.038531f, -0.032805f,
- -0.025382f, 0.023624f, -0.949694f, -0.065480f, -0.375721f, -0.697319f,
- -0.117387f, -0.204309f, -0.190797f, -0.223867f, -0.190248f, 0.026668f,
- 0.199717f, 0.216902f, -0.239241f, -0.096894f, -0.225046f, 0.246523f,
- 0.002333f, -0.254385f, -0.205815f, 0.123139f, -0.476923f, 0.137557f,
- 0.059686f, -0.124013f, 0.974675f, 0.889753f, 0.378940f, 0.526413f,
- -0.208747f, -0.001913f, 0.094081f, 0.848010f, 0.062042f, 0.159831f,
- 0.071016f, 0.024437f, 0.212611f, 0.039501f, -0.149922f, -0.055229f,
- -0.229270f, 0.129004f, -0.182803f, 0.291223f, -1.197804f, -0.916991f,
- -0.024095f, 0.738729f, -0.300326f, 0.402480f, 0.023944f, -0.022613f,
- -0.004554f, 0.001784f, 0.035143f, -0.202237f, 0.080252f, -0.003912f,
- -0.040345f, -0.121881f, 0.126672f, 0.093507f, -0.081305f, -0.081099f,
- -0.218824f, -0.459254f, -0.055250f, -0.095096f, 0.207278f, 0.245259f,
- -0.380849f, -0.334458f, -0.351449f, -0.513045f, -0.407823f, -0.222423f,
- 0.103205f, -0.299965f, -0.211472f, -0.348690f, -0.283688f, -0.152743f,
- -0.204005f, -0.173636f, 0.020302f, -0.109112f, 0.081203f, -0.137344f,
- -0.364582f, -0.343133f, -0.176167f, -0.446541f, 0.144844f, -0.268105f,
- -0.003889f, -0.309560f, -0.236092f, -0.299450f, 0.248269f, 0.207510f,
- -0.279023f, -0.272472f, -0.166427f, 0.205973f, -0.345692f, -0.238400f,
- -0.319178f, -0.327246f, -0.321756f, 0.043191f, -0.027520f, -0.029310f,
- 0.161379f, 0.031154f, -0.605365f, -0.230926f, 0.261142f, -0.262678f,
- -0.373351f, -0.326245f, 0.279222f, 0.684357f, -0.864302f, 0.036132f,
- 0.239307f, 0.136262f, 0.124002f, -0.410379f, -0.172722f, -0.376670f,
- -0.195889f, 0.037292f, -0.055295f, 1.022308f, 0.237600f, -0.618435f,
- 0.366154f, 0.168308f, -0.473467f, -0.756558f, -0.044830f, 0.019057f,
- -0.084214f, -0.007789f, -0.066028f, -0.074562f, 0.002082f, 0.001007f,
- -0.269676f, -0.164768f, -0.027271f, -0.098935f, 0.009431f, 0.254431f,
- 0.124238f, -0.198181f, 0.142723f, -0.112997f, -0.164224f, -0.355160f,
- 0.135330f, -0.379557f, 0.079392f, 0.210607f, -0.354927f, -0.277678f,
- -0.931111f, 0.056208f, -0.347710f, -0.355415f, 0.826145f, 0.390625f,
- 0.374414f, -0.205685f, 0.562485f, 0.152288f, 0.130635f, 0.056622f,
- 0.057972f, 0.095526f, -0.082436f, -0.085938f, -0.070570f, -0.087634f,
- 0.335934f, 0.084860f, 0.544424f, -0.278917f, 0.476740f, 0.050927f,
- -1.288817f, -0.078320f, -0.553041f, -0.160538f, -0.109365f, -0.127146f,
- -0.032524f, -0.105117f, -0.182965f, -0.024723f, 0.083317f, 0.060073f,
- -0.042945f, 0.015249f, 1.241504f, 0.662613f, 0.530496f, -0.180519f,
- -1.099086f, -0.825844f, 0.551856f, -0.025009f, -0.006619f, -0.001049f,
- 0.014828f, -0.035166f, -0.241091f, -0.136364f, -0.003219f, -0.014581f,
- -0.379945f, -0.226191f, -0.161241f, -0.496390f, -0.147175f, -0.118004f,
- -0.128206f, -0.389770f, -0.184288f, -0.119076f, -0.379211f, 0.236180f,
- -0.468730f, -0.175170f, 0.136433f, 0.167739f, -0.377602f, 0.135772f,
- 0.040972f, -0.193974f, -0.319475f, -0.016469f, -0.412027f, -0.322605f,
- 0.111125f, -0.078456f, -0.387234f, -0.401605f, -0.088717f, -0.340682f,
- 0.010556f, 0.058256f, -0.127352f, 0.017665f, 0.072632f, -0.171966f,
- -0.117342f, -0.166050f, -0.182689f, -0.073182f, 0.096279f, -0.260229f,
- 0.025216f, -0.332236f, -0.218706f, -0.200153f, -0.110303f, 0.073499f,
- -0.280123f, 0.132262f, -0.308330f, -0.119036f, -0.303874f, -0.065445f,
- -0.412137f, 0.057167f, 0.044582f, -0.330952f, -0.232572f, 0.039732f,
- -0.326877f, -0.300569f, -0.467164f, -0.371499f, 0.034430f, 0.058277f,
- -0.042485f, -0.409028f, -0.110889f, -0.500758f, -0.343141f, 0.042023f,
- -1.071050f, 0.086854f, -0.004932f, -0.259698f, 0.125301f, -0.742663f,
- -0.370517f, -0.772840f, 0.193628f, 0.554676f, 0.051283f, -0.196639f,
- 0.040344f, 0.027391f, -0.040501f, 0.038303f, 0.032972f, -0.014638f,
- 0.097720f, -0.206897f, -0.015480f, 0.008543f, 0.034469f, 0.127234f,
- -0.396463f, -0.390189f, 0.117538f, -0.435622f, 0.043420f, -0.241987f,
- -0.118254f, -0.190349f, 0.190273f, -0.085625f, -0.141253f, -0.377438f,
- -0.249211f, 0.214512f, -0.363191f, -0.754851f, 0.238045f, 1.127635f,
- 0.173947f, -0.357620f, 0.073671f, 0.220617f, 0.072067f, -0.076214f,
- -0.044583f, -0.018371f, 0.010952f, -0.135116f, 0.076597f, 0.034480f,
- -0.070212f, -0.454429f, -0.135215f, 0.163851f, -0.625990f, -0.283991f,
- 0.284051f, 0.182935f, -0.048717f, 0.002484f, -0.009086f, 0.321724f,
- 0.125162f, -0.069624f, -0.430299f, -0.007224f, -0.284725f, -0.475662f,
- 0.123807f, -0.313614f, -0.103142f, 0.072125f, 0.100320f, -0.185558f,
- -0.481522f, -0.247311f, -0.386762f, -0.258850f, 0.178844f, -0.381231f,
- -0.436001f, -0.374834f, 0.230104f, -0.500679f, 0.170880f, 0.029657f,
- -0.105857f, -0.366671f, -0.268833f, 0.036885f, -0.026776f, 0.037837f,
- -0.362095f, -0.254933f, 0.129650f, 0.007945f, -0.304715f, -0.100813f,
- -0.342849f, -0.269223f, 0.178490f, 0.186735f, -0.353995f, 0.050381f,
- -0.440186f, 0.025985f, 1.096969f, 1.132937f, 0.581545f, 0.271734f,
- -0.109169f, -0.014239f, 0.688644f, 0.602702f, 0.048616f, 0.022335f,
- 0.037545f, 0.081667f, -0.109038f, -0.088565f, -0.002506f, -0.041420f,
- -0.132515f, 0.187312f, 0.677273f, 1.111182f, 0.199096f, -0.211551f,
- -0.896508f, 0.257981f, 0.007803f, 0.160343f, -0.124864f, -0.097150f,
- 0.225090f, 0.242900f, -0.195665f, 0.011310f, 0.160765f, 0.169195f,
- -0.081994f, -0.017372f, -0.566190f, -0.902086f, 0.027768f, 0.511419f,
- 0.076009f, -0.165861f, 0.240487f, 0.006298f, -0.153334f, 0.041249f,
- 0.387092f, 0.313011f, -0.032269f, 0.019024f, 0.052568f, 0.124247f,
- 0.197640f, 0.002537f, 0.651044f, 0.829828f, -0.446444f, -0.402042f,
- -0.469399f, -0.019842f, 0.371960f, 0.140373f, -0.044808f, 0.008283f,
- 0.093791f, 0.052149f, 0.143123f, -0.449571f, -0.868816f, -0.265661f,
- -0.225232f, -0.014704f, 0.543836f, -0.374498f, 0.561647f, 1.309445f,
- 0.056789f, -0.048447f, 0.255758f, 0.644553f, -0.124802f, 0.097419f,
- -0.149336f, 0.021596f, -0.043699f, 0.057591f, -0.000077f, 0.034488f,
- -0.049353f, -0.007799f, 0.437914f, 0.509369f, 0.674428f, 1.858949f,
- -0.205964f, 0.060776f, 0.184213f, 0.037177f, -0.062535f, -0.115408f,
- 0.076498f, 0.010235f, -0.142253f, 0.009983f, 0.073436f, 0.038716f,
- -0.369983f, -0.185959f, -0.137867f, 0.032134f, 0.213814f, -0.125571f,
- 0.247874f, -0.166871f, -0.160890f, 0.147029f, 0.267143f, -0.298488f,
- -0.210203f, -0.188313f, -0.085024f, -0.244962f, -0.189833f, -0.261242f,
- 0.399519f, 0.143200f, -0.776419f, -0.374639f, -0.022066f, 0.582904f,
- 0.006430f, -0.139134f, -0.491894f, -0.430579f, -0.358221f, -0.231365f,
- -0.398255f, -0.173231f, 0.211789f, -0.036121f, -0.266856f, 0.042956f,
- -1.138513f, -0.070313f, 0.158803f, 0.406989f, -0.015974f, 0.651020f,
- -0.468982f, -0.310019f, 0.416922f, 0.895162f, 0.019921f, 0.004023f,
- 0.006962f, 0.000863f, -0.216395f, -0.074913f, -0.002613f, 0.026703f,
+ -0.219494f, -0.428273f, 0.471006f, 0.448210f, -0.152935f, 0.440435f,
+ 0.922857f, -0.074436f, 1.002195f, 0.414176f, -0.327202f, -0.380066f,
+ -0.212346f, 0.061868f, -0.056620f, 0.594134f, 0.617995f, 0.308358f,
+ 0.232484f, 0.129849f, 1.483593f, -0.071460f, 1.984515f, 1.116422f,
+ -1.141762f, -0.306220f, 0.089075f, -0.271845f, 0.187524f, 0.050396f,
+ -0.061025f, 0.030809f, 0.172799f, -0.458151f, -0.318357f, 0.122052f,
+ -0.414329f, 0.089366f, 0.118898f, -0.376213f, -0.206151f, -0.519946f,
+ -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f, -0.245280f,
+ -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f,
+ -0.500856f, 0.003388f, -0.527397f, -0.449330f, -0.174272f, 0.123769f,
+ 0.023005f, 0.157273f, 0.073400f, 0.019099f, -0.113848f, -0.098601f,
+ -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f,
+ -1.282604f, 0.048505f, -0.746382f, 0.093740f, -0.706583f, -0.085729f,
+ 0.947382f, -0.002961f, 1.175362f, 1.007309f, 0.141638f, -0.037608f,
+ -0.118807f, -0.021474f, -0.146763f, 0.069363f, -0.074372f, -0.215713f,
+ -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f, -0.534598f,
+ -0.357759f, -0.455950f, 0.139469f, 0.036582f, -0.384743f, -0.168828f,
+ -0.239250f, 0.003520f, -0.049003f, 0.075702f, -0.025809f, -0.225972f,
+ -0.228905f, -0.412489f, 0.060570f, -0.328819f, -0.206446f, -0.080231f,
+ -0.372008f, -0.218118f, -0.011954f, 0.024155f, 0.156014f, 0.020679f,
+ 0.194398f, -0.283491f, -0.024463f, -0.275099f, 0.028031f, 0.026340f,
+ -0.254668f, 0.103637f, 2.178693f, 0.552284f, 0.109366f, -0.474806f,
+ -0.379286f, -0.026315f, 2.487924f, -0.089466f, 0.206428f, 0.114578f,
+ 0.152248f, 0.184050f, -0.631948f, -0.014793f, -0.283782f, -0.830353f,
+ 0.009343f, -0.021029f, -0.060534f, -0.025164f, 1.841311f, 1.842748f,
+ -1.979708f, 0.450985f, -1.606357f, -0.785454f, -0.212679f, -0.344342f,
+ 0.198991f, -0.258070f, 0.055974f, 0.224069f, 0.453051f, 0.408053f,
+ 0.027873f, -0.180538f, 0.056609f, 0.207654f, 0.104086f, -0.194426f,
+ -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f,
+ -0.160439f, -0.044856f, -0.346647f, 0.044859f, 0.231398f, -0.023643f,
+ -0.140316f, -0.260177f, 0.206965f, -0.425386f, -0.420268f, -0.409748f,
+ 0.006971f, 0.066186f, -0.034950f, -0.345518f, 0.018633f, -0.122489f,
+ -0.038506f, -0.330942f, 0.161236f, -0.314119f, -0.050202f, -0.179597f,
+ 0.731897f, -0.184481f, 0.153598f, -0.539501f, -0.301493f, -0.184967f,
+ -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f,
+ -0.101083f, -0.064142f, 0.161190f, 0.430826f, 0.355647f, 0.138266f,
+ 0.051114f, -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f,
+ -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f,
+ 0.120193f, 0.011360f, -0.010475f, -0.092739f, -0.159650f, -0.033129f,
+ -0.259893f, -0.073217f, 0.200128f, 0.103407f, -0.229233f, 0.128831f,
+ -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f,
+ -0.442879f, -0.310456f, -0.112881f, 0.263696f, -0.205014f, -0.497936f,
+ -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f,
+ 0.117232f, -0.577809f, 0.154596f, -0.409522f, -0.413113f, -0.359199f,
+ 0.307294f, -0.008746f, -0.310522f, 0.347620f, -0.384845f, -0.451398f,
+ -0.226199f, 0.054154f, -0.167608f, 0.046836f, -0.013285f, -0.408119f,
+ -0.177973f, -0.248293f, -0.465830f, 0.035827f, -0.222208f, -0.221717f,
+ 0.066392f, -0.349769f, -0.428029f, -0.516692f, 0.022398f, -0.251682f,
+ 0.134746f, 0.011167f, -2.078787f, 0.173592f, -1.948348f, 0.330060f,
+ 1.993785f, -0.052859f, -0.004795f, -3.703177f, 0.013450f, -0.011687f,
+ 0.073079f, 0.034803f, 0.025515f, 0.005994f, 0.101731f, 0.074303f,
+ -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f,
+ 0.007667f, -0.358453f, 0.027412f, 0.033492f, 0.021197f, -0.049991f,
+ 0.104468f, -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f,
+ -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f,
+ -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f, -0.246475f,
+ -0.229144f, -0.087744f, -0.346909f, 0.172611f, 0.004377f, -0.009386f,
+ -0.023104f, 0.008000f, -0.029390f, -0.317842f, 0.549674f, -0.195337f,
+ -0.863979f, 0.160889f, -0.269014f, -0.442104f, -1.799191f, 1.396533f,
+ -0.112837f, 0.881303f, 0.000764f, -0.035415f, -0.141877f, 0.184831f,
+ -0.363566f, -0.178569f, 0.254134f, -0.326893f, 0.127325f, 0.310620f,
+ -0.384621f, 0.146058f, -0.287682f, -0.373447f, 0.026930f, 0.251650f,
+ 0.053817f, 0.227509f, 0.121396f, 0.396514f, -0.278381f, -0.038969f,
+ -1.538756f, -0.002856f, -0.892900f, 0.363426f, -1.257922f, 0.743795f,
+ 0.941177f, 0.219345f, 0.684189f, 1.396858f, 0.026299f, -0.093433f,
+ -0.066182f, 0.057868f, -0.089278f, -0.159680f, -0.262035f, -0.236656f,
+ 0.005349f, -0.031314f, 0.027917f, -0.182113f, -0.212086f, -0.160774f,
+ 0.051468f, 0.036787f, 0.183881f, -0.288205f, -0.349691f, 0.162511f,
+ 0.117878f, -0.294534f, -0.365037f, -0.246313f, 0.073977f, -0.072378f,
+ -0.173579f, -0.584560f, 0.547194f, 0.259853f, -0.405287f, -0.421146f,
+ 0.165788f, -0.146964f, 0.257415f, 0.772394f, -0.475302f, -0.310906f,
+ 0.058723f, 0.276833f, 0.586842f, 0.248998f, -0.061135f, 0.255779f,
+ 0.152158f, -0.024781f, 2.821834f, 1.365141f, 0.914744f, 0.165752f,
+ -1.048304f, -0.333891f, 1.804087f, -0.437028f, -0.120211f, -0.020443f,
+ 0.040077f, 0.258600f, -0.598893f, -0.494579f, -0.281054f, -0.517041f,
+ 0.005258f, 0.053986f, 0.322755f, 0.429495f, -1.992364f, -0.717192f,
+ -1.774802f, 2.047362f, -0.016194f, 0.312606f, 0.019331f, 0.060950f,
+ 0.116428f, 0.168458f, -0.307001f, -0.420734f, 0.475843f, 0.425346f,
+ -0.107119f, 0.049892f, -1.168619f, 0.010878f, 0.354872f, 0.902717f,
+ -0.391407f, 0.332772f, -1.335037f, -0.447100f, 0.481719f, -0.101069f,
+ -1.806565f, 0.925280f, 0.346999f, 0.093809f, 0.006275f, 0.270814f,
+ -0.691123f, 0.230748f, 0.137033f, 0.068228f, 1.555975f, -0.271637f,
+ -0.370403f, 0.236131f, 0.367464f, -0.136562f, 0.428838f, 0.181750f,
+ 0.338762f, 0.292449f, -0.748204f, -0.922731f, -0.959445f, -0.806418f,
+ -0.140501f, 0.070525f, 1.248748f, 0.637990f, -1.307246f, -0.514055f,
+ 0.393858f, -1.858727f, 0.713591f, -0.141044f, 0.080723f, 0.120220f,
+ -0.031175f, 0.224488f, 0.753818f, -0.833351f, -1.099132f, 0.651100f,
+ -0.135061f, -0.043820f, 0.026983f, -0.059259f, 0.001345f, -0.281775f,
+ 0.006958f, 0.046103f, -0.246539f, 0.057630f, -0.360778f, -0.160681f,
+ -0.414870f, -0.301979f, 0.000683f, 0.132957f, -0.477609f, 0.106110f,
+ -0.637769f, -0.078374f, -0.229494f, 0.583108f, -0.822973f, -0.107540f,
+ 1.063426f, -0.268346f, 1.105787f, 2.587550f, -0.020314f, -0.002161f,
+ -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f,
+ -0.019870f, -0.018920f, -0.219732f, 0.035608f, -1.789450f, 0.483032f,
+ -0.464729f, 1.563277f, -1.054195f, 0.359991f, 0.065204f, 0.135623f,
+ 0.158380f, -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f,
};
static const float av1_4_partition_nn_bias_32_layer0[32] = {
- 0.133615f, -0.113389f, -0.575989f, 0.589389f, -0.193574f, -0.132463f,
- 0.000000f, 0.060317f, 0.264577f, -0.060599f, 0.540147f, -0.127782f,
- -0.548802f, -0.172235f, -0.193032f, -0.026301f, -0.177527f, 0.267821f,
- -0.115455f, -0.137162f, -0.079595f, -0.041443f, -0.043856f, -0.657220f,
- -0.448931f, 0.446300f, 0.250002f, 0.223559f, -0.647723f, -0.014369f,
- 0.084333f, -0.056270f,
+ 0.421645f, -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f,
+ -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f,
+ -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f, -0.032831f,
+ -0.165621f, 0.145844f, -0.032104f, -0.453246f, -0.113444f, 0.321589f,
+ -0.862375f, -0.108826f, -0.486259f, 0.685325f, 0.072569f, -0.187961f,
+ 0.109579f, -0.082685f,
};
static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = {
- -0.069633f, -0.087239f, 0.365816f, -0.068579f, 0.231198f, -0.067856f,
- -0.139892f, -0.100235f, -0.488166f, -0.150112f, -0.005546f, 0.210832f,
- 0.778888f, 0.169624f, 0.089968f, -0.243569f, 0.353483f, 0.032296f,
- -0.157408f, 0.286885f, -0.063537f, -0.324055f, -0.161464f, 0.430600f,
- 0.277707f, -0.196463f, 0.154647f, 0.059804f, 0.176408f, 0.303179f,
- -0.040156f, 0.375810f, -0.363032f, -0.186808f, -0.264561f, -0.158937f,
- -0.007949f, -0.076394f, 0.056475f, 0.308528f, 0.695387f, 0.051336f,
- 0.433063f, -0.229948f, -1.210712f, 0.036286f, 0.183868f, -0.117660f,
- 0.230134f, -0.093469f, 0.237918f, 0.625986f, -0.236671f, -0.377172f,
- 0.331091f, -0.394004f, -0.214349f, 0.243940f, -0.600348f, 0.069843f,
- 0.088325f, 0.225775f, 0.276884f, -0.604493f, 0.769812f, 0.259574f,
- 0.086220f, 0.511515f, -0.282584f, -0.157719f, 0.278778f, -0.332732f,
- 0.068985f, -0.237236f, -0.006102f, -0.154883f, 0.710288f, -0.245896f,
- -0.255895f, -0.398038f, 0.304084f, -0.317065f, 0.192609f, -0.235613f,
- 0.461340f, 0.117194f, 0.116817f, 0.196150f, 0.421622f, -0.264495f,
- 0.617852f, -0.351756f, -0.310016f, 0.135932f, -0.242622f, -0.073094f,
- 0.042077f, 0.039230f, -0.482715f, 0.553187f, 0.360637f, 0.313484f,
- -0.131540f, -0.104731f, 0.374704f, 0.222173f, 0.437657f, 0.029827f,
- -0.545156f, -0.203176f, 0.267824f, 0.169237f, -0.057871f, 0.552197f,
- 0.272243f, 0.025681f, -0.262192f, 0.255934f, -0.202407f, -0.483317f,
- -0.204721f, 0.288807f, -0.030735f, -0.047161f, -0.780724f, 0.381939f,
- -0.295318f, 0.537378f,
+ 0.255012f, 0.658860f, 0.216907f, 0.165947f, 0.241182f, 0.340854f,
+ 0.409445f, 0.165220f, 0.553373f, -0.242385f, -0.209571f, 0.255515f,
+ 0.222500f, 0.037032f, 0.238590f, 0.061624f, -2.038693f, 0.264167f,
+ -0.230144f, 0.129952f, -0.027979f, 0.847761f, 0.438922f, 0.462323f,
+ 0.555345f, 0.030689f, 0.336357f, -0.357326f, -0.113137f, 0.272631f,
+ 0.421022f, 0.367776f, -0.197094f, 0.157117f, -0.015008f, -0.056123f,
+ -0.283913f, 0.186417f, 0.178561f, -0.763041f, 0.602038f, 0.341092f,
+ 0.320453f, -0.312776f, -0.371240f, -0.356279f, 0.220117f, -0.131871f,
+ 1.517429f, 0.162223f, -0.255069f, 0.451861f, 0.045071f, -0.223257f,
+ 0.003257f, 0.015734f, -0.630447f, -0.672588f, 0.670164f, 0.571031f,
+ -0.657948f, 0.034506f, -0.249076f, 0.790293f, 0.066491f, -0.131245f,
+ 0.355173f, 0.564622f, 0.374048f, 0.033974f, 0.253970f, 0.495498f,
+ -0.556321f, -0.104651f, 0.276947f, 0.057148f, -0.039126f, -0.170050f,
+ -0.141542f, 0.158541f, 0.582763f, -0.100992f, 0.096705f, -0.209029f,
+ 0.008449f, 0.255865f, 0.103565f, 0.317719f, 0.479499f, 0.599126f,
+ -0.065613f, -0.268614f, 0.508736f, 0.180813f, -0.815868f, 0.051238f,
+ 0.001223f, -0.305423f, -0.270079f, 0.036180f, 0.304342f, 0.202634f,
+ 0.218348f, -0.304304f, -0.438297f, 0.241123f, 0.200230f, 0.151804f,
+ 0.051944f, 0.160422f, -0.262981f, -0.417412f, 1.845729f, -0.086183f,
+ 0.403517f, 0.059667f, 0.564543f, -0.081752f, 0.114907f, -0.284489f,
+ -0.673943f, 0.056965f, 0.362221f, 0.403224f, -0.000233f, -0.209552f,
+ -0.800926f, -0.134132f,
};
static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = {
- -0.332518f,
- 0.114452f,
- 0.098949f,
- 0.465896f,
+ -0.019518f,
+ 0.198546f,
+ 0.339015f,
+ -0.261961f,
};
static const NN_CONFIG av1_4_partition_nnconfig_32 = {
@@ -1688,82 +1596,112 @@ static const NN_CONFIG av1_4_partition_nnconfig_32 = {
},
};
-static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
- 0.256343f, -0.021774f, -0.117102f, 0.416930f, 0.188160f, 0.148768f,
- -0.611181f, -0.121607f, -0.394825f, -0.875025f, -0.167071f, 0.016408f,
- 0.222769f, -0.199332f, 0.058667f, -0.679529f, 0.081744f, 0.044438f,
- -0.182941f, -0.110339f, -0.137822f, -0.096164f, -0.132319f, 0.140036f,
- -0.049503f, -0.309894f, -0.323991f, 0.166113f, 0.138104f, -0.263629f,
- -0.368460f, -0.273989f, 0.147239f, 0.044566f, -0.363357f, -0.030792f,
- 0.020734f, 0.068506f, -0.434214f, 0.581644f, -1.244146f, -0.569162f,
- 0.179499f, -0.188900f, 0.078431f, -0.392126f, -0.006431f, 0.112146f,
- -0.065892f, -0.051319f, 0.094607f, 0.251700f, -0.000650f, 0.011911f,
- 0.080449f, 0.022816f, 0.322382f, 0.577070f, 0.927738f, 0.178707f,
- -0.101237f, -0.212521f, 0.560261f, -0.206492f, -0.077591f, -0.069960f,
- 0.025727f, 0.041122f, -0.735228f, -0.506091f, -0.600776f, -0.117829f,
- 0.103556f, 0.141823f, 0.853448f, 0.339488f, 0.994022f, 0.121693f,
- -2.065366f, -0.352510f, -0.174323f, -0.323400f, -0.002193f, 0.004161f,
- 0.042469f, -0.005319f, -0.305784f, -0.371353f, 0.011194f, -0.018597f,
- 0.209260f, 0.071577f, 0.242470f, -0.856593f, 0.288842f, 1.062608f,
- -0.300472f, 0.221623f, -0.813563f, -0.250347f, -0.081455f, -0.092779f,
- -0.168132f, -0.180640f, -0.075130f, -0.052906f, -0.015645f, 0.127158f,
- -0.006546f, 0.051671f, 0.545608f, 1.101804f, 0.288086f, 1.107046f,
- -0.200012f, 0.220182f, -0.189220f, -0.554973f, 0.040711f, -0.058029f,
- 0.043737f, 0.016164f, -0.391790f, -0.287770f, -0.046545f, 0.045071f,
- 0.190005f, -0.076963f, 0.836839f, 1.633266f, 0.902928f, 0.991972f,
- -0.127932f, 0.293680f, -0.035984f, 0.476179f, -0.098024f, 0.068314f,
- -0.058365f, 0.096221f, -0.000321f, -0.128840f, 0.136441f, -0.061853f,
- 0.270367f, -0.184129f, -0.373670f, -0.177381f, 0.262109f, -0.378013f,
- -0.053249f, -0.456389f, 0.222972f, -0.228067f, -0.115210f, -0.277797f,
- 0.096913f, -0.014512f, -0.015533f, 0.026389f, -0.360536f, -0.078477f,
- -0.203186f, 0.199574f, 0.770476f, 0.595592f, 0.360828f, 0.547721f,
- -0.804787f, 0.389690f, -0.437645f, 0.576776f, 0.081903f, 0.082750f,
- 0.007166f, -0.143755f, 0.114462f, 0.472432f, -0.058974f, 0.077761f,
- -2.015181f, -0.054942f, -0.110894f, 0.529188f, -0.003300f, 0.913895f,
- -0.324643f, 0.316135f, -0.291729f, 1.072647f, -0.029236f, 0.045592f,
- -0.039399f, 0.043472f, -0.303244f, -0.108761f, -0.011154f, 0.009693f,
- -0.374985f, 0.027758f, 0.302075f, -0.295758f, -0.165563f, -0.297259f,
- -0.485624f, -0.469310f, -0.028247f, -0.124440f, -0.428082f, 0.096325f,
- 0.089003f, -0.301585f, 0.022474f, 0.077477f, -0.032233f, -0.231036f,
- 0.143206f, 0.169113f, -0.556486f, 0.346327f, -0.667790f, 0.126983f,
- 0.179727f, 0.397307f, -0.490612f, -1.708789f, -0.040336f, -0.028547f,
- -0.091319f, -0.119367f, -0.518796f, -0.543383f, 0.037162f, 0.031344f,
- -0.131692f, 0.119353f, 0.799313f, 0.443848f, -0.499919f, -1.002983f,
- 0.375477f, 0.221096f, -0.238033f, 0.284849f, 0.021897f, 0.023338f,
- -0.059067f, 0.117276f, 0.039540f, 0.049630f, 0.175150f, 0.014166f,
- -0.071486f, 0.091234f, -1.007432f, -1.417378f, 0.640528f, 1.442576f,
- -0.257183f, -0.597016f, 0.861785f, 0.276121f, -0.098017f, 0.120514f,
- -0.133184f, 0.106529f, 0.171644f, 0.059513f, 0.215952f, -0.009441f,
- -0.505313f, 0.063174f, 0.229148f, -0.344213f, 0.862721f, 1.549941f,
- -0.220129f, 0.493094f, 0.264095f, 0.143641f, 0.084968f, -0.078266f,
- 0.032335f, -0.019006f, -0.098205f, 0.119213f, -0.103465f, 0.072811f,
-};
-
-static const float av1_4_partition_nn_bias_64_layer0[16] = {
- 0.111611f, -0.067682f, 0.633594f, 0.143559f, -1.051284f, -0.266625f,
- -0.829789f, -0.956123f, -0.153484f, -0.787741f, 0.004832f, -0.080769f,
- 0.235166f, 0.449468f, 0.294689f, -0.395300f,
-};
-
-static const float av1_4_partition_nn_weights_64_layer1[16 * LABEL_SIZE] = {
- -0.069999f, -0.093710f, -0.423714f, -0.028138f, 0.684415f, 0.141445f,
- 0.507161f, 0.435533f, -0.263268f, 0.585105f, 0.235301f, 0.127536f,
- -0.688639f, -0.217993f, -0.540066f, 0.406718f, 0.018210f, -0.077349f,
- -0.124823f, -0.488220f, -0.957026f, 0.302632f, 0.285490f, -0.411356f,
- 0.091089f, 0.103862f, -0.549291f, 0.148628f, 0.640603f, -0.601018f,
- 0.178024f, 0.601370f, 0.313780f, 0.051938f, 0.524083f, 0.814631f,
- -0.415522f, -0.738849f, 0.477881f, -0.342864f, 0.105181f, 0.040010f,
- -0.177521f, 0.400646f, 0.167093f, 0.388279f, -0.898439f, -0.111936f,
- 0.469875f, -0.099528f, -0.217370f, 0.283742f, -0.033798f, -0.142797f,
- -0.174057f, -1.293311f, -0.038777f, -0.003846f, 0.093642f, -0.527150f,
- -0.021259f, 0.194651f, -0.276294f, -0.109514f,
+static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = {
+ -0.152649f, 0.074509f, 1.000136f, 0.601661f, -1.416694f, -1.932396f,
+ -1.163850f, 0.640931f, -0.888625f, -0.345711f, 0.161799f, 0.103165f,
+ 0.147513f, 0.089956f, 0.204329f, 0.196922f, 0.014927f, 0.283714f,
+ -0.110422f, 0.062005f, -0.531870f, -0.075287f, -0.448349f, -0.218881f,
+ -0.005592f, -0.130490f, -0.015779f, 0.093521f, -0.158487f, 0.072241f,
+ 0.066879f, -0.418566f, -0.206281f, 0.025634f, 0.048334f, -0.534750f,
+ 0.302081f, 0.028707f, -1.543248f, 0.103799f, -1.214052f, 0.395870f,
+ 0.394754f, -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f,
+ 0.014742f, 0.065263f, 0.000879f, -0.019768f, 0.101275f, 0.163059f,
+ -0.371392f, -0.283484f, 0.241915f, 0.012684f, -0.210101f, -0.166534f,
+ -0.024894f, 0.274696f, 0.098993f, 0.104086f, 0.055044f, -0.289378f,
+ 0.146571f, -0.147441f, 0.004056f, 0.112244f, -0.416162f, -0.033176f,
+ -0.214836f, -0.213787f, 0.023197f, -0.339043f, 0.301109f, -0.408551f,
+ 0.284922f, -0.344418f, -0.039255f, 0.158748f, -0.344169f, 0.078286f,
+ -0.043957f, -0.302162f, -0.310826f, 0.063425f, 0.198166f, -0.285324f,
+ -0.108252f, 0.038992f, -1.053110f, -1.663290f, -0.417185f, 1.504443f,
+ 0.643206f, -0.850240f, 0.889641f, -0.733214f, 0.147302f, 0.060291f,
+ -0.052954f, 0.167453f, 0.111870f, 0.085471f, 0.035107f, 0.064361f,
+ 0.176053f, 0.184373f, 0.676576f, 0.066164f, 1.455569f, 0.925111f,
+ -0.640845f, 0.803795f, -0.653782f, -0.201038f, 0.060033f, 0.016964f,
+ -0.047590f, 0.045908f, 0.354162f, 0.014812f, 0.156978f, 0.058792f,
+ -0.238119f, 0.002450f, -0.094388f, -0.155229f, 0.194858f, -0.355429f,
+ -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f, -0.425339f,
+ -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f, -0.185816f,
+ -0.317294f, 0.002453f, -0.498241f, -0.204302f, -0.079093f, 0.020646f,
+ -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f,
+ -0.146812f, 0.171111f, 0.090261f, -0.367033f, -0.299051f, -0.322132f,
+ 0.428192f, -0.252613f, 0.488498f, -0.559682f, 0.486720f, -0.511084f,
+ 0.992506f, 0.346765f, -0.118697f, -0.065127f, -0.376612f, -0.345137f,
+ -0.426517f, -0.516836f, 0.307083f, 0.609362f, 0.369555f, 0.093775f,
+ -0.375664f, -0.221595f, -0.025465f, 0.134374f, -0.387031f, 0.096236f,
+ 0.337465f, -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f,
+ -0.247705f, 0.146559f, -0.236206f, -0.036073f, 0.064206f, -0.330919f,
+ 0.516591f, -0.013492f, 1.269568f, 1.182530f, -0.455390f, -1.328091f,
+ -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f, 0.021176f,
+ 0.169119f, 0.103707f, -0.174504f, -0.462719f, -0.079445f, -0.247128f,
+ 0.459111f, 0.036129f, 0.769570f, -0.080405f, 1.667107f, 0.355567f,
+ -2.433896f, 0.627572f, -0.600090f, -0.651872f, -0.059769f, -0.041945f,
+ -0.009933f, 0.014864f, -0.049378f, -0.041561f, 0.075180f, 0.138307f,
+ 0.122366f, -0.160756f, 0.215327f, 0.013572f, 0.198194f, -0.762650f,
+ 0.054466f, 1.110332f, 1.692853f, 0.658654f, -0.409549f, 0.506085f,
+ 0.330962f, -0.223008f, 0.007448f, -0.289062f, -0.476231f, -0.228359f,
+ 0.013977f, -0.000609f, -0.673604f, 0.275996f, 0.405291f, 1.693561f,
+ -1.079768f, 1.122516f, -0.203227f, 0.099265f, -0.165207f, -0.323899f,
+ -0.269973f, -0.080122f, 0.127700f, 0.190201f, 0.219527f, 0.306194f,
+ 0.026049f, -0.003779f, 1.107357f, 1.720315f, 1.017908f, 0.078664f,
+ -1.599813f, -0.482636f, -0.117450f, 0.122249f, 0.030220f, 0.039794f,
+ 0.176350f, 0.129715f, -0.305755f, -0.274044f, -0.299640f, -0.187335f,
+ -0.073616f, -0.564507f, -0.127758f, 0.044855f, -0.191090f, 0.039095f,
+ 0.115378f, 0.969352f, -0.088360f, 0.301443f, 0.065726f, -0.019740f,
+ -0.102350f, -0.084913f, -0.194615f, 0.118582f, 0.920789f, -0.171615f,
+ -1.436553f, -0.026419f, -0.730864f, 0.615697f, -0.795079f, 0.119701f,
+ 0.601782f, 0.792902f, 0.184920f, 1.635090f, -0.085860f, -0.033187f,
+ -0.166883f, 0.008487f, -0.128300f, -0.089923f, -0.108781f, -0.133719f,
+ -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f,
+ -0.063101f, -0.121002f, 0.058072f, -0.031166f, 0.086413f, -0.016203f,
+ -0.305075f, -0.005420f, -0.168796f, 0.148745f, -0.116737f, -0.050222f,
+ -0.287952f, -0.290982f, -0.090449f, 0.076098f, -0.345632f, -0.061309f,
+ 0.142218f, 0.035692f, 0.304517f, -0.228031f, 0.119608f, -0.120350f,
+ 0.163404f, -0.105605f, -0.305462f, -0.176657f, 0.210070f, -0.227600f,
+ -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f, 0.017162f,
+ -0.069792f, 0.305768f, -0.421095f, 0.187740f, -0.032059f, 0.575115f,
+ -0.064283f, -0.091828f, 0.772648f, -0.393189f, -0.297098f, 0.141420f,
+ 0.826389f, -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f,
+ 1.546000f, -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f,
+ -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f,
+ -0.492625f, 0.025350f, -0.332075f, -0.475638f, -0.076667f, -0.065779f,
+ 0.108957f, 0.246298f, -0.289007f, -0.442552f, -0.206692f, -0.257453f,
+ 0.073806f, -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f,
+ 0.018810f, -0.098241f, 1.027369f, 0.479328f, 1.129707f, 0.484813f,
+ -0.085207f, 0.621873f, -0.520981f, 0.236175f, 0.273487f, 0.061426f,
+ 0.306085f, 0.161487f, 0.220991f, 0.223783f, -0.091826f, 0.391031f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer0[24] = {
+ 0.580225f, -0.191304f, 1.091767f, -0.134522f, -0.089361f, 0.398750f,
+ -0.882708f, -0.213102f, -0.119981f, 0.378296f, -0.075719f, 0.426598f,
+ -2.015505f, 0.202534f, -1.044792f, -0.841519f, 0.266421f, -0.047115f,
+ -0.131147f, -0.075066f, -0.009441f, 0.853007f, -0.175606f, -0.868306f,
+};
+
+static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = {
+ -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f, -0.340847f,
+ 0.498032f, 0.308652f, -0.051574f, 0.323146f, -0.097547f, -0.040269f,
+ 1.909655f, 0.098348f, 0.588136f, 0.568112f, 0.313297f, 0.920848f,
+ -0.014486f, 0.386014f, 0.029199f, -0.537330f, -0.021502f, 0.349073f,
+ -0.524715f, -0.351848f, 1.565454f, -0.297148f, 0.020177f, 0.648369f,
+ 0.027321f, -0.096052f, -0.363163f, -0.132642f, 0.024292f, -0.734176f,
+ -0.782700f, 0.408299f, 0.476945f, -0.489512f, -0.728318f, -0.632042f,
+ 0.405417f, 0.184086f, -0.400730f, 0.359032f, 0.019710f, -0.217409f,
+ 0.519159f, -0.136316f, 0.993592f, -0.147128f, 0.097495f, 0.426189f,
+ -0.295233f, 0.278799f, 0.080667f, -0.025052f, -0.307757f, 0.418716f,
+ -0.853388f, -0.374878f, -0.322725f, 0.696335f, -0.380649f, -0.160356f,
+ -0.140060f, 0.502455f, 0.656728f, -0.095023f, -0.184198f, -0.347069f,
+ 0.456372f, -0.029754f, 0.907923f, 0.265710f, -0.065505f, 0.226763f,
+ -0.277798f, 0.413292f, -0.593899f, -0.060740f, -0.313358f, -0.249944f,
+ -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f,
+ -0.057382f, 0.334741f, -0.283083f, 0.368280f, -0.407197f, -0.441849f,
};
static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = {
- -0.688947f,
- 0.121075f,
- 0.289597f,
- 0.948091f,
+ -0.478735f,
+ 0.292948f,
+ 0.293172f,
+ 0.040013f,
};
static const NN_CONFIG av1_4_partition_nnconfig_64 = {
@@ -1771,7 +1709,7 @@ static const NN_CONFIG av1_4_partition_nnconfig_64 = {
LABEL_SIZE, // num_outputs
1, // num_hidden_layers
{
- 16, // num_hidden_nodes
+ 24, // num_hidden_nodes
},
{
av1_4_partition_nn_weights_64_layer0,
@@ -1786,8 +1724,725 @@ static const NN_CONFIG av1_4_partition_nnconfig_64 = {
#undef FEATURE_SIZE
#undef LABEL_SIZE
+#define FEATURE_SIZE 4
+static const float
+ av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = {
+ -0.331785f, 0.068675f, -0.323814f, 0.033714f, -0.237835f, 0.166316f,
+ -0.498766f, -0.545634f, -0.266173f, -0.476957f, -0.120409f, -0.021042f,
+ 0.124056f, -0.278750f, -0.110120f, -0.372812f, 4.547939f, 0.097618f,
+ -0.002710f, -0.064169f, -1.841173f, -0.403833f, 0.005536f, 0.067188f,
+ -0.434935f, -0.227421f, -0.000011f, -0.139961f, -0.174056f, -0.652384f,
+ -0.000015f, -0.262847f, -3.319706f, -0.947693f, 0.002981f, 0.016717f,
+ -10.408850f, -0.014568f, -0.000018f, 0.019084f, 1.523383f, 0.074525f,
+ -0.002076f, -0.020734f, 4.881495f, 0.002799f, 0.000342f, -0.019623f,
+ 1.786154f, 0.037462f, -0.019037f, 0.052833f, 11.408153f, -0.044602f,
+ 0.026155f, -0.518627f, -0.474499f, -0.427430f, -0.442733f, -0.011116f,
+ -22.379410f, -0.000549f, -0.001418f, 0.008090f, -0.295090f, -0.230268f,
+ -0.337278f, -0.001127f, -0.644282f, -0.598783f, -0.539417f, -0.003303f,
+ 9.189824f, 0.038066f, -0.004097f, -0.460045f, -0.308858f, -0.242691f,
+ -0.230835f, -0.273057f, 0.152226f, 0.179239f, -0.146382f, -0.004655f,
+ -0.242940f, -0.718862f, -0.001685f, -0.214736f, 3.263186f, 0.079463f,
+ -0.003854f, -0.187461f, -0.599144f, -0.419808f, -0.000597f, -0.136980f,
+ 0.184813f, -0.319525f, -0.007246f, 0.079709f, -0.883229f, -0.343748f,
+ -0.000077f, -0.172214f, -0.548759f, -0.194674f, -0.144786f, 0.043896f,
+ -0.176364f, -0.248394f, -0.090215f, -0.294743f, -0.280980f, -0.181436f,
+ -0.115681f, -0.071915f, -13.035494f, -0.075623f, 0.017052f, -0.171152f,
+ 5.910803f, 0.128344f, 0.010256f, -1.073301f, 2.387826f, 0.166183f,
+ -0.007193f, -0.257836f,
+ };
+
+static const float av1_partition_breakout_nn_bias_128_layer0[32] = {
+ 0.115591f, -0.100178f, -0.165523f, -0.122997f, 11.045759f, 1.034761f,
+ -0.323672f, -0.189087f, 2.850950f, 7.010029f, -21.447067f, 1.877031f,
+ 0.437442f, 5.929414f, -0.117274f, 4.462253f, -0.135198f, -0.145927f,
+ 8.727211f, 0.000000f, -3.532987f, -0.405898f, 11.364439f, -0.141728f,
+ -5.994947f, -0.362574f, 1.857687f, -0.100400f, -0.130312f, 0.006080f,
+ 0.429660f, -8.439470f,
+};
+
+static const float av1_partition_breakout_nn_weights_128_layer1[32] = {
+ -0.013738f, 0.022052f, -0.074437f, -0.211377f, -0.080433f, 0.015543f,
+ 0.002091f, 0.014252f, 0.134834f, 0.190263f, 0.244175f, -0.031747f,
+ 0.020068f, -0.068326f, 0.185471f, 0.660268f, -0.134898f, -0.010376f,
+ -0.276023f, -0.282921f, -0.022769f, 0.007070f, -0.186235f, 0.024407f,
+ -0.024837f, 0.005764f, 0.016599f, -0.040077f, 0.020990f, 0.095054f,
+ -0.039662f, 0.131499f,
+};
+
+static const float av1_partition_breakout_nn_bias_128_layer1[1] = {
+ 0.86678213f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_128_layer0,
+ av1_partition_breakout_nn_weights_128_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_128_layer0,
+ av1_partition_breakout_nn_bias_128_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
+ 0.872892f, -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f,
+ -0.001373f, 0.112147f, 5.281734f, 0.060704f, 0.000838f, -0.961554f,
+ 0.244995f, 0.154515f, -0.292654f, -0.167177f, -3.759112f, -0.486347f,
+ 0.003208f, -0.418226f, 2.618152f, 0.026832f, 0.003988f, -0.404406f,
+ -0.405434f, 0.102791f, -0.033406f, -0.029820f, -4.492342f, -0.154291f,
+ 0.012947f, -0.195075f, 0.009311f, -0.411410f, -0.010986f, -0.554822f,
+ 0.160576f, 0.020796f, -0.457230f, -0.191111f, -7.759542f, -0.065039f,
+ -0.001322f, 0.055691f, 0.291924f, -0.053076f, -0.148379f, -0.298383f,
+ 1.022023f, -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f,
+ -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f,
+ -2.407131f, -0.062304f, 0.000874f, 0.108786f,
+ };
+
+static const float av1_partition_breakout_nn_bias_64_layer0[16] = {
+ 0.081425f, -14.404084f, 11.511393f, -0.930053f, 1.841889f, 15.020920f,
+ -1.872288f, 5.392535f, -0.329335f, -0.005358f, 12.600776f, 0.000000f,
+ -0.337413f, 4.492778f, 0.000000f, 17.043072f,
+};
+
+static const float av1_partition_breakout_nn_weights_64_layer1[16] = {
+ -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f,
+ 0.014185f, 0.067030f, -0.001939f, -0.175049f, 0.245992f, -0.181660f,
+ -0.038572f, 0.307899f, -0.294283f, 0.118323f,
+};
+
+static const float av1_partition_breakout_nn_bias_64_layer1[1] = {
+ -1.33438122f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_64_layer0,
+ av1_partition_breakout_nn_weights_64_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_64_layer0,
+ av1_partition_breakout_nn_bias_64_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = {
+ -4.825528f, -0.145737f, 0.001907f, 0.145415f, -1.858153f, -0.080744f,
+ 0.000601f, 0.211991f, 0.384265f, -0.043945f, -0.521332f, -0.170622f,
+ -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f,
+ -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f,
+ -1.202551f, -0.729727f, -0.370084f, 0.088215f, -1.926800f, -0.086519f,
+ 0.000359f, 0.215120f, 0.718749f, 0.022942f, 0.003840f, -0.176518f,
+ 1.213451f, 0.080786f, 0.001557f, -1.053430f, 0.202698f, -0.583919f,
+ -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f,
+ -0.178518f, -0.585784f, 0.000029f, -0.833014f, -0.331358f, -0.520297f,
+ -0.088676f, -0.178487f, -1.430755f, 0.022981f, -0.106931f, 0.015573f,
+ -0.520814f, -0.045386f, -0.443123f, -0.484209f,
+ };
+
+static const float av1_partition_breakout_nn_bias_32_layer0[16] = {
+ 11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f,
+ 6.669584f, 16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f,
+ -0.423808f, 0.000000f, 6.352258f, -0.155787f,
+};
+
+static const float av1_partition_breakout_nn_weights_32_layer1[16] = {
+ 0.168561f, -0.122519f, 0.524667f, 0.032474f, 0.059097f, 0.011900f,
+ 0.166445f, 0.127256f, -0.034838f, -0.212586f, -0.317973f, 0.348419f,
+ -0.004171f, 0.157694f, 0.117845f, 0.272115f,
+};
+
+static const float av1_partition_breakout_nn_bias_32_layer1[1] = {
+ 0.09049262f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_32_layer0,
+ av1_partition_breakout_nn_weights_32_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_32_layer0,
+ av1_partition_breakout_nn_bias_32_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = {
+ 0.209371f, 0.028758f, 0.005764f, -0.384401f, -0.625777f, -0.005647f,
+ -0.316867f, 0.042985f, 0.127344f, 0.025461f, 0.011465f, -0.071043f,
+ -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f,
+ -0.130997f, -0.012326f, 0.024124f, -0.323578f, -0.005790f, -0.085664f,
+ -1.575066f, -0.119221f, 0.015018f, 0.187204f, 0.238117f, 0.084924f,
+ -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f,
+ -0.278642f, -0.011114f, 0.021162f, 0.081290f, -0.467486f, -0.040771f,
+ -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f,
+ -0.814479f, -0.050450f, -0.003666f, 0.085668f, -0.272589f, 0.057330f,
+ -0.206540f, -0.303418f, 0.075335f, -0.180468f, -0.064872f, -0.755948f,
+ -0.509287f, -0.048877f, -0.001512f, 0.077086f,
+ };
+
+static const float av1_partition_breakout_nn_bias_16_layer0[16] = {
+ 16.421495f, 4.012273f, -1.828571f, 0.000000f, -0.263564f, -0.201972f,
+ 6.564987f, 14.651000f, -3.227779f, 2.241833f, -0.137116f, 0.762876f,
+ 5.625762f, 0.615822f, 0.040057f, 16.668884f,
+};
+
+static const float av1_partition_breakout_nn_weights_16_layer1[16] = {
+ -0.096440f, 0.184316f, -0.021148f, 0.424974f, 0.003743f, 0.006310f,
+ 0.046266f, -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f,
+ 0.269773f, -0.021105f, -0.146698f, 0.188764f,
+};
+
+static const float av1_partition_breakout_nn_bias_16_layer1[1] = {
+ 1.60751927f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_16_layer0,
+ av1_partition_breakout_nn_weights_16_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_16_layer0,
+ av1_partition_breakout_nn_bias_16_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = {
+ -0.255885f, 0.109548f, -0.111054f, -0.476119f, -1.083031f, -0.342003f,
+ 0.048241f, -0.356013f, -0.085054f, 0.124908f, 0.000084f, -0.149906f,
+ -0.729829f, 0.133535f, -0.002125f, 0.207516f, -0.210163f, -0.567365f,
+ -0.590103f, 0.045308f, -0.539406f, 0.130550f, -0.663879f, -0.170549f,
+ 0.017587f, -0.054187f, 0.000550f, 0.038297f, -0.112891f, -0.012751f,
+ -0.048067f, 0.095564f, 0.079892f, 0.077285f, -0.749708f, -0.286312f,
+ -0.054334f, 0.132242f, -0.004152f, -0.209758f, -0.073407f, 0.082306f,
+ -0.001034f, -0.090990f, 0.122823f, -0.109794f, -0.230066f, -0.391155f,
+ -0.262245f, -0.004744f, -0.232246f, 0.099290f, -0.637484f, 0.111937f,
+ -0.548556f, -0.598344f, 0.123265f, -0.281395f, -0.399711f, -0.525671f,
+ -0.596269f, 0.098494f, -0.005765f, 0.173652f,
+ };
+
+static const float av1_partition_breakout_nn_bias_8_layer0[16] = {
+ 0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f,
+ 2.056990f, 5.284306f, 0.639643f, -2.792049f, -2.232339f, -0.232209f,
+ 2.336705f, -0.278834f, 0.231905f, 7.954366f,
+};
+
+static const float av1_partition_breakout_nn_weights_8_layer1[16] = {
+ -0.014439f, 0.010171f, 0.048116f, -0.090659f, -0.081235f, -0.021840f,
+ -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f,
+ 0.055858f, 0.230970f, -0.056466f, 0.119780f,
+};
+
+static const float av1_partition_breakout_nn_bias_8_layer1[1] = {
+ 1.27784479f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_8 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_8_layer0,
+ av1_partition_breakout_nn_weights_8_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_8_layer0,
+ av1_partition_breakout_nn_bias_8_layer1,
+ },
+};
+#undef FEATURE_SIZE
+
+#define FEATURE_SIZE 9 // Input layer size
+#define NUM_NODES 32 // Hidden layer size
+#define LABEL_SIZE 3 // Output layer size
+
+static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ 0.22151f, 0.99424f, 0.23415f, -1.13841f, -0.11277f, 0.09530f, 0.14769f,
+ -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f, 0.15777f, -0.03176f,
+ 0.02729f, -0.37344f, -0.01727f, -0.05469f, 0.19402f, -3.45508f, 0.90106f,
+ -2.91557f, 0.19379f, 0.14356f, -0.13291f, 0.05734f, -0.03032f, -0.13060f,
+ 0.35744f, 1.31630f, -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f,
+ 1.08305f, -0.21596f, 0.76244f, 1.10616f, -1.71706f, 0.05768f, 0.10966f,
+ 0.00949f, -0.12680f, 0.00699f, -0.11522f, -0.38566f, 0.34283f, -0.35266f,
+ -0.40643f, -0.22462f, 0.32300f, -0.39737f, -0.20587f, -0.16096f, 1.07543f,
+ 0.30314f, -1.35659f, -0.38212f, 0.45857f, 0.76615f, 0.16819f, -1.24459f,
+ 0.39677f, 0.87436f, -2.33757f, 1.27471f, 0.27488f, 0.01019f, -0.01221f,
+ -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f,
+ -0.06777f, -1.13868f, 0.01354f, -0.75590f, -0.78222f, -0.07453f, 0.61788f,
+ 0.56899f, 1.17144f, 0.70899f, 0.48568f, 0.11266f, 0.81579f, -0.03929f,
+ 0.01088f, 0.33599f, -0.22401f, -0.49654f, -0.02598f, 0.04509f, -0.08217f,
+ -0.30687f, 0.19851f, -2.96860f, -2.30698f, 0.01848f, 0.11801f, 0.06614f,
+ 0.01673f, -0.11002f, -0.08168f, 0.09204f, -0.06379f, 0.27972f, -0.31716f,
+ -0.00566f, -0.13651f, -0.37276f, 0.01511f, -0.23697f, 0.21696f, -0.19480f,
+ 0.60758f, -0.43506f, -0.02247f, -1.45073f, 0.84442f, -0.94018f, 0.32550f,
+ 0.03985f, -0.06581f, 0.21665f, 0.79472f, -2.41080f, 0.04788f, -0.09492f,
+ -0.10677f, 0.07250f, 0.14329f, -0.37319f, 0.53043f, -0.49108f, 0.25792f,
+ -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f,
+ -0.26196f, 0.93379f, 0.72115f, 0.54464f, 0.27642f, 0.04757f, 2.01629f,
+ 1.55787f, -0.11665f, 1.00722f, -0.24352f, 0.53308f, 0.57719f, 0.39344f,
+ 0.19174f, 0.06339f, -0.02530f, 0.07724f, -0.32416f, -0.26992f, -0.35887f,
+ -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f, -1.52153f, -0.26503f,
+ 0.97552f, -2.96705f, -0.91220f, -0.11827f, 0.00406f, -0.14514f, 0.18417f,
+ -0.20874f, 0.27293f, -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f,
+ -0.19293f, -0.18275f, -0.05902f, 0.58625f, -0.05470f, -0.48814f, -0.45382f,
+ -0.05959f, 2.01250f, -0.30014f, 0.69546f, -1.24180f, 1.34923f, 0.20337f,
+ 0.16850f, 0.07187f, 0.72630f, -0.15380f, -2.40973f, -2.73561f, -1.71375f,
+ -1.61695f, 0.50052f, 0.09730f, 0.00579f, 0.06133f, -0.06512f, -0.61439f,
+ -1.16173f, -0.58716f, 1.60438f, 0.23242f, 0.91847f, 0.49041f, -0.16277f,
+ -0.02574f, -0.64593f, 1.17028f, 0.46852f, 0.14926f, 0.73853f, -0.78521f,
+ 0.05959f, -0.35590f, 0.02039f, 0.10812f, -0.28650f, 1.34038f, -0.72188f,
+ 0.62385f, -0.35271f, -0.39599f, 0.41543f, 0.53124f, -0.23510f, -0.15480f,
+ -0.05066f, -0.33529f, 0.05238f, -0.35311f, -0.26983f, -0.39764f, 0.01085f,
+ 0.26593f, -0.18411f, -0.29945f, 0.50090f, -0.03397f, 0.78562f, -0.33068f,
+ 1.21308f, -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f, 0.65567f,
+ 0.76496f, 0.44325f, 0.01368f, -0.33619f, -0.64256f, 0.64478f, 0.84553f,
+ 1.74183f, 0.22563f, -0.14550f, -0.16258f, 0.03010f, 0.49922f, 0.64575f,
+ -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f, 0.87411f,
+ -1.08228f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = {
+ 0.33919f, -0.03003f, 0.79073f, -0.18508f, 0.00668f, -0.12017f, 0.35362f,
+ -0.51642f, 0.06536f, 0.41668f, -0.06509f, 0.94606f, -0.15385f, 0.14936f,
+ 1.46274f, -0.06961f, 2.82537f, -1.95576f, -0.09457f, 0.02042f, -0.07480f,
+ -0.55083f, 0.26170f, 4.39883f, 0.33999f, -0.10502f, 0.70884f, -0.06992f,
+ -0.22638f, 1.40940f, -0.09309f, 0.05828f,
+};
+
+static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.09209f, 0.26236f, 0.62136f, 0.76324f, -1.14678f, 0.42289f, -0.08895f,
+ -0.97267f, 2.05958f, 0.00843f, 0.35335f, 1.12096f, -0.11679f, 0.07350f,
+ -1.23231f, -0.61990f, 1.51379f, -1.99450f, 0.22441f, 2.41974f, -0.30488f,
+ -0.37869f, 0.47168f, -3.70132f, 0.00061f, 0.19432f, 0.11512f, 0.26200f,
+ -0.35285f, 0.37985f, 0.90571f, 0.27344f, 0.74840f, -0.17965f, -2.51433f,
+ 0.59235f, 1.16670f, -0.53446f, 0.67897f, 0.04505f, -0.86874f, 0.45361f,
+ -0.35033f, 1.21283f, 0.31426f, -0.20841f, 0.56757f, 0.45909f, -1.23683f,
+ 0.09835f, -0.17214f, -0.96323f, 0.01138f, -0.50233f, 0.30104f, 2.01814f,
+ 1.15821f, -0.11947f, 0.74574f, -0.30714f, -0.39646f, -1.30086f, -0.88541f,
+ -0.12259f, -0.54977f, 0.30069f, 1.84299f, -0.95141f, -0.65887f, -0.25888f,
+ -0.63265f, 1.29531f, -0.56672f, 0.10837f, -0.21297f, -2.19131f, 0.01156f,
+ 0.51912f, 0.46704f, 0.42810f, -0.59271f, 0.98469f, -0.17914f, -1.91163f,
+ -0.32807f, 0.48199f, -0.99525f, 1.67108f, -0.87631f, -0.60258f, -0.78731f,
+ -0.32877f, 0.44237f, 0.01087f, 0.07489f, -0.28224f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = {
+ 1.70665f,
+ -0.77954f,
+ -0.92709f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_8 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_8_layer0,
+ av1_rect_partition_nn_weights_8_layer1 },
+ { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.18480f, -0.05410f, -0.18957f, 0.15451f, -0.38649f, -0.26162f, -0.22727f,
+ -0.38555f, -0.36738f, 0.74384f, -1.85999f, 0.98491f, -0.72119f, 1.77321f,
+ 0.39983f, 0.96314f, 0.23695f, 0.30200f, 0.30629f, -0.47617f, -1.43320f,
+ -1.81730f, 0.36554f, -0.07142f, -1.27242f, -1.27697f, 0.00110f, -0.32179f,
+ 0.27460f, 0.45428f, 0.15308f, -0.73906f, -0.28577f, -0.01238f, -0.16958f,
+ -0.85390f, 1.05484f, -1.62812f, 0.77632f, -0.27327f, -0.32527f, 0.32726f,
+ 1.73255f, 0.53763f, 0.59121f, -0.39068f, -0.32451f, -0.31869f, 0.17777f,
+ 0.07519f, -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f,
+ 0.39674f, -0.08596f, 0.18587f, -2.04097f, -1.73993f, 1.57212f, 1.42410f,
+ -1.36762f, -0.41485f, -1.12103f, 0.56959f, 0.11500f, 0.48945f, -0.13585f,
+ 1.22125f, 0.67071f, -1.11812f, -0.20660f, -0.52856f, 0.70663f, 0.74382f,
+ 0.61114f, -0.11454f, 1.14687f, 0.80322f, -0.45965f, -0.44466f, -0.05830f,
+ 0.13206f, -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f,
+ 1.34433f, 2.49427f, 2.91955f, 1.71730f, 0.03295f, 0.03587f, -0.14550f,
+ 0.08189f, -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f,
+ -1.01334f, -0.57302f, 0.22592f, 0.05916f, -0.05305f, -0.89824f, -0.52969f,
+ -0.24542f, 0.27029f, -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f,
+ 1.83695f, 2.19716f, 2.31001f, 0.03657f, 0.00063f, -0.04379f, 0.05835f,
+ -0.08623f, 0.20557f, -0.17791f, 0.07874f, -0.25456f, -0.19513f, -0.27753f,
+ -0.31982f, 0.00245f, -0.33183f, 0.26059f, -0.22165f, 0.37582f, -0.30411f,
+ -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f, 1.03673f,
+ 0.66139f, 0.44941f, -0.44461f, -0.50376f, -0.49664f, 0.18608f, -0.26175f,
+ 0.14844f, 0.78715f, -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f,
+ -0.43135f, -0.22571f, 3.46263f, 3.13580f, -1.33203f, -0.15247f, -0.15866f,
+ -0.11214f, 0.12211f, 0.03964f, -1.87597f, -4.81597f, -4.80195f, -4.98096f,
+ -5.62336f, -0.05337f, -0.00943f, 0.00792f, 0.02742f, 1.05679f, 2.41455f,
+ 0.85382f, 1.42504f, 0.58096f, 0.21443f, 1.02694f, 1.06746f, 1.20242f,
+ 0.60767f, 1.98667f, -0.80879f, -0.63495f, 1.95508f, 0.23952f, -0.15019f,
+ -0.16097f, 0.30155f, -3.42407f, -1.34998f, 9.07689f, -2.22559f, 2.22562f,
+ -0.03348f, -0.05229f, 0.05931f, 0.03042f, -0.18068f, -0.05732f, -0.33010f,
+ -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f, -0.16366f, -0.24935f,
+ -0.69124f, 0.58508f, 0.50654f, 0.04492f, 1.38340f, -1.51487f, 1.72889f,
+ -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f, 0.03784f,
+ 0.08052f, -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f,
+ 0.61751f, -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f,
+ 0.15671f, -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f, -0.11533f,
+ -0.05799f, -0.03142f, 0.20218f, -0.39499f, -0.33859f, -0.13201f, -0.19527f,
+ -0.28459f, -0.20346f, 0.89457f, -2.22103f, -2.37455f, -2.00221f, 2.44553f,
+ 0.33915f, 0.50047f, -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f,
+ -1.70297f, 1.00482f, -0.00103f, -1.40813f, 0.21311f, 0.39230f, -0.07302f,
+ -3.49100f, 1.60675f, -2.90692f, 0.11022f, 0.13507f, -0.13308f, 0.15201f,
+ -0.05573f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = {
+ -0.16783f, -0.16023f, 0.52215f, -0.04109f, 2.00122f, -0.11633f, 0.25535f,
+ 1.80638f, 1.69273f, -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f,
+ 0.00000f, -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f, -0.78548f,
+ -1.39335f, -5.42248f, -0.10388f, 0.07634f, 2.81012f, -0.57429f, -0.15629f,
+ -0.12044f, 1.65478f, -0.75153f, 1.18441f,
+};
+
+static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ -0.26407f, 0.06322f, 0.87932f, 0.17772f, 0.71686f, -0.12283f, 0.08454f,
+ 0.20098f, -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f, 3.80486f,
+ 0.16750f, 0.29218f, 0.57234f, -0.96550f, -0.10599f, -4.91130f, -0.14658f,
+ 0.95803f, -4.13925f, 0.24567f, 0.25708f, 1.60547f, -1.03251f, -0.31053f,
+ -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f, 0.13689f,
+ 0.24504f, 0.49623f, 0.19980f, 0.38349f, 0.37481f, 0.54540f, -0.02198f,
+ 3.43385f, 1.02543f, -0.40921f, -3.07235f, 0.02996f, 0.00323f, -0.35414f,
+ 0.71099f, 1.39334f, 2.43741f, -1.11007f, -0.22739f, -4.21757f, 0.11905f,
+ 0.00353f, -1.69637f, 0.45944f, -0.19884f, 0.03624f, 0.25729f, 0.23659f,
+ -2.08405f, 0.08573f, -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f,
+ -0.09884f, -0.69026f, -0.37284f, 0.04622f, 1.32973f, -0.15414f, 0.19138f,
+ -0.67927f, -0.17658f, 0.36008f, -0.51832f, 0.09887f, -1.94414f, 2.95227f,
+ 1.76937f, -0.26687f, 8.50976f, 0.26247f, 0.60262f, -0.27910f, 0.30061f,
+ -0.05117f, 0.16018f, 0.71195f, 0.57871f, 1.57794f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer1[3] = {
+ 2.68750f,
+ -1.31894f,
+ -1.36768f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_16_layer0,
+ av1_rect_partition_nn_weights_16_layer1 },
+ { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f,
+ -0.09858f, -0.09438f, 0.37306f, 0.23934f, -1.86375f, -1.18307f, -0.32995f,
+ -0.09745f, 0.05431f, -0.13799f, 0.14734f, -0.33219f, 0.18057f, -0.23792f,
+ -0.28126f, 0.02977f, -0.07431f, 0.07860f, 0.00067f, -0.01927f, 1.01841f,
+ -0.57739f, 0.08412f, -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f,
+ -0.16703f, 0.02808f, 0.11994f, -0.26267f, 0.19706f, -0.29707f, -0.25305f,
+ -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f, -0.37001f, -0.23319f,
+ -0.11139f, -0.30513f, 0.04213f, -0.12550f, 0.02504f, 0.33245f, 0.01102f,
+ -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f,
+ 0.65066f, 0.28443f, -1.24943f, -3.00246f, -1.01897f, 0.09304f, 0.70052f,
+ -0.12877f, 0.21120f, -0.37476f, 0.23261f, -0.28401f, 0.09837f, 0.00020f,
+ -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f, 0.16596f, -0.06532f,
+ 1.72938f, 1.57754f, 0.55963f, 0.33246f, -0.20023f, 0.30715f, 0.08629f,
+ 0.18945f, -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f,
+ -0.57698f, 0.04157f, -0.92428f, -1.31268f, 1.78210f, 0.10291f, 1.55042f,
+ -1.26793f, 1.39042f, -1.43729f, 0.25600f, 5.21263f, 5.31955f, 5.19316f,
+ 5.43430f, 0.00294f, -0.00970f, -0.02333f, 0.00250f, 1.17672f, 6.27544f,
+ 4.95973f, 3.54009f, 4.51269f, 0.30750f, 0.78780f, -0.44741f, -0.76442f,
+ 0.75050f, 0.58799f, 0.03400f, -2.09859f, 1.67313f, 0.12503f, 0.28609f,
+ 1.15809f, 2.46530f, -0.04898f, 0.23072f, -0.12635f, -0.82097f, -0.63827f,
+ 2.16779f, 1.77132f, 0.15434f, -1.06427f, 0.06206f, -0.87732f, -0.61897f,
+ -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f, -0.22992f,
+ 1.74638f, 1.29199f, -0.55464f, 0.98316f, 0.06665f, 0.50254f, -0.66292f,
+ 0.17113f, -0.32633f, -1.85803f, -0.92759f, 4.44965f, 1.33057f, 0.02135f,
+ -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f,
+ -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f, 0.14248f,
+ -0.48630f, 0.18840f, 0.11040f, 0.17287f, -0.51880f, 1.12466f, -0.38888f,
+ -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f, -0.26881f,
+ -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f,
+ -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f, 1.29826f, 0.23788f,
+ 0.04189f, 2.66416f, 0.48815f, -0.06803f, 0.96742f, 1.27165f, -0.70348f,
+ -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f,
+ -0.48073f, 2.43657f, -2.60191f, -1.82837f, 0.50440f, 0.71829f, 0.76491f,
+ 0.28293f, 0.20568f, 0.92642f, -0.02496f, 1.43637f, -0.24474f, -1.21030f,
+ 0.54084f, 1.05130f, 1.29572f, 0.03750f, -0.36894f, 0.74548f, -1.33857f,
+ -0.84858f, 1.35230f, 0.80175f, 0.66136f, 1.06473f, 0.18701f, 1.42413f,
+ 0.04661f, -0.07820f, 0.64990f, -0.43595f, 1.18304f, -0.11437f, -0.06365f,
+ 0.03558f, 0.78260f, -1.74890f, 1.56217f, -1.23424f, 4.59193f, -3.35072f,
+ 0.01180f, -0.18296f, -0.20870f, 0.04510f, 1.52595f, -1.37402f, -0.33123f,
+ -0.85957f, 0.80598f, 0.03743f, 0.02354f, 0.37707f, 1.62095f, -0.29627f,
+ -0.31778f, -0.45789f, -0.14906f, 0.25315f, -0.10817f, -0.32610f, -0.40890f,
+ 0.33984f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = {
+ -0.17482f, 0.39042f, 0.00000f, 1.69677f, 0.08792f, -0.09301f, 0.13809f,
+ 4.84061f, 0.00000f, 0.40515f, 0.46246f, 0.20644f, -5.77478f, -1.54510f,
+ 0.05660f, -0.32013f, 0.23649f, 0.03778f, -2.53710f, -0.27869f, 0.45623f,
+ -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f, 1.93272f, -1.07032f,
+ -0.27602f, -1.98063f, 0.20816f, -0.01315f,
+};
+
+static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.02827f, 1.02560f, -0.07137f, -0.31911f, 0.11365f, 0.13684f, -0.07816f,
+ -5.23036f, -0.34340f, 0.84526f, -1.51845f, 0.07017f, -8.12570f, 6.24061f,
+ 0.35739f, -0.09937f, -0.30978f, 0.22032f, 0.74968f, -0.34557f, 0.45547f,
+ -0.16512f, 0.07118f, 1.66415f, 0.41320f, -1.81533f, -1.96004f, 1.04666f,
+ 0.84049f, 4.31009f, 0.68850f, 0.26322f, -0.24634f, -1.25889f, 0.31952f,
+ 0.63632f, 0.05801f, -0.10664f, -0.21992f, 2.44386f, 0.19526f, -0.09838f,
+ 1.53049f, -0.26630f, 3.54126f, -3.40574f, 0.72730f, 0.04557f, 0.92652f,
+ 0.15522f, 2.35895f, -0.13347f, 0.56907f, 0.15352f, 0.01823f, -0.73939f,
+ 0.43104f, 1.90321f, 0.31267f, -0.51972f, 0.50094f, -3.98372f, -3.41518f,
+ -0.48183f, 0.26661f, 0.64146f, 0.14500f, -0.01695f, 0.16653f, -0.37846f,
+ 0.08412f, 2.69714f, -0.20258f, -0.75786f, 0.11201f, 0.61878f, 4.22231f,
+ -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f,
+ -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f, 2.11441f,
+ -1.08794f, -1.41694f, 0.02620f, 2.18792f, 0.04271f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer1[3] = {
+ 2.47332f,
+ -1.65756f,
+ -0.81573f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_32_layer0,
+ av1_rect_partition_nn_weights_32_layer1 },
+ { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ 0.08972f, 4.09095f, -0.31398f, -2.43631f, -0.74767f, 1.42471f, 1.60926f,
+ 1.44721f, 1.88259f, 2.35375f, 1.88299f, 2.01109f, 0.98679f, 2.24131f,
+ 0.06279f, -0.08315f, 0.32107f, 0.91334f, -0.36569f, 5.55049f, 5.44943f,
+ 5.20471f, 5.39099f, -0.01943f, -0.00284f, 0.02203f, -0.01309f, 1.41917f,
+ 6.68460f, -6.15986f, 6.41341f, -3.20630f, -0.00567f, -0.00038f, 0.05960f,
+ 0.04308f, 0.95366f, 3.48535f, 2.98266f, 4.11784f, 3.44255f, 0.61630f,
+ 0.71405f, 0.63945f, -0.00713f, 0.39193f, 1.91621f, 3.32755f, 0.71674f,
+ -0.11647f, 2.07090f, 2.64191f, 0.07949f, -0.05023f, 0.99935f, 0.83145f,
+ 0.75898f, -0.98764f, -0.58731f, 1.21734f, -0.08076f, -3.26780f, 1.66278f,
+ 0.04189f, -0.33177f, -1.58648f, 1.00883f, -0.56132f, -2.34877f, 0.67056f,
+ -2.32297f, -0.91641f, -1.02909f, 4.19781f, 3.87484f, 4.32778f, -1.97171f,
+ -0.24734f, 0.00822f, 0.05892f, 0.12697f, -3.62915f, -2.93127f, 7.94856f,
+ -3.29311f, 3.26001f, -0.02231f, 0.02741f, 0.05919f, 0.08190f, -1.49344f,
+ -0.64475f, -0.24627f, 4.03324f, -1.14799f, -0.18465f, -0.17829f, 0.10394f,
+ 0.08580f, -5.74721f, 4.42467f, 3.63964f, 3.00258f, -1.22744f, -0.29408f,
+ 0.00767f, 0.12305f, 0.05249f, -0.17166f, -0.20120f, -0.32941f, -0.31901f,
+ 0.04628f, -0.35249f, -0.18272f, 0.03956f, -0.19329f, -0.33564f, 0.09856f,
+ -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f,
+ -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f, -0.24918f, -0.53516f,
+ -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f, 6.54429f,
+ -0.00226f, 0.00430f, 0.00321f, 0.00442f, 0.87551f, -0.16224f, -0.22832f,
+ -0.60640f, -0.28738f, 0.18062f, 0.22008f, -0.47406f, 0.80302f, 0.12149f,
+ 1.49530f, 1.05069f, -2.02985f, -0.92833f, 0.25616f, 0.12852f, 3.51840f,
+ 0.25226f, -2.63283f, -4.04386f, 8.46300f, -2.93408f, 0.44069f, 0.08276f,
+ 0.34482f, -0.22615f, 0.28666f, 3.02962f, -1.20055f, -1.04832f, -0.97632f,
+ -0.99530f, 1.44196f, 1.68550f, 0.49360f, 1.08155f, -0.26059f, -0.02876f,
+ -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f,
+ 1.17443f, 2.41497f, 1.90537f, 2.37716f, 2.91495f, -0.44455f, -0.51176f,
+ 0.48195f, 0.53032f, 0.23696f, -1.06211f, 1.47459f, -0.89029f, 0.29521f,
+ 0.66291f, -0.42653f, 1.82308f, -1.30372f, -0.36192f, -3.40388f, -1.61476f,
+ -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f, 0.27297f,
+ -4.81472f, 4.60404f, -0.11053f, 0.14765f, 0.02826f, -0.14688f, -0.07066f,
+ -0.01224f, 1.20377f, 7.02725f, -6.02627f, 6.87255f, -3.14257f, 0.01074f,
+ 0.02397f, -0.02359f, 0.01901f, 0.14956f, -1.67671f, 2.26714f, 2.57043f,
+ -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f,
+ -2.14814f, -0.67266f, -1.56850f, 0.57137f, -1.14428f, -0.34265f, -0.12521f,
+ 0.01220f, -0.74906f, -0.19270f, 0.68110f, -0.24737f, -0.70568f, -1.64826f,
+ -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f, 6.17812f,
+ -0.03191f, -0.00104f, 0.01402f, -0.00046f, -0.94517f, 1.51266f, -0.56318f,
+ 0.72260f, -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f, 2.21148f,
+ -1.47954f, -1.01439f, 0.31536f, 0.77238f, -0.85083f, -0.15758f, -0.50886f,
+ 0.09101f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = {
+ 0.91706f, -1.31328f, -5.16196f, 1.13191f, -0.98044f, -1.61122f, 1.03039f,
+ -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f, -0.10752f, -0.13065f,
+ -0.35567f, -0.35693f, 1.74941f, 1.17379f, -3.45555f, 5.66321f, -0.24917f,
+ -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f, -2.97859f, -0.16774f,
+ 0.59835f, -0.31269f, -0.30585f, -1.66212f,
+};
+
+static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.58963f, 4.20320f, -8.62465f, -6.54014f, 5.41108f, 2.33581f, -0.10354f,
+ -1.17753f, -3.45909f, -2.24722f, 2.20881f, 3.21971f, -0.09087f, -0.21624f,
+ 0.16529f, -8.40985f, -1.60205f, -1.41538f, 4.41826f, -4.63069f, -0.27742f,
+ 4.08710f, 0.26439f, -1.46028f, 0.51234f, 6.25212f, -3.35650f, -1.21348f,
+ 1.37201f, 8.89151f, 0.28859f, -0.97328f, -0.36196f, -2.71701f, 4.54196f,
+ -0.62476f, -2.43814f, -1.34209f, 0.12850f, 1.73859f, 3.09809f, -4.42434f,
+ -1.82552f, -3.66420f, -0.31535f, 0.00968f, -0.02019f, 9.66824f, 0.58835f,
+ 1.50425f, 2.84487f, 2.55522f, 0.01409f, -2.27594f, -0.31800f, 0.91076f,
+ -0.66808f, 0.33120f, -0.12460f, 0.64457f, -0.36416f, -10.30843f, 1.51013f,
+ 2.06861f, -0.20989f, -0.87119f, 3.68642f, 7.33662f, -2.88037f, -0.52414f,
+ -0.35036f, -0.45947f, -0.07406f, 6.46346f, -0.16031f, 0.27071f, 0.38845f,
+ -0.21940f, 0.08583f, -1.39526f, 0.50554f, 0.45279f, -6.61856f, 1.84069f,
+ -0.19149f, -1.77235f, 0.75136f, 1.11797f, 0.32677f, -7.10427f, 3.82908f,
+ 1.04238f, -0.91435f, 1.93317f, -1.84946f, -0.48909f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer1[3] = {
+ 0.32215f,
+ -0.57522f,
+ 0.25314f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_64_layer0,
+ av1_rect_partition_nn_weights_64_layer1 },
+ { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.70901f, -3.03481f, 3.30604f, -1.28803f, -0.08610f, -0.33320f, -0.30716f,
+ 0.25100f, 0.14323f, -0.98422f, -0.89084f, -0.24508f, -1.10785f, -0.82524f,
+ 0.11766f, -0.42777f, 1.08965f, 4.35125f, -1.19388f, 4.22042f, 4.96306f,
+ 6.32406f, 3.29899f, -0.90768f, 0.05203f, 0.38467f, 1.74257f, -0.19918f,
+ -0.11335f, 0.00140f, -0.42303f, -0.04419f, 0.03583f, -0.05441f, -0.19586f,
+ 0.01484f, -1.19964f, 0.25497f, 3.04502f, 0.05446f, -0.23253f, 0.00266f,
+ 0.07117f, -2.78986f, -4.62953f, 1.45331f, 0.43923f, 0.92298f, -0.47736f,
+ 1.49165f, 0.45942f, -1.99787f, 3.33510f, 0.17234f, 0.04024f, -1.42780f,
+ 0.23566f, -0.90970f, 1.18041f, -1.45865f, 2.30878f, -1.28507f, 1.87290f,
+ 1.91186f, 4.74826f, -3.70735f, 4.49808f, -4.72275f, -0.02696f, -0.02642f,
+ -0.06093f, -0.01121f, -0.70683f, 2.69737f, -1.88563f, 2.48637f, 1.10922f,
+ 0.74624f, 0.40308f, 2.06396f, 1.39289f, 0.00909f, -2.05271f, -1.53539f,
+ -1.38323f, 0.83303f, -0.32250f, 0.51172f, 3.91249f, 1.66373f, 1.13184f,
+ -2.22874f, -1.13448f, -0.11185f, 0.19387f, 0.36770f, -0.58933f, 0.22789f,
+ 1.17307f, 0.77461f, 0.20817f, 0.33417f, 0.54037f, 0.32961f, -0.18456f,
+ -9.78171f, -0.17216f, -3.44703f, -2.42158f, 0.51946f, 4.35949f, -0.73335f,
+ -1.61515f, -0.29622f, -0.37617f, -0.42316f, 0.74922f, 1.44386f, 3.92704f,
+ -3.76274f, 4.19775f, -3.86958f, 0.00074f, -0.02418f, -0.12944f, 0.05857f,
+ -0.85507f, 5.42546f, 5.40338f, 5.54347f, 5.59791f, -0.01611f, 0.01618f,
+ -0.01654f, -0.00270f, -0.39608f, -0.40410f, -0.24551f, 0.09124f, -0.34413f,
+ -0.11504f, 0.12793f, -0.31523f, 0.09148f, -0.08567f, -0.05140f, -0.13310f,
+ -0.81200f, 0.06882f, -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f,
+ -1.02573f, 0.32788f, 1.06828f, -1.25503f, -0.42693f, 2.01413f, -2.29103f,
+ 0.62271f, 1.11764f, -1.83113f, -1.32325f, -1.65651f, -2.87826f, 1.46910f,
+ 0.60885f, 0.16079f, 0.00171f, -0.25658f, -0.25465f, -0.14149f, 0.19497f,
+ -0.07866f, -0.37080f, -0.05778f, -0.08870f, -0.20491f, 0.84521f, -0.18214f,
+ -1.38441f, -1.08932f, -1.76627f, 0.73172f, 0.05967f, 1.28057f, 3.42722f,
+ 1.69287f, 0.77169f, 0.44528f, 1.85513f, 0.07840f, 1.31252f, 2.89948f,
+ 1.49489f, 0.15281f, 0.54708f, -1.14185f, -2.51063f, 0.36618f, -0.55322f,
+ 0.96671f, 1.59470f, 1.38252f, 1.99697f, 0.03266f, -0.23200f, -0.01127f,
+ -0.18918f, -0.37598f, -0.03119f, -0.36039f, -0.21192f, -0.11565f, -4.22635f,
+ 1.41252f, 0.56608f, -0.08867f, 3.11924f, -0.54597f, -0.12504f, -0.05289f,
+ -0.28665f, -0.58297f, -1.18362f, -0.76201f, -1.22011f, -0.58756f, 0.14740f,
+ 1.43971f, 0.98381f, -0.02998f, -0.40678f, -0.23047f, -0.12979f, 0.04003f,
+ -0.22081f, -0.09294f, -0.15955f, -0.10379f, -0.10192f, -1.51316f, 2.39482f,
+ -1.69975f, 3.58976f, -0.91032f, -0.03498f, 0.48982f, -0.13418f, 0.76256f,
+ 1.61003f, -2.01676f, -1.24430f, -3.25763f, 1.12314f, 2.00740f, 0.04613f,
+ -0.14746f, -0.57374f, 3.44511f, -0.56767f, -4.08432f, -2.04894f, 2.35951f,
+ -0.00458f, 0.18512f, 0.09916f, -0.04084f, -1.56207f, 1.38034f, 4.17302f,
+ -1.47326f, -2.03530f, -0.00210f, 0.27469f, -0.17423f, 0.86860f, 2.76195f,
+ 2.43269f, -3.57331f, 2.08715f, -1.44171f, -0.17389f, 2.26157f, -0.07852f,
+ 2.02519f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = {
+ 2.53427f, 1.66678f, -0.84914f, -0.15070f, -1.74769f, 0.45218f, -0.26067f,
+ 2.05916f, 0.08978f, 5.30984f, 2.66243f, -1.62740f, 0.70018f, 1.96403f,
+ -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f, -0.08373f, 0.00225f,
+ -1.40692f, -0.27569f, -0.30253f, 0.77377f, -0.67636f, -0.26379f, 1.82348f,
+ 0.66120f, 0.61119f, -1.42293f, 0.32676f,
+};
+
+static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 1.53453f, -0.23707f, 7.88368f, 0.33340f, 0.97523f, 1.38538f, -0.16746f,
+ 4.42070f, 3.18678f, -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f,
+ -8.75673f, 0.27398f, -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f,
+ 3.32134f, 0.25375f, -0.00394f, 2.30213f, -0.14183f, 0.14544f, -1.42830f,
+ 1.31101f, 3.99389f, -0.00017f, -2.90184f, -2.11444f, 2.16734f, -3.05133f,
+ 0.39206f, 4.61489f, -2.88181f, -0.47745f, 2.86649f, -1.20621f, 3.70550f,
+ 1.58029f, -4.58731f, -2.29350f, -0.76930f, 5.19135f, -0.22521f, -5.08782f,
+ 2.17316f, 1.30563f, 0.16777f, -2.17767f, -2.09904f, 1.37001f, 0.25091f,
+ -1.76743f, 1.57940f, 0.30544f, -2.39895f, -0.08532f, -1.77122f, 1.84010f,
+ -0.88449f, 0.79299f, -1.35368f, -4.54110f, 0.02244f, -5.11580f, 1.60883f,
+ 0.29352f, -6.47042f, -1.81426f, 1.24013f, 0.90980f, 7.93977f, 2.12555f,
+ 5.24720f, 4.19508f, 0.21499f, 11.06045f, -0.74752f, 0.89396f, 0.26422f,
+ 1.72332f, -1.25113f, -1.71136f, 0.13676f, -0.07867f, -0.96929f, 0.19911f,
+ 3.58233f, -0.76470f, -2.24162f, -2.87465f, 3.18736f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer1[3] = {
+ 1.09014f,
+ -0.53317f,
+ -0.55668f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_128_layer0,
+ av1_rect_partition_nn_weights_128_layer1 },
+ { av1_rect_partition_nn_bias_128_layer0,
+ av1_rect_partition_nn_bias_128_layer1 }
+};
+#undef FEATURE_SIZE
+#undef NUM_NODES
+#undef LABEL_SIZE
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
+#endif // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
index 461c3af83..c5508e25c 100644
--- a/third_party/aom/av1/encoder/picklpf.c
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -70,7 +70,7 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
// TODO(any): please enable multi-thread and remove the flag when loop
// filter mask is compatible with multi-thread.
#if LOOP_FILTER_BITMASK
- av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+ av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, 0, plane,
plane + 1, partial_frame);
#else
if (cpi->num_workers > 1)
@@ -193,6 +193,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
(void)sd;
lf->sharpness_level = 0;
+ cpi->td.mb.rdmult = cpi->rd.RDMULT;
if (method == LPF_PICK_MINIMAL_LPF) {
lf->filter_level[0] = 0;
diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h
index 2a168358e..357097ae1 100644
--- a/third_party/aom/av1/encoder/picklpf.h
+++ b/third_party/aom/av1/encoder/picklpf.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_PICKLPF_H_
-#define AV1_ENCODER_PICKLPF_H_
+#ifndef AOM_AV1_ENCODER_PICKLPF_H_
+#define AOM_AV1_ENCODER_PICKLPF_H_
#ifdef __cplusplus
extern "C" {
@@ -27,4 +27,4 @@ void av1_pick_filter_level(const struct yv12_buffer_config *sd,
} // extern "C"
#endif
-#endif // AV1_ENCODER_PICKLPF_H_
+#endif // AOM_AV1_ENCODER_PICKLPF_H_
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
index 28b693b08..e7804f6b4 100644
--- a/third_party/aom/av1/encoder/pickrst.c
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -15,6 +15,7 @@
#include <math.h>
#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/binary_codes_writer.h"
@@ -22,7 +23,6 @@
#include "aom_mem/aom_mem.h"
#include "aom_ports/mem.h"
#include "aom_ports/system_state.h"
-
#include "av1/common/onyxc_int.h"
#include "av1/common/quant_common.h"
#include "av1/common/restoration.h"
@@ -181,6 +181,77 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
}
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0,
+ int flt0_stride, int32_t *flt1,
+ int flt1_stride, int xq[2],
+ const sgr_params_type *params) {
+ int i, j;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+ assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ }
+ } else if (params->r[0] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[0] * (flt0[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ }
+ } else if (params->r[1] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[1] * (flt1[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt1 += flt1_stride;
+ }
+ } else {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t e = (int32_t)(dat[j]) - src[j];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ }
+
+ return err;
+}
+
static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
int src_stride, const uint8_t *dat8,
int dat_stride, int use_highbitdepth,
@@ -192,21 +263,9 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
int xq[2];
decode_xq(xqd, xq, params);
if (!use_highbitdepth) {
- const uint8_t *src = src8;
- const uint8_t *dat = dat8;
- for (i = 0; i < height; ++i) {
- for (j = 0; j < width; ++j) {
- const int32_t u =
- (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
- int32_t v = u << SGRPROJ_PRJ_BITS;
- if (params->r[0] > 0) v += xq[0] * (flt0[i * flt0_stride + j] - u);
- if (params->r[1] > 0) v += xq[1] * (flt1[i * flt1_stride + j] - u);
- const int32_t e =
- ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
- src[i * src_stride + j];
- err += e * e;
- }
- }
+ err = av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, xq, params);
} else {
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
@@ -463,9 +522,11 @@ static void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width,
// Iterate over the stripe in blocks of width pu_width
for (int j = 0; j < width; j += pu_width) {
const int w = AOMMIN(pu_width, width - j);
- av1_selfguided_restoration(dat8_row + j, w, h, dat_stride, flt0_row + j,
- flt1_row + j, flt_stride, sgr_params_idx,
- bit_depth, use_highbd);
+ const int ret = av1_selfguided_restoration(
+ dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
+ flt_stride, sgr_params_idx, bit_depth, use_highbd);
+ (void)ret;
+ assert(!ret);
}
}
}
@@ -588,22 +649,9 @@ static void search_sgrproj(const RestorationTileLimits *limits,
if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
}
-static double find_average(const uint8_t *src, int h_start, int h_end,
- int v_start, int v_end, int stride) {
- uint64_t sum = 0;
- double avg = 0;
- int i, j;
- aom_clear_system_state();
- for (i = v_start; i < v_end; i++)
- for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
- avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
- return avg;
-}
-
-static void compute_stats(int wiener_win, const uint8_t *dgd,
- const uint8_t *src, int h_start, int h_end,
- int v_start, int v_end, int dgd_stride,
- int src_stride, double *M, double *H) {
+void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+ int h_start, int h_end, int v_start, int v_end,
+ int dgd_stride, int src_stride, double *M, double *H) {
int i, j, k, l;
double Y[WIENER_WIN2];
const int wiener_win2 = wiener_win * wiener_win;
@@ -626,8 +674,7 @@ static void compute_stats(int wiener_win, const uint8_t *dgd,
assert(idx == wiener_win2);
for (k = 0; k < wiener_win2; ++k) {
M[k] += Y[k] * X;
- H[k * wiener_win2 + k] += Y[k] * Y[k];
- for (l = k + 1; l < wiener_win2; ++l) {
+ for (l = k; l < wiener_win2; ++l) {
// H is a symmetric matrix, so we only need to fill out the upper
// triangle here. We can copy it down to the lower triangle outside
// the (i, j) loops.
@@ -1073,9 +1120,9 @@ static void search_wiener(const RestorationTileLimits *limits,
limits->h_start, limits->h_end, limits->v_start,
limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
} else {
- compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer, limits->h_start,
- limits->h_end, limits->v_start, limits->v_end,
- rsc->dgd_stride, rsc->src_stride, M, H);
+ av1_compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+ limits->h_start, limits->h_end, limits->v_start,
+ limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
}
const MACROBLOCK *const x = rsc->x;
@@ -1266,6 +1313,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
// problem, as these elements are ignored later, but in order to quiet
// Valgrind's warnings we initialise the array below.
memset(rusi, 0, sizeof(*rusi) * ntiles[0]);
+ cpi->td.mb.rdmult = cpi->rd.RDMULT;
RestSearchCtxt rsc;
const int plane_start = AOM_PLANE_Y;
diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h
index 179b89ff9..3fec0c34b 100644
--- a/third_party/aom/av1/encoder/pickrst.h
+++ b/third_party/aom/av1/encoder/pickrst.h
@@ -8,22 +8,39 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_PICKRST_H_
-#define AV1_ENCODER_PICKRST_H_
+#ifndef AOM_AV1_ENCODER_PICKRST_H_
+#define AOM_AV1_ENCODER_PICKRST_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "av1/encoder/encoder.h"
+#include "aom_ports/system_state.h"
struct yv12_buffer_config;
struct AV1_COMP;
+static const uint8_t g_shuffle_stats_data[16] = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+};
+
+static INLINE double find_average(const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int stride) {
+ uint64_t sum = 0;
+ double avg = 0;
+ int i, j;
+ aom_clear_system_state();
+ for (i = v_start; i < v_end; i++)
+ for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+ avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+ return avg;
+}
+
void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_PICKRST_H_
+#endif // AOM_AV1_ENCODER_PICKRST_H_
diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h
index 42a4c590b..40dd46768 100644
--- a/third_party/aom/av1/encoder/pustats.h
+++ b/third_party/aom/av1/encoder/pustats.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_PUSTATS_H_
-#define AV1_ENCODER_PUSTATS_H_
+#ifndef AOM_AV1_ENCODER_PUSTATS_H_
+#define AOM_AV1_ENCODER_PUSTATS_H_
#ifdef __cplusplus
extern "C" {
@@ -18,83 +18,78 @@ extern "C" {
#include "av1/encoder/ml.h"
-#define NUM_FEATURES 11
+#define NUM_FEATURES_PUSTATS 8
#define NUM_HIDDEN_LAYERS 2
#define HIDDEN_LAYERS_0_NODES 12
#define HIDDEN_LAYERS_1_NODES 10
#define LOGITS_NODES 1
static const float
- av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES *
+ av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
HIDDEN_LAYERS_0_NODES] = {
- 21.5067f, 22.6709f, 0.0049f, 0.9288f, -0.0100f, 0.0060f, -0.0071f,
- -0.0085f, 0.0348f, -0.1273f, 10.1154f, 6.3405f, 7.8589f, -0.0652f,
- -4.6352f, 0.0445f, -3.2748f, 0.1025f, -0.0385f, -0.4505f, 1.1320f,
- 3.2634f, 23.2420f, -7.9056f, 0.0522f, -18.1555f, 0.0977f, 0.1155f,
- -0.0138f, 0.0267f, -0.3992f, 0.2735f, 22.8063f, 35.1043f, 3.8140f,
- -0.0295f, 0.0771f, -0.6938f, 0.0302f, -0.0266f, 0.0989f, -0.0794f,
- 0.2981f, 33.3333f, -24.1150f, 1.4986f, -0.0975f, -15.3938f, -0.0858f,
- -0.0845f, -0.0869f, -0.0858f, 0.3542f, 0.0155f, -18.2629f, 9.6688f,
- -11.9643f, -0.2904f, -5.3026f, -0.1011f, -0.1202f, 0.0127f, -0.0269f,
- 0.3434f, 0.0595f, 16.6800f, 41.4730f, 6.9269f, -0.0512f, -1.4540f,
- 0.0468f, 0.0077f, 0.0983f, 0.1265f, -0.5234f, 0.9477f, 36.6470f,
- -0.4838f, -0.2269f, -0.1143f, -0.3907f, -0.5005f, -0.0179f, -0.1057f,
- 0.1233f, -0.4412f, -0.0474f, 0.1140f, -21.6813f, -0.9077f, -0.0078f,
- -3.3306f, 0.0417f, 0.0412f, 0.0427f, 0.0418f, -0.1699f, 0.0072f,
- -22.3335f, 16.1203f, -10.1220f, -0.0019f, 0.0005f, -0.0054f, -0.0155f,
- -0.0302f, -0.0379f, 0.1276f, 0.1568f, 21.6175f, 12.2919f, 11.0327f,
- -0.2000f, -8.6691f, -0.5593f, -0.5952f, -0.4203f, -0.4857f, -1.1239f,
- 3.1404f, -13.1098f, -5.9165f, 22.2060f, -0.0312f, -3.9642f, -0.0344f,
- -0.0656f, -0.0273f, -0.0465f, 0.1412f, -6.1974f, 9.3661f,
+ -0.1758f, -0.0499f, -10.0069f, -2.2838f, -0.3359f, 0.3459f, -0.3285f,
+ -0.0515f, -0.5417f, 0.2357f, -0.0575f, -69.0782f, 0.5348f, 1.4068f,
+ 0.2213f, -1.0490f, -0.0636f, 0.1654f, 1.1002f, 33.4924f, 0.4358f,
+ 1.2499f, 0.1143f, 0.0592f, -1.6335f, -0.0092f, 1.2207f, -28.4543f,
+ -0.4973f, 0.4368f, 0.2341f, -0.1623f, -3.8986f, 0.1311f, -1.8789f,
+ -3.9079f, -0.8158f, -0.8420f, 1.4295f, -2.3629f, -1.4825f, 0.6498f,
+ -5.3669f, 6.4434f, 1.8393f, -35.0678f, 3.7459f, -2.8504f, 2.0502f,
+ -0.1812f, -3.9011f, -1.0155f, 1.8375f, -1.4517f, 1.3917f, 3.8664f,
+ 0.8345f, -0.3472f, 5.7740f, -1.1196f, -0.3264f, -1.2481f, -0.9284f,
+ -4.9657f, 2.2831f, 0.7337f, 2.3176f, 0.6416f, 0.8804f, 1.9988f,
+ -1.3426f, 1.2728f, 1.2249f, -0.1551f, 5.6045f, 0.2046f, -2.1464f,
+ -2.4922f, -0.5334f, 12.1055f, 7.2467f, -0.0070f, 0.0234f, 0.0021f,
+ 0.0215f, -0.0098f, -0.0682f, -6.1494f, -0.3176f, -1.6069f, -0.2119f,
+ -1.0533f, -0.3566f, 0.5294f, -0.4335f, 0.1626f,
};
static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
{
- -14.3065f, 2.059f, -62.9916f, -50.1209f, 57.643f, -59.3737f,
- -30.4737f, -0.1112f, 72.5427f, 55.402f, 24.9523f, 18.5834f,
+ 10.5266f, 5.3268f, -1.0678f, 7.7411f, 8.7164f, -0.3235f,
+ 7.3028f, 9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f,
};
static const float
av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
HIDDEN_LAYERS_1_NODES] = {
- 0.3883f, -0.2784f, -0.2850f, 0.4894f, -2.2450f, 0.4511f, -0.1969f,
- -0.0077f, -1.4924f, 0.1138f, -2.9848f, 1.0211f, -0.1712f, -0.1952f,
- -0.4774f, 0.0761f, -0.3186f, -0.1002f, 0.8663f, 0.5026f, 1.1920f,
- 0.9337f, 0.3911f, -0.3841f, -0.0037f, 0.7295f, -0.3183f, 0.1829f,
- -1.3670f, -0.1046f, 0.6629f, 0.0619f, -0.1551f, 0.8174f, 2.1521f,
- -1.3323f, -0.0527f, -0.5772f, 0.2001f, -0.6270f, -1.0625f, 0.3342f,
- 0.6676f, 0.4605f, -2.0049f, 0.7781f, 0.0713f, -0.0824f, -0.4529f,
- 0.1757f, -0.1338f, -0.2319f, -0.2864f, 0.1248f, 0.3887f, -0.1676f,
- 1.8422f, 0.6435f, 1.2123f, -0.5667f, -0.2423f, -0.0314f, 0.2411f,
- -0.5013f, 0.0422f, 0.2559f, 0.4435f, -0.1223f, 1.5167f, 0.3939f,
- 1.0898f, 0.0795f, -0.9251f, -0.0813f, -0.5929f, -0.0741f, 4.0687f,
- -0.4368f, -0.0984f, 0.0837f, 3.6169f, 0.0662f, -0.1679f, -0.8090f,
- -0.2610f, -0.5791f, 0.0642f, -0.2979f, -0.9036f, 0.2898f, 0.3265f,
- 0.4660f, -1.6358f, -0.0347f, 0.1087f, 0.0353f, 0.5687f, -0.5242f,
- -0.4895f, 0.7693f, -1.3829f, -0.2244f, -0.2880f, 0.0575f, 2.0563f,
- -0.2322f, -1.1597f, 1.6125f, -0.0925f, 1.3540f, 0.1432f, 0.3993f,
- -0.0303f, -1.1438f, -1.7323f, -0.4329f, 2.9443f, -0.5724f, 0.0122f,
- -1.0829f,
+ 10.5932f, 2.5192f, -0.0015f, 5.9479f, 5.2426f, -0.4091f, 5.3220f,
+ 6.0469f, 0.7200f, 3.3241f, 5.5006f, 12.8290f, -1.6396f, 0.5743f,
+ -0.8370f, 1.9956f, -4.9270f, -1.5295f, 2.1350f, -9.4415f, -0.7094f,
+ 5.1822f, 19.7287f, -3.0444f, -0.3320f, 0.0031f, -0.2709f, -0.5249f,
+ 0.3281f, -0.2240f, 0.2225f, -0.2386f, -0.4370f, -0.2438f, -0.4928f,
+ -0.2842f, -2.1772f, 9.2570f, -17.6655f, 3.5448f, -2.8394f, -1.0167f,
+ -0.5115f, -1.9260f, -0.2111f, -0.7528f, -1.2387f, -0.0401f, 5.0716f,
+ -3.3763f, -0.2898f, -0.4956f, -7.9993f, 0.1526f, -0.0242f, 0.7354f,
+ 6.0432f, 4.8043f, 7.4790f, -0.6295f, 1.7565f, 3.7197f, -2.3963f,
+ 6.8945f, 2.9717f, -3.1623f, 3.4241f, 4.4676f, -1.8154f, -2.9401f,
+ -8.5657f, -3.0240f, -1.4661f, 8.1145f, -12.7858f, 3.3624f, -1.0819f,
+ -4.2856f, 1.1801f, -0.5587f, -1.6062f, -1.1813f, -3.5882f, -0.2490f,
+ -24.9566f, -0.4140f, -0.1113f, 3.5537f, 4.4112f, 0.1367f, -1.5876f,
+ 1.6605f, 1.3903f, -0.0253f, -2.1419f, -2.2197f, -0.7659f, -0.4249f,
+ -0.0424f, 0.1486f, 0.4643f, -0.9068f, -0.3619f, -0.7624f, -0.9132f,
+ -0.4947f, -0.3527f, -0.5445f, -0.4768f, -1.7761f, -1.0686f, 0.5462f,
+ 1.3371f, 4.3116f, 0.0777f, -2.7216f, -1.8908f, 3.4989f, 7.7269f,
+ -2.7566f,
};
static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
{
- -10.3717f, 37.304f, -36.7221f, -52.7572f, 44.0877f,
- 41.1631f, 36.3299f, -48.6087f, -4.5189f, 13.0611f,
+ 13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f,
+ 6.1715f, 0.5094f, 7.6433f, -0.3992f, -1.3555f,
};
static const float
av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
- 0.8362f, 1.0615f, -1.5178f, -1.2959f, 1.3233f,
- 1.4909f, 1.3554f, -0.8626f, -0.618f, -0.9458f,
+ 4.3078f, -17.3497f, 0.0195f, 34.6032f, -5.0127f,
+ 5.3079f, 10.0077f, -13.129f, 0.0087f, -8.4009f,
};
static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
- 30.6878f,
+ 4.5103f,
};
static const NN_CONFIG av1_pustats_rate_nnconfig = {
- NUM_FEATURES, // num_inputs
+ NUM_FEATURES_PUSTATS, // num_inputs
LOGITS_NODES, // num_outputs
NUM_HIDDEN_LAYERS, // num_hidden_layers
{ HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes
@@ -111,76 +106,71 @@ static const NN_CONFIG av1_pustats_rate_nnconfig = {
};
static const float
- av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES *
+ av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
HIDDEN_LAYERS_0_NODES] = {
- 0.7770f, 1.0881f, 0.0177f, 0.4939f, -0.2541f, -0.2672f, -0.1705f,
- -0.1940f, -0.6395f, 1.2928f, 3.6240f, 2.4445f, 1.6790f, 0.0265f,
- 0.1897f, 0.1776f, 0.0422f, 0.0197f, -0.0466f, 0.0462f, -1.0827f,
- 2.0231f, 1.8044f, 2.7022f, 0.0064f, 0.2255f, -0.0552f, -0.1010f,
- -0.0581f, -0.0781f, 0.2614f, -3.4085f, 1.7478f, 0.1155f, -0.1458f,
- -0.0031f, -0.1797f, -0.4378f, -0.0539f, 0.0607f, -0.1347f, -0.3142f,
- -0.2014f, -0.4484f, -0.2808f, 1.5913f, 0.0046f, -0.0610f, -0.6479f,
- -0.7278f, -0.5592f, -0.6695f, -0.8120f, 2.9056f, -1.1501f, 9.3618f,
- 4.2486f, 0.0011f, -0.1499f, -0.0834f, 0.1282f, 0.0409f, 0.1670f,
- -0.1398f, -0.4661f, 13.7700f, 8.2061f, -0.0685f, 0.0061f, -0.2951f,
- 0.0169f, 0.0520f, 0.0040f, 0.0374f, 0.0467f, -0.0107f, 14.2664f,
- -2.2489f, -0.2516f, -0.0061f, -0.9921f, 0.1223f, 0.1212f, 0.1199f,
- 0.1185f, -0.4867f, 0.0325f, -5.0757f, -8.7853f, 1.0450f, 0.0169f,
- 0.5462f, 0.0051f, 0.1330f, 0.0143f, 0.1429f, -0.0258f, 0.2769f,
- -12.8839f, 22.3093f, 1.2761f, 0.0037f, -1.2459f, -0.0466f, 0.0003f,
- -0.0464f, -0.0067f, 0.2361f, 0.0355f, 23.3833f, 10.9218f, 2.6811f,
- 0.0222f, -1.1055f, 0.1825f, 0.0575f, 0.0114f, -0.1259f, 0.3148f,
- -2.0047f, 11.9559f, 5.7375f, 0.8802f, 0.0042f, -0.2469f, -0.1040f,
- -1.5679f, 0.1969f, -0.0184f, 0.0157f, 0.6688f, 3.4492f,
+ -0.2560f, 0.1105f, -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f,
+ 0.4885f, 1.7518f, 0.4985f, 0.5582f, -0.3739f, 0.9403f, 0.3874f,
+ 0.3265f, 1.7383f, 3.1747f, 0.0285f, 3.3942f, -0.0123f, 0.5057f,
+ 0.1584f, 0.2697f, 4.6151f, 3.6251f, -0.0121f, -1.0047f, -0.0037f,
+ 0.0127f, 0.1935f, -0.5277f, -2.7144f, 0.0729f, -0.1457f, -0.0816f,
+ -0.5462f, 0.4738f, 0.3599f, -0.0564f, 0.0910f, 0.0126f, -0.0310f,
+ -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f, -0.2662f, -0.0999f,
+ -0.2983f, -0.4899f, -0.2314f, 0.2873f, -0.3614f, 0.1783f, -0.1210f,
+ 0.3569f, 0.5436f, -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f,
+ 1.9045f, 0.5463f, 0.1102f, -0.9293f, -0.0185f, -0.8302f, -0.4378f,
+ -0.3531f, -1.3095f, 0.6099f, 0.7977f, 4.1950f, -0.0067f, -0.2762f,
+ -0.1574f, -0.2149f, 0.6104f, -1.7053f, 0.1904f, 4.2402f, -0.2671f,
+ 0.8940f, 0.6820f, 0.2241f, -0.9459f, 1.4571f, 0.5255f, 2.3352f,
+ -0.0806f, 0.5231f, 0.3928f, 0.4146f, 2.0956f,
};
static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
{
- 4.5051f, -4.5858f, 1.4693f, 0.f, 3.7968f, -3.6292f,
- -7.3112f, 10.9743f, 8.027f, -2.2692f, -8.748f, -1.3689f,
+ 1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f,
+ 0.f, 1.1485f, 2.7085f, -4.7897f, 1.4093f, -1.657f,
};
static const float
av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
HIDDEN_LAYERS_1_NODES] = {
- -0.0182f, -0.0925f, -0.0311f, -0.2962f, 0.1177f, -0.0027f, -0.2136f,
- -1.2094f, 0.0935f, -0.1403f, -0.1477f, -0.0752f, 0.1519f, -0.4726f,
- -0.3521f, 0.4199f, -0.0168f, -0.2927f, -0.2510f, 0.0706f, -0.2920f,
- 0.2046f, -0.0400f, -0.2114f, 0.4240f, -0.7070f, 0.4964f, 0.4471f,
- 0.3841f, -0.0918f, -0.6140f, 0.6056f, -0.1123f, 0.3944f, -0.0178f,
- -1.7702f, -0.4434f, 0.0560f, 0.1565f, -0.0793f, -0.0041f, 0.0052f,
- -0.1843f, 0.2400f, -0.0605f, 0.3196f, -0.0286f, -0.0002f, -0.0595f,
- -0.0493f, -0.2636f, -0.3994f, -0.1871f, -0.3298f, -0.0788f, -1.0685f,
- 0.1900f, -0.5549f, -0.1350f, -0.0153f, -0.1195f, -0.5874f, 1.0468f,
- 0.0212f, -0.2306f, -0.2677f, -0.3000f, -1.0702f, -0.1725f, -0.0656f,
- -0.0226f, 0.0616f, -0.3453f, 0.0810f, 0.4838f, -0.3780f, -1.4486f,
- 0.7777f, -0.0459f, -0.6568f, 0.0589f, -1.0286f, -0.6001f, 0.0826f,
- 0.4794f, -0.0586f, -0.1759f, 0.3811f, -0.1313f, 0.3829f, -0.0968f,
- -2.0445f, -0.3566f, -0.1491f, -0.0745f, -0.0202f, 0.0839f, 0.0470f,
- -0.2432f, 0.3013f, -0.0743f, -0.3479f, 0.0749f, -5.2490f, 0.0209f,
- -0.1653f, -0.0826f, -0.0535f, 0.3225f, -0.3786f, -0.0104f, 0.3091f,
- 0.3652f, 0.1757f, -0.3252f, -1.1022f, -0.0574f, -0.4473f, 0.3469f,
- -0.5539f,
+ -0.5203f, -1.3468f, 0.3865f, -0.6859f, 0.0058f, 4.0682f, 0.4807f,
+ -0.1380f, 0.6050f, 0.8958f, 0.7748f, -0.1311f, 1.7317f, 1.1265f,
+ 0.0827f, 0.1407f, -0.3605f, 0.5429f, 0.1880f, -0.1439f, 0.2837f,
+ 1.6477f, 0.0832f, 0.0593f, -1.8464f, -0.7241f, -1.0672f, -0.3546f,
+ -0.3842f, -2.3637f, 0.2514f, 0.8263f, -0.1872f, 0.5774f, -0.3610f,
+ -0.0205f, 1.3977f, -0.1083f, 0.6923f, 1.3039f, -0.2870f, 1.0622f,
+ -0.0566f, 0.2697f, -0.5429f, -0.6193f, 1.7559f, 0.3246f, 1.9159f,
+ 0.3744f, 0.0686f, 1.0191f, -0.4212f, 1.9591f, -0.0691f, -0.1085f,
+ -1.2034f, 0.0606f, 1.0116f, 0.5565f, -0.1874f, -0.7898f, 0.4796f,
+ 0.2290f, 0.4334f, -0.5817f, -0.2949f, 0.1367f, -0.2932f, -1.1265f,
+ 0.0133f, -0.5309f, -3.3191f, 0.0939f, 0.3895f, -2.5812f, -0.0066f,
+ -3.0063f, -0.2982f, 0.7309f, -0.2422f, -0.2770f, -0.7152f, 0.1700f,
+ 1.9630f, 0.1988f, 0.4194f, 0.8762f, 0.3402f, 0.1051f, -0.1598f,
+ 0.2405f, 0.0392f, 1.1256f, 1.5245f, 0.0950f, 0.2160f, -0.5023f,
+ 0.2584f, 0.2074f, 0.2218f, 0.3966f, -0.0921f, -0.2435f, -0.4560f,
+ -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f, -0.3342f, -0.7888f,
+ -0.4488f, -1.7168f, 0.3341f, 0.1146f, 0.5226f, 0.2610f, -0.4574f,
+ -0.4164f,
};
static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
{
- 11.9337f, -0.3681f, -6.1324f, 12.674f, 9.0956f,
- 4.6069f, -4.4158f, -12.4848f, 10.8473f, 5.7633f,
+ -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f,
+ 2.7149f, -2.5649f, 2.7765f, 2.9617f, 2.7684f,
};
static const float
av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
- 0.3245f, 0.2979f, -0.157f, -0.1441f, 0.1413f,
- -0.7496f, -0.1737f, -0.5322f, 0.0748f, 0.2518f,
+ -0.6868f, -0.6715f, 0.449f, -1.293f, 0.6214f,
+ 0.9894f, -0.4342f, 0.7002f, 1.4363f, 0.6951f,
};
static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
- 4.6065f,
+ 2.3371f,
};
static const NN_CONFIG av1_pustats_dist_nnconfig = {
- NUM_FEATURES, // num_inputs
+ NUM_FEATURES_PUSTATS, // num_inputs
LOGITS_NODES, // num_outputs
NUM_HIDDEN_LAYERS, // num_hidden_layers
{ HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes
@@ -196,7 +186,6 @@ static const NN_CONFIG av1_pustats_dist_nnconfig = {
},
};
-#undef NUM_FEATURES
#undef NUM_HIDDEN_LAYERS
#undef HIDDEN_LAYERS_0_NODES
#undef HIDDEN_LAYERS_1_NODES
@@ -206,4 +195,4 @@ static const NN_CONFIG av1_pustats_dist_nnconfig = {
} // extern "C"
#endif
-#endif // AV1_ENCODER_PUSTATS_H_
+#endif // AOM_AV1_ENCODER_PUSTATS_H_
diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h
index 9b2dac965..0bca39102 100644
--- a/third_party/aom/av1/encoder/random.h
+++ b/third_party/aom/av1/encoder/random.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RANDOM_H_
-#define AV1_ENCODER_RANDOM_H_
+#ifndef AOM_AV1_ENCODER_RANDOM_H_
+#define AOM_AV1_ENCODER_RANDOM_H_
#ifdef __cplusplus
extern "C" {
@@ -26,4 +26,4 @@ static INLINE unsigned int lcg_rand16(unsigned int *state) {
} // extern "C"
#endif
-#endif // AV1_ENCODER_RANDOM_H_
+#endif // AOM_AV1_ENCODER_RANDOM_H_
diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h
index 1019055ed..c429f2ce5 100644
--- a/third_party/aom/av1/encoder/ransac.h
+++ b/third_party/aom/av1/encoder/ransac.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RANSAC_H_
-#define AV1_ENCODER_RANSAC_H_
+#ifndef AOM_AV1_ENCODER_RANSAC_H_
+#define AOM_AV1_ENCODER_RANSAC_H_
#include <stdio.h>
#include <stdlib.h>
@@ -32,4 +32,4 @@ int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
int ransac_translation(int *matched_points, int npoints,
int *num_inliers_by_motion, double *params_by_motion,
int num_motions);
-#endif // AV1_ENCODER_RANSAC_H_
+#endif // AOM_AV1_ENCODER_RANSAC_H_
diff --git a/third_party/aom/av1/encoder/rate_distortion_model_params.h b/third_party/aom/av1/encoder/rate_distortion_model_params.h
index 14d23f10f..7cd0962c5 100644
--- a/third_party/aom/av1/encoder/rate_distortion_model_params.h
+++ b/third_party/aom/av1/encoder/rate_distortion_model_params.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
-#define AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#ifndef AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#define AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
#ifdef __cplusplus
extern "C" {
@@ -588,4 +588,4 @@ static const NN_CONFIG av1_rdcost_model_nnconfig = {
} // extern "C"
#endif
-#endif // AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#endif // AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
index 3aae0144e..2597fb990 100644
--- a/third_party/aom/av1/encoder/ratectrl.c
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -117,7 +117,7 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
for (i = 0; i < QINDEX_RANGE; i++) {
const double maxq = av1_convert_qindex_to_q(i, bit_depth);
kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
- kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+ kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth);
arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
@@ -253,6 +253,9 @@ int av1_rc_get_default_min_gf_interval(int width, int height,
int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
interval += (interval & 0x01); // Round to even value
+#if CONFIG_FIX_GF_LENGTH
+ interval = AOMMAX(FIXED_GF_LENGTH, interval);
+#endif
return AOMMAX(interval, min_gf_interval);
}
@@ -299,9 +302,9 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
- rc->rate_correction_factors[i] = 1.0;
+ rc->rate_correction_factors[i] = 0.7;
}
-
+ rc->rate_correction_factors[KF_STD] = 1.0;
rc->min_gf_interval = oxcf->min_gf_interval;
rc->max_gf_interval = oxcf->max_gf_interval;
if (rc->min_gf_interval == 0)
@@ -556,6 +559,14 @@ static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
arfgf_low_motion_minq, arfgf_high_motion_minq);
}
+#if REDUCE_LAST_ALT_BOOST
+static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
+ int *arfgf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+ return arfgf_high_motion_minq[q];
+}
+#endif
+
static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
const RATE_CONTROL *const rc = &cpi->rc;
const unsigned int curr_frame = cpi->common.current_video_frame;
@@ -918,7 +929,7 @@ int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
#define STATIC_MOTION_THRESH 95
static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
int height, int *bottom_index,
- int *top_index) {
+ int *top_index, int *arf_q) {
const AV1_COMMON *const cm = &cpi->common;
const RATE_CONTROL *const rc = &cpi->rc;
const AV1EncoderConfig *const oxcf = &cpi->oxcf;
@@ -959,7 +970,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
qindex = rc->last_boosted_qindex;
last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
- last_boosted_q * 0.75, bit_depth);
+ last_boosted_q * 0.5, bit_depth);
active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
}
} else {
@@ -1000,17 +1011,49 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
// For constrained quality dont allow Q less than the cq level
if (oxcf->rc_mode == AOM_CQ) {
if (q < cq_level) q = cq_level;
+#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+ (is_intrl_arf_boost && !cpi->new_bwdref_update_rule)) {
+#endif // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
- active_best_quality = get_gf_active_quality(rc, q, bit_depth);
-
- // Constrained quality use slightly lower active best.
- active_best_quality = active_best_quality * 15 / 16;
+ // Constrained quality use slightly lower active best.
+ active_best_quality = active_best_quality * 15 / 16;
+#if REDUCE_LAST_ALT_BOOST
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+ const int boost = min_boost - active_best_quality;
+ active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+ }
+#endif
+ *arf_q = active_best_quality;
+#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+ } else {
+ active_best_quality = rc->arf_q;
+ int this_height = gf_group->pyramid_level[gf_group->index];
+ while (this_height < gf_group->pyramid_height) {
+ active_best_quality = (active_best_quality + cq_level + 1) / 2;
+ ++this_height;
+ }
+ }
+#endif // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
} else if (oxcf->rc_mode == AOM_Q) {
if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
active_best_quality = cq_level;
} else {
- active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+ *arf_q = active_best_quality;
+#if REDUCE_LAST_ALT_BOOST
+ const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+ const int boost = min_boost - active_best_quality;
+
+ active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+#endif
+ } else {
+ active_best_quality = rc->arf_q;
+ }
#if USE_SYMM_MULTI_LAYER
if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
int this_height = gf_group->pyramid_level[gf_group->index];
@@ -1030,6 +1073,12 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
}
} else {
active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+#if REDUCE_LAST_ALT_BOOST
+ const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+ const int boost = min_boost - active_best_quality;
+
+ active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+#endif
#if USE_SYMM_MULTI_LAYER
if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
int this_height = gf_group->pyramid_level[gf_group->index];
@@ -1104,7 +1153,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
} else {
- q = rc->last_boosted_qindex;
+ q = AOMMIN(rc->last_boosted_qindex,
+ (active_best_quality + active_worst_quality) / 2);
}
} else {
q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
@@ -1129,7 +1179,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
return q;
}
-int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height,
int *bottom_index, int *top_index) {
int q;
if (cpi->oxcf.pass == 0) {
@@ -1140,8 +1190,17 @@ int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index,
top_index);
} else {
+ assert(cpi->oxcf.pass == 2 && "invalid encode pass");
+
+ GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ int arf_q = 0;
+
q = rc_pick_q_and_bounds_two_pass(cpi, width, height, bottom_index,
- top_index);
+ top_index, &arf_q);
+
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ cpi->rc.arf_q = arf_q;
+ }
}
return q;
@@ -1327,13 +1386,6 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
update_golden_frame_stats(cpi);
if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
-
- // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
- // differently here for rc->avg_frame_bandwidth.
- if (cm->show_frame || rc->is_bwd_ref_frame) {
- rc->frames_since_key++;
- rc->frames_to_key--;
- }
// if (cm->current_video_frame == 1 && cm->show_frame)
/*
rc->this_frame_target =
@@ -1635,10 +1687,6 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
rc->max_gf_interval = rc->static_scene_max_gf_interval;
-#if FIX_GF_INTERVAL_LENGTH
- rc->max_gf_interval = FIXED_GF_LENGTH + 1;
-#endif
-
// Clamp min to max
rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
}
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
index f0508da9e..198ecab97 100644
--- a/third_party/aom/av1/encoder/ratectrl.h
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RATECTRL_H_
-#define AV1_ENCODER_RATECTRL_H_
+#ifndef AOM_AV1_ENCODER_RATECTRL_H_
+#define AOM_AV1_ENCODER_RATECTRL_H_
#include "aom/aom_codec.h"
#include "aom/aom_integer.h"
@@ -25,13 +25,27 @@ extern "C" {
#define BPER_MB_NORMBITS 9
#define CUSTOMIZED_GF 1
-#define FIX_GF_INTERVAL_LENGTH 0
-#if FIX_GF_INTERVAL_LENGTH
+#if CONFIG_FIX_GF_LENGTH
#define FIXED_GF_LENGTH 16
+#define MAX_PYRAMID_LVL 4
+// We allow a frame to have at most two left/right descendants before changing
+// them into to a subtree, i.e., we allow the following structure:
+/* OUT_OF_ORDER_FRAME
+ / / \ \
+(two left children) F F F F (two right children) */
+// Therefore the max gf size supported by 4 layer structure is
+// 1 (KEY/OVERLAY) + 1 + 2 + 4 + 16 (two children on both side of their parent)
+#define MAX_PYRAMID_SIZE 24
#define USE_SYMM_MULTI_LAYER 1
+#define REDUCE_LAST_ALT_BOOST 1
+#define REDUCE_LAST_GF_LENGTH 1
+#define MULTI_LVL_BOOST_VBR_CQ 1
#else
#define USE_SYMM_MULTI_LAYER 0
+#define REDUCE_LAST_ALT_BOOST 0
+#define REDUCE_LAST_GF_LENGTH 0
+#define MULTI_LVL_BOOST_VBR_CQ 0
#endif
#if USE_SYMM_MULTI_LAYER
@@ -159,6 +173,9 @@ typedef struct {
// Auto frame-scaling variables.
int rf_level_maxq[RATE_FACTOR_LEVELS];
+ float_t arf_boost_factor;
+ // Q index used for ALT frame
+ int arf_q;
} RATE_CONTROL;
struct AV1_COMP;
@@ -228,7 +245,7 @@ void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
int *frame_over_shoot_limit);
// Picks q and q bounds given the target for bits
-int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int width, int height,
+int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height,
int *bottom_index, int *top_index);
// Estimates q to achieve a target bits per frame
@@ -275,4 +292,4 @@ int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
} // extern "C"
#endif
-#endif // AV1_ENCODER_RATECTRL_H_
+#endif // AOM_AV1_ENCODER_RATECTRL_H_
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.c b/third_party/aom/av1/encoder/ratectrl_xiph.c
deleted file mode 100644
index e69de29bb..000000000
--- a/third_party/aom/av1/encoder/ratectrl_xiph.c
+++ /dev/null
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.h b/third_party/aom/av1/encoder/ratectrl_xiph.h
deleted file mode 100644
index e69de29bb..000000000
--- a/third_party/aom/av1/encoder/ratectrl_xiph.h
+++ /dev/null
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
index c4d4777bf..b87d89e50 100644
--- a/third_party/aom/av1/encoder/rd.c
+++ b/third_party/aom/av1/encoder/rd.c
@@ -648,6 +648,473 @@ void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
}
}
+static double interp_cubic(const double *p, double x) {
+ return p[1] + 0.5 * x *
+ (p[2] - p[0] +
+ x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
+ x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+}
+
+static double interp_bicubic(const double *p, int p_stride, double x,
+ double y) {
+ double q[4];
+ q[0] = interp_cubic(p, x);
+ q[1] = interp_cubic(p + p_stride, x);
+ q[2] = interp_cubic(p + 2 * p_stride, x);
+ q[3] = interp_cubic(p + 3 * p_stride, x);
+ return interp_cubic(q, y);
+}
+
+static const double interp_rgrid_surf[65 * 18] = {
+ 0.104019, 0.245714, 0.293686, 0.358635, 0.382167, 0.412446,
+ 0.419955, 0.421388, 0.426672, 0.427990, 0.428531, 0.456868,
+ 0.569880, 0.638822, 1.016319, 2.143453, 3.565229, 4.720880,
+ 0.124618, 0.294211, 0.352023, 0.429991, 0.458206, 0.494510,
+ 0.503513, 0.505232, 0.511566, 0.513234, 0.519365, 0.570225,
+ 0.697373, 0.840624, 1.462198, 3.289054, 6.256517, 6.852788,
+ 0.118630, 0.269669, 0.346620, 0.430999, 0.459385, 0.495783,
+ 0.504808, 0.506532, 0.512884, 0.514988, 0.543437, 0.662772,
+ 0.795876, 1.313596, 2.403841, 4.163098, 7.440589, 8.616275,
+ 0.093329, 0.168205, 0.321320, 0.430607, 0.459385, 0.495783,
+ 0.504813, 0.506548, 0.512975, 0.520662, 0.571659, 0.701841,
+ 1.010727, 2.138851, 3.460626, 6.317955, 10.098127, 14.418553,
+ 0.087021, 0.142905, 0.315011, 0.430509, 0.459385, 0.495787,
+ 0.505075, 0.507599, 0.513584, 0.543182, 0.669941, 0.825620,
+ 1.362800, 2.572187, 4.205047, 7.498399, 12.303118, 16.641735,
+ 0.086923, 0.142513, 0.314913, 0.430508, 0.459385, 0.495803,
+ 0.506126, 0.511816, 0.514810, 0.549705, 0.725350, 1.127334,
+ 2.168597, 3.463686, 6.318605, 10.162284, 18.556041, 19.847042,
+ 0.086923, 0.142513, 0.314913, 0.430506, 0.459376, 0.495805,
+ 0.506388, 0.512954, 0.520772, 0.580215, 0.810474, 1.391548,
+ 2.579442, 4.205160, 7.498399, 12.381597, 21.703618, 24.015457,
+ 0.086923, 0.142513, 0.314911, 0.430353, 0.458765, 0.495652,
+ 0.506391, 0.513406, 0.544098, 0.702950, 1.121860, 2.168961,
+ 3.463798, 6.318607, 10.162284, 18.685361, 28.188192, 37.638872,
+ 0.086923, 0.142513, 0.314901, 0.429742, 0.456313, 0.495045,
+ 0.506484, 0.519195, 0.580104, 0.810126, 1.391462, 2.579441,
+ 4.205160, 7.498399, 12.381597, 21.848607, 33.367199, 42.623190,
+ 0.086923, 0.142513, 0.314899, 0.429589, 0.455706, 0.495155,
+ 0.507882, 0.542426, 0.702360, 1.119921, 2.168478, 3.463791,
+ 6.318607, 10.162284, 18.685361, 28.345760, 47.802028, 49.163533,
+ 0.086924, 0.142548, 0.315086, 0.429842, 0.455870, 0.496336,
+ 0.512412, 0.556953, 0.773373, 1.266396, 2.548277, 4.204676,
+ 7.498399, 12.381597, 21.848607, 33.548250, 54.301011, 56.262859,
+ 0.087067, 0.144957, 0.327436, 0.446616, 0.466362, 0.505706,
+ 0.522077, 0.610747, 0.972543, 1.666916, 3.338812, 6.316669,
+ 10.162284, 18.685361, 28.345760, 48.065311, 66.145302, 78.396020,
+ 0.094295, 0.164235, 0.393722, 0.534219, 0.530922, 0.579308,
+ 0.603889, 0.760870, 1.229961, 2.423214, 4.173513, 7.497916,
+ 12.381597, 21.848607, 33.548250, 54.589585, 74.875848, 86.468182,
+ 0.124096, 0.213005, 0.497188, 0.665176, 0.685973, 0.800200,
+ 0.911394, 1.077971, 1.677290, 3.332129, 6.314960, 10.162257,
+ 18.685361, 28.345760, 48.065311, 66.453506, 98.275189, 96.862588,
+ 0.140999, 0.270140, 0.658212, 0.867661, 0.970183, 1.149516,
+ 1.480599, 1.664833, 2.421893, 3.857981, 7.418830, 12.380371,
+ 21.848607, 33.548250, 54.589585, 75.188867, 106.657971, 99.762997,
+ 0.178353, 0.398001, 0.988462, 1.241473, 1.340967, 1.713568,
+ 2.335030, 2.701432, 3.348532, 5.077158, 9.829903, 18.676528,
+ 28.345700, 48.065311, 66.453506, 98.588283, 117.057193, 101.130722,
+ 0.281079, 0.548300, 1.395825, 1.780770, 2.000508, 2.702964,
+ 3.638454, 4.573843, 5.051641, 7.079129, 11.293332, 21.594861,
+ 33.544335, 54.589585, 75.188867, 106.971065, 119.957601, 101.466632,
+ 0.476762, 0.842189, 2.019678, 2.723895, 3.188467, 4.011610,
+ 5.545111, 7.508984, 8.176339, 9.774504, 14.720782, 27.334416,
+ 48.049609, 66.453506, 98.588283, 117.370357, 121.329855, 101.509242,
+ 0.993999, 1.520111, 3.013605, 4.203530, 4.982992, 6.074944,
+ 8.583581, 11.818375, 14.192544, 14.937517, 21.258160, 33.305953,
+ 54.585735, 75.188867, 106.971135, 120.279824, 121.976055, 102.690130,
+ 1.776487, 2.613655, 4.356487, 6.161726, 7.622196, 9.464193,
+ 13.077233, 18.051656, 23.221051, 24.080068, 30.085038, 48.345269,
+ 66.457698, 98.588353, 117.379415, 121.976128, 124.356210, 107.713202,
+ 3.191085, 4.495201, 5.686033, 8.365566, 11.275339, 14.706437,
+ 20.300969, 28.152237, 35.688355, 39.341382, 41.030743, 55.752262,
+ 75.211764, 106.980285, 120.608403, 124.680746, 130.222528, 112.260098,
+ 6.136611, 7.305215, 7.272532, 10.646713, 15.630815, 22.383168,
+ 31.349131, 42.419822, 52.301680, 58.983454, 58.915405, 69.161305,
+ 98.992460, 117.713855, 124.344836, 130.623638, 138.442401, 127.846670,
+ 11.707980, 13.490761, 11.640845, 14.176132, 22.131124, 33.776462,
+ 47.365711, 61.603834, 75.281056, 83.463985, 85.510533, 86.026513,
+ 108.787480, 123.031136, 130.607284, 138.954406, 160.867784, 158.958882,
+ 27.062874, 32.195139, 24.147297, 22.114632, 35.580506, 52.551674,
+ 71.652956, 88.606776, 102.107193, 110.703186, 114.398733, 111.118539,
+ 121.503578, 132.455924, 139.490806, 161.412674, 193.563210, 172.203945,
+ 35.625692, 47.953028, 42.639820, 42.276254, 58.815664, 84.977282,
+ 110.656412, 126.168446, 134.658126, 140.604482, 144.006012, 141.702382,
+ 140.125323, 153.122630, 164.748041, 194.156197, 206.854650, 174.013079,
+ 49.516447, 65.335381, 71.738306, 81.872819, 98.400740, 136.840488,
+ 163.775802, 169.440078, 172.747876, 171.222919, 171.679604, 172.173550,
+ 168.200129, 187.617133, 199.683394, 207.768200, 210.062520, 175.478356,
+ 60.341673, 92.487135, 119.907299, 136.068010, 144.778950, 189.443534,
+ 220.120077, 219.641635, 214.616503, 205.894657, 198.453924, 200.013069,
+ 195.938103, 206.118661, 210.447375, 212.061379, 216.078218, 181.162805,
+ 78.422159, 112.242899, 158.416312, 181.404320, 193.188690, 229.296967,
+ 270.461799, 275.168977, 256.511701, 244.706786, 231.344608, 226.065087,
+ 222.248618, 218.662324, 217.966722, 218.248574, 218.818588, 182.740573,
+ 88.713664, 123.594164, 172.928179, 213.781414, 245.800351, 252.063414,
+ 313.283141, 331.703831, 305.866639, 285.177142, 269.759635, 251.988739,
+ 245.998388, 232.688076, 230.588702, 230.882657, 230.319053, 192.120741,
+ 102.540561, 152.905927, 189.137131, 241.806756, 273.868497, 284.258017,
+ 339.689853, 373.561104, 362.657463, 326.291984, 311.922687, 290.460189,
+ 276.774381, 273.012072, 277.751792, 279.123748, 278.820447, 233.813798,
+ 132.983118, 176.307242, 197.415684, 243.307787, 280.893995, 332.922370,
+ 340.329043, 404.530166, 419.475405, 375.775209, 351.300889, 340.042759,
+ 315.683832, 306.123530, 306.359319, 306.733063, 307.609556, 261.647847,
+ 149.579109, 185.925581, 207.937033, 245.159084, 301.890957, 350.040480,
+ 352.250771, 418.742329, 458.112686, 430.125208, 386.460441, 380.346839,
+ 354.679150, 337.305620, 334.504124, 335.889932, 341.060725, 286.898578,
+ 153.576812, 202.105624, 219.366967, 248.524506, 314.255692, 350.607526,
+ 390.567688, 408.629209, 488.000213, 480.563823, 432.461799, 410.412624,
+ 398.607371, 400.188740, 402.780916, 408.853470, 430.449735, 363.777088,
+ 161.353129, 214.848904, 231.549852, 258.536466, 313.163177, 368.140577,
+ 412.136393, 413.409032, 499.838438, 519.571063, 485.833867, 444.562715,
+ 435.738129, 442.358549, 450.166531, 453.208524, 458.424358, 385.823139,
+ 175.109034, 227.608058, 250.069563, 286.101747, 312.256740, 378.421485,
+ 413.344147, 435.058646, 476.960941, 542.448886, 530.189154, 495.408402,
+ 475.326752, 465.017144, 464.694045, 465.144689, 466.905382, 398.669138,
+ 184.750180, 240.766694, 283.240772, 305.480150, 322.409001, 374.526162,
+ 427.141326, 452.840323, 472.604139, 545.366105, 567.676694, 541.666203,
+ 509.591873, 492.044219, 492.778569, 493.765684, 493.235693, 413.684325,
+ 194.728357, 254.928927, 289.991157, 300.193195, 324.194589, 371.563147,
+ 439.226438, 468.295088, 495.654854, 533.506353, 587.476353, 578.298989,
+ 548.041942, 527.393885, 538.965146, 545.070442, 544.295454, 454.012211,
+ 205.195287, 283.135677, 297.921431, 319.295927, 355.621830, 392.466463,
+ 446.696167, 485.053519, 516.426615, 532.264584, 588.481600, 615.906737,
+ 589.319634, 555.754316, 558.389367, 569.094521, 569.779764, 475.384946,
+ 218.552054, 298.511016, 319.188338, 351.781666, 372.789510, 412.827434,
+ 464.569387, 506.270203, 533.049810, 553.347364, 580.644599, 632.759854,
+ 622.235843, 569.960552, 580.799340, 586.553714, 579.488366, 491.826482,
+ 244.803348, 299.790203, 324.187975, 363.280782, 403.710443, 441.724083,
+ 492.732682, 534.722691, 552.193622, 575.112647, 586.097705, 635.224970,
+ 644.642944, 606.017786, 640.321218, 642.316989, 616.397020, 548.300111,
+ 256.957358, 318.638991, 355.063346, 389.889307, 433.607315, 468.209001,
+ 515.178157, 573.556591, 578.113115, 587.246475, 601.762801, 638.454644,
+ 656.574853, 641.184609, 676.908189, 684.198162, 678.387412, 574.805864,
+ 251.211502, 323.448532, 364.227424, 411.792704, 462.226488, 503.572288,
+ 549.299249, 599.124071, 601.227977, 597.118176, 613.247552, 633.278532,
+ 658.074755, 664.930719, 685.731531, 693.632845, 693.076350, 578.326477,
+ 267.695377, 354.273736, 389.976833, 438.518178, 493.332686, 544.343027,
+ 588.895829, 620.206193, 628.327410, 606.067827, 620.998532, 657.985256,
+ 683.936059, 691.345257, 693.894723, 695.175306, 693.618786, 578.517148,
+ 274.290725, 363.465288, 411.808596, 463.369805, 515.310226, 581.009306,
+ 613.070738, 636.638714, 647.333929, 629.867603, 644.646319, 687.796202,
+ 702.859596, 713.495479, 704.068069, 704.991807, 704.188594, 587.283658,
+ 302.538449, 389.174737, 438.518422, 493.398902, 547.662399, 601.981814,
+ 624.773046, 641.629484, 644.699451, 645.848784, 668.033340, 703.643523,
+ 707.422408, 717.329600, 726.298973, 744.127507, 745.365167, 617.954068,
+ 310.328188, 410.984766, 463.369805, 515.315010, 581.309832, 613.787792,
+ 634.988538, 654.145284, 662.632978, 668.413496, 706.494057, 750.545471,
+ 730.724808, 730.002100, 743.625262, 750.801609, 745.308457, 606.505800,
+ 329.948756, 437.600191, 493.398902, 547.661910, 601.917884, 622.557745,
+ 633.244395, 644.055898, 648.224221, 665.062911, 763.555733, 812.391078,
+ 769.063582, 744.865168, 727.579796, 724.950408, 722.179707, 598.564510,
+ 350.848328, 462.437458, 515.315010, 581.309823, 613.779123, 634.465309,
+ 652.056257, 662.179143, 671.466297, 726.881256, 819.824030, 880.232789,
+ 810.371672, 754.246481, 725.053473, 724.253390, 723.503395, 603.394909,
+ 373.704088, 492.408266, 547.661910, 601.917884, 622.557620, 633.236320,
+ 644.023513, 648.232514, 666.381639, 785.498283, 929.441612, 999.772800,
+ 890.339033, 775.852504, 731.840181, 726.905100, 725.251844, 604.899901,
+ 394.473422, 514.261306, 581.309823, 613.779123, 634.465309, 652.056257,
+ 662.179143, 671.466557, 727.134512, 835.764144, 981.747089, 1018.462934,
+ 939.686967, 811.276731, 739.398459, 727.365647, 725.285425, 604.923525,
+ 419.976505, 546.538939, 601.917884, 622.557620, 633.236320, 644.023513,
+ 648.232514, 666.381639, 785.545191, 932.841398, 1036.609617, 1026.945092,
+ 963.822765, 840.827315, 755.532423, 730.241865, 725.366847, 604.924155,
+ 437.281359, 580.116337, 613.779123, 634.465309, 652.056257, 662.179143,
+ 671.466557, 727.134512, 835.764859, 981.996194, 1031.896881, 1002.544732,
+ 881.157178, 828.151494, 799.340975, 751.314325, 728.316587, 605.005504,
+ 464.713920, 600.649281, 622.557620, 633.236320, 644.023513, 648.232514,
+ 666.381639, 785.545191, 932.841398, 1036.735329, 1035.037004, 995.478339,
+ 858.093733, 823.471976, 819.881754, 798.749289, 749.440463, 607.955244,
+ 495.880237, 612.473139, 634.465309, 652.056257, 662.179143, 671.466557,
+ 727.134512, 835.764859, 981.996194, 1032.339788, 1031.105117, 995.303259,
+ 857.733663, 823.435877, 822.822791, 819.873050, 796.882480, 629.038445,
+ 510.391280, 621.158273, 633.236320, 644.023513, 648.232514, 666.381639,
+ 785.545191, 932.841398, 1036.735329, 1035.566013, 1029.599350, 994.926093,
+ 857.645648, 823.435143, 822.904139, 822.822791, 817.965681, 673.856962,
+ 514.588176, 632.947715, 652.056257, 662.179143, 671.466557, 727.134512,
+ 835.764859, 981.996194, 1032.339788, 1031.547475, 1023.835377, 972.158629,
+ 851.968626, 823.347128, 822.904770, 822.904139, 820.752301, 684.418900,
+ 520.013294, 631.668183, 644.023513, 648.232514, 666.381639, 785.545191,
+ 932.841398, 1036.735329, 1035.567378, 1029.776746, 1001.044108, 880.853721,
+ 829.201546, 822.994150, 822.904770, 822.904770, 820.792975, 684.582020,
+ 531.253628, 650.479606, 662.179143, 671.466557, 727.134512, 835.764859,
+ 981.996194, 1032.339788, 1031.636855, 1029.601779, 995.366703, 858.086641,
+ 823.524524, 822.906135, 822.904770, 822.904770, 820.792975, 684.582020,
+ 528.531744, 642.424501, 648.232514, 666.381639, 785.545191, 932.841398,
+ 1036.735329, 1035.567378, 1030.219103, 1029.576226, 995.278687, 857.733663,
+ 823.436508, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 545.401164, 660.550678, 671.508859, 727.304161, 835.807162, 981.996850,
+ 1032.339788, 1031.636855, 1030.130788, 1029.487827, 994.925709, 857.645648,
+ 823.435143, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 537.684760, 646.650947, 669.110131, 796.487512, 935.569890, 1036.777631,
+ 1035.567378, 1030.219103, 1030.018584, 1023.810805, 972.158629, 851.968626,
+ 823.347128, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 552.408370, 670.001885, 738.246482, 879.690154, 992.939171, 1032.509436,
+ 1031.636855, 1030.132153, 1029.665223, 1001.043724, 880.853721, 829.201546,
+ 822.994150, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 539.835902, 667.496388, 799.216004, 946.512211, 1039.506123, 1035.609680,
+ 1030.219103, 1030.107964, 1029.577207, 995.366703, 858.086641, 823.524524,
+ 822.906135, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 558.362529, 734.277451, 877.197218, 990.478243, 1029.908393, 1028.993978,
+ 1027.488620, 1027.464048, 1026.933674, 992.724534, 855.532488, 821.323349,
+ 820.792975, 820.792975, 820.792975, 820.792975, 818.686600, 682.825198,
+ 453.127195, 649.075095, 780.278390, 867.165890, 862.469711, 857.067460,
+ 856.956321, 856.955937, 856.513579, 827.981461, 713.556496, 685.024378,
+ 684.582020, 684.582020, 684.582020, 684.582020, 682.825198, 569.510056,
+};
+
+static const double interp_dgrid_surf[65 * 18] = {
+ 10.650434, 12.204694, 12.040917, 11.843008, 11.845578, 12.051535, 12.103583,
+ 12.136780, 12.266709, 12.299107, 12.299673, 12.303120, 12.316337, 12.293431,
+ 12.092165, 11.602421, 11.141559, 8.864495, 12.770003, 14.634889, 14.437149,
+ 14.199413, 14.202487, 14.449423, 14.511827, 14.551629, 14.707410, 14.746265,
+ 14.747610, 14.753705, 14.762194, 14.699395, 14.390525, 13.690970, 12.874168,
+ 10.367121, 12.832328, 14.790730, 14.503765, 14.236403, 14.239028, 14.486600,
+ 14.549164, 14.589069, 14.745250, 14.784258, 14.788320, 14.801930, 14.762798,
+ 14.499088, 14.021544, 13.469684, 12.661560, 10.108384, 12.950520, 15.264726,
+ 14.621957, 14.238236, 14.239028, 14.486601, 14.549264, 14.589469, 14.745361,
+ 14.784949, 14.791572, 14.798652, 14.660251, 14.119394, 13.651131, 12.935657,
+ 12.176082, 9.228999, 12.979992, 15.382918, 14.651428, 14.238693, 14.239028,
+ 14.486701, 14.555710, 14.615321, 14.751849, 14.787700, 14.797104, 14.743189,
+ 14.475057, 13.944406, 13.450468, 12.687876, 11.824993, 8.906683, 12.980449,
+ 15.384750, 14.651885, 14.238700, 14.239028, 14.487102, 14.581562, 14.718998,
+ 14.777721, 14.788445, 14.778661, 14.582790, 14.099785, 13.649637, 12.935359,
+ 12.201859, 10.891931, 8.482221, 12.980449, 15.384750, 14.651886, 14.238801,
+ 14.239434, 14.487303, 14.588010, 14.744860, 14.784773, 14.786094, 14.735647,
+ 14.455704, 13.939591, 13.450393, 12.687876, 11.849334, 10.476658, 8.043672,
+ 12.980449, 15.384750, 14.651987, 14.245320, 14.265579, 14.493824, 14.588211,
+ 14.745312, 14.787263, 14.775934, 14.582036, 14.099475, 13.649563, 12.935358,
+ 12.201859, 10.911285, 9.730570, 6.696921, 12.980449, 15.384750, 14.652393,
+ 14.271466, 14.370434, 14.520069, 14.589027, 14.746028, 14.785482, 14.735605,
+ 14.455693, 13.939590, 13.450393, 12.687876, 11.849334, 10.494514, 9.195398,
+ 6.215460, 12.980449, 15.384750, 14.652494, 14.277985, 14.396679, 14.533035,
+ 14.615021, 14.754825, 14.775610, 14.582796, 14.099664, 13.649565, 12.935358,
+ 12.201859, 10.911285, 9.747361, 7.779960, 5.617541, 12.980448, 15.384731,
+ 14.652415, 14.278078, 14.397578, 14.559053, 14.718657, 14.776398, 14.747044,
+ 14.504690, 13.951810, 13.450583, 12.687876, 11.849334, 10.494514, 9.210817,
+ 7.210003, 5.164575, 12.980446, 15.383448, 14.647073, 14.277541, 14.403813,
+ 14.569546, 14.744956, 14.765103, 14.629073, 14.296161, 13.698573, 12.936118,
+ 12.201859, 10.911285, 9.747361, 7.790897, 6.322998, 3.931551, 12.981550,
+ 15.376916, 14.615597, 14.274820, 14.437479, 14.575942, 14.707492, 14.734111,
+ 14.515975, 14.000806, 13.462803, 12.688066, 11.849334, 10.494514, 9.210817,
+ 7.219566, 5.781392, 3.486081, 12.991899, 15.376201, 14.579444, 14.296898,
+ 14.473361, 14.522910, 14.491600, 14.543267, 14.288580, 13.700311, 12.936579,
+ 12.201867, 10.911285, 9.747361, 7.790897, 6.331506, 4.480348, 2.923138,
+ 13.019848, 15.383477, 14.582260, 14.385262, 14.452673, 14.436019, 14.238174,
+ 14.255993, 13.977481, 13.532342, 12.705591, 11.849605, 10.494514, 9.210817,
+ 7.219566, 5.789642, 4.018194, 2.766222, 13.028558, 15.315782, 14.439141,
+ 14.326286, 14.452429, 14.311731, 14.033235, 13.922587, 13.665868, 13.207897,
+ 12.274375, 10.912967, 9.747371, 7.790897, 6.331506, 4.488594, 3.454993,
+ 2.692682, 12.992752, 15.321471, 14.409573, 14.236340, 14.322969, 14.049072,
+ 13.764823, 13.479242, 13.250105, 12.759133, 12.019174, 10.532951, 9.211409,
+ 7.219566, 5.789642, 4.026440, 3.298077, 2.674624, 12.945493, 15.276596,
+ 14.315745, 14.026198, 14.085774, 13.844563, 13.447576, 12.964935, 12.735525,
+ 12.288592, 11.511693, 9.900227, 7.793270, 6.331506, 4.488594, 3.463236,
+ 3.224318, 2.672433, 12.757570, 15.056661, 14.095011, 13.722362, 13.812624,
+ 13.608480, 13.021206, 12.367627, 11.937931, 11.581049, 10.599552, 9.247860,
+ 7.220151, 5.789642, 4.026437, 3.305882, 3.191260, 2.615317, 12.581293,
+ 14.824658, 13.909074, 13.496158, 13.491402, 13.221550, 12.514140, 11.677229,
+ 10.936895, 10.619912, 9.634779, 7.763570, 6.331082, 4.488590, 3.462798,
+ 3.216460, 3.076315, 2.373499, 12.283499, 14.455760, 13.890593, 13.427587,
+ 13.183783, 12.763833, 11.861006, 10.740618, 9.820756, 9.354945, 8.669862,
+ 7.123268, 5.787860, 4.025994, 3.290000, 3.084410, 2.810905, 2.222916,
+ 12.010893, 14.300919, 13.986624, 13.484026, 13.025385, 12.224281, 11.064265,
+ 9.631040, 8.594396, 8.003736, 7.561587, 6.274418, 4.466637, 3.446574,
+ 3.102467, 2.816989, 2.598688, 1.951541, 11.581477, 13.831132, 13.632027,
+ 13.380414, 12.807880, 11.665651, 10.218236, 8.562237, 7.222614, 6.611808,
+ 6.261676, 5.402793, 3.938544, 3.174375, 2.818166, 2.602758, 2.213911,
+ 1.434763, 11.050735, 12.893449, 12.363152, 12.712829, 12.012961, 10.887854,
+ 9.109699, 7.421701, 5.965603, 5.272129, 4.991435, 4.423000, 3.369988,
+ 2.800371, 2.593901, 2.217431, 1.670917, 1.215265, 10.641194, 11.766277,
+ 10.777082, 10.972917, 10.689298, 9.701545, 7.719947, 6.145654, 4.872442,
+ 4.099600, 3.880934, 3.514159, 2.786474, 2.368963, 2.162376, 1.673670,
+ 1.450770, 1.185424, 10.071964, 11.107701, 9.172361, 8.551313, 8.412080,
+ 7.641397, 6.174246, 4.853916, 3.904549, 3.246810, 2.959903, 2.785066,
+ 2.240001, 1.793166, 1.585520, 1.449824, 1.405368, 1.168856, 9.213182,
+ 9.173278, 7.219231, 6.242951, 5.626013, 5.768007, 4.908666, 3.809589,
+ 3.115109, 2.617899, 2.274793, 2.172960, 1.838597, 1.505915, 1.414333,
+ 1.392666, 1.338173, 1.105611, 7.365015, 7.471370, 5.622346, 4.520127,
+ 3.936272, 4.208822, 3.623024, 2.977794, 2.450003, 2.097261, 1.824090,
+ 1.643270, 1.473525, 1.351388, 1.327504, 1.323865, 1.307894, 1.088234,
+ 6.198210, 6.580712, 4.682511, 3.416952, 2.941929, 2.766637, 2.650686,
+ 2.315439, 1.925838, 1.659784, 1.464419, 1.252806, 1.162722, 1.197518,
+ 1.199875, 1.197365, 1.194040, 0.995797, 5.402507, 5.055466, 3.728724,
+ 2.624359, 2.165810, 1.943189, 1.918190, 1.738078, 1.516328, 1.290520,
+ 1.155793, 1.015962, 0.881900, 0.807203, 0.754242, 0.743378, 0.740288,
+ 0.614158, 3.937867, 3.862507, 2.884664, 2.088147, 1.648496, 1.473584,
+ 1.340123, 1.291769, 1.165381, 1.000224, 0.893316, 0.821333, 0.691363,
+ 0.610501, 0.586766, 0.583762, 0.577840, 0.468733, 3.104660, 3.181078,
+ 2.420208, 1.747442, 1.297956, 1.109835, 0.970385, 0.943229, 0.876923,
+ 0.777584, 0.678183, 0.628623, 0.553745, 0.523430, 0.519490, 0.514394,
+ 0.492259, 0.403172, 2.593833, 2.533720, 2.010452, 1.480944, 1.060302,
+ 0.846383, 0.738703, 0.673144, 0.658010, 0.592449, 0.518236, 0.470335,
+ 0.425088, 0.393168, 0.378116, 0.355846, 0.275469, 0.213128, 2.176988,
+ 2.089575, 1.671284, 1.225008, 0.895382, 0.672008, 0.566241, 0.496746,
+ 0.488005, 0.449874, 0.400899, 0.354002, 0.318150, 0.281533, 0.238545,
+ 0.224159, 0.202399, 0.160681, 1.874679, 1.769165, 1.430124, 1.068727,
+ 0.780272, 0.557801, 0.441643, 0.377256, 0.352957, 0.338452, 0.304965,
+ 0.273172, 0.240052, 0.208724, 0.193431, 0.190845, 0.185025, 0.138166,
+ 1.590226, 1.502830, 1.193127, 0.917885, 0.670432, 0.474546, 0.355420,
+ 0.292305, 0.259035, 0.249937, 0.232079, 0.208943, 0.181936, 0.160038,
+ 0.152257, 0.151235, 0.149583, 0.120747, 1.331730, 1.255907, 1.012871,
+ 0.778422, 0.578977, 0.412432, 0.293155, 0.231824, 0.197187, 0.183921,
+ 0.174876, 0.157252, 0.140263, 0.127050, 0.110244, 0.105041, 0.104323,
+ 0.086944, 1.153994, 1.118771, 0.822355, 0.612321, 0.478249, 0.348222,
+ 0.247408, 0.186141, 0.152714, 0.135445, 0.129810, 0.119994, 0.115619,
+ 0.131626, 0.095612, 0.079343, 0.077502, 0.064550, 0.946317, 0.925894,
+ 0.677969, 0.499906, 0.397101, 0.297931, 0.214467, 0.152333, 0.120731,
+ 0.102686, 0.095062, 0.090361, 0.122319, 0.240194, 0.112687, 0.070690,
+ 0.070461, 0.054194, 0.824155, 0.787241, 0.581856, 0.419228, 0.313167,
+ 0.245582, 0.183500, 0.128101, 0.096577, 0.080267, 0.071022, 0.066851,
+ 0.085754, 0.154163, 0.075884, 0.052401, 0.054270, 0.026656, 0.716310,
+ 0.671378, 0.489580, 0.349569, 0.256155, 0.206343, 0.157853, 0.111950,
+ 0.079271, 0.062518, 0.053441, 0.049660, 0.051400, 0.063778, 0.039993,
+ 0.029133, 0.023382, 0.013725, 0.614125, 0.579096, 0.417126, 0.299465,
+ 0.217849, 0.165515, 0.129040, 0.093127, 0.065612, 0.049543, 0.041429,
+ 0.036850, 0.034416, 0.033989, 0.024216, 0.017377, 0.014833, 0.011987,
+ 0.520407, 0.487239, 0.349473, 0.251741, 0.184897, 0.135813, 0.107098,
+ 0.073607, 0.053938, 0.040531, 0.032931, 0.028876, 0.025759, 0.022168,
+ 0.016739, 0.014638, 0.014333, 0.011947, 0.449954, 0.415124, 0.299452,
+ 0.216942, 0.158874, 0.115334, 0.088821, 0.060105, 0.042610, 0.032566,
+ 0.026903, 0.023123, 0.019913, 0.016835, 0.014306, 0.013625, 0.013535,
+ 0.011284, 0.377618, 0.347773, 0.251741, 0.184839, 0.132857, 0.095439,
+ 0.070462, 0.052244, 0.036078, 0.026025, 0.021518, 0.018487, 0.015361,
+ 0.012905, 0.011470, 0.010569, 0.010283, 0.008297, 0.319953, 0.297976,
+ 0.216942, 0.158842, 0.113280, 0.080426, 0.057367, 0.041987, 0.030135,
+ 0.022295, 0.017901, 0.015121, 0.012224, 0.010035, 0.009353, 0.009108,
+ 0.008695, 0.006139, 0.267864, 0.250502, 0.184839, 0.132851, 0.095039,
+ 0.068220, 0.049135, 0.035315, 0.025144, 0.018237, 0.013857, 0.012094,
+ 0.009715, 0.007743, 0.006937, 0.006446, 0.006243, 0.004929, 0.230449,
+ 0.215895, 0.158842, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959,
+ 0.021866, 0.015673, 0.012133, 0.010083, 0.007801, 0.006053, 0.005401,
+ 0.003834, 0.003429, 0.002851, 0.193984, 0.183963, 0.132851, 0.095039,
+ 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013175, 0.010422,
+ 0.008491, 0.006397, 0.004567, 0.003494, 0.002933, 0.002825, 0.002355,
+ 0.167298, 0.158088, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009257, 0.007051, 0.005543, 0.003905,
+ 0.002984, 0.002825, 0.002814, 0.002347, 0.143228, 0.132220, 0.095039,
+ 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008403, 0.006661, 0.005378, 0.003545, 0.002876, 0.002818, 0.002814,
+ 0.002347, 0.122934, 0.112735, 0.080417, 0.057174, 0.041304, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009258, 0.007182, 0.006012, 0.003762,
+ 0.002866, 0.002739, 0.002788, 0.002810, 0.002347, 0.101934, 0.094569,
+ 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008405, 0.006797, 0.005845, 0.003333, 0.002703, 0.002695, 0.002723,
+ 0.002781, 0.002343, 0.086702, 0.080014, 0.057174, 0.041304, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006533, 0.005839,
+ 0.003326, 0.002700, 0.002690, 0.002694, 0.002716, 0.002314, 0.073040,
+ 0.067886, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008405, 0.006807, 0.006468, 0.005831, 0.003325, 0.002700, 0.002690,
+ 0.002690, 0.002687, 0.002253, 0.061685, 0.056890, 0.041304, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006542, 0.006360,
+ 0.005416, 0.003221, 0.002698, 0.002690, 0.002690, 0.002683, 0.002238,
+ 0.052465, 0.048894, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008405, 0.006807, 0.006472, 0.005943, 0.003748, 0.002805, 0.002692,
+ 0.002690, 0.002690, 0.002683, 0.002238, 0.043838, 0.041101, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006543, 0.006465,
+ 0.005839, 0.003333, 0.002702, 0.002690, 0.002690, 0.002690, 0.002683,
+ 0.002238, 0.037824, 0.035133, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008405, 0.006807, 0.006480, 0.006464, 0.005838, 0.003326, 0.002700,
+ 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.031865, 0.029815,
+ 0.021866, 0.015668, 0.011955, 0.009258, 0.007190, 0.006543, 0.006475,
+ 0.006462, 0.005831, 0.003325, 0.002700, 0.002690, 0.002690, 0.002690,
+ 0.002683, 0.002238, 0.027150, 0.025016, 0.018128, 0.013083, 0.010371,
+ 0.008405, 0.006807, 0.006480, 0.006472, 0.006359, 0.005416, 0.003221,
+ 0.002698, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.023094,
+ 0.021760, 0.015577, 0.011590, 0.009167, 0.007188, 0.006543, 0.006475,
+ 0.006466, 0.005943, 0.003748, 0.002805, 0.002692, 0.002690, 0.002690,
+ 0.002690, 0.002683, 0.002238, 0.019269, 0.018038, 0.013060, 0.010280,
+ 0.008382, 0.006806, 0.006480, 0.006474, 0.006464, 0.005839, 0.003333,
+ 0.002702, 0.002690, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238,
+ 0.016874, 0.015472, 0.011566, 0.009148, 0.007171, 0.006527, 0.006458,
+ 0.006457, 0.006447, 0.005823, 0.003318, 0.002693, 0.002683, 0.002683,
+ 0.002683, 0.002683, 0.002676, 0.002232, 0.011968, 0.011056, 0.008762,
+ 0.007219, 0.005717, 0.005391, 0.005386, 0.005386, 0.005377, 0.004856,
+ 0.002767, 0.002246, 0.002238, 0.002238, 0.002238, 0.002238, 0.002232,
+ 0.001862,
+};
+
+void av1_model_rd_surffit(double xm, double yl, double *rate_f,
+ double *dist_f) {
+ const double x_start = -0.5;
+ const double x_end = 16.5;
+ const double x_step = 1;
+ const double y_start = -15.5;
+ const double y_end = 16.5;
+ const double y_step = 0.5;
+ const double epsilon = 1e-6;
+ const int stride = (int)rint((x_end - x_start) / x_step) + 1;
+ (void)y_end;
+
+ xm = AOMMAX(xm, x_start + x_step + epsilon);
+ xm = AOMMIN(xm, x_end - x_step - epsilon);
+ yl = AOMMAX(yl, y_start + y_step + epsilon);
+ yl = AOMMIN(yl, y_end - y_step - epsilon);
+
+ const double y = (yl - y_start) / y_step;
+ const double x = (xm - x_start) / x_step;
+
+ const int yi = (int)floor(y);
+ const int xi = (int)floor(x);
+ assert(xi > 0);
+ assert(yi > 0);
+
+ const double yo = y - yi;
+ const double xo = x - xi;
+ const double *prate = &interp_rgrid_surf[(yi - 1) * stride + (xi - 1)];
+ const double *pdist = &interp_dgrid_surf[(yi - 1) * stride + (xi - 1)];
+ *rate_f = interp_bicubic(prate, stride, xo, yo);
+ *dist_f = interp_bicubic(pdist, stride, xo, yo);
+}
+
+static const double interp_rgrid_curv[65] = {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 4.759876,
+ 8.132086, 13.651828, 21.908271, 33.522054, 48.782376, 71.530983,
+ 106.728649, 151.942795, 199.893011, 242.850965, 283.933923, 322.154203,
+ 360.684608, 394.801656, 426.879017, 460.234313, 484.103987, 508.261495,
+ 536.486763, 558.196737, 586.285894, 614.764511, 634.166333, 647.706472,
+ 658.211478, 681.360407, 701.052141, 727.007310, 768.663973, 804.407660,
+ 884.627751, 1065.658131, 1238.875214, 1440.185176, 1678.377931, 1962.243390,
+ 2300.571467, 2702.152072, 3175.775119, 3730.230519, 4374.308184, 5116.798028,
+ 5966.489961, 6932.173897, 8022.639747, 9246.677424, 10613.076839,
+};
+
+static const double interp_dgrid_curv[65] = {
+ 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855,
+ 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.555776, 14.533692,
+ 14.439920, 14.257791, 13.977230, 13.623229, 13.064884, 12.355411, 11.560773,
+ 10.728960, 9.861975, 8.643612, 6.916021, 5.154769, 3.734940, 2.680051,
+ 1.925506, 1.408410, 1.042223, 0.767641, 0.565392, 0.420116, 0.310427,
+ 0.231711, 0.172999, 0.128293, 0.094992, 0.072171, 0.052972, 0.039354,
+ 0.029555, 0.022857, 0.016832, 0.013297, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000,
+};
+
+void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) {
+ const double x_start = -15.5;
+ const double x_end = 16.5;
+ const double x_step = 0.5;
+ const double epsilon = 1e-6;
+ (void)x_end;
+
+ xqr = AOMMAX(xqr, x_start + x_step + epsilon);
+ xqr = AOMMIN(xqr, x_end - x_step - epsilon);
+ const double x = (xqr - x_start) / x_step;
+ const int xi = (int)floor(x);
+ const double xo = x - xi;
+
+ assert(xi > 0);
+
+ const double *prate = &interp_rgrid_curv[(xi - 1)];
+ const double *pdist = &interp_dgrid_curv[(xi - 1)];
+ *rate_f = interp_cubic(prate, xo);
+ *distbysse_f = interp_cubic(pdist, xo);
+}
+
static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
const struct macroblockd_plane *pd,
ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
@@ -814,8 +1281,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_NEARESTG] = 0;
}
- rd->thresh_mult[THR_DC] += 1000;
-
rd->thresh_mult[THR_NEWMV] += 1000;
rd->thresh_mult[THR_NEWL2] += 1000;
rd->thresh_mult[THR_NEWL3] += 1000;
@@ -840,8 +1305,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_GLOBALG] += 2000;
rd->thresh_mult[THR_GLOBALA] += 2000;
- rd->thresh_mult[THR_PAETH] += 1000;
-
rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
@@ -956,15 +1419,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000;
rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500;
- rd->thresh_mult[THR_H_PRED] += 2000;
- rd->thresh_mult[THR_V_PRED] += 2000;
- rd->thresh_mult[THR_D135_PRED] += 2500;
- rd->thresh_mult[THR_D203_PRED] += 2500;
- rd->thresh_mult[THR_D157_PRED] += 2500;
- rd->thresh_mult[THR_D67_PRED] += 2500;
- rd->thresh_mult[THR_D113_PRED] += 2500;
- rd->thresh_mult[THR_D45_PRED] += 2500;
-
rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600;
rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000;
rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000;
@@ -996,6 +1450,20 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200;
rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400;
rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200;
+
+ rd->thresh_mult[THR_DC] += 1000;
+ rd->thresh_mult[THR_PAETH] += 1000;
+ rd->thresh_mult[THR_SMOOTH] += 2000;
+ rd->thresh_mult[THR_SMOOTH_V] += 2000;
+ rd->thresh_mult[THR_SMOOTH_H] += 2000;
+ rd->thresh_mult[THR_H_PRED] += 2000;
+ rd->thresh_mult[THR_V_PRED] += 2000;
+ rd->thresh_mult[THR_D135_PRED] += 2500;
+ rd->thresh_mult[THR_D203_PRED] += 2500;
+ rd->thresh_mult[THR_D157_PRED] += 2500;
+ rd->thresh_mult[THR_D67_PRED] += 2500;
+ rd->thresh_mult[THR_D113_PRED] += 2500;
+ rd->thresh_mult[THR_D45_PRED] += 2500;
}
void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
index 692367d7a..755b61df5 100644
--- a/third_party/aom/av1/encoder/rd.h
+++ b/third_party/aom/av1/encoder/rd.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RD_H_
-#define AV1_ENCODER_RD_H_
+#ifndef AOM_AV1_ENCODER_RD_H_
+#define AOM_AV1_ENCODER_RD_H_
#include <limits.h>
@@ -57,8 +57,6 @@ typedef enum {
THR_NEARESTA,
THR_NEARESTG,
- THR_DC,
-
THR_NEWMV,
THR_NEWL2,
THR_NEWL3,
@@ -100,12 +98,6 @@ typedef enum {
THR_COMP_NEAREST_NEARESTLG,
THR_COMP_NEAREST_NEARESTBA,
- THR_PAETH,
-
- THR_SMOOTH,
- THR_SMOOTH_V,
- THR_SMOOTH_H,
-
THR_COMP_NEAR_NEARLA,
THR_COMP_NEW_NEARESTLA,
THR_COMP_NEAREST_NEWLA,
@@ -202,15 +194,6 @@ typedef enum {
THR_COMP_NEW_NEWGA2,
THR_COMP_GLOBAL_GLOBALGA2,
- THR_H_PRED,
- THR_V_PRED,
- THR_D135_PRED,
- THR_D203_PRED,
- THR_D157_PRED,
- THR_D67_PRED,
- THR_D113_PRED,
- THR_D45_PRED,
-
THR_COMP_NEAR_NEARLL2,
THR_COMP_NEW_NEARESTLL2,
THR_COMP_NEAREST_NEWLL2,
@@ -243,7 +226,26 @@ typedef enum {
THR_COMP_NEW_NEWBA,
THR_COMP_GLOBAL_GLOBALBA,
- MAX_MODES
+ THR_DC,
+ THR_PAETH,
+ THR_SMOOTH,
+ THR_SMOOTH_V,
+ THR_SMOOTH_H,
+ THR_H_PRED,
+ THR_V_PRED,
+ THR_D135_PRED,
+ THR_D203_PRED,
+ THR_D157_PRED,
+ THR_D67_PRED,
+ THR_D113_PRED,
+ THR_D45_PRED,
+
+ MAX_MODES,
+
+ LAST_SINGLE_REF_MODES = THR_GLOBALG,
+ MAX_SINGLE_REF_MODES = LAST_SINGLE_REF_MODES + 1,
+ LAST_COMP_REF_MODES = THR_COMP_GLOBAL_GLOBALBA,
+ MAX_COMP_REF_MODES = LAST_COMP_REF_MODES + 1
} THR_MODES;
typedef enum {
@@ -392,6 +394,10 @@ void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
unsigned int qstep, int *rate, int64_t *dist);
+void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f);
+void av1_model_rd_surffit(double xm, double yl, double *rate_f,
+ double *distbysse_f);
+
int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
const MACROBLOCKD *xd);
@@ -455,4 +461,4 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
} // extern "C"
#endif
-#endif // AV1_ENCODER_RD_H_
+#endif // AOM_AV1_ENCODER_RD_H_
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
index fef6d2875..c2d15534f 100644
--- a/third_party/aom/av1/encoder/rdopt.c
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -55,17 +55,97 @@
#include "av1/encoder/ratectrl.h"
#include "av1/encoder/rd.h"
#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
#include "av1/encoder/tokenize.h"
#include "av1/encoder/tx_prune_model_weights.h"
-#define DNN_BASED_RD_INTERP_FILTER 0
+typedef void (*model_rd_for_sb_type)(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist);
-// Set this macro as 1 to collect data about tx size selection.
-#define COLLECT_TX_SIZE_DATA 0
+static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+ int plane_to, int mi_row, int mi_col,
+ int *out_rate_sum, int64_t *out_dist_sum,
+ int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse,
+ int64_t *plane_dist);
+static void model_rd_for_sb_with_curvfit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_surffit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_dnn(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_fullrdy(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_from_sse(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+ int plane, int64_t sse, int num_samples,
+ int *rate, int64_t *dist);
+static void model_rd_with_dnn(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+ int plane, int64_t sse, int num_samples,
+ int *rate, int64_t *dist);
+static void model_rd_with_curvfit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist);
+static void model_rd_with_surffit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist);
-#if COLLECT_TX_SIZE_DATA
-static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
-#endif
+typedef enum {
+ MODELRD_LEGACY,
+ MODELRD_CURVFIT,
+ MODELRD_SUFFIT,
+ MODELRD_DNN,
+ MODELRD_FULLRDY,
+ MODELRD_TYPES
+} ModelRdType;
+
+static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
+ model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit,
+ model_rd_for_sb_with_dnn, model_rd_for_sb_with_fullrdy
+};
+
+static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
+ model_rd_from_sse, model_rd_with_curvfit, model_rd_with_surffit,
+ model_rd_with_dnn, NULL
+};
+
+// 0: Legacy model
+// 1: Curve fit model
+// 2: Surface fit model
+// 3: DNN regression model
+// 4: Full rd model
+#define MODELRD_TYPE_INTERP_FILTER 1
+#define MODELRD_TYPE_TX_SEARCH_PRUNE 2
+#define MODELRD_TYPE_MASKED_COMPOUND 1
+#define MODELRD_TYPE_INTERINTRA 1
+#define MODELRD_TYPE_INTRA 1
+#define MODELRD_TYPE_JNT_COMPOUND 1
#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = {
@@ -103,6 +183,16 @@ typedef enum {
FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
} FAST_TX_SEARCH_MODE;
+static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int64_t ref_best_rd);
+
+static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t non_skip_ref_best_rd,
+ int64_t skip_ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode);
+
struct rdcost_block_args {
const AV1_COMP *cpi;
MACROBLOCK *x;
@@ -112,6 +202,7 @@ struct rdcost_block_args {
int64_t this_rd;
int64_t best_rd;
int exit_early;
+ int incomplete_exit;
int use_fast_coef_costing;
FAST_TX_SEARCH_MODE ftxs_mode;
};
@@ -126,8 +217,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
{ NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
{ NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
- { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
-
{ NEWMV, { LAST_FRAME, NONE_FRAME } },
{ NEWMV, { LAST2_FRAME, NONE_FRAME } },
{ NEWMV, { LAST3_FRAME, NONE_FRAME } },
@@ -172,12 +261,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
{ NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
{ NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
- { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
-
- { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
- { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
- { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
-
{ NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
{ NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
{ NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -274,15 +357,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
{ NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
{ GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
- { H_PRED, { INTRA_FRAME, NONE_FRAME } },
- { V_PRED, { INTRA_FRAME, NONE_FRAME } },
- { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
- { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
- { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
- { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
- { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
- { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
-
{ NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
{ NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
{ NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
@@ -314,6 +388,21 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
{ NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
{ NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
{ GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+ // intra modes
+ { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
};
static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = {
@@ -451,7 +540,6 @@ static int get_prediction_mode_idx(PREDICTION_MODE this_mode,
if (this_mode >= SINGLE_INTER_MODE_START &&
this_mode < SINGLE_INTER_MODE_END) {
assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
- assert(second_ref_frame == NONE_FRAME);
return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
[ref_frame];
}
@@ -479,6 +567,12 @@ static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
UV_D113_PRED, UV_D45_PRED,
};
+typedef struct SingleInterModeState {
+ int64_t rd;
+ MV_REFERENCE_FRAME ref_frame;
+ int valid;
+} SingleInterModeState;
+
typedef struct InterModeSearchState {
int64_t best_rd;
MB_MODE_INFO best_mbmode;
@@ -510,32 +604,21 @@ typedef struct InterModeSearchState {
int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES];
int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES];
int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES];
- int64_t modelled_rd[MB_MODE_COUNT][REF_FRAMES];
+ int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES];
+ // The rd of simple translation in single inter modes
+ int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES];
+
+ // Single search results by [directions][modes][reference frames]
+ SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+ int single_state_cnt[2][SINGLE_INTER_MODE_NUM];
+ SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM]
+ [FWD_REFS];
+ int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM];
+
+ MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
} InterModeSearchState;
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-
-typedef struct InterModeRdModel {
- int ready;
- double a;
- double b;
- double dist_mean;
- int skip_count;
- int non_skip_count;
- int fp_skip_count;
- int bracket_idx;
-} InterModeRdModel;
-
-InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
-
-#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
-static int inter_mode_data_idx[4];
-static int64_t inter_mode_data_sse[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-static int64_t inter_mode_data_dist[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-static int inter_mode_data_residue_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-static int inter_mode_data_all_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-static int64_t inter_mode_data_ref_best_rd[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-
int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
if (bsize == BLOCK_8X8) return 1;
if (bsize == BLOCK_16X16) return 2;
@@ -543,137 +626,152 @@ int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
return -1;
}
-void av1_inter_mode_data_init() {
+void av1_inter_mode_data_init(TileDataEnc *tile_data) {
for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
- const int block_idx = inter_mode_data_block_idx(i);
- if (block_idx != -1) inter_mode_data_idx[block_idx] = 0;
- InterModeRdModel *md = &inter_mode_rd_models[i];
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[i];
md->ready = 0;
- md->skip_count = 0;
- md->non_skip_count = 0;
- md->fp_skip_count = 0;
- md->bracket_idx = 0;
+ md->num = 0;
+ md->dist_sum = 0;
+ md->ld_sum = 0;
+ md->sse_sum = 0;
+ md->sse_sse_sum = 0;
+ md->sse_ld_sum = 0;
}
}
-void av1_inter_mode_data_show(const AV1_COMMON *cm) {
- printf("frame_offset %d\n", cm->frame_offset);
- for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
- const int block_idx = inter_mode_data_block_idx(i);
- if (block_idx != -1) inter_mode_data_idx[block_idx] = 0;
- InterModeRdModel *md = &inter_mode_rd_models[i];
- if (md->ready) {
- printf("bsize %d non_skip_count %d skip_count %d fp_skip_count %d\n", i,
- md->non_skip_count, md->skip_count, md->fp_skip_count);
+static int get_est_rate_dist(TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ int64_t sse, int *est_residue_cost,
+ int64_t *est_dist) {
+ aom_clear_system_state();
+ const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ if (md->ready) {
+ const double est_ld = md->a * sse + md->b;
+ if (sse < md->dist_mean) {
+ *est_residue_cost = 0;
+ *est_dist = sse;
+ } else {
+ *est_residue_cost = (int)round((sse - md->dist_mean) / est_ld);
+ *est_dist = (int64_t)round(md->dist_mean);
}
+ return 1;
}
+ return 0;
}
-static int64_t get_est_rd(BLOCK_SIZE bsize, int rdmult, int64_t sse,
- int curr_cost) {
- aom_clear_system_state();
- InterModeRdModel *md = &inter_mode_rd_models[bsize];
- if (md->ready) {
- const double est_ld = md->a * sse + md->b;
- const double est_residue_cost = (sse - md->dist_mean) / est_ld;
- const int64_t est_cost = (int64_t)round(est_residue_cost) + curr_cost;
- const int64_t int64_dist_mean = (int64_t)round(md->dist_mean);
- const int64_t est_rd = RDCOST(rdmult, est_cost, int64_dist_mean);
+static int64_t get_est_rd(TileDataEnc *tile_data, BLOCK_SIZE bsize, int rdmult,
+ int64_t sse, int curr_cost) {
+ int est_residue_cost;
+ int64_t est_dist;
+ if (get_est_rate_dist(tile_data, bsize, sse, &est_residue_cost, &est_dist)) {
+ int rate = est_residue_cost + curr_cost;
+ int64_t est_rd = RDCOST(rdmult, rate, est_dist);
return est_rd;
}
return 0;
}
-#define DATA_BRACKETS 7
-static const int data_num_threshold[DATA_BRACKETS] = {
- 200, 400, 800, 1600, 3200, 6400, INT32_MAX
-};
-
-void av1_inter_mode_data_fit(int rdmult) {
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
aom_clear_system_state();
for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
const int block_idx = inter_mode_data_block_idx(bsize);
- InterModeRdModel *md = &inter_mode_rd_models[bsize];
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
if (block_idx == -1) continue;
- int data_num = inter_mode_data_idx[block_idx];
- if (data_num < data_num_threshold[md->bracket_idx]) {
+ if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) {
continue;
+ } else {
+ if (md->ready == 0) {
+ md->dist_mean = md->dist_sum / md->num;
+ md->ld_mean = md->ld_sum / md->num;
+ md->sse_mean = md->sse_sum / md->num;
+ md->sse_sse_mean = md->sse_sse_sum / md->num;
+ md->sse_ld_mean = md->sse_ld_sum / md->num;
+ } else {
+ const double factor = 3;
+ md->dist_mean =
+ (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1);
+ md->ld_mean =
+ (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1);
+ md->sse_mean =
+ (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1);
+ md->sse_sse_mean =
+ (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) /
+ (factor + 1);
+ md->sse_ld_mean =
+ (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) /
+ (factor + 1);
+ }
+
+ const double my = md->ld_mean;
+ const double mx = md->sse_mean;
+ const double dx = sqrt(md->sse_sse_mean);
+ const double dxy = md->sse_ld_mean;
+
+ md->a = (dxy - mx * my) / (dx * dx - mx * mx);
+ md->b = my - md->a * mx;
+ md->ready = 1;
+
+ md->num = 0;
+ md->dist_sum = 0;
+ md->ld_sum = 0;
+ md->sse_sum = 0;
+ md->sse_sse_sum = 0;
+ md->sse_ld_sum = 0;
}
- double my = 0;
- double mx = 0;
- double dx = 0;
- double dxy = 0;
- double dist_mean = 0;
- const int train_num = data_num;
- for (int i = 0; i < train_num; ++i) {
- const double sse = (double)inter_mode_data_sse[block_idx][i];
- const double dist = (double)inter_mode_data_dist[block_idx][i];
- const double residue_cost = inter_mode_data_residue_cost[block_idx][i];
- const double ld = (sse - dist) / residue_cost;
- dist_mean += dist;
- my += ld;
- mx += sse;
- dx += sse * sse;
- dxy += sse * ld;
- }
- dist_mean = dist_mean / data_num;
- my = my / train_num;
- mx = mx / train_num;
- dx = sqrt(dx / train_num);
- dxy = dxy / train_num;
-
- md->dist_mean = dist_mean;
- md->a = (dxy - mx * my) / (dx * dx - mx * mx);
- md->b = my - md->a * mx;
- ++md->bracket_idx;
- md->ready = 1;
- assert(md->bracket_idx < DATA_BRACKETS);
-
(void)rdmult;
-#if 0
- int skip_count = 0;
- int fp_skip_count = 0;
- double avg_error = 0;
- const int test_num = data_num;
- for (int i = 0; i < data_num; ++i) {
- const int64_t sse = inter_mode_data_sse[block_idx][i];
- const int64_t dist = inter_mode_data_dist[block_idx][i];
- const int64_t residue_cost = inter_mode_data_residue_cost[block_idx][i];
- const int64_t all_cost = inter_mode_data_all_cost[block_idx][i];
- const int64_t est_rd =
- get_est_rd(bsize, rdmult, sse, all_cost - residue_cost);
- const int64_t real_rd = RDCOST(rdmult, all_cost, dist);
- const int64_t ref_best_rd = inter_mode_data_ref_best_rd[block_idx][i];
- if (est_rd > ref_best_rd) {
- ++skip_count;
- if (real_rd < ref_best_rd) {
- ++fp_skip_count;
- }
- }
- avg_error += abs(est_rd - real_rd) * 100. / real_rd;
- }
- avg_error /= test_num;
- printf("test_num %d bsize %d avg_error %f skip_count %d fp_skip_count %d\n",
- test_num, bsize, avg_error, skip_count, fp_skip_count);
-#endif
}
}
-static void inter_mode_data_push(BLOCK_SIZE bsize, int64_t sse, int64_t dist,
- int residue_cost, int all_cost,
- int64_t ref_best_rd) {
+static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ int64_t sse, int64_t dist, int residue_cost) {
if (residue_cost == 0 || sse == dist) return;
const int block_idx = inter_mode_data_block_idx(bsize);
if (block_idx == -1) return;
- if (inter_mode_data_idx[block_idx] < INTER_MODE_RD_DATA_OVERALL_SIZE) {
- const int data_idx = inter_mode_data_idx[block_idx];
- inter_mode_data_sse[block_idx][data_idx] = sse;
- inter_mode_data_dist[block_idx][data_idx] = dist;
- inter_mode_data_residue_cost[block_idx][data_idx] = residue_cost;
- inter_mode_data_all_cost[block_idx][data_idx] = all_cost;
- inter_mode_data_ref_best_rd[block_idx][data_idx] = ref_best_rd;
- ++inter_mode_data_idx[block_idx];
+ InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize];
+ if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) {
+ aom_clear_system_state();
+ const double ld = (sse - dist) * 1. / residue_cost;
+ ++rd_model->num;
+ rd_model->dist_sum += dist;
+ rd_model->ld_sum += ld;
+ rd_model->sse_sum += sse;
+ rd_model->sse_sse_sum += sse * sse;
+ rd_model->sse_ld_sum += sse * ld;
+ }
+}
+
+static void inter_modes_info_push(InterModesInfo *inter_modes_info,
+ int mode_rate, int64_t sse, int64_t est_rd,
+ const MB_MODE_INFO *mbmi) {
+ const int num = inter_modes_info->num;
+ assert(num < MAX_INTER_MODES);
+ inter_modes_info->mbmi_arr[num] = *mbmi;
+ inter_modes_info->mode_rate_arr[num] = mode_rate;
+ inter_modes_info->sse_arr[num] = sse;
+ inter_modes_info->est_rd_arr[num] = est_rd;
+ ++inter_modes_info->num;
+}
+
+static int compare_rd_idx_pair(const void *a, const void *b) {
+ if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) {
+ return 0;
+ } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) {
+ return 1;
+ } else {
+ return -1;
+ }
+}
+
+static void inter_modes_info_sort(const InterModesInfo *inter_modes_info,
+ RdIdxPair *rd_idx_pair_arr) {
+ if (inter_modes_info->num == 0) {
+ return;
+ }
+ for (int i = 0; i < inter_modes_info->num; ++i) {
+ rd_idx_pair_arr[i].idx = i;
+ rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i];
}
+ qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]),
+ compare_rd_idx_pair);
}
#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
@@ -1528,13 +1626,13 @@ static void score_2D_transform_pow8(float *scores_2D, float shift) {
// will lead to pruning i+1 TX types on average
static const float *prune_2D_adaptive_thresholds[] = {
// TX_4X4
- (float[]){ 0.02014f, 0.02722f, 0.03430f, 0.04114f, 0.04724f, 0.05212f,
- 0.05627f, 0.06018f, 0.06409f, 0.06824f, 0.07312f, 0.07849f,
- 0.08606f, 0.09827f },
+ (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f,
+ 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f,
+ 0.09778f, 0.11780f },
// TX_8X8
- (float[]){ 0.00745f, 0.01355f, 0.02039f, 0.02795f, 0.03625f, 0.04407f,
- 0.05042f, 0.05579f, 0.06067f, 0.06604f, 0.07239f, 0.08093f,
- 0.09363f, 0.11682f },
+ (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f,
+ 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f,
+ 0.10803f, 0.14124f },
// TX_16X16
(float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
0.06897f, 0.07629f, 0.08875f, 0.11169f },
@@ -1543,35 +1641,37 @@ static const float *prune_2D_adaptive_thresholds[] = {
// TX_64X64
NULL,
// TX_4X8
- (float[]){ 0.01282f, 0.02087f, 0.02844f, 0.03601f, 0.04285f, 0.04871f,
- 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f,
- 0.09119f, 0.10828f },
+ (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f,
+ 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f,
+ 0.10168f, 0.12585f },
// TX_8X4
- (float[]){ 0.01184f, 0.01941f, 0.02722f, 0.03503f, 0.04187f, 0.04822f,
- 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f,
- 0.09167f, 0.10974f },
+ (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f,
+ 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f,
+ 0.10583f, 0.13123f },
// TX_8X16
- (float[]){ 0.00525f, 0.01135f, 0.01819f, 0.02576f, 0.03357f, 0.04114f,
- 0.04773f, 0.05383f, 0.05920f, 0.06506f, 0.07190f, 0.08118f,
- 0.09509f, 0.12097f },
+ (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f,
+ 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f,
+ 0.10730f, 0.14221f },
// TX_16X8
- (float[]){ 0.00525f, 0.01160f, 0.01819f, 0.02527f, 0.03308f, 0.04065f,
- 0.04773f, 0.05383f, 0.05969f, 0.06531f, 0.07214f, 0.08118f,
- 0.09485f, 0.12048f },
+ (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f,
+ 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f,
+ 0.10339f, 0.13464f },
// TX_16X32
- (float[]){ 0.01257f, 0.02576f, 0.03723f, 0.04578f, 0.05212f, 0.05798f,
- 0.06506f, 0.07385f, 0.08606f, 0.10925f },
+ NULL,
// TX_32X16
- (float[]){ 0.01233f, 0.02527f, 0.03699f, 0.04602f, 0.05286f, 0.05896f,
- 0.06531f, 0.07336f, 0.08582f, 0.11072f },
+ NULL,
// TX_32X64
NULL,
// TX_64X32
NULL,
// TX_4X16
- NULL,
+ (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f,
+ 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f,
+ 0.10242f, 0.12878f },
// TX_16X4
- NULL,
+ (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f,
+ 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f,
+ 0.10217f, 0.12610f },
// TX_8X32
NULL,
// TX_32X8
@@ -1631,7 +1731,18 @@ static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
cur_scores_2D[3];
}
score_2D_average /= 16;
- score_2D_transform_pow8(scores_2D, (20 - score_2D_average));
+
+ const int prune_aggr_table[2][2] = { { 6, 4 }, { 10, 7 } };
+ int pruning_aggressiveness = 1;
+ if (tx_set_type == EXT_TX_SET_ALL16) {
+ score_2D_transform_pow8(scores_2D, (10 - score_2D_average));
+ pruning_aggressiveness =
+ prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0];
+ } else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) {
+ score_2D_transform_pow8(scores_2D, (20 - score_2D_average));
+ pruning_aggressiveness =
+ prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1];
+ }
// Always keep the TX type with the highest score, prune all others with
// score below score_thresh.
@@ -1645,18 +1756,6 @@ static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
}
}
- int pruning_aggressiveness = 0;
- if (prune_mode == PRUNE_2D_ACCURATE) {
- if (tx_set_type == EXT_TX_SET_ALL16)
- pruning_aggressiveness = 6;
- else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
- pruning_aggressiveness = 4;
- } else if (prune_mode == PRUNE_2D_FAST) {
- if (tx_set_type == EXT_TX_SET_ALL16)
- pruning_aggressiveness = 10;
- else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
- pruning_aggressiveness = 7;
- }
const float score_thresh =
prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1];
@@ -1724,9 +1823,11 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
}
static void model_rd_from_sse(const AV1_COMP *const cpi,
- const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
- int plane, int64_t sse, int *rate,
- int64_t *dist) {
+ const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+ int plane, int64_t sse, int num_samples,
+ int *rate, int64_t *dist) {
+ (void)num_samples;
+ const MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
const int dequant_shift =
(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
@@ -1734,15 +1835,17 @@ static void model_rd_from_sse(const AV1_COMP *const cpi,
// Fast approximate the modelling function.
if (cpi->sf.simple_model_rd_from_var) {
const int64_t square_error = sse;
- int quantizer = (pd->dequant_Q3[1] >> dequant_shift);
+ int quantizer = pd->dequant_Q3[1] >> dequant_shift;
if (quantizer < 120)
- *rate = (int)((square_error * (280 - quantizer)) >>
- (16 - AV1_PROB_COST_SHIFT));
+ *rate = (int)AOMMIN(
+ (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT),
+ INT_MAX);
else
*rate = 0;
+ assert(*rate >= 0);
*dist = (square_error * quantizer) >> 8;
} else {
- av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize],
+ av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize],
pd->dequant_Q3[1] >> dequant_shift, rate,
dist);
}
@@ -1776,22 +1879,23 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
- int plane_to, int *out_rate_sum,
- int64_t *out_dist_sum, int *skip_txfm_sb,
- int64_t *skip_sse_sb, int *plane_rate,
- int64_t *plane_sse, int64_t *plane_dist) {
+ int plane_to, int mi_row, int mi_col,
+ int *out_rate_sum, int64_t *out_dist_sum,
+ int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse,
+ int64_t *plane_dist) {
// Note our transform coeffs are 8 times an orthogonal transform.
// Hence quantizer step is also 8 times. To get effective quantizer
// we need to divide by 8 before sending to modeling function.
int plane;
+ (void)mi_row;
+ (void)mi_col;
const int ref = xd->mi[0]->ref_frame[0];
int64_t rate_sum = 0;
int64_t dist_sum = 0;
int64_t total_sse = 0;
- x->pred_sse[ref] = 0;
-
for (plane = plane_from; plane <= plane_to; ++plane) {
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -1805,26 +1909,31 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
if (x->skip_chroma_rd && plane) continue;
- // TODO(geza): Write direct sse functions that do not compute
- // variance as well.
- sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+ model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
total_sse += sse;
-
- model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &rate, &dist);
-
rate_sum += rate;
dist_sum += dist;
if (plane_rate) plane_rate[plane] = rate;
if (plane_sse) plane_sse[plane] = sse;
if (plane_dist) plane_dist[plane] = dist;
+ assert(rate_sum >= 0);
}
if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ rate_sum = AOMMIN(rate_sum, INT_MAX);
*out_rate_sum = (int)rate_sum;
*out_dist_sum = dist_sum;
}
@@ -1949,7 +2058,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
assert(visible_cols > 0);
#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && plane == 0 && txb_cols >= 8 && txb_rows >= 8)
+ if (x->using_dist_8x8 && plane == 0)
return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
tx_bsize, txb_cols, txb_rows, visible_cols,
visible_rows, x->qindex);
@@ -1967,8 +2076,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
int blk_row, int blk_col,
const BLOCK_SIZE plane_bsize,
- const BLOCK_SIZE tx_bsize,
- int force_sse) {
+ const BLOCK_SIZE tx_bsize) {
int visible_rows, visible_cols;
const MACROBLOCKD *xd = &x->e_mbd;
get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
@@ -1978,8 +2086,7 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
#if CONFIG_DIST_8X8
int txb_height = block_size_high[tx_bsize];
int txb_width = block_size_wide[tx_bsize];
- if (!force_sse && x->using_dist_8x8 && plane == 0 && txb_width >= 8 &&
- txb_height >= 8) {
+ if (x->using_dist_8x8 && plane == 0) {
const int src_stride = x->plane[plane].src.stride;
const int src_idx = (blk_row * src_stride + blk_col)
<< tx_size_wide_log2[0];
@@ -2145,29 +2252,7 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
MAX_TX_SIZE, eob,
cpi->common.reduced_tx_set_used);
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
- // Save decoded pixels for inter block in pd->pred to avoid
- // block_8x8_rd_txfm_daala_dist() need to produce them
- // by calling av1_inverse_transform_block() again.
- const int pred_stride = block_size_wide[plane_bsize];
- const int pred_idx = (blk_row * pred_stride + blk_col)
- << tx_size_wide_log2[0];
- int16_t *pred = &x->pred_luma[pred_idx];
- int i, j;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- for (j = 0; j < bsh; j++)
- for (i = 0; i < bsw; i++)
- pred[j * pred_stride + i] =
- CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
- } else {
- for (j = 0; j < bsh; j++)
- for (i = 0; i < bsw; i++)
- pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
- }
- }
-#endif // CONFIG_DIST_8X8
return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
blk_row, blk_col, plane_bsize, tx_bsize);
}
@@ -2258,11 +2343,11 @@ static void get_2x2_normalized_sses_and_sads(
}
}
+// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+// 0: Do not collect any RD stats
+// 1: Collect RD stats for transform units
+// 2: Collect RD stats for partition units
#if CONFIG_COLLECT_RD_STATS
- // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
- // 0: Do not collect any RD stats
- // 1: Collect RD stats for transform units
- // 2: Collect RD stats for partition units
#if CONFIG_COLLECT_RD_STATS == 1
static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -2274,7 +2359,7 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
// Generate small sample to restrict output size.
static unsigned int seed = 21743;
- if (lcg_rand16(&seed) % 100 > 0) return;
+ if (lcg_rand16(&seed) % 256 > 0) return;
const char output_file[] = "tu_stats.txt";
FILE *fout = fopen(output_file, "a");
@@ -2336,7 +2421,8 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
int model_rate;
int64_t model_dist;
- model_rd_from_sse(cpi, xd, tx_bsize, plane, sse, &model_rate, &model_dist);
+ model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
const double model_rate_norm = (double)model_rate / num_samples;
const double model_dist_norm = (double)model_dist / num_samples;
fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
@@ -2360,7 +2446,7 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
}
#endif // CONFIG_COLLECT_RD_STATS == 1
-#if CONFIG_COLLECT_RD_STATS == 2
+#if CONFIG_COLLECT_RD_STATS >= 2
static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
const RD_STATS *const rd_stats,
BLOCK_SIZE plane_bsize) {
@@ -2369,7 +2455,7 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
// Generate small sample to restrict output size.
static unsigned int seed = 95014;
- if (lcg_rand16(&seed) % 100 > 0) return;
+ if (lcg_rand16(&seed) % 256 > 0) return;
const char output_file[] = "pu_stats.txt";
FILE *fout = fopen(output_file, "a");
@@ -2390,8 +2476,10 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
const double rate_norm = (double)rd_stats->rate / num_samples;
const double dist_norm = (double)rd_stats->dist / num_samples;
+ const double rdcost_norm =
+ (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples;
- fprintf(fout, "%g %g", rate_norm, dist_norm);
+ fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm);
const int src_stride = p->src.stride;
const uint8_t *const src = p->src.buf;
@@ -2426,14 +2514,18 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
fprintf(fout, " %g", sad_norm_arr[i]);
}
- fprintf(fout, " %d %d %d", q_step, bw, bh);
+ fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh);
int model_rate;
int64_t model_dist;
- model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, &model_dist);
+ model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
+ const double model_rdcost_norm =
+ (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples;
const double model_rate_norm = (double)model_rate / num_samples;
const double model_dist_norm = (double)model_dist / num_samples;
- fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
+ fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
+ model_rdcost_norm);
double mean = get_mean(src_diff, diff_stride, bw, bh);
mean /= (1 << shift);
@@ -2450,53 +2542,51 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
fprintf(fout, "\n");
fclose(fout);
}
-#endif // CONFIG_COLLECT_RD_STATS == 2
+#endif // CONFIG_COLLECT_RD_STATS >= 2
#endif // CONFIG_COLLECT_RD_STATS
-static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x,
- BLOCK_SIZE plane_bsize, int plane, int64_t *rsse,
+static void model_rd_with_dnn(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+ int plane, int64_t sse, int num_samples,
int *rate, int64_t *dist) {
const MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
const int log_numpels = num_pels_log2_lookup[plane_bsize];
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int q_step = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+
const struct macroblock_plane *const p = &x->plane[plane];
int bw, bh;
- const int diff_stride = block_size_wide[plane_bsize];
get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
&bh);
- const int num_samples = bw * bh;
- const int dequant_shift =
- (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
- const int q_step = pd->dequant_Q3[1] >> dequant_shift;
-
const int src_stride = p->src.stride;
const uint8_t *const src = p->src.buf;
const int dst_stride = pd->dst.stride;
const uint8_t *const dst = pd->dst.buf;
const int16_t *const src_diff = p->src_diff;
+ const int diff_stride = block_size_wide[plane_bsize];
const int shift = (xd->bd - 8);
- int64_t sse = aom_sum_squares_2d_i16(p->src_diff, diff_stride, bw, bh);
- sse = ROUND_POWER_OF_TWO(sse, shift * 2);
- const double sse_norm = (double)sse / num_samples;
if (sse == 0) {
if (rate) *rate = 0;
if (dist) *dist = 0;
- if (rsse) *rsse = sse;
return;
}
if (plane) {
int model_rate;
int64_t model_dist;
- model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate,
- &model_dist);
+ model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
if (rate) *rate = model_rate;
if (dist) *dist = model_dist;
- if (rsse) *rsse = sse;
return;
}
+ aom_clear_system_state();
+ const double sse_norm = (double)sse / num_samples;
+
double sse_norm_arr[4];
get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
dst_stride, src_diff, diff_stride,
@@ -2506,25 +2596,26 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x,
for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
mean /= (1 << shift);
}
- const double variance = sse_norm - mean * mean;
- assert(variance >= 0.0);
+ double sse_norm_sum = 0.0, sse_frac_arr[3];
+ for (int k = 0; k < 4; ++k) sse_norm_sum += sse_norm_arr[k];
+ for (int k = 0; k < 3; ++k)
+ sse_frac_arr[k] =
+ sse_norm_sum > 0.0 ? sse_norm_arr[k] / sse_norm_sum : 0.25;
const double q_sqr = (double)(q_step * q_step);
const double q_sqr_by_sse_norm = q_sqr / (sse_norm + 1.0);
+ const double mean_sqr_by_sse_norm = mean * mean / (sse_norm + 1.0);
double hor_corr, vert_corr;
get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
- float features[11];
+ float features[NUM_FEATURES_PUSTATS];
features[0] = (float)hor_corr;
features[1] = (float)log_numpels;
- features[2] = (float)q_sqr;
+ features[2] = (float)mean_sqr_by_sse_norm;
features[3] = (float)q_sqr_by_sse_norm;
- features[4] = (float)sse_norm_arr[0];
- features[5] = (float)sse_norm_arr[1];
- features[6] = (float)sse_norm_arr[2];
- features[7] = (float)sse_norm_arr[3];
- features[8] = (float)sse_norm;
- features[9] = (float)variance;
- features[10] = (float)vert_corr;
+ features[4] = (float)sse_frac_arr[0];
+ features[5] = (float)sse_frac_arr[1];
+ features[6] = (float)sse_frac_arr[2];
+ features[7] = (float)vert_corr;
float rate_f, dist_by_sse_norm_f;
av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f);
@@ -2532,27 +2623,29 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x,
const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm));
int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+ aom_clear_system_state();
// Check if skip is better
- if (RDCOST(x->rdmult, rate_i, dist_i) >= RDCOST(x->rdmult, 0, (sse << 4))) {
+ if (rate_i == 0) {
dist_i = sse << 4;
+ } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+ RDCOST(x->rdmult, 0, sse << 4)) {
rate_i = 0;
- } else if (rate_i == 0) {
dist_i = sse << 4;
}
if (rate) *rate = rate_i;
if (dist) *dist = dist_i;
- if (rsse) *rsse = sse;
return;
}
-void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
- MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
- int plane_to, int *out_rate_sum,
- int64_t *out_dist_sum, int *skip_txfm_sb,
- int64_t *skip_sse_sb, int *plane_rate,
- int64_t *plane_sse, int64_t *plane_dist) {
+static void model_rd_for_sb_with_dnn(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ (void)mi_row;
+ (void)mi_col;
// Note our transform coeffs are 8 times an orthogonal transform.
// Hence quantizer step is also 8 times. To get effective quantizer
// we need to divide by 8 before sending to modeling function.
@@ -2562,19 +2655,306 @@ void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
int64_t dist_sum = 0;
int64_t total_sse = 0;
- x->pred_sse[ref] = 0;
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int64_t dist, sse;
+ int rate;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int shift = (xd->bd - 8);
+ int bw, bh;
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+ &bw, &bh);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+
+ model_rd_with_dnn(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+// Fits a surface for rate and distortion using as features:
+// log2(sse_norm + 1) and log2(sse_norm/qstep^2)
+static void model_rd_with_surffit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist) {
+ (void)cpi;
+ (void)plane_bsize;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+ if (sse == 0) {
+ if (rate) *rate = 0;
+ if (dist) *dist = 0;
+ return;
+ }
+ aom_clear_system_state();
+ const double sse_norm = (double)sse / num_samples;
+ const double qstepsqr = (double)qstep * qstep;
+ const double xm = log(sse_norm + 1.0) / log(2.0);
+ const double yl = log(sse_norm / qstepsqr) / log(2.0);
+ double rate_f, dist_by_sse_norm_f;
+
+ av1_model_rd_surffit(xm, yl, &rate_f, &dist_by_sse_norm_f);
+
+ const double dist_f = dist_by_sse_norm_f * sse_norm;
+ int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+ int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+ aom_clear_system_state();
+
+ // Check if skip is better
+ if (rate_i == 0) {
+ dist_i = sse << 4;
+ } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+ RDCOST(x->rdmult, 0, sse << 4)) {
+ rate_i = 0;
+ dist_i = sse << 4;
+ }
+
+ if (rate) *rate = rate_i;
+ if (dist) *dist = dist_i;
+}
+
+static void model_rd_for_sb_with_surffit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ (void)mi_row;
+ (void)mi_col;
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int64_t dist, sse;
+ int rate;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ int bw, bh;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int shift = (xd->bd - 8);
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+ &bw, &bh);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+
+ model_rd_with_surffit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+ &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+// Fits a curve for rate and distortion using as feature:
+// log2(sse_norm/qstep^2)
+static void model_rd_with_curvfit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist) {
+ (void)cpi;
+ (void)plane_bsize;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+
+ if (sse == 0) {
+ if (rate) *rate = 0;
+ if (dist) *dist = 0;
+ return;
+ }
+ aom_clear_system_state();
+ const double sse_norm = (double)sse / num_samples;
+ const double qstepsqr = (double)qstep * qstep;
+ const double xqr = log(sse_norm / qstepsqr) / log(2.0);
+
+ double rate_f, dist_by_sse_norm_f;
+ av1_model_rd_curvfit(xqr, &rate_f, &dist_by_sse_norm_f);
+
+ const double dist_f = dist_by_sse_norm_f * sse_norm;
+ int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+ int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+ aom_clear_system_state();
+
+ // Check if skip is better
+ if (rate_i == 0) {
+ dist_i = sse << 4;
+ } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+ RDCOST(x->rdmult, 0, sse << 4)) {
+ rate_i = 0;
+ dist_i = sse << 4;
+ }
+
+ if (rate) *rate = rate_i;
+ if (dist) *dist = dist_i;
+}
+
+static void model_rd_for_sb_with_curvfit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ (void)mi_row;
+ (void)mi_col;
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int64_t dist, sse;
+ int rate;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ int bw, bh;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int shift = (xd->bd - 8);
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+ &bw, &bh);
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+ model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+ &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+static void model_rd_for_sb_with_fullrdy(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
const BLOCK_SIZE plane_bsize =
get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
int64_t sse;
int rate;
int64_t dist;
if (x->skip_chroma_rd && plane) continue;
- model_rd_with_dnn(cpi, x, plane_bsize, plane, &sse, &rate, &dist);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+ sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+
+ RD_STATS rd_stats;
+ if (plane == 0) {
+ select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
+ if (rd_stats.invalid_rate) {
+ rate = 0;
+ dist = sse << 4;
+ } else {
+ rate = rd_stats.rate;
+ dist = rd_stats.dist;
+ }
+ } else {
+ model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+ &dist);
+ }
if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
@@ -2638,8 +3018,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
- if (intra_hash_idx > 0 &&
- intra_txb_rd_info->entropy_context == cur_joint_ctx &&
+ if (intra_txb_rd_info->entropy_context == cur_joint_ctx &&
x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type;
const TX_TYPE ref_tx_type =
@@ -2727,7 +3106,6 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
#if CONFIG_DIST_8X8
if (x->using_dist_8x8) use_transform_domain_distortion = 0;
#endif
-
int calc_pixel_domain_distortion_final =
cpi->sf.use_transform_domain_distortion == 1 &&
use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD &&
@@ -2740,7 +3118,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
int64_t block_sse =
- pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize, 1);
+ pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
block_sse *= 16;
@@ -2834,7 +3212,6 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
assert(best_rd != INT64_MAX);
best_rd_stats->skip = best_eob == 0;
- if (best_eob == 0) best_tx_type = DCT_DCT;
if (plane == 0) {
update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
best_tx_type);
@@ -2914,24 +3291,12 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
int64_t rd1, rd2, rd;
RD_STATS this_rd_stats;
-#if CONFIG_DIST_8X8
- // If sub8x8 tx, 8x8 or larger partition, and luma channel,
- // dist-8x8 disables early skip, because the distortion metrics for
- // sub8x8 tx (MSE) and reference distortion from 8x8 or larger partition
- // (new distortion metric) are different.
- // Exception is: dist-8x8 is enabled but still MSE is used,
- // i.e. "--tune=" encoder option is not used.
- int bw = block_size_wide[plane_bsize];
- int bh = block_size_high[plane_bsize];
- int disable_early_skip =
- x->using_dist_8x8 && plane == AOM_PLANE_Y && bw >= 8 && bh >= 8 &&
- (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
- x->tune_metric != AOM_TUNE_PSNR;
-#endif // CONFIG_DIST_8X8
-
av1_init_rd_stats(&this_rd_stats);
- if (args->exit_early) return;
+ if (args->exit_early) {
+ args->incomplete_exit = 1;
+ return;
+ }
if (!is_inter_block(mbmi)) {
av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
@@ -2954,11 +3319,14 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
#endif // CONFIG_RD_DEBUG
av1_set_txb_context(x, plane, block, tx_size, a, l);
- if (plane == 0) {
- x->blk_skip[blk_row *
- (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) +
- blk_col] = (x->plane[plane].eobs[block] == 0);
- }
+ const int blk_idx =
+ blk_row * (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) +
+ blk_col;
+
+ if (plane == 0)
+ set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0);
+ else
+ set_blk_skip(x, plane, blk_idx, 0);
rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
@@ -2972,100 +3340,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
args->this_rd += rd;
-#if CONFIG_DIST_8X8
- if (!disable_early_skip)
-#endif
- if (args->this_rd > args->best_rd) {
- args->exit_early = 1;
- return;
- }
-}
-
-#if CONFIG_DIST_8X8
-static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize,
- struct rdcost_block_args *args) {
- MACROBLOCKD *const xd = &x->e_mbd;
- const struct macroblockd_plane *const pd = &xd->plane[0];
- const struct macroblock_plane *const p = &x->plane[0];
- MB_MODE_INFO *const mbmi = xd->mi[0];
- const int src_stride = p->src.stride;
- const int dst_stride = pd->dst.stride;
- const uint8_t *src = &p->src.buf[0];
- const uint8_t *dst = &pd->dst.buf[0];
- const int16_t *pred = &x->pred_luma[0];
- int bw = block_size_wide[bsize];
- int bh = block_size_high[bsize];
- int visible_w = bw;
- int visible_h = bh;
-
- int i, j;
- int64_t rd, rd1, rd2;
- int64_t sse = INT64_MAX, dist = INT64_MAX;
- int qindex = x->qindex;
-
- assert((bw & 0x07) == 0);
- assert((bh & 0x07) == 0);
-
- get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w,
- &visible_h);
-
- const int diff_stride = block_size_wide[bsize];
- const int16_t *diff = p->src_diff;
- sse = dist_8x8_diff(x, src, src_stride, diff, diff_stride, bw, bh, visible_w,
- visible_h, qindex);
- sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
- sse *= 16;
-
- if (!is_inter_block(mbmi)) {
- dist = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize, bw, bh,
- visible_w, visible_h, qindex);
- dist *= 16;
- } else {
- // For inter mode, the decoded pixels are provided in x->pred_luma,
- // while the predicted pixels are in dst.
- uint8_t *pred8;
- DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- pred8 = CONVERT_TO_BYTEPTR(pred16);
- else
- pred8 = (uint8_t *)pred16;
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- for (j = 0; j < bh; j++)
- for (i = 0; i < bw; i++)
- CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i];
- } else {
- for (j = 0; j < bh; j++)
- for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i];
- }
-
- dist = av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw, bh,
- visible_w, visible_h, qindex);
- dist *= 16;
- }
-
-#ifdef DEBUG_DIST_8X8
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) {
- assert(args->rd_stats.sse == sse);
- assert(args->rd_stats.dist == dist);
+ if (args->this_rd > args->best_rd) {
+ args->exit_early = 1;
+ return;
}
-#endif // DEBUG_DIST_8X8
-
- args->rd_stats.sse = sse;
- args->rd_stats.dist = dist;
-
- rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist);
- rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse);
- rd = AOMMIN(rd1, rd2);
-
- args->rd_stats.rdcost = rd;
- args->this_rd = rd;
-
- if (args->this_rd > args->best_rd) args->exit_early = 1;
}
-#endif // CONFIG_DIST_8X8
static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
@@ -3089,16 +3368,12 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
&args);
-#if CONFIG_DIST_8X8
- int bw = block_size_wide[bsize];
- int bh = block_size_high[bsize];
- if (x->using_dist_8x8 && !args.exit_early && plane == 0 && bw >= 8 &&
- bh >= 8 && (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
- dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
-#endif
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early;
- if (args.exit_early) {
+ if (invalid_rd) {
av1_invalid_rd_stats(rd_stats);
} else {
*rd_stats = args.rd_stats;
@@ -3269,6 +3544,11 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16);
for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) {
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue;
+ }
+#endif
RD_STATS this_rd_stats;
if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD;
rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE);
@@ -3284,10 +3564,13 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
}
if (n == TX_4X4) break;
}
- mbmi->tx_size = best_tx_size;
- memcpy(mbmi->txk_type, best_txk_type,
- sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
- memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+
+ if (rd_stats->rate != INT_MAX) {
+ mbmi->tx_size = best_tx_size;
+ memcpy(mbmi->txk_type, best_txk_type,
+ sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+ memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+ }
// Reset the pruning flags.
av1_zero(x->tx_search_prune);
@@ -3429,7 +3712,8 @@ static int conditional_skipintra(PREDICTION_MODE mode,
// Model based RD estimation for luma intra blocks.
static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
- BLOCK_SIZE bsize, int mode_cost) {
+ BLOCK_SIZE bsize, int mode_cost, int mi_row,
+ int mi_col) {
const AV1_COMMON *cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -3450,10 +3734,9 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
}
}
// RD estimation.
- av1_subtract_plane(x, bsize, 0);
- model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
- &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL,
- NULL, NULL);
+ model_rd_sb_fn[MODELRD_TYPE_INTRA](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &this_rd_stats.rate,
+ &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL);
if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
mode_cost +=
x->angle_delta_cost[mbmi->mode - V_PRED]
@@ -3519,13 +3802,16 @@ static void optimize_palette_colors(uint16_t *color_cache, int n_cache,
// Given the base colors as specified in centroids[], calculate the RD cost
// of palette mode.
-static void palette_rd_y(
- const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
- BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
- uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
- uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
- int *rate, int *rate_tokenonly, int *rate_overhead, int64_t *distortion,
- int *skippable, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip) {
+static void palette_rd_y(const AV1_COMP *const cpi, MACROBLOCK *x,
+ MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int dc_mode_cost, const int *data,
+ int *centroids, int n, uint16_t *color_cache,
+ int n_cache, MB_MODE_INFO *best_mbmi,
+ uint8_t *best_palette_color_map, int64_t *best_rd,
+ int64_t *best_model_rd, int *rate, int *rate_tokenonly,
+ int *rate_overhead, int64_t *distortion,
+ int *skippable, PICK_MODE_CONTEXT *ctx,
+ uint8_t *blk_skip) {
optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
int k = av1_remove_duplicates(centroids, n);
if (k < PALETTE_MIN_SIZE) {
@@ -3551,7 +3837,8 @@ static void palette_rd_y(
extend_palette_color_map(color_map, cols, rows, block_width, block_height);
const int palette_mode_cost =
intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
- int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
+ int64_t this_model_rd =
+ intra_model_yrd(cpi, x, bsize, palette_mode_cost, mi_row, mi_col);
if (*best_model_rd != INT64_MAX &&
this_model_rd > *best_model_rd + (*best_model_rd >> 1))
return;
@@ -3580,11 +3867,11 @@ static void palette_rd_y(
}
static int rd_pick_palette_intra_sby(
- const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
- int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
- int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
- int64_t *distortion, int *skippable, PICK_MODE_CONTEXT *ctx,
- uint8_t *best_blk_skip) {
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int dc_mode_cost, MB_MODE_INFO *best_mbmi,
+ uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+ int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+ PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip) {
int rate_overhead = 0;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -3668,10 +3955,11 @@ static int rd_pick_palette_intra_sby(
// where the dominant colors and the k-means results are similar.
for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
for (i = 0; i < n; ++i) centroids[i] = top_colors[i];
- palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
- color_cache, n_cache, best_mbmi, best_palette_color_map,
- best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead,
- distortion, skippable, ctx, best_blk_skip);
+ palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data,
+ centroids, n, color_cache, n_cache, best_mbmi,
+ best_palette_color_map, best_rd, best_model_rd, rate,
+ rate_tokenonly, &rate_overhead, distortion, skippable, ctx,
+ best_blk_skip);
}
// K-means clustering.
@@ -3688,10 +3976,11 @@ static int rd_pick_palette_intra_sby(
}
av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
}
- palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
- color_cache, n_cache, best_mbmi, best_palette_color_map,
- best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead,
- distortion, skippable, ctx, best_blk_skip);
+ palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data,
+ centroids, n, color_cache, n_cache, best_mbmi,
+ best_palette_color_map, best_rd, best_model_rd, rate,
+ rate_tokenonly, &rate_overhead, distortion, skippable, ctx,
+ best_blk_skip);
}
}
@@ -3705,10 +3994,11 @@ static int rd_pick_palette_intra_sby(
// Return 1 if an filter intra mode is selected; return 0 otherwise.
static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
- int *rate, int *rate_tokenonly,
- int64_t *distortion, int *skippable,
- BLOCK_SIZE bsize, int mode_cost,
- int64_t *best_rd, int64_t *best_model_rd,
+ int mi_row, int mi_col, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ int *skippable, BLOCK_SIZE bsize,
+ int mode_cost, int64_t *best_rd,
+ int64_t *best_model_rd,
PICK_MODE_CONTEXT *ctx) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi = xd->mi[0];
@@ -3727,7 +4017,7 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
int64_t this_rd, this_model_rd;
RD_STATS tokenonly_rd_stats;
mbmi->filter_intra_mode_info.filter_intra_mode = mode;
- this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
+ this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col);
if (*best_model_rd != INT64_MAX &&
this_model_rd > *best_model_rd + (*best_model_rd >> 1))
continue;
@@ -3770,20 +4060,18 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
// Run RD calculation with given luma intra prediction angle., and return
// the RD cost. Update the best mode info. if the RD cost is the best so far.
static int64_t calc_rd_given_intra_angle(
- const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
- int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
- RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
- int64_t *best_rd, int64_t *best_model_rd, TX_TYPE *best_txk_type,
- uint8_t *best_blk_skip) {
- int this_rate;
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int mode_cost, int64_t best_rd_in, int8_t angle_delta,
+ int max_angle_delta, int *rate, RD_STATS *rd_stats, int *best_angle_delta,
+ TX_SIZE *best_tx_size, int64_t *best_rd, int64_t *best_model_rd,
+ TX_TYPE *best_txk_type, uint8_t *best_blk_skip) {
RD_STATS tokenonly_rd_stats;
int64_t this_rd, this_model_rd;
MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
const int n4 = bsize_to_num_blk(bsize);
assert(!is_inter_block(mbmi));
-
mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
- this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
+ this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col);
if (*best_model_rd != INT64_MAX &&
this_model_rd > *best_model_rd + (*best_model_rd >> 1))
return INT64_MAX;
@@ -3791,10 +4079,9 @@ static int64_t calc_rd_given_intra_angle(
super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in);
if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
- this_rate =
- tokenonly_rd_stats.rate + mode_cost +
- x->angle_delta_cost[mbmi->mode - V_PRED]
- [max_angle_delta + mbmi->angle_delta[PLANE_TYPE_Y]];
+ int this_rate =
+ mode_cost + tokenonly_rd_stats.rate +
+ x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
if (this_rd < *best_rd) {
@@ -3815,32 +4102,32 @@ static int64_t calc_rd_given_intra_angle(
// With given luma directional intra prediction mode, pick the best angle delta
// Return the RD cost corresponding to the best angle delta.
static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
- int *rate, RD_STATS *rd_stats,
- BLOCK_SIZE bsize, int mode_cost,
- int64_t best_rd,
+ int mi_row, int mi_col, int *rate,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int mode_cost, int64_t best_rd,
int64_t *best_model_rd) {
- MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = xd->mi[0];
+ MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
assert(!is_inter_block(mbmi));
- int i, angle_delta, best_angle_delta = 0;
- int first_try = 1;
- int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+
+ int best_angle_delta = 0;
+ int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
TX_SIZE best_tx_size = mbmi->tx_size;
- const int n4 = bsize_to_num_blk(bsize);
TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
- for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+ for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
- for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
- for (i = 0; i < 2; ++i) {
- best_rd_in = (best_rd == INT64_MAX)
- ? INT64_MAX
- : (best_rd + (best_rd >> (first_try ? 3 : 5)));
- this_rd = calc_rd_given_intra_angle(
- cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta,
- MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
- &best_rd, best_model_rd, best_txk_type, best_blk_skip);
+ int first_try = 1;
+ for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ for (int i = 0; i < 2; ++i) {
+ const int64_t best_rd_in =
+ (best_rd == INT64_MAX) ? INT64_MAX
+ : (best_rd + (best_rd >> (first_try ? 3 : 5)));
+ const int64_t this_rd = calc_rd_given_intra_angle(
+ cpi, x, bsize, mi_row, mi_col, mode_cost, best_rd_in,
+ (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
+ &best_angle_delta, &best_tx_size, &best_rd, best_model_rd,
+ best_txk_type, best_blk_skip);
rd_cost[2 * angle_delta + i] = this_rd;
if (first_try && this_rd == INT64_MAX) return best_rd;
first_try = 0;
@@ -3852,28 +4139,31 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
}
assert(best_rd != INT64_MAX);
- for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
- int64_t rd_thresh;
- for (i = 0; i < 2; ++i) {
+ for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ for (int i = 0; i < 2; ++i) {
int skip_search = 0;
- rd_thresh = best_rd + (best_rd >> 5);
+ const int64_t rd_thresh = best_rd + (best_rd >> 5);
if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
skip_search = 1;
if (!skip_search) {
- calc_rd_given_intra_angle(
- cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta,
- MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
- &best_rd, best_model_rd, best_txk_type, best_blk_skip);
+ calc_rd_given_intra_angle(cpi, x, bsize, mi_row, mi_col, mode_cost,
+ best_rd, (1 - 2 * i) * angle_delta,
+ MAX_ANGLE_DELTA, rate, rd_stats,
+ &best_angle_delta, &best_tx_size, &best_rd,
+ best_model_rd, best_txk_type, best_blk_skip);
}
}
}
- mbmi->tx_size = best_tx_size;
- mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
- memcpy(mbmi->txk_type, best_txk_type,
- sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
- memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+ if (rd_stats->rate != INT_MAX) {
+ mbmi->tx_size = best_tx_size;
+ mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
+ memcpy(mbmi->txk_type, best_txk_type,
+ sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+ memcpy(x->blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+ }
return best_rd;
}
@@ -4052,10 +4342,10 @@ static void intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
// This function is used only for intra_only frames
static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
- int *rate, int *rate_tokenonly,
- int64_t *distortion, int *skippable,
- BLOCK_SIZE bsize, int64_t best_rd,
- PICK_MODE_CONTEXT *ctx) {
+ int mi_row, int mi_col, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ int *skippable, BLOCK_SIZE bsize,
+ int64_t best_rd, PICK_MODE_CONTEXT *ctx) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
assert(!is_inter_block(mbmi));
@@ -4098,13 +4388,14 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
MB_MODE_INFO best_mbmi = *mbmi;
/* Y Search for intra prediction mode */
- for (int mode_idx = DC_PRED; mode_idx < INTRA_MODES; ++mode_idx) {
+ for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
RD_STATS this_rd_stats;
int this_rate, this_rate_tokenonly, s;
int64_t this_distortion, this_rd, this_model_rd;
mbmi->mode = intra_rd_search_mode_order[mode_idx];
mbmi->angle_delta[PLANE_TYPE_Y] = 0;
- this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]);
+ this_model_rd =
+ intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode], mi_row, mi_col);
if (best_model_rd != INT64_MAX &&
this_model_rd > best_model_rd + (best_model_rd >> 1))
continue;
@@ -4113,8 +4404,9 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
if (is_directional_mode && av1_use_angle_delta(bsize)) {
this_rd_stats.rate = INT_MAX;
- rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
- bmode_costs[mbmi->mode], best_rd, &best_model_rd);
+ rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &this_rate,
+ &this_rd_stats, bsize, bmode_costs[mbmi->mode],
+ best_rd, &best_model_rd);
} else {
super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
}
@@ -4151,16 +4443,16 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
}
if (try_palette) {
- rd_pick_palette_intra_sby(cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi,
- best_palette_color_map, &best_rd, &best_model_rd,
- rate, rate_tokenonly, distortion, skippable, ctx,
- ctx->blk_skip);
+ rd_pick_palette_intra_sby(
+ cpi, x, bsize, mi_row, mi_col, bmode_costs[DC_PRED], &best_mbmi,
+ best_palette_color_map, &best_rd, &best_model_rd, rate, rate_tokenonly,
+ distortion, skippable, ctx, ctx->blk_skip);
}
if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
- if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
- skippable, bsize, bmode_costs[DC_PRED],
- &best_rd, &best_model_rd, ctx)) {
+ if (rd_pick_filter_intra_sby(
+ cpi, x, mi_row, mi_col, rate, rate_tokenonly, distortion, skippable,
+ bsize, bmode_costs[DC_PRED], &best_rd, &best_model_rd, ctx)) {
best_mbmi = *mbmi;
}
}
@@ -4230,16 +4522,12 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
int blk_row, int blk_col, int plane, int block,
- int plane_bsize, const ENTROPY_CONTEXT *a,
- const ENTROPY_CONTEXT *l, RD_STATS *rd_stats,
+ int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats,
FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost,
TXB_RD_INFO *rd_info_array) {
const struct macroblock_plane *const p = &x->plane[plane];
- TXB_CTX txb_ctx;
- get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
const uint16_t cur_joint_ctx =
- (txb_ctx.dc_sign_ctx << 8) + txb_ctx.txb_skip_ctx;
-
+ (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
const int txk_type_idx =
av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
// Look up RD and terminate early in case when we've already processed exactly
@@ -4264,7 +4552,7 @@ static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
RD_STATS this_rd_stats;
search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
- &txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
+ txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
av1_merge_rd_stats(rd_stats, &this_rd_stats);
@@ -4428,8 +4716,8 @@ static void try_tx_block_no_split(
rd_stats->zero_rate = zero_blk_rate;
const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
mbmi->inter_tx_size[index] = tx_size;
- tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta,
- ptl, rd_stats, ftxs_mode, ref_best_rd,
+ tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
+ &txb_ctx, rd_stats, ftxs_mode, ref_best_rd,
rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
assert(rd_stats->rate < INT_MAX);
@@ -4444,12 +4732,12 @@ static void try_tx_block_no_split(
rd_stats->rate = zero_blk_rate;
rd_stats->dist = rd_stats->sse;
rd_stats->skip = 1;
- x->blk_skip[blk_row * bw + blk_col] = 1;
+ set_blk_skip(x, 0, blk_row * bw + blk_col, 1);
p->eobs[block] = 0;
update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
DCT_DCT);
} else {
- x->blk_skip[blk_row * bw + blk_col] = 0;
+ set_blk_skip(x, 0, blk_row * bw + blk_col, 0);
rd_stats->skip = 0;
}
@@ -4482,7 +4770,6 @@ static void try_tx_block_split(
MACROBLOCKD *const xd = &x->e_mbd;
const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
- struct macroblock_plane *const p = &x->plane[0];
const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
const int bsw = tx_size_wide_unit[sub_txs];
const int bsh = tx_size_high_unit[sub_txs];
@@ -4490,10 +4777,7 @@ static void try_tx_block_split(
RD_STATS this_rd_stats;
int this_cost_valid = 1;
int64_t tmp_rd = 0;
-#if CONFIG_DIST_8X8
- int sub8x8_eob[4] = { 0, 0, 0, 0 };
- struct macroblockd_plane *const pd = &xd->plane[0];
-#endif
+
split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
assert(tx_size < TX_SIZES_ALL);
@@ -4511,123 +4795,22 @@ static void try_tx_block_split(
&this_cost_valid, ftxs_mode,
(rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
-#if CONFIG_DIST_8X8
- if (!x->using_dist_8x8)
-#endif
- if (!this_cost_valid) goto LOOP_EXIT;
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && tx_size == TX_8X8) {
- sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block];
- }
-#endif // CONFIG_DIST_8X8
+ if (!this_cost_valid) goto LOOP_EXIT;
+
av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
-#if CONFIG_DIST_8X8
- if (!x->using_dist_8x8)
-#endif
- if (no_split_rd < tmp_rd) {
- this_cost_valid = 0;
- goto LOOP_EXIT;
- }
+
+ if (no_split_rd < tmp_rd) {
+ this_cost_valid = 0;
+ goto LOOP_EXIT;
+ }
block += sub_step;
}
}
LOOP_EXIT : {}
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) {
- const int src_stride = p->src.stride;
- const int dst_stride = pd->dst.stride;
-
- const uint8_t *src =
- &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
- const uint8_t *dst =
- &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-
- int64_t dist_8x8;
- const int qindex = x->qindex;
- const int pred_stride = block_size_wide[plane_bsize];
- const int pred_idx = (blk_row * pred_stride + blk_col)
- << tx_size_wide_log2[0];
- const int16_t *pred = &x->pred_luma[pred_idx];
- int i, j;
- int row, col;
-
- uint8_t *pred8;
- DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
-
- dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, BLOCK_8X8,
- 8, 8, 8, 8, qindex) *
- 16;
-
-#ifdef DEBUG_DIST_8X8
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
- assert(sum_rd_stats.sse == dist_8x8);
-#endif // DEBUG_DIST_8X8
-
- split_rd_stats->sse = dist_8x8;
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- pred8 = CONVERT_TO_BYTEPTR(pred8_16);
- else
- pred8 = (uint8_t *)pred8_16;
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- for (row = 0; row < 2; ++row) {
- for (col = 0; col < 2; ++col) {
- int idx = row * 2 + col;
- int eob = sub8x8_eob[idx];
-
- if (eob > 0) {
- for (j = 0; j < 4; j++)
- for (i = 0; i < 4; i++)
- CONVERT_TO_SHORTPTR(pred8)
- [(row * 4 + j) * 8 + 4 * col + i] =
- pred[(row * 4 + j) * pred_stride + 4 * col + i];
- } else {
- for (j = 0; j < 4; j++)
- for (i = 0; i < 4; i++)
- CONVERT_TO_SHORTPTR(pred8)
- [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR(
- dst)[(row * 4 + j) * dst_stride + 4 * col + i];
- }
- }
- }
- } else {
- for (row = 0; row < 2; ++row) {
- for (col = 0; col < 2; ++col) {
- int idx = row * 2 + col;
- int eob = sub8x8_eob[idx];
-
- if (eob > 0) {
- for (j = 0; j < 4; j++)
- for (i = 0; i < 4; i++)
- pred8[(row * 4 + j) * 8 + 4 * col + i] =
- (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i];
- } else {
- for (j = 0; j < 4; j++)
- for (i = 0; i < 4; i++)
- pred8[(row * 4 + j) * 8 + 4 * col + i] =
- dst[(row * 4 + j) * dst_stride + 4 * col + i];
- }
- }
- }
- }
- dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8, 8,
- 8, 8, qindex) *
- 16;
-
-#ifdef DEBUG_DIST_8X8
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
- assert(sum_rd_stats.dist == dist_8x8);
-#endif // DEBUG_DIST_8X8
-
- split_rd_stats->dist = dist_8x8;
- tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
- }
-#endif // CONFIG_DIST_8X8
if (this_cost_valid) *split_rd = tmp_rd;
}
@@ -4660,7 +4843,10 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
const int try_no_split = 1;
int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
-
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8)
+ try_split &= tx_size_wide[tx_size] >= 16 && tx_size_high[tx_size] >= 16;
+#endif
TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
// TX no split
@@ -4691,11 +4877,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
}
}
-#if COLLECT_TX_SIZE_DATA
- // Do not skip tx_split when collecting tx size data.
- try_split = 1;
-#endif
-
// TX split
int64_t split_rd = INT64_MAX;
RD_STATS split_rd_stats;
@@ -4707,54 +4888,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
rd_info_node, &split_rd_stats, &split_rd);
}
-#if COLLECT_TX_SIZE_DATA
- do {
- if (tx_size <= TX_4X4 || depth >= MAX_VARTX_DEPTH) break;
-
-#if 0
- // Randomly select blocks to collect data to reduce output file size.
- const int rnd_val = rand() % 2;
- if (rnd_val) break;
-#endif
-
- const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
- const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
- const int within_border =
- mi_row >= xd->tile.mi_row_start &&
- (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
- mi_col >= xd->tile.mi_col_start &&
- (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
- if (!within_border) break;
-
- FILE *fp = fopen(av1_tx_size_data_output_file, "a");
- if (!fp) break;
-
- // Split decision, RD cost, block type(inter/intra), q-index, rdmult,
- // and block size.
- const int split_selected = sum_rd < this_rd;
- const int is_inter = 1;
- const int txb_w = tx_size_wide[tx_size];
- const int txb_h = tx_size_high[tx_size];
- fprintf(fp, "%d,%lld,%lld,%d,%d,%d,%d,%d,", split_selected,
- (long long)this_rd, (long long)sum_rd, cpi->common.base_qindex,
- x->rdmult, is_inter, txb_w, txb_h);
-
- // Residue signal.
- const int diff_stride = block_size_wide[plane_bsize];
- const int16_t *src_diff =
- &p->src_diff[(blk_row * diff_stride + blk_col) * 4];
- for (int r = 0; r < txb_h; ++r) {
- for (int c = 0; c < txb_w; ++c) {
- fprintf(fp, "%d,", src_diff[c]);
- }
- src_diff += diff_stride;
- }
- fprintf(fp, "\n");
-
- fclose(fp);
- } while (0);
-#endif // COLLECT_TX_SIZE_DATA
-
if (no_split.rd < split_rd) {
ENTROPY_CONTEXT *pta = ta + blk_col;
ENTROPY_CONTEXT *ptl = tl + blk_row;
@@ -4773,7 +4906,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
mbmi->tx_size = tx_size_selected;
update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
no_split.tx_type);
- x->blk_skip[blk_row * bw + blk_col] = rd_stats->skip;
+ set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip);
} else {
*rd_stats = split_rd_stats;
if (split_rd == INT64_MAX) *is_cost_valid = 0;
@@ -4787,7 +4920,7 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
TXB_RD_INFO_NODE *rd_info_tree) {
MACROBLOCKD *const xd = &x->e_mbd;
int is_cost_valid = 1;
- int64_t this_rd = 0;
+ int64_t this_rd = 0, skip_rd = 0;
if (ref_best_rd < 0) is_cost_valid = 0;
@@ -4818,42 +4951,39 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+ const int skip_ctx = av1_get_skip_context(xd);
+ const int s0 = x->skip_cost[skip_ctx][0];
+ const int s1 = x->skip_cost[skip_ctx][1];
+ skip_rd = RDCOST(x->rdmult, s1, 0);
+ this_rd = RDCOST(x->rdmult, s0, 0);
for (idy = 0; idy < mi_height; idy += bh) {
for (idx = 0; idx < mi_width; idx += bw) {
+ int64_t best_rd_sofar = (ref_best_rd - (AOMMIN(skip_rd, this_rd)));
select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth,
plane_bsize, ctxa, ctxl, tx_above, tx_left,
- &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid,
- ftxs_mode, rd_info_tree);
+ &pn_rd_stats, best_rd_sofar, &is_cost_valid, ftxs_mode,
+ rd_info_tree);
if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
av1_invalid_rd_stats(rd_stats);
return;
}
av1_merge_rd_stats(rd_stats, &pn_rd_stats);
- this_rd +=
- AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
- RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
+ skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+ this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
block += step;
if (rd_info_tree != NULL) rd_info_tree += 1;
}
}
+ if (skip_rd <= this_rd) {
+ rd_stats->rate = 0;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip = 1;
+ } else {
+ rd_stats->skip = 0;
+ }
}
- const int skip_ctx = av1_get_skip_context(xd);
- const int s0 = x->skip_cost[skip_ctx][0];
- const int s1 = x->skip_cost[skip_ctx][1];
- int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
- this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
- if (skip_rd <= this_rd) {
- this_rd = skip_rd;
- rd_stats->rate = 0;
- rd_stats->dist = rd_stats->sse;
- rd_stats->skip = 1;
- } else {
- rd_stats->skip = 0;
- }
- if (this_rd > ref_best_rd) is_cost_valid = 0;
-
if (!is_cost_valid) {
// reset cost value
av1_invalid_rd_stats(rd_stats);
@@ -4945,8 +5075,8 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
.txb_skip_cost[txb_ctx.txb_skip_ctx][1];
rd_stats->zero_rate = zero_blk_rate;
rd_stats->ref_rdcost = ref_best_rd;
- tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, ta,
- tl, rd_stats, ftxs_mode, ref_best_rd, NULL);
+ tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
+ &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL);
const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
@@ -4954,14 +5084,14 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
rd_stats->rate = zero_blk_rate;
rd_stats->dist = rd_stats->sse;
rd_stats->skip = 1;
- x->blk_skip[blk_row * mi_width + blk_col] = 1;
+ set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1);
x->plane[0].eobs[block] = 0;
x->plane[0].txb_entropy_ctx[block] = 0;
update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
DCT_DCT);
} else {
rd_stats->skip = 0;
- x->blk_skip[blk_row * mi_width + blk_col] = 0;
+ set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0);
}
if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
rd_stats->rate += x->txfm_partition_cost[ctx][0];
@@ -5128,12 +5258,13 @@ static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info,
static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
const uint32_t hash) {
// Linear search through the circular buffer to find matching hash.
- int index;
- for (int i = cur_record->num - 1; i >= 0; i--) {
- index = (cur_record->index_start + i) % TX_SIZE_RD_RECORD_BUFFER_LEN;
- if (cur_record->hash_vals[index] == hash) return index;
+ for (int i = cur_record->index_start - 1; i >= 0; i--) {
+ if (cur_record->hash_vals[i] == hash) return i;
}
-
+ for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) {
+ if (cur_record->hash_vals[i] == hash) return i;
+ }
+ int index;
// If not found - add new RD info into the buffer and return its index
if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
index = (cur_record->index_start + cur_record->num) %
@@ -5150,6 +5281,155 @@ static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
return index;
}
+typedef struct {
+ int leaf;
+ int8_t children[4];
+} RD_RECORD_IDX_NODE;
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = {
+ { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 1, { 0, 0, 0, 0 } },
+ { 1, { 0, 0, 0, 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 1, { 0 } },
+ { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = {
+ { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 0, { 3, 4, 5, 6 } },
+ { 0, { 7, 8, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 0, { 3, 4, 7, 8 } },
+ { 0, { 5, 6, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = {
+ { 0, { 1, 2, 3, 4 } }, { 0, { 5, 6, 9, 10 } }, { 0, { 7, 8, 11, 12 } },
+ { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = {
+ { 0, { 2, 3, 4, 5 } }, { 0, { 6, 7, 8, 9 } },
+ { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } },
+ { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } },
+ { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } },
+ { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = {
+ { 0, { 2, 3, 6, 7 } }, { 0, { 4, 5, 8, 9 } },
+ { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } },
+ { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } },
+ { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } },
+ { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = {
+ { 0, { 4, 5, 8, 9 } }, { 0, { 6, 7, 10, 11 } },
+ { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } },
+ { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } },
+ { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } },
+ { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } },
+ { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } },
+ { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } },
+ { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } },
+ { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } },
+ { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = {
+ { 0, { 1, -1, 2, -1 } },
+ { 0, { 3, 4, -1, -1 } },
+ { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 0, { 3, 4, -1, -1 } },
+ { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = {
+ NULL, // BLOCK_4X4
+ NULL, // BLOCK_4X8
+ NULL, // BLOCK_8X4
+ rd_record_tree_8x8, // BLOCK_8X8
+ rd_record_tree_8x16, // BLOCK_8X16
+ rd_record_tree_16x8, // BLOCK_16X8
+ rd_record_tree_16x16, // BLOCK_16X16
+ rd_record_tree_1_2, // BLOCK_16X32
+ rd_record_tree_2_1, // BLOCK_32X16
+ rd_record_tree_sqr, // BLOCK_32X32
+ rd_record_tree_1_2, // BLOCK_32X64
+ rd_record_tree_2_1, // BLOCK_64X32
+ rd_record_tree_sqr, // BLOCK_64X64
+ rd_record_tree_64x128, // BLOCK_64X128
+ rd_record_tree_128x64, // BLOCK_128X64
+ rd_record_tree_128x128, // BLOCK_128X128
+ NULL, // BLOCK_4X16
+ NULL, // BLOCK_16X4
+ rd_record_tree_1_4, // BLOCK_8X32
+ rd_record_tree_4_1, // BLOCK_32X8
+ rd_record_tree_1_4, // BLOCK_16X64
+ rd_record_tree_4_1, // BLOCK_64X16
+};
+
+static const int rd_record_tree_size[BLOCK_SIZES_ALL] = {
+ 0, // BLOCK_4X4
+ 0, // BLOCK_4X8
+ 0, // BLOCK_8X4
+ sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X8
+ sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X16
+ sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X8
+ sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X16
+ sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X32
+ sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X16
+ sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X32
+ sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X64
+ sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X32
+ sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X64
+ sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X128
+ sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X64
+ sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X128
+ 0, // BLOCK_4X16
+ 0, // BLOCK_16X4
+ sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X32
+ sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X8
+ sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X64
+ sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X16
+};
+
+static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree,
+ BLOCK_SIZE bsize) {
+ const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize];
+ const int size = rd_record_tree_size[bsize];
+ for (int i = 0; i < size; ++i) {
+ if (rd_record[i].leaf) {
+ av1_zero(tree[i].children);
+ } else {
+ for (int j = 0; j < 4; ++j) {
+ const int8_t idx = rd_record[i].children[j];
+ tree[i].children[j] = idx > 0 ? &tree[idx] : NULL;
+ }
+ }
+ }
+}
+
// Go through all TX blocks that could be used in TX size search, compute
// residual hash values for them and find matching RD info that stores previous
// RD search results for these TX blocks. The idea is to prevent repeated
@@ -5168,26 +5448,23 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
// Hashing is performed only for square TX sizes larger than TX_4X4
if (max_square_tx_size < TX_8X8) return 0;
-
- const int bw_mi = mi_size_wide[bsize];
const int diff_stride = bw;
const struct macroblock_plane *const p = &x->plane[0];
const int16_t *diff = &p->src_diff[0];
-
+ init_rd_record_tree(dst_rd_info, bsize);
// Coordinates of the top-left corner of current block within the superblock
// measured in pixels:
const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
int cur_rd_info_idx = 0;
int cur_tx_depth = 0;
- uint8_t parent_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
- uint8_t child_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
while (cur_tx_depth <= MAX_VARTX_DEPTH) {
const int cur_tx_bw = tx_size_wide[cur_tx_size];
const int cur_tx_bh = tx_size_high[cur_tx_size];
if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
+ const int tx_size_idx = cur_tx_size - TX_8X8;
for (int row = 0; row < bh; row += cur_tx_bh) {
for (int col = 0; col < bw; col += cur_tx_bw) {
if (cur_tx_bw != cur_tx_bh) {
@@ -5211,48 +5488,13 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
(uint8_t *)hash_data,
2 * cur_tx_bw * cur_tx_bh);
-
// Find corresponding RD info based on the hash value.
- const int rd_record_idx =
- row_in_sb * (MAX_MIB_SIZE >> (cur_tx_size + 1 - TX_8X8)) +
- col_in_sb;
-
- int idx = find_tx_size_rd_info(
- &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx], hash);
+ const int record_idx =
+ row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
+ TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx];
+ int idx = find_tx_size_rd_info(records, hash);
dst_rd_info[cur_rd_info_idx].rd_info_array =
- &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx]
- .tx_rd_info[idx];
- }
-
- // Update the output quadtree RD info structure.
- av1_zero(dst_rd_info[cur_rd_info_idx].children);
- const int this_mi_row = row / MI_SIZE;
- const int this_mi_col = col / MI_SIZE;
- if (cur_tx_depth > 0) { // Set up child pointers.
- const int mi_index = this_mi_row * bw_mi + this_mi_col;
- const int child_idx = child_idx_buf[mi_index];
- assert(child_idx < 4);
- dst_rd_info[parent_idx_buf[mi_index]].children[child_idx] =
- &dst_rd_info[cur_rd_info_idx];
- }
- if (cur_tx_depth < MAX_VARTX_DEPTH) { // Set up parent and child idx.
- const int tx_bh_mi = cur_tx_bh / MI_SIZE;
- const int tx_bw_mi = cur_tx_bw / MI_SIZE;
- for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; ++i) {
- memset(parent_idx_buf + i * bw_mi + this_mi_col, cur_rd_info_idx,
- tx_bw_mi);
- }
- int child_idx = 0;
- const int next_tx_bh_mi = tx_size_wide_unit[next_tx_size];
- const int next_tx_bw_mi = tx_size_wide_unit[next_tx_size];
- for (int i = this_mi_row; i < this_mi_row + tx_bh_mi;
- i += next_tx_bh_mi) {
- for (int j = this_mi_col; j < this_mi_col + tx_bw_mi;
- j += next_tx_bw_mi) {
- assert(child_idx < 4);
- child_idx_buf[i * bw_mi + j] = child_idx++;
- }
- }
+ &records->tx_rd_info[idx];
}
++cur_rd_info_idx;
}
@@ -5300,7 +5542,7 @@ static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
const MACROBLOCKD *xd = &x->e_mbd;
const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
- *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, 1);
+ *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
const int64_t mse = *dist / bw / bh;
// Normalized quantizer takes the transform upscaling factor (8 for tx size
// smaller than 32) into account.
@@ -5354,7 +5596,7 @@ static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
mbmi->tx_size = tx_size;
- memset(x->blk_skip, 1, sizeof(x->blk_skip[0]) * n4);
+ for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
rd_stats->skip = 1;
rd_stats->rate = 0;
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
@@ -5388,17 +5630,21 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
int model_rate;
int64_t model_dist;
int model_skip;
- model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist,
- &model_skip, NULL, NULL, NULL, NULL);
+ model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &model_rate, &model_dist,
+ &model_skip, NULL, NULL, NULL, NULL);
const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
// If the modeled rd is a lot worse than the best so far, breakout.
// TODO(debargha, urvang): Improve the model and make the check below
// tighter.
assert(cpi->sf.model_based_prune_tx_search_level >= 0 &&
cpi->sf.model_based_prune_tx_search_level <= 2);
+ static const int prune_factor_by8[] = { 2 + MODELRD_TYPE_TX_SEARCH_PRUNE,
+ 4 + MODELRD_TYPE_TX_SEARCH_PRUNE };
if (!model_skip &&
- model_rd / (5 - cpi->sf.model_based_prune_tx_search_level) >
- ref_best_rd)
+ ((model_rd *
+ prune_factor_by8[cpi->sf.model_based_prune_tx_search_level - 1]) >>
+ 3) > ref_best_rd)
return;
}
@@ -5431,7 +5677,7 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
// Precompute residual hashes and find existing or add new RD records to
// store and reuse rate and distortion values to speed up TX size search.
- TXB_RD_INFO_NODE matched_rd_info[16 + 64 + 256];
+ TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64];
int found_rd_info = 0;
if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_inter_txb_hash) {
found_rd_info =
@@ -5479,34 +5725,61 @@ static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
assert(plane > 0);
assert(tx_size < TX_SIZES_ALL);
MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
ENTROPY_CONTEXT *ta = above_ctx + blk_col;
ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx);
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_UV]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize,
- ta, tl, rd_stats, ftxs_mode, INT64_MAX, NULL);
+ &txb_ctx, rd_stats, ftxs_mode, INT64_MAX, NULL);
+
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int blk_idx = blk_row * mi_width + blk_col;
+
av1_set_txb_context(x, plane, block, tx_size, ta, tl);
+ if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+ RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+ rd_stats->skip == 1) &&
+ !xd->lossless[mbmi->segment_id]) {
+ rd_stats->rate = zero_blk_rate;
+ rd_stats->dist = rd_stats->sse;
+ }
+
+ // Set chroma blk_skip to 0
+ set_blk_skip(x, plane, blk_idx, 0);
}
// Return value 0: early termination triggered, no valid rd cost available;
// 1: rd cost values are valid.
static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
RD_STATS *rd_stats, BLOCK_SIZE bsize,
- int64_t ref_best_rd,
+ int64_t non_skip_ref_best_rd,
+ int64_t skip_ref_best_rd,
FAST_TX_SEARCH_MODE ftxs_mode) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
int plane;
int is_cost_valid = 1;
int64_t this_rd = 0;
+ int64_t skip_rd = 0;
- if (ref_best_rd < 0) is_cost_valid = 0;
+ if ((non_skip_ref_best_rd < 0) && (skip_ref_best_rd < 0)) is_cost_valid = 0;
av1_init_rd_stats(rd_stats);
- if (x->skip_chroma_rd) return is_cost_valid;
+ if (x->skip_chroma_rd) {
+ if (!is_cost_valid) av1_invalid_rd_stats(rd_stats);
+
+ return is_cost_valid;
+ }
+
const BLOCK_SIZE bsizec = scale_chroma_bsize(
bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
@@ -5531,36 +5804,31 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
const int step = bh * bw;
ENTROPY_CONTEXT ta[MAX_MIB_SIZE];
ENTROPY_CONTEXT tl[MAX_MIB_SIZE];
- RD_STATS pn_rd_stats;
- av1_init_rd_stats(&pn_rd_stats);
av1_get_entropy_contexts(bsizec, pd, ta, tl);
for (idy = 0; idy < mi_height; idy += bh) {
for (idx = 0; idx < mi_width; idx += bw) {
+ RD_STATS pn_rd_stats;
+ av1_init_rd_stats(&pn_rd_stats);
tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size,
plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode);
+ if (pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+ if ((this_rd > non_skip_ref_best_rd) &&
+ (skip_rd > skip_ref_best_rd)) {
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
block += step;
}
}
-
- if (pn_rd_stats.rate == INT_MAX) {
- is_cost_valid = 0;
- break;
- }
-
- av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-
- this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
- RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse));
-
- if (this_rd > ref_best_rd) {
- is_cost_valid = 0;
- break;
- }
}
- }
-
- if (!is_cost_valid) {
+ } else {
// reset cost value
av1_invalid_rd_stats(rd_stats);
}
@@ -6137,9 +6405,9 @@ static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
(mv->col >> 3) > mv_limits->col_max;
}
-static INLINE int get_single_mode(int this_mode, int ref_idx,
- int is_comp_pred) {
- int single_mode;
+static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
+ int ref_idx, int is_comp_pred) {
+ PREDICTION_MODE single_mode;
if (is_comp_pred) {
single_mode =
ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode);
@@ -6149,63 +6417,6 @@ static INLINE int get_single_mode(int this_mode, int ref_idx,
return single_mode;
}
-/* If the current mode shares the same mv with other modes with higher prority,
- * skip this mode. This priority order is nearest > global > near. */
-static int skip_repeated_mv(const AV1_COMMON *const cm,
- const MACROBLOCK *const x, int this_mode,
- const MV_REFERENCE_FRAME ref_frames[2]) {
- const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
- const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
- const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
- if (!is_comp_pred) {
- if (this_mode == NEARMV) {
- if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) {
- // NEARMV has the same motion vector as NEARESTMV
- return 1;
- }
- if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 &&
- cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
- // NEARMV has the same motion vector as GLOBALMV
- return 1;
- }
- }
- if (this_mode == GLOBALMV) {
- if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 &&
- cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
- // GLOBALMV has the same motion vector as NEARESTMV
- return 1;
- }
- }
- } else {
- for (int i = 0; i < 2; ++i) {
- const int single_mode = get_single_mode(this_mode, i, is_comp_pred);
- if (single_mode == NEARMV) {
- if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) {
- // NEARMV has the same motion vector as NEARESTMV in compound mode
- return 1;
- }
- }
- }
- if (this_mode == NEAR_NEARMV) {
- if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 &&
- cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION &&
- cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) {
- // NEAR_NEARMV has the same motion vector as GLOBAL_GLOBALMV
- return 1;
- }
- }
- if (this_mode == GLOBAL_GLOBALMV) {
- if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 &&
- cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION &&
- cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) {
- // GLOBAL_GLOBALMV has the same motion vector as NEARST_NEARSTMV
- return 1;
- }
- }
- }
- return 0;
-}
-
static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row,
int mi_col, int_mv *ref_mv_sub8x8[2],
@@ -6215,10 +6426,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
const int num_planes = av1_num_planes(cm);
const int pw = block_size_wide[bsize];
const int ph = block_size_high[bsize];
+ const int plane = 0;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = xd->mi[0];
// This function should only ever be called for compound modes
assert(has_second_ref(mbmi));
+ const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
int_mv ref_mv[2];
int ite, ref;
@@ -6228,11 +6441,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
struct macroblockd_plane *const pd = &xd->plane[0];
const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
- int is_global[2];
+
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+ conv_params.use_jnt_comp_avg = 0;
+ WarpTypesAllowed warp_types[2];
for (ref = 0; ref < 2; ++ref) {
const WarpedMotionParams *const wm =
&xd->global_motion[xd->mi[0]->ref_frame[ref]];
- is_global[ref] = is_global_mv_block(xd->mi[0], wm->wmtype);
+ const int is_global = is_global_mv_block(xd->mi[0], wm->wmtype);
+ warp_types[ref].global_warp_allowed = is_global;
+ warp_types[ref].local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
}
// Do joint motion search in compound mode to get more accurate mv.
@@ -6244,30 +6462,38 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
};
// Prediction buffer from second frame.
- DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
- uint8_t *second_pred;
+ DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
+ uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
(void)ref_mv_sub8x8;
+ const int have_newmv = have_nearmv_in_inter_mode(mbmi->mode);
+ const int ref_mv_idx = mbmi->ref_mv_idx + (have_newmv ? 1 : 0);
+ MV *const best_mv = &x->best_mv.as_mv;
+ const int search_range = SEARCH_RANGE_8P;
+ const int sadpb = x->sadperbit16;
// Allow joint search multiple times iteratively for each reference frame
// and break out of the search loop if it couldn't find a better mv.
for (ite = 0; ite < 4; ite++) {
struct buf_2d ref_yv12[2];
int bestsme = INT_MAX;
- int sadpb = x->sadperbit16;
- MV *const best_mv = &x->best_mv.as_mv;
- int search_range = 3;
-
MvLimits tmp_mv_limits = x->mv_limits;
int id = ite % 2; // Even iterations search in the first reference frame,
// odd iterations search in the second. The predictor
// found for the 'other' reference frame is factored in.
- const int plane = 0;
- ConvolveParams conv_params = get_conv_params(!id, 0, plane, xd->bd);
- conv_params.use_jnt_comp_avg = 0;
- WarpTypesAllowed warp_types;
- warp_types.global_warp_allowed = is_global[!id];
- warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
-
+ if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) {
+ if (cur_mv[id].as_int == init_mv[id].as_int) {
+ break;
+ } else {
+ int_mv cur_int_mv, init_int_mv;
+ cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
+ cur_int_mv.as_mv.row = cur_mv[id].as_mv.col >> 3;
+ init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
+ init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
+ if (cur_int_mv.as_int == init_int_mv.as_int) {
+ break;
+ }
+ }
+ }
for (ref = 0; ref < 2; ++ref) {
ref_mv[ref] = av1_get_ref_mv(x, ref);
// Swap out the reference frame for a version that's been scaled to
@@ -6294,26 +6520,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
ref_yv12[1] = xd->plane[plane].pre[1];
// Get the prediction block from the 'other' reference frame.
- InterpFilters interp_filters = EIGHTTAP_REGULAR;
+ const InterpFilters interp_filters = EIGHTTAP_REGULAR;
// Since we have scaled the reference frames to match the size of the
// current frame we must use a unit scaling factor during mode selection.
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
- av1_highbd_build_inter_predictor(
- ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
- &cur_mv[!id].as_mv, &cm->sf_identity, pw, ph, 0, interp_filters,
- &warp_types, p_col, p_row, plane, MV_PRECISION_Q3, mi_col * MI_SIZE,
- mi_row * MI_SIZE, xd, cm->allow_warped_motion);
- } else {
- second_pred = (uint8_t *)second_pred_alloc_16;
- av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
- second_pred, pw, &cur_mv[!id].as_mv,
- &cm->sf_identity, pw, ph, &conv_params,
- interp_filters, &warp_types, p_col, p_row,
- plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE,
- mi_row * MI_SIZE, xd, cm->allow_warped_motion);
- }
+ av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
+ second_pred, pw, &cur_mv[!id].as_mv,
+ &cm->sf_identity, pw, ph, &conv_params,
+ interp_filters, &warp_types[!id], p_col, p_row,
+ plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE,
+ mi_row * MI_SIZE, xd, cm->allow_warped_motion);
const int order_idx = id != 0;
av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
@@ -6325,15 +6541,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
// Use the mv result from the single mode as mv predictor.
- // Use the mv result from the single mode as mv predictor.
*best_mv = cur_mv[id].as_mv;
best_mv->col >>= 3;
best_mv->row >>= 3;
- av1_set_mvcost(
- x, id,
- mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
+ av1_set_mvcost(x, id, ref_mv_idx);
// Small-range full-pixel motion search.
bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
@@ -6385,7 +6598,6 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
// Restore the pointer to the first prediction buffer.
if (id) xd->plane[plane].pre[0] = ref_yv12[0];
-
if (bestsme < last_besterr[id]) {
cur_mv[id].as_mv = *best_mv;
last_besterr[id] = bestsme;
@@ -6397,10 +6609,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
*rate_mv = 0;
for (ref = 0; ref < 2; ++ref) {
- av1_set_mvcost(
- x, ref,
- mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
-
+ av1_set_mvcost(x, ref, ref_mv_idx);
const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
*rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -6710,16 +6919,16 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
switch (mbmi->motion_mode) {
case SIMPLE_TRANSLATION:
- bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
- sadpb, cond_cost_list(cpi, cost_list),
- &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col),
- (MI_SIZE * mi_row), 0);
+ bestsme = av1_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
+ sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
+ (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0);
break;
case OBMC_CAUSAL:
- bestsme = av1_obmc_full_pixel_diamond(
- cpi, x, &mvp_full, step_param, sadpb,
- MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
- &(x->best_mv.as_mv), 0);
+ bestsme = av1_obmc_full_pixel_search(cpi, x, &mvp_full, step_param, sadpb,
+ MAX_MVSEARCH_STEPS - 1 - step_param,
+ 1, &cpi->fn_ptr[bsize], &ref_mv,
+ &(x->best_mv.as_mv), 0);
break;
default: assert(0 && "Invalid motion mode!\n");
}
@@ -6850,25 +7059,17 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
cm->width, cm->height);
- ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane, xd->bd);
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
WarpTypesAllowed warp_types;
warp_types.global_warp_allowed = is_global;
warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
// Get the prediction block from the 'other' reference frame.
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- av1_highbd_build_inter_predictor(
- ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
- 0, mbmi->interp_filters, &warp_types, p_col, p_row, plane,
- MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd,
- cm->allow_warped_motion);
- } else {
- av1_build_inter_predictor(
- ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
- &conv_params, mbmi->interp_filters, &warp_types, p_col, p_row, plane,
- !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd,
- cm->allow_warped_motion);
- }
+ av1_build_inter_predictor(ref_yv12.buf, ref_yv12.stride, second_pred, pw,
+ other_mv, &sf, pw, ph, &conv_params,
+ mbmi->interp_filters, &warp_types, p_col, p_row,
+ plane, !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE,
+ mi_row * MI_SIZE, xd, cm->allow_warped_motion);
av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
&xd->jcp_param.bck_offset,
@@ -6921,7 +7122,7 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
int bestsme = INT_MAX;
int sadpb = x->sadperbit16;
MV *const best_mv = &x->best_mv.as_mv;
- int search_range = 3;
+ int search_range = SEARCH_RANGE_8P;
MvLimits tmp_mv_limits = x->mv_limits;
@@ -7056,12 +7257,12 @@ static void do_masked_motion_search_indexed(
// near mv modes to reduce distortion in subsequent blocks and also improve
// visual quality.
#define NEW_MV_DISCOUNT_FACTOR 8
-static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
- int ref_mv_idx,
+static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+ int ref_idx, int ref_mv_idx,
const MV_REFERENCE_FRAME *ref_frame,
const MB_MODE_INFO_EXT *mbmi_ext);
static int discount_newmv_test(const AV1_COMP *const cpi, const MACROBLOCK *x,
- int this_mode, int_mv this_mv) {
+ PREDICTION_MODE this_mode, int_mv this_mv) {
if (this_mode == NEWMV && this_mv.as_int != 0 &&
!cpi->rc.is_src_frame_alt_ref) {
// Only discount new_mv when nearst_mv and all near_mv are zero, and the
@@ -7176,6 +7377,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
const int N = bw * bh;
+ assert(N >= 64);
int rate;
int64_t dist;
int64_t rd, best_rd = INT64_MAX;
@@ -7199,28 +7401,27 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
(int64_t)aom_sum_squares_i16(residual1, N)) *
(1 << WEDGE_WEIGHT_BITS) / 2;
int16_t *ds = residual0;
- if (N < 64)
- av1_wedge_compute_delta_squares_c(ds, residual0, residual1, N);
- else
- av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
+
+ av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
- // TODO(jingning): Make sse2 functions support N = 16 case
- if (N < 64)
- wedge_sign = av1_wedge_sign_from_residuals_c(ds, mask, N, sign_limit);
- else
- wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+ wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
- if (N < 64)
- sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N);
- else
- sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
sse = ROUND_POWER_OF_TWO(sse, bd_round);
- model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ // int rate2;
+ // int64_t dist2;
+ // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2);
+ // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n",
+ // sse, rate, dist, rate2, dist2); dist = dist2;
+ // rate = rate2;
+
rate += x->wedge_idx_cost[bsize][wedge_index];
rd = RDCOST(x->rdmult, rate, dist);
@@ -7248,6 +7449,7 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
const int N = bw * bh;
+ assert(N >= 64);
int rate;
int64_t dist;
int64_t rd, best_rd = INT64_MAX;
@@ -7259,13 +7461,11 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
- if (N < 64)
- sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N);
- else
- sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
sse = ROUND_POWER_OF_TWO(sse, bd_round);
- model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
rate += x->wedge_idx_cost[bsize][wedge_index];
rd = RDCOST(x->rdmult, rate, dist);
@@ -7317,50 +7517,45 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
MB_MODE_INFO *const mbmi = xd->mi[0];
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
- const int N = bw * bh;
+ const int N = 1 << num_pels_log2_lookup[bsize];
int rate;
- uint64_t sse;
int64_t dist;
- int64_t rd0;
DIFFWTD_MASK_TYPE cur_mask_type;
int64_t best_rd = INT64_MAX;
DIFFWTD_MASK_TYPE best_mask_type = 0;
const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+ DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+ uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
// try each mask type and its inverse
for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
// build mask and inverse
if (hbd)
av1_build_compound_diffwtd_mask_highbd(
- xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+ tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
else
- av1_build_compound_diffwtd_mask(xd->seg_mask, cur_mask_type, p0, bw, p1,
- bw, bh, bw);
+ av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
+ p0, bw, p1, bw, bh, bw);
// compute rd for mask
- sse = av1_wedge_sse_from_residuals(residual1, diff10, xd->seg_mask, N);
+ uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
+ tmp_mask[cur_mask_type], N);
sse = ROUND_POWER_OF_TWO(sse, bd_round);
- model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
- rd0 = RDCOST(x->rdmult, rate, dist);
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ const int64_t rd0 = RDCOST(x->rdmult, rate, dist);
if (rd0 < best_rd) {
best_mask_type = cur_mask_type;
best_rd = rd0;
}
}
-
- // make final mask
mbmi->interinter_comp.mask_type = best_mask_type;
- if (hbd)
- av1_build_compound_diffwtd_mask_highbd(
- xd->seg_mask, mbmi->interinter_comp.mask_type, CONVERT_TO_BYTEPTR(p0),
- bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
- else
- av1_build_compound_diffwtd_mask(
- xd->seg_mask, mbmi->interinter_comp.mask_type, p0, bw, p1, bw, bh, bw);
-
+ if (best_mask_type == DIFFWTD_38_INV) {
+ memcpy(xd->seg_mask, seg_mask, N * 2);
+ }
return best_rd;
}
@@ -7413,9 +7608,12 @@ static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
}
}
-static int interinter_compound_motion_search(
- const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
- const BLOCK_SIZE bsize, const int this_mode, int mi_row, int mi_col) {
+static int interinter_compound_motion_search(const AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ const int_mv *const cur_mv,
+ const BLOCK_SIZE bsize,
+ const PREDICTION_MODE this_mode,
+ int mi_row, int mi_col) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
int_mv tmp_mv[2];
@@ -7440,11 +7638,40 @@ static int interinter_compound_motion_search(
return tmp_rate_mv;
}
+static void get_inter_predictors_masked_compound(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+ int mi_row, int mi_col, uint8_t **preds0, uint8_t **preds1,
+ int16_t *residual1, int16_t *diff10, int *strides) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ int can_use_previous = cm->allow_warped_motion;
+ // get inter predictors to use for masked compound modes
+ av1_build_inter_predictors_for_planes_single_buf(
+ xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous);
+ av1_build_inter_predictors_for_planes_single_buf(
+ xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
+ const struct buf_2d *const src = &x->plane[0].src;
+ if (get_bitdepth_data_path_index(xd)) {
+ aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd);
+ aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
+ bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd);
+ } else {
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1,
+ bw);
+ aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+ }
+}
+
static int64_t build_and_cost_compound_type(
const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
- const BLOCK_SIZE bsize, const int this_mode, int *rs2, int rate_mv,
- BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1,
- int16_t *residual1, int16_t *diff10, int *strides, int mi_row, int mi_col) {
+ const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
+ int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
+ uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
+ int mi_row, int mi_col, int mode_rate, int64_t ref_best_rd,
+ int *calc_pred_masked_compound) {
const AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -7456,19 +7683,30 @@ static int64_t build_and_cost_compound_type(
int64_t tmp_skip_sse_sb;
const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+ if (*calc_pred_masked_compound) {
+ get_inter_predictors_masked_compound(cpi, x, bsize, mi_row, mi_col, preds0,
+ preds1, residual1, diff10, strides);
+ *calc_pred_masked_compound = 0;
+ }
+
best_rd_cur =
pick_interinter_mask(cpi, x, bsize, *preds0, *preds1, residual1, diff10);
*rs2 += get_interinter_compound_mask_rate(x, mbmi);
best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
- if (have_newmv_in_inter_mode(this_mode) &&
- use_masked_motion_search(compound_type)) {
+ // Although the true rate_mv might be different after motion search, but it
+ // is unlikely to be the best mode considering the transform rd cost and other
+ // mode overhead cost
+ int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
+ if (mode_rd > ref_best_rd) return INT64_MAX;
+
+ if (have_newmv_in_inter_mode(this_mode) && compound_type == COMPOUND_WEDGE) {
*out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize,
this_mode, mi_row, mi_col);
av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
- av1_subtract_plane(x, bsize, 0);
- model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
if (rd >= best_rd_cur) {
mbmi->mv[0].as_int = cur_mv[0].as_int;
@@ -7508,12 +7746,72 @@ typedef struct {
int (*single_newmv_valid)[REF_FRAMES];
// Pointer to array of predicted rate-distortion
// Should point to first of 2 arrays in 2D array
- int64_t (*modelled_rd)[REF_FRAMES];
+ int64_t (*modelled_rd)[MAX_REF_MV_SERCH][REF_FRAMES];
InterpFilter single_filter[MB_MODE_COUNT][REF_FRAMES];
int ref_frame_cost;
int single_comp_cost;
+ int64_t (*simple_rd)[MAX_REF_MV_SERCH][REF_FRAMES];
+ int skip_motion_mode;
+ INTERINTRA_MODE *inter_intra_mode;
} HandleInterModeArgs;
+/* If the current mode shares the same mv with other modes with higher cost,
+ * skip this mode. */
+static int skip_repeated_mv(const AV1_COMMON *const cm,
+ const MACROBLOCK *const x,
+ PREDICTION_MODE this_mode,
+ const MV_REFERENCE_FRAME ref_frames[2],
+ InterModeSearchState *search_state) {
+ const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
+ const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ PREDICTION_MODE compare_mode = MB_MODE_COUNT;
+ if (!is_comp_pred) {
+ if (this_mode == NEARMV) {
+ if (ref_mv_count == 0) {
+ // NEARMV has the same motion vector as NEARESTMV
+ compare_mode = NEARESTMV;
+ }
+ if (ref_mv_count == 1 &&
+ cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+ // NEARMV has the same motion vector as GLOBALMV
+ compare_mode = GLOBALMV;
+ }
+ }
+ if (this_mode == GLOBALMV) {
+ if (ref_mv_count == 0 &&
+ cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+ // GLOBALMV has the same motion vector as NEARESTMV
+ compare_mode = NEARESTMV;
+ }
+ if (ref_mv_count == 1) {
+ // GLOBALMV has the same motion vector as NEARMV
+ compare_mode = NEARMV;
+ }
+ }
+
+ if (compare_mode != MB_MODE_COUNT) {
+ // Use modelled_rd to check whether compare mode was searched
+ if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] !=
+ INT64_MAX) {
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames);
+ const int compare_cost = cost_mv_ref(x, compare_mode, mode_ctx);
+ const int this_cost = cost_mv_ref(x, this_mode, mode_ctx);
+
+ // Only skip if the mode cost is larger than compare mode cost
+ if (this_cost > compare_cost) {
+ search_state->modelled_rd[this_mode][0][ref_frames[0]] =
+ search_state->modelled_rd[compare_mode][0][ref_frames[0]];
+ return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
const AV1_COMMON *cm,
const MACROBLOCK *x) {
@@ -7640,62 +7938,97 @@ static INLINE int64_t interpolation_filter_rd(
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
- int tmp_rate, tmp_skip_sb = 0;
- int64_t tmp_dist, tmp_skip_sse = INT64_MAX;
+ int tmp_rate[2], tmp_skip_sb[2] = { 1, 1 };
+ int64_t tmp_dist[2], tmp_skip_sse[2] = { 0, 0 };
const InterpFilters last_best = mbmi->interp_filters;
mbmi->interp_filters = filter_sets[filter_idx];
const int tmp_rs =
get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
- if (!skip_pred) {
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
- av1_subtract_plane(x, bsize, 0);
-#if DNN_BASED_RD_INTERP_FILTER
- model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist,
- &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL);
-#else
- model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, &tmp_skip_sb,
- &tmp_skip_sse, NULL, NULL, NULL);
-#endif
+ assert(skip_pred != 2);
+ assert((skip_pred >= 0) && (skip_pred <= cpi->default_interp_skip_flags));
+ assert(rate[0] >= 0);
+ assert(dist[0] >= 0);
+ assert((skip_txfm_sb[0] == 0) || (skip_txfm_sb[0] == 1));
+ assert(skip_sse_sb[0] >= 0);
+ assert(rate[1] >= 0);
+ assert(dist[1] >= 0);
+ assert((skip_txfm_sb[1] == 0) || (skip_txfm_sb[1] == 1));
+ assert(skip_sse_sb[1] >= 0);
+
+ if (skip_pred != cpi->default_interp_skip_flags) {
+ if (skip_pred != DEFAULT_LUMA_INTERP_SKIP_FLAG) {
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+#if CONFIG_COLLECT_RD_STATS == 3
+ RD_STATS rd_stats_y;
+ select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
+ PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 3
+ model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
+ &tmp_skip_sb[0], &tmp_skip_sse[0], NULL, NULL, NULL);
+ tmp_rate[1] = tmp_rate[0];
+ tmp_dist[1] = tmp_dist[0];
+ } else {
+ // only luma MC is skipped
+ tmp_rate[1] = rate[0];
+ tmp_dist[1] = dist[0];
+ }
if (num_planes > 1) {
- int64_t tmp_y_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
- if (tmp_y_rd > *rd) {
- mbmi->interp_filters = last_best;
- return 0;
+ for (int plane = 1; plane < num_planes; ++plane) {
+ int tmp_rate_uv, tmp_skip_sb_uv;
+ int64_t tmp_dist_uv, tmp_skip_sse_uv;
+ int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]);
+ if (tmp_rd >= *rd) {
+ mbmi->interp_filters = last_best;
+ return 0;
+ }
+ av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ plane);
+ model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, plane, plane, mi_row, mi_col, &tmp_rate_uv,
+ &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, NULL, NULL);
+ tmp_rate[1] =
+ (int)AOMMIN(((int64_t)tmp_rate[1] + (int64_t)tmp_rate_uv), INT_MAX);
+ tmp_dist[1] += tmp_dist_uv;
+ tmp_skip_sb[1] &= tmp_skip_sb_uv;
+ tmp_skip_sse[1] += tmp_skip_sse_uv;
}
- int tmp_rate_uv, tmp_skip_sb_uv;
- int64_t tmp_dist_uv, tmp_skip_sse_uv;
- av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize);
- for (int plane = 1; plane < num_planes; ++plane)
- av1_subtract_plane(x, bsize, plane);
-#if DNN_BASED_RD_INTERP_FILTER
- model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 1, num_planes - 1,
- &tmp_rate_uv, &tmp_dist_uv, &tmp_skip_sb_uv,
- &tmp_skip_sse_uv, NULL, NULL, NULL);
-#else
- model_rd_for_sb(cpi, bsize, x, xd, 1, num_planes - 1, &tmp_rate_uv,
- &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL,
- NULL, NULL);
-#endif
- tmp_rate += tmp_rate_uv;
- tmp_skip_sb &= tmp_skip_sb_uv;
- tmp_dist += tmp_dist_uv;
- tmp_skip_sse += tmp_skip_sse_uv;
}
} else {
- tmp_rate = *rate;
- tmp_dist = *dist;
+ // both luma and chroma MC is skipped
+ tmp_rate[1] = rate[1];
+ tmp_dist[1] = dist[1];
}
- int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+ int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]);
+
if (tmp_rd < *rd) {
*rd = tmp_rd;
*switchable_rate = tmp_rs;
- *skip_txfm_sb = tmp_skip_sb;
- *skip_sse_sb = tmp_skip_sse;
- *rate = tmp_rate;
- *dist = tmp_dist;
- if (!skip_pred) {
+ if (skip_pred != cpi->default_interp_skip_flags) {
+ if (skip_pred == 0) {
+ // Overwrite the data as current filter is the best one
+ tmp_skip_sb[1] = tmp_skip_sb[0] & tmp_skip_sb[1];
+ tmp_skip_sse[1] = tmp_skip_sse[0] + tmp_skip_sse[1];
+ memcpy(rate, tmp_rate, sizeof(*rate) * 2);
+ memcpy(dist, tmp_dist, sizeof(*dist) * 2);
+ memcpy(skip_txfm_sb, tmp_skip_sb, sizeof(*skip_txfm_sb) * 2);
+ memcpy(skip_sse_sb, tmp_skip_sse, sizeof(*skip_sse_sb) * 2);
+ // As luma MC data is computed, no need to recompute after the search
+ x->recalc_luma_mc_data = 0;
+ } else if (skip_pred == DEFAULT_LUMA_INTERP_SKIP_FLAG) {
+ // As luma MC data is not computed, update of luma data can be skipped
+ rate[1] = tmp_rate[1];
+ dist[1] = tmp_dist[1];
+ skip_txfm_sb[1] = skip_txfm_sb[0] & tmp_skip_sb[1];
+ skip_sse_sb[1] = skip_sse_sb[0] + tmp_skip_sse[1];
+ // As luma MC data is not recomputed and current filter is the best,
+ // indicate the possibility of recomputing MC data
+ // If current buffer contains valid MC data, toggle to indicate that
+ // luma MC data needs to be recomputed
+ x->recalc_luma_mc_data ^= 1;
+ }
swap_dst_buf(xd, dst_bufs, num_planes);
}
return 1;
@@ -7715,8 +8048,8 @@ static INLINE int find_best_horiz_interp_filter_rd(
int i;
const int bw = block_size_wide[bsize];
assert(best_dual_mode == 0);
- if ((bw <= 4) && (!skip_hor)) {
- int skip_pred = 1;
+ if ((bw <= 4) && (skip_hor != cpi->default_interp_skip_flags)) {
+ int skip_pred = cpi->default_interp_skip_flags;
// Process the filters in reverse order to enable reusing rate and
// distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
@@ -7726,7 +8059,7 @@ static INLINE int find_best_horiz_interp_filter_rd(
dist)) {
best_dual_mode = i;
}
- skip_pred = 0;
+ skip_pred = skip_hor;
}
} else {
for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
@@ -7751,8 +8084,8 @@ static INLINE void find_best_vert_interp_filter_rd(
int best_dual_mode, int filter_set_size) {
int i;
const int bh = block_size_high[bsize];
- if ((bh <= 4) && (!skip_ver)) {
- int skip_pred = 1;
+ if ((bh <= 4) && (skip_ver != cpi->default_interp_skip_flags)) {
+ int skip_pred = cpi->default_interp_skip_flags;
// Process the filters in reverse order to enable reusing rate and
// distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
assert(filter_set_size == DUAL_FILTER_SET_SIZE);
@@ -7762,7 +8095,7 @@ static INLINE void find_best_vert_interp_filter_rd(
switchable_rate, skip_txfm_sb, skip_sse_sb,
dst_bufs, i, switchable_ctx, skip_pred, rate,
dist);
- skip_pred = 0;
+ skip_pred = skip_ver;
}
} else {
for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
@@ -7784,6 +8117,7 @@ static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
return 0;
}
}
+ if (has_second_ref(mi) && st->comp_type != mi->interinter_comp.type) return 0;
return 1;
}
@@ -7806,11 +8140,11 @@ static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
const int comp_idx = mbmi->compound_idx;
const int offset = x->interp_filter_stats_idx[comp_idx];
if (offset < MAX_INTERP_FILTER_STATS) {
- INTERPOLATION_FILTER_STATS stat = {
- mbmi->interp_filters,
- { mbmi->mv[0], mbmi->mv[1] },
- { mbmi->ref_frame[0], mbmi->ref_frame[1] },
- };
+ INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters,
+ { mbmi->mv[0], mbmi->mv[1] },
+ { mbmi->ref_frame[0],
+ mbmi->ref_frame[1] },
+ mbmi->interinter_comp.type };
x->interp_filter_stats[comp_idx][offset] = stat;
x->interp_filter_stats_idx[comp_idx]++;
}
@@ -7821,15 +8155,22 @@ static int64_t interpolation_filter_search(
int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES],
int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
- int64_t *const skip_sse_sb) {
+ int64_t *const skip_sse_sb, const int skip_build_pred,
+ HandleInterModeArgs *args, int64_t ref_best_rd) {
const AV1_COMMON *cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
const int need_search =
av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
- int i, tmp_rate;
- int64_t tmp_dist;
+ int i;
+ // Index 0 corresponds to luma rd data and index 1 corresponds to cummulative
+ // data of all planes
+ int tmp_rate[2] = { 0, 0 };
+ int64_t tmp_dist[2] = { 0, 0 };
+ int best_skip_txfm_sb[2] = { 1, 1 };
+ int64_t best_skip_sse_sb[2] = { 0, 0 };
+ const int ref_frame = xd->mi[0]->ref_frame[0];
(void)single_filter;
int match_found = -1;
@@ -7845,18 +8186,32 @@ static int64_t interpolation_filter_search(
switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
*switchable_rate =
get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
- for (int plane = 0; plane < num_planes; ++plane)
- av1_subtract_plane(x, bsize, plane);
-#if DNN_BASED_RD_INTERP_FILTER
- model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
- &tmp_dist, skip_txfm_sb, skip_sse_sb, NULL, NULL,
- NULL);
-#else
- model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist,
- skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL);
-#endif // DNN_BASED_RD_INTERP_FILTER
- *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist);
+ if (!skip_build_pred)
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+
+#if CONFIG_COLLECT_RD_STATS == 3
+ RD_STATS rd_stats_y;
+ select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
+ PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 3
+ model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
+ &best_skip_txfm_sb[0], &best_skip_sse_sb[0], NULL, NULL, NULL);
+ if (num_planes > 1)
+ model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, 1, num_planes - 1, mi_row, mi_col, &tmp_rate[1],
+ &tmp_dist[1], &best_skip_txfm_sb[1], &best_skip_sse_sb[1], NULL, NULL,
+ NULL);
+ tmp_rate[1] =
+ (int)AOMMIN((int64_t)tmp_rate[0] + (int64_t)tmp_rate[1], INT_MAX);
+ assert(tmp_rate[1] >= 0);
+ tmp_dist[1] = tmp_dist[0] + tmp_dist[1];
+ best_skip_txfm_sb[1] = best_skip_txfm_sb[0] & best_skip_txfm_sb[1];
+ best_skip_sse_sb[1] = best_skip_sse_sb[0] + best_skip_sse_sb[1];
+ *rd = RDCOST(x->rdmult, (*switchable_rate + tmp_rate[1]), tmp_dist[1]);
+ *skip_txfm_sb = best_skip_txfm_sb[1];
+ *skip_sse_sb = best_skip_sse_sb[1];
+ x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
if (assign_filter != SWITCHABLE || match_found != -1) {
return 0;
@@ -7866,22 +8221,71 @@ static int64_t interpolation_filter_search(
av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
return 0;
}
- int skip_hor = 1;
- int skip_ver = 1;
+ if (args->modelled_rd != NULL) {
+ if (has_second_ref(mbmi)) {
+ const int ref_mv_idx = mbmi->ref_mv_idx;
+ int refs[2] = { mbmi->ref_frame[0],
+ (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+ const int mode0 = compound_ref0_mode(mbmi->mode);
+ const int mode1 = compound_ref1_mode(mbmi->mode);
+ const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+ args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+ if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) {
+ return INT64_MAX;
+ }
+ }
+ }
+
+ x->recalc_luma_mc_data = 0;
+ // skip_flag=xx (in binary form)
+ // Setting 0th flag corresonds to skipping luma MC and setting 1st bt
+ // corresponds to skipping chroma MC skip_flag=0 corresponds to "Don't skip
+ // luma and chroma MC" Skip flag=1 corresponds to "Skip Luma MC only"
+ // Skip_flag=2 is not a valid case
+ // skip_flag=3 corresponds to "Skip both luma and chroma MC"
+ int skip_hor = cpi->default_interp_skip_flags;
+ int skip_ver = cpi->default_interp_skip_flags;
const int is_compound = has_second_ref(mbmi);
- for (int k = 0; k < num_planes - 1; ++k) {
- struct macroblockd_plane *const pd = &xd->plane[k];
- const int bw = pd->width;
- const int bh = pd->height;
- for (int j = 0; j < 1 + is_compound; ++j) {
- const MV mv = mbmi->mv[j].as_mv;
+ assert(is_intrabc_block(mbmi) == 0);
+ for (int j = 0; j < 1 + is_compound; ++j) {
+ const RefBuffer *ref_buf = &cm->frame_refs[mbmi->ref_frame[j] - LAST_FRAME];
+ const struct scale_factors *const sf = &ref_buf->sf;
+ // TODO(any): Refine skip flag calculation considering scaling
+ if (av1_is_scaled(sf)) {
+ skip_hor = 0;
+ skip_ver = 0;
+ break;
+ }
+ const MV mv = mbmi->mv[j].as_mv;
+ int skip_hor_plane = 0;
+ int skip_ver_plane = 0;
+ for (int k = 0; k < AOMMAX(1, (num_planes - 1)); ++k) {
+ struct macroblockd_plane *const pd = &xd->plane[k];
+ const int bw = pd->width;
+ const int bh = pd->height;
const MV mv_q4 = clamp_mv_to_umv_border_sb(
xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
- skip_hor &= (sub_x == 0);
- skip_ver &= (sub_y == 0);
- }
+ skip_hor_plane |= ((sub_x == 0) << k);
+ skip_ver_plane |= ((sub_y == 0) << k);
+ }
+ skip_hor = skip_hor & skip_hor_plane;
+ skip_ver = skip_ver & skip_ver_plane;
+ // It is not valid that "luma MV is sub-pel, whereas chroma MV is not"
+ assert(skip_hor != 2);
+ assert(skip_ver != 2);
+ }
+ // When compond prediction type is compound segment wedge, luma MC and chroma
+ // MC need to go hand in hand as mask generated during luma MC is reuired for
+ // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during
+ // vertical filter decision may be incorrect as temporary MC evaluation
+ // overwrites the mask. Make skip_ver as 0 for this case so that mask is
+ // populated during luma MC
+ if (is_compound && mbmi->compound_idx == 1 &&
+ mbmi->interinter_comp.type == COMPOUND_DIFFWTD) {
+ assert(mbmi->comp_group_idx == 1);
+ if (skip_hor == 0 && skip_ver == 1) skip_ver = 0;
}
// do interp_filter search
const int filter_set_size = DUAL_FILTER_SET_SIZE;
@@ -7895,14 +8299,14 @@ static int64_t interpolation_filter_search(
// EIGHTTAP_REGULAR mode is calculated beforehand
best_dual_mode = find_best_horiz_interp_filter_rd(
x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
- skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_hor,
- &tmp_rate, &tmp_dist, best_dual_mode);
+ best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_hor,
+ tmp_rate, tmp_dist, best_dual_mode);
// From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
find_best_vert_interp_filter_rd(
x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
- skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
- &tmp_rate, &tmp_dist, best_dual_mode, filter_set_size);
+ best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
+ tmp_rate, tmp_dist, best_dual_mode, filter_set_size);
} else {
// EIGHTTAP_REGULAR mode is calculated beforehand
for (i = 1; i < filter_set_size; ++i) {
@@ -7912,12 +8316,25 @@ static int64_t interpolation_filter_search(
if (filter_x != filter_y) continue;
}
interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
- switchable_rate, skip_txfm_sb, skip_sse_sb,
- dst_bufs, i, switchable_ctx, 0, &tmp_rate,
- &tmp_dist);
+ switchable_rate, best_skip_txfm_sb,
+ best_skip_sse_sb, dst_bufs, i, switchable_ctx, 0,
+ tmp_rate, tmp_dist);
+ assert(x->recalc_luma_mc_data == 0);
}
}
swap_dst_buf(xd, dst_bufs, num_planes);
+ // Recompute final MC data if required
+ if (x->recalc_luma_mc_data == 1) {
+ // Recomputing final luma MC data is required only if the same was skipped
+ // in either of the directions Condition below is necessary, but not
+ // sufficient
+ assert((skip_hor == 1) || (skip_ver == 1));
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ }
+ *skip_txfm_sb = best_skip_txfm_sb[1];
+ *skip_sse_sb = best_skip_sse_sb[1];
+ x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
+
// save search results
if (cpi->sf.skip_repeat_interpolation_filter_search) {
assert(match_found == -1);
@@ -7926,6 +8343,301 @@ static int64_t interpolation_filter_search(
return 0;
}
+static int txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, RD_STATS *rd_stats,
+ RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+ int mode_rate, int64_t ref_best_rd) {
+ /*
+ * This function combines y and uv planes' transform search processes
+ * together, when the prediction is generated. It first does subtration to
+ * obtain the prediction error. Then it calls
+ * select_tx_type_yrd/super_block_yrd and inter_block_uvrd sequentially and
+ * handles the early terminations happen in those functions. At the end, it
+ * computes the rd_stats/_y/_uv accordingly.
+ */
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int skip_txfm_sb = 0;
+ const int num_planes = av1_num_planes(cm);
+ const int ref_frame_1 = mbmi->ref_frame[1];
+ const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
+ const int64_t rd_thresh =
+ ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
+ const int skip_ctx = av1_get_skip_context(xd);
+ const int64_t min_header_rate =
+ mode_rate + AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+ // Account for minimum skip and non_skip rd.
+ // Eventually either one of them will be added to mode_rate
+ const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
+
+ if (min_header_rd_possible > ref_best_rd) {
+ av1_invalid_rd_stats(rd_stats_y);
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
+
+ av1_init_rd_stats(rd_stats);
+ av1_init_rd_stats(rd_stats_y);
+ av1_init_rd_stats(rd_stats_uv);
+ rd_stats->rate = mode_rate;
+
+ if (!cpi->common.all_lossless)
+ check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
+ if (!skip_txfm_sb) {
+ int64_t non_skip_rdcosty = INT64_MAX;
+ int64_t skip_rdcosty = INT64_MAX;
+ int64_t min_rdcosty = INT64_MAX;
+ int is_cost_valid_uv = 0;
+
+ // cost and distortion
+ av1_subtract_plane(x, bsize, 0);
+ if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+ // Motion mode
+ select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh);
+#if CONFIG_COLLECT_RD_STATS == 2
+ PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 2
+ } else {
+ super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+ memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+ for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+ set_blk_skip(x, 0, i, rd_stats_y->skip);
+ }
+
+ if (rd_stats_y->rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ // TODO(angiebird): check if we need this
+ // restore_dst_buf(xd, *orig_dst, num_planes);
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+
+ av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+ non_skip_rdcosty = RDCOST(
+ x->rdmult, rd_stats->rate + x->skip_cost[skip_ctx][0], rd_stats->dist);
+ skip_rdcosty =
+ RDCOST(x->rdmult, mode_rate + x->skip_cost[skip_ctx][1], rd_stats->sse);
+ min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
+
+ if (min_rdcosty > ref_best_rd) {
+ int64_t tokenonly_rdy =
+ AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
+ RDCOST(x->rdmult, 0, rd_stats_y->sse));
+ // Invalidate rd_stats_y to skip the rest of the motion modes search
+ if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.adaptive_txb_search_level) >
+ rd_thresh)
+ av1_invalid_rd_stats(rd_stats_y);
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+
+ if (num_planes > 1) {
+ /* clang-format off */
+ is_cost_valid_uv =
+ inter_block_uvrd(cpi, x, rd_stats_uv, bsize,
+ ref_best_rd - non_skip_rdcosty,
+ ref_best_rd - skip_rdcosty, FTXS_NONE);
+ if (!is_cost_valid_uv) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+ /* clang-format on */
+ av1_merge_rd_stats(rd_stats, rd_stats_uv);
+ } else {
+ av1_init_rd_stats(rd_stats_uv);
+ }
+ if (rd_stats->skip) {
+ rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ rd_stats->rate += x->skip_cost[skip_ctx][1];
+ mbmi->skip = 0;
+ // here mbmi->skip temporarily plays a role as what this_skip2 does
+
+ int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (tmprd > ref_best_rd) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+ } else if (!xd->lossless[mbmi->segment_id] &&
+ (RDCOST(x->rdmult,
+ rd_stats_y->rate + rd_stats_uv->rate +
+ x->skip_cost[skip_ctx][0],
+ rd_stats->dist) >=
+ RDCOST(x->rdmult, x->skip_cost[skip_ctx][1], rd_stats->sse))) {
+ rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+ rd_stats->rate += x->skip_cost[skip_ctx][1];
+ rd_stats->dist = rd_stats->sse;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ mbmi->skip = 1;
+ } else {
+ rd_stats->rate += x->skip_cost[skip_ctx][0];
+ mbmi->skip = 0;
+ }
+ } else {
+ x->skip = 1;
+ mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+ // The cost of skip bit needs to be added.
+ mbmi->skip = 0;
+ rd_stats->rate += x->skip_cost[skip_ctx][1];
+
+ rd_stats->dist = 0;
+ rd_stats->sse = 0;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ rd_stats->skip = 1;
+ int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (tmprd > ref_best_rd) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int handle_inter_intra_mode(const AV1_COMP *const cpi,
+ MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, MB_MODE_INFO *mbmi,
+ HandleInterModeArgs *args,
+ int64_t ref_best_rd, int *rate_mv,
+ int *tmp_rate2, BUFFER_SET *orig_dst) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
+ int64_t rd, best_interintra_rd = INT64_MAX;
+ int rmode, rate_sum;
+ int64_t dist_sum;
+ int tmp_rate_mv = 0;
+ int tmp_skip_txfm_sb;
+ int bw = block_size_wide[bsize];
+ int64_t tmp_skip_sse_sb;
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
+ uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
+ uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+ const int *const interintra_mode_cost =
+ x->interintra_mode_cost[size_group_lookup[bsize]];
+ const int_mv mv0 = mbmi->mv[0];
+ const int is_wedge_used = is_interintra_wedge_used(bsize);
+ int rwedge = is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ xd->plane[0].dst.buf = tmp_buf;
+ xd->plane[0].dst.stride = bw;
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ mbmi->ref_frame[1] = INTRA_FRAME;
+ mbmi->use_wedge_interintra = 0;
+ best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]];
+ int j = 0;
+ if (cpi->sf.reuse_inter_intra_mode == 0 ||
+ best_interintra_mode == INTERINTRA_MODES) {
+ for (j = 0; j < INTERINTRA_MODES; ++j) {
+ mbmi->interintra_mode = (INTERINTRA_MODE)j;
+ rmode = interintra_mode_cost[mbmi->interintra_mode];
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
+ if (rd < best_interintra_rd) {
+ best_interintra_rd = rd;
+ best_interintra_mode = mbmi->interintra_mode;
+ }
+ }
+ args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+ }
+ if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) {
+ mbmi->interintra_mode = best_interintra_mode;
+ rmode = interintra_mode_cost[mbmi->interintra_mode];
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ }
+ rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, *rate_mv + rmode + rate_sum + rwedge, dist_sum);
+ best_interintra_rd = rd;
+ if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) {
+ return -1;
+ }
+ if (is_wedge_used) {
+ int64_t best_interintra_rd_nowedge = rd;
+ int64_t best_interintra_rd_wedge = INT64_MAX;
+ int_mv tmp_mv;
+ // Disable wedge search if source variance is small
+ if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
+ mbmi->use_wedge_interintra = 1;
+
+ rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
+ x->wedge_interintra_cost[bsize][1];
+
+ best_interintra_rd_wedge =
+ pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+
+ best_interintra_rd_wedge +=
+ RDCOST(x->rdmult, rmode + *rate_mv + rwedge, 0);
+ rd = INT64_MAX;
+ // Refine motion vector.
+ if (have_newmv_in_inter_mode(mbmi->mode)) {
+ // get negative of mask
+ const uint8_t *mask = av1_get_contiguous_soft_mask(
+ mbmi->interintra_wedge_index, 1, bsize);
+ tmp_mv = mbmi->mv[0];
+ compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
+ mi_col, intrapred, mask, bw, &tmp_rate_mv,
+ 0);
+ if (mbmi->mv[0].as_int != tmp_mv.as_int) {
+ mbmi->mv[0].as_int = tmp_mv.as_int;
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
+ bsize);
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge,
+ dist_sum);
+ }
+ }
+ if (rd >= best_interintra_rd_wedge) {
+ tmp_mv.as_int = mv0.as_int;
+ tmp_rate_mv = *rate_mv;
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ }
+ // Evaluate closer to true rd
+ rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
+ dist_sum);
+ best_interintra_rd_wedge = rd;
+ if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+ mbmi->use_wedge_interintra = 1;
+ mbmi->mv[0].as_int = tmp_mv.as_int;
+ *tmp_rate2 += tmp_rate_mv - *rate_mv;
+ *rate_mv = tmp_rate_mv;
+ } else {
+ mbmi->use_wedge_interintra = 0;
+ mbmi->mv[0].as_int = mv0.as_int;
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ }
+ } else {
+ mbmi->use_wedge_interintra = 0;
+ }
+ } // if (is_interintra_wedge_used(bsize))
+ if (num_planes > 1) {
+ av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ }
+ return 0;
+}
+
// TODO(afergs): Refactor the MBMI references in here - there's four
// TODO(afergs): Refactor optional args - add them to a struct or remove
static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
@@ -7933,11 +8645,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
int *disable_skip, int mi_row, int mi_col,
HandleInterModeArgs *const args,
- int64_t ref_best_rd, const int *refs, int rate_mv,
- BUFFER_SET *orig_dst
+ int64_t ref_best_rd, const int *refs,
+ int *rate_mv, BUFFER_SET *orig_dst
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
,
- int64_t *best_est_rd
+ TileDataEnc *tile_data, int64_t *best_est_rd,
+ int do_tx_search, InterModesInfo *inter_modes_info
#endif
) {
const AV1_COMMON *const cm = &cpi->common;
@@ -7946,41 +8659,49 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
MB_MODE_INFO *mbmi = xd->mi[0];
const int is_comp_pred = has_second_ref(mbmi);
const PREDICTION_MODE this_mode = mbmi->mode;
- int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0;
+ const int rate2_nocoeff = rd_stats->rate;
+ int best_xskip, best_disable_skip = 0;
RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
MB_MODE_INFO base_mbmi, best_mbmi;
uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const int rate_mv0 = *rate_mv;
+
int interintra_allowed = cm->seq_params.enable_interintra_compound &&
is_interintra_allowed(mbmi) && mbmi->compound_idx;
int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
- int total_samples;
-
- (void)rate_mv;
+ assert(mbmi->ref_frame[1] != INTRA_FRAME);
+ const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
av1_invalid_rd_stats(&best_rd_stats);
-
aom_clear_system_state();
- mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
- total_samples = mbmi->num_proj_ref[0];
- rate2_nocoeff = rd_stats->rate;
+ mbmi->num_proj_ref = 1; // assume num_proj_ref >=1
+ MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+ if (cm->switchable_motion_mode) {
+ last_motion_mode_allowed = motion_mode_allowed(xd->global_motion, xd, mbmi,
+ cm->allow_warped_motion);
+ }
+ if (last_motion_mode_allowed == WARPED_CAUSAL) {
+ mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
+ }
+ int total_samples = mbmi->num_proj_ref;
+ if (total_samples == 0) {
+ last_motion_mode_allowed = OBMC_CAUSAL;
+ }
base_mbmi = *mbmi;
- MOTION_MODE last_motion_mode_allowed =
- cm->switchable_motion_mode
- ? motion_mode_allowed(xd->global_motion, xd, mbmi,
- cm->allow_warped_motion)
- : SIMPLE_TRANSLATION;
- assert(mbmi->ref_frame[1] != INTRA_FRAME);
- const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
+ const int switchable_rate =
+ av1_is_interp_needed(xd) ? av1_get_switchable_rate(cm, x, xd) : 0;
int64_t best_rd = INT64_MAX;
-
+ int best_rate_mv = rate_mv0;
for (int mode_index = (int)SIMPLE_TRANSLATION;
mode_index <= (int)last_motion_mode_allowed + interintra_allowed;
mode_index++) {
+ if (args->skip_motion_mode && mode_index) continue;
int64_t tmp_rd = INT64_MAX;
int tmp_rate2 = rate2_nocoeff;
int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
int skip_txfm_sb = 0;
+ int tmp_rate_mv = rate_mv0;
*mbmi = base_mbmi;
if (is_interintra_mode) {
@@ -7995,10 +8716,9 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
// The prediction is calculated before motion_mode_rd() is called in
// handle_inter_mode()
} else if (mbmi->motion_mode == OBMC_CAUSAL) {
- mbmi->motion_mode = OBMC_CAUSAL;
- if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
- int tmp_rate_mv = 0;
-
+ uint32_t cur_mv = mbmi->mv[0].as_int;
+ assert(!is_comp_pred);
+ if (have_newmv_in_inter_mode(this_mode)) {
single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
mbmi->mv[0].as_int = x->best_mv.as_int;
#if USE_DISCOUNT_NEWMV_TEST
@@ -8006,36 +8726,38 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
}
#endif
- tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+ tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+ }
+ if (mbmi->mv[0].as_int != cur_mv) {
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
}
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
av1_build_obmc_inter_prediction(
cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
args->left_pred_buf, args->left_pred_stride);
} else if (mbmi->motion_mode == WARPED_CAUSAL) {
int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
mbmi->motion_mode = WARPED_CAUSAL;
- mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
+ mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
mbmi->interp_filters = av1_broadcast_interp_filter(
av1_unswitchable_filter(cm->interp_filter));
memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
// Select the samples according to motion vector difference
- if (mbmi->num_proj_ref[0] > 1) {
- mbmi->num_proj_ref[0] = selectSamples(
- &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref[0], bsize);
+ if (mbmi->num_proj_ref > 1) {
+ mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+ mbmi->num_proj_ref, bsize);
}
- if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
+ if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
- &mbmi->wm_params[0], mi_row, mi_col)) {
+ &mbmi->wm_params, mi_row, mi_col)) {
// Refine MV for NEWMV mode
- if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
- int tmp_rate_mv = 0;
+ assert(!is_comp_pred);
+ if (have_newmv_in_inter_mode(this_mode)) {
const int_mv mv0 = mbmi->mv[0];
- const WarpedMotionParams wm_params0 = mbmi->wm_params[0];
- int num_proj_ref0 = mbmi->num_proj_ref[0];
+ const WarpedMotionParams wm_params0 = mbmi->wm_params;
+ int num_proj_ref0 = mbmi->num_proj_ref;
// Refine MV in a small range.
av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
@@ -8057,12 +8779,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
}
#endif
- tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+ tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
} else {
// Restore the old MV and WM parameters.
mbmi->mv[0] = mv0;
- mbmi->wm_params[0] = wm_params0;
- mbmi->num_proj_ref[0] = num_proj_ref0;
+ mbmi->wm_params = wm_params0;
+ mbmi->num_proj_ref = num_proj_ref0;
}
}
@@ -8071,144 +8793,10 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
continue;
}
} else if (is_interintra_mode) {
- INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
- int64_t rd, best_interintra_rd = INT64_MAX;
- int rmode, rate_sum;
- int64_t dist_sum;
- int j;
- int tmp_rate_mv = 0;
- int tmp_skip_txfm_sb;
- int bw = block_size_wide[bsize];
- int64_t tmp_skip_sse_sb;
- DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
- uint8_t *tmp_buf, *intrapred;
- const int *const interintra_mode_cost =
- x->interintra_mode_cost[size_group_lookup[bsize]];
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
- intrapred = CONVERT_TO_BYTEPTR(intrapred_);
- } else {
- tmp_buf = tmp_buf_;
- intrapred = intrapred_;
- }
- const int_mv mv0 = mbmi->mv[0];
-
- mbmi->ref_frame[1] = NONE_FRAME;
- xd->plane[0].dst.buf = tmp_buf;
- xd->plane[0].dst.stride = bw;
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
-
- restore_dst_buf(xd, *orig_dst, num_planes);
- mbmi->ref_frame[1] = INTRA_FRAME;
- mbmi->use_wedge_interintra = 0;
- for (j = 0; j < INTERINTRA_MODES; ++j) {
- mbmi->interintra_mode = (INTERINTRA_MODE)j;
- rmode = interintra_mode_cost[mbmi->interintra_mode];
- av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
- intrapred, bw);
- av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
- av1_subtract_plane(x, bsize, 0);
- model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
- rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
- if (rd < best_interintra_rd) {
- best_interintra_rd = rd;
- best_interintra_mode = mbmi->interintra_mode;
- }
- }
- mbmi->interintra_mode = best_interintra_mode;
- rmode = interintra_mode_cost[mbmi->interintra_mode];
- av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
- intrapred, bw);
- av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
- rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
- if (rd != INT64_MAX)
- rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum);
- best_interintra_rd = rd;
-
- if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) {
- // restore ref_frame[1]
- mbmi->ref_frame[1] = ref_frame_1;
- continue;
- }
-
- if (is_interintra_wedge_used(bsize)) {
- int64_t best_interintra_rd_nowedge = INT64_MAX;
- int64_t best_interintra_rd_wedge = INT64_MAX;
- int_mv tmp_mv;
- InterpFilters backup_interp_filters = mbmi->interp_filters;
- int rwedge = x->wedge_interintra_cost[bsize][0];
- if (rd != INT64_MAX)
- rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum + rwedge, dist_sum);
- best_interintra_rd_nowedge = rd;
-
- // Disable wedge search if source variance is small
- if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
- mbmi->use_wedge_interintra = 1;
-
- rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
- x->wedge_interintra_cost[bsize][1];
-
- best_interintra_rd_wedge =
- pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-
- best_interintra_rd_wedge +=
- RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0);
- // Refine motion vector.
- if (have_newmv_in_inter_mode(mbmi->mode)) {
- // get negative of mask
- const uint8_t *mask = av1_get_contiguous_soft_mask(
- mbmi->interintra_wedge_index, 1, bsize);
- tmp_mv = av1_get_ref_mv(x, 0);
- compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
- mi_col, intrapred, mask, bw,
- &tmp_rate_mv, 0);
- mbmi->mv[0].as_int = tmp_mv.as_int;
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
- bsize);
- av1_subtract_plane(x, bsize, 0);
- model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL,
- NULL);
- rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge,
- dist_sum);
- if (rd >= best_interintra_rd_wedge) {
- tmp_mv.as_int = mv0.as_int;
- tmp_rate_mv = rate_mv;
- mbmi->interp_filters = backup_interp_filters;
- av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
- }
- } else {
- tmp_mv.as_int = mv0.as_int;
- tmp_rate_mv = rate_mv;
- av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
- }
- // Evaluate closer to true rd
- rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
- INT64_MAX);
- if (rd != INT64_MAX)
- rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
- dist_sum);
- best_interintra_rd_wedge = rd;
- if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
- mbmi->use_wedge_interintra = 1;
- mbmi->mv[0].as_int = tmp_mv.as_int;
- tmp_rate2 += tmp_rate_mv - rate_mv;
- } else {
- mbmi->use_wedge_interintra = 0;
- mbmi->mv[0].as_int = mv0.as_int;
- mbmi->interp_filters = backup_interp_filters;
- }
- } else {
- mbmi->use_wedge_interintra = 0;
- }
- } // if (is_interintra_wedge_used(bsize))
- restore_dst_buf(xd, *orig_dst, num_planes);
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ const int ret = handle_inter_intra_mode(
+ cpi, x, bsize, mi_row, mi_col, mbmi, args, ref_best_rd, &tmp_rate_mv,
+ &tmp_rate2, orig_dst);
+ if (ret < 0) continue;
}
if (!cpi->common.all_lossless)
@@ -8220,8 +8808,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
rd_stats->sse = 0;
rd_stats->skip = 1;
rd_stats->rate = tmp_rate2;
- if (av1_is_interp_needed(xd))
- rd_stats->rate += av1_get_switchable_rate(cm, x, xd);
+ if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate;
if (interintra_allowed) {
rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]]
[mbmi->ref_frame[1] == INTRA_FRAME];
@@ -8246,167 +8833,86 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
}
}
+
if (!skip_txfm_sb) {
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
int64_t est_rd = 0;
int est_skip = 0;
- if (cpi->sf.inter_mode_rd_model_estimation) {
- InterModeRdModel *md = &inter_mode_rd_models[mbmi->sb_type];
+ if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+ cm->tile_rows == 1) {
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[mbmi->sb_type];
if (md->ready) {
const int64_t curr_sse = get_sse(cpi, x);
- est_rd =
- get_est_rd(mbmi->sb_type, x->rdmult, curr_sse, rd_stats->rate);
+ est_rd = get_est_rd(tile_data, mbmi->sb_type, x->rdmult, curr_sse,
+ rd_stats->rate);
est_skip = est_rd * 0.8 > *best_est_rd;
-#if INTER_MODE_RD_TEST
- if (est_rd < *best_est_rd) {
- *best_est_rd = est_rd;
- }
-#else // INTER_MODE_RD_TEST
if (est_skip) {
- ++md->skip_count;
mbmi->ref_frame[1] = ref_frame_1;
continue;
} else {
if (est_rd < *best_est_rd) {
*best_est_rd = est_rd;
}
- ++md->non_skip_count;
}
-#endif // INTER_MODE_RD_TEST
}
}
#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
+ }
- int64_t rdcosty = INT64_MAX;
- int is_cost_valid_uv = 0;
-
- // cost and distortion
- av1_subtract_plane(x, bsize, 0);
- if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
- // Motion mode
- select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col,
- ref_best_rd);
-#if CONFIG_COLLECT_RD_STATS == 2
- PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize);
-#endif // CONFIG_COLLECT_RD_STATS == 2
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ if (!do_tx_search) {
+ const int64_t curr_sse = get_sse(cpi, x);
+ int est_residue_cost = 0;
+ int64_t est_dist = 0;
+ const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
+ &est_residue_cost, &est_dist);
+ (void)has_est_rd;
+ assert(has_est_rd);
+ const int mode_rate = rd_stats->rate;
+ rd_stats->rate += est_residue_cost;
+ rd_stats->dist = est_dist;
+ rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (cm->reference_mode == SINGLE_REFERENCE) {
+ if (!is_comp_pred) {
+ inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+ rd_stats->rdcost, mbmi);
+ }
} else {
- super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
- memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
- memset(x->blk_skip, rd_stats_y->skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+ rd_stats->rdcost, mbmi);
}
-
- if (rd_stats_y->rate == INT_MAX) {
- av1_invalid_rd_stats(rd_stats);
- if (mbmi->motion_mode != SIMPLE_TRANSLATION ||
- mbmi->ref_frame[1] == INTRA_FRAME) {
- mbmi->ref_frame[1] = ref_frame_1;
- continue;
- } else {
- restore_dst_buf(xd, *orig_dst, num_planes);
- mbmi->ref_frame[1] = ref_frame_1;
+ } else {
+#endif
+ int mode_rate = rd_stats->rate;
+ if (!txfm_search(cpi, x, bsize, mi_row, mi_col, rd_stats, rd_stats_y,
+ rd_stats_uv, mode_rate, ref_best_rd)) {
+ if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
return INT64_MAX;
}
+ continue;
}
-
- av1_merge_rd_stats(rd_stats, rd_stats_y);
-
- rdcosty = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, 0, rd_stats->sse));
- if (num_planes > 1) {
- /* clang-format off */
- is_cost_valid_uv =
- inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty,
- FTXS_NONE);
- if (!is_cost_valid_uv) {
- mbmi->ref_frame[1] = ref_frame_1;
- continue;
+ if (!skip_txfm_sb) {
+ const int64_t curr_rd =
+ RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (curr_rd < ref_best_rd) {
+ ref_best_rd = curr_rd;
}
- /* clang-format on */
- av1_merge_rd_stats(rd_stats, rd_stats_uv);
- } else {
- av1_init_rd_stats(rd_stats_uv);
- }
-#if CONFIG_RD_DEBUG
- // record transform block coefficient cost
- // TODO(angiebird): So far rd_debug tool only detects discrepancy of
- // coefficient cost. Therefore, it is fine to copy rd_stats into mbmi
- // here because we already collect the coefficient cost. Move this part to
- // other place when we need to compare non-coefficient cost.
- mbmi->rd_stats = *rd_stats;
-#endif // CONFIG_RD_DEBUG
- const int skip_ctx = av1_get_skip_context(xd);
- if (rd_stats->skip) {
- rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
- rd_stats_y->rate = 0;
- rd_stats_uv->rate = 0;
- rd_stats->rate += x->skip_cost[skip_ctx][1];
- mbmi->skip = 0;
- // here mbmi->skip temporarily plays a role as what this_skip2 does
- } else if (!xd->lossless[mbmi->segment_id] &&
- (RDCOST(x->rdmult,
- rd_stats_y->rate + rd_stats_uv->rate +
- x->skip_cost[skip_ctx][0],
- rd_stats->dist) >= RDCOST(x->rdmult,
- x->skip_cost[skip_ctx][1],
- rd_stats->sse))) {
- rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
- rd_stats->rate += x->skip_cost[skip_ctx][1];
- rd_stats->dist = rd_stats->sse;
- rd_stats_y->rate = 0;
- rd_stats_uv->rate = 0;
- mbmi->skip = 1;
- } else {
- rd_stats->rate += x->skip_cost[skip_ctx][0];
- mbmi->skip = 0;
- }
- *disable_skip = 0;
+ *disable_skip = 0;
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
- cm->tile_rows == 1) {
-#if INTER_MODE_RD_TEST
- if (md->ready) {
- int64_t real_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- if (est_skip) {
- ++md->skip_count;
- if (real_rd < ref_best_rd) {
- ++md->fp_skip_count;
- }
- // int fp_skip = real_rd < ref_best_rd;
- // printf("est_skip %d fp_skip %d est_rd %ld best_est_rd %ld real_rd
- // %ld ref_best_rd %ld\n",
- // est_skip, fp_skip, est_rd, *best_est_rd, real_rd,
- // ref_best_rd);
- } else {
- ++md->non_skip_count;
- }
+ if (cpi->sf.inter_mode_rd_model_estimation) {
+ const int skip_ctx = av1_get_skip_context(xd);
+ inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
+ rd_stats->dist,
+ rd_stats_y->rate + rd_stats_uv->rate +
+ x->skip_cost[skip_ctx][mbmi->skip]);
}
-#endif // INTER_MODE_RD_TEST
- inter_mode_data_push(mbmi->sb_type, rd_stats->sse, rd_stats->dist,
- rd_stats_y->rate + rd_stats_uv->rate +
- x->skip_cost[skip_ctx][mbmi->skip],
- rd_stats->rate, ref_best_rd);
- }
#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
- int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- if (curr_rd < ref_best_rd) {
- ref_best_rd = curr_rd;
+ } else {
+ *disable_skip = 1;
}
- } else {
- x->skip = 1;
- *disable_skip = 1;
- mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
-
- // The cost of skip bit needs to be added.
- mbmi->skip = 0;
- rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1];
-
- rd_stats->dist = 0;
- rd_stats->sse = 0;
- rd_stats_y->rate = 0;
- rd_stats_uv->rate = 0;
- rd_stats->skip = 1;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
}
+#endif
if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
if (is_nontrans_global_motion(xd, xd->mi[0])) {
@@ -8416,23 +8922,24 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
}
tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- if ((mbmi->motion_mode == SIMPLE_TRANSLATION &&
- mbmi->ref_frame[1] != INTRA_FRAME) ||
- (tmp_rd < best_rd)) {
+ if (mode_index == 0)
+ args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
+ if ((mode_index == 0) || (tmp_rd < best_rd)) {
best_mbmi = *mbmi;
best_rd = tmp_rd;
best_rd_stats = *rd_stats;
best_rd_stats_y = *rd_stats_y;
+ best_rate_mv = tmp_rate_mv;
if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
memcpy(best_blk_skip, x->blk_skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
best_xskip = x->skip;
best_disable_skip = *disable_skip;
if (best_xskip) break;
}
}
mbmi->ref_frame[1] = ref_frame_1;
-
+ *rate_mv = best_rate_mv;
if (best_rd == INT64_MAX) {
av1_invalid_rd_stats(rd_stats);
restore_dst_buf(xd, *orig_dst, num_planes);
@@ -8443,7 +8950,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
*rd_stats_y = best_rd_stats_y;
if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
memcpy(x->blk_skip, best_blk_skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
x->skip = best_xskip;
*disable_skip = best_disable_skip;
@@ -8482,15 +8989,9 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
return 0;
}
-#ifndef NDEBUG
-static INLINE int is_single_inter_mode(int this_mode) {
- return this_mode >= SINGLE_INTER_MODE_START &&
- this_mode < SINGLE_INTER_MODE_END;
-}
-#endif
-
-static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) {
- assert(is_single_inter_mode(single_mode));
+static INLINE int get_ref_mv_offset(PREDICTION_MODE single_mode,
+ uint8_t ref_mv_idx) {
+ assert(is_inter_singleref_mode(single_mode));
int ref_mv_offset;
if (single_mode == NEARESTMV) {
ref_mv_offset = 0;
@@ -8502,14 +9003,15 @@ static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) {
return ref_mv_offset;
}
-static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
- int ref_mv_idx,
+static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+ int ref_idx, int ref_mv_idx,
const MV_REFERENCE_FRAME *ref_frame,
const MB_MODE_INFO_EXT *mbmi_ext) {
const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
const int is_comp_pred = ref_frame[1] > INTRA_FRAME;
- const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred);
- assert(is_single_inter_mode(single_mode));
+ const PREDICTION_MODE single_mode =
+ get_single_mode(this_mode, ref_idx, is_comp_pred);
+ assert(is_inter_singleref_mode(single_mode));
if (single_mode == NEWMV) {
this_mv->as_int = INVALID_MV;
} else if (single_mode == GLOBALMV) {
@@ -8533,7 +9035,7 @@ static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
}
// This function update the non-new mv for the current prediction mode
-static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode,
+static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
const AV1_COMMON *cm, const MACROBLOCK *x) {
const MACROBLOCKD *xd = &x->e_mbd;
const MB_MODE_INFO *mbmi = xd->mi[0];
@@ -8543,7 +9045,8 @@ static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode,
int_mv this_mv;
get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, mbmi->ref_frame,
x->mbmi_ext);
- const int single_mode = get_single_mode(this_mode, i, is_comp_pred);
+ const PREDICTION_MODE single_mode =
+ get_single_mode(this_mode, i, is_comp_pred);
if (single_mode == NEWMV) {
cur_mv[i] = this_mv;
} else {
@@ -8584,18 +9087,29 @@ static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
return cost;
}
-static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize, int mi_col, int mi_row,
- int_mv *cur_mv, int masked_compound_used,
- BUFFER_SET *orig_dst, BUFFER_SET *tmp_dst,
- int *rate_mv, int64_t *rd,
- RD_STATS *rd_stats, int64_t ref_best_rd) {
+// Struct for buffers used by compound_type_rd() function.
+// For sizes and alignment of these arrays, refer to
+// alloc_compound_type_rd_buffers() function.
+typedef struct {
+ uint8_t *pred0;
+ uint8_t *pred1;
+ int16_t *residual1; // src - pred1
+ int16_t *diff10; // pred1 - pred0
+ uint8_t *tmp_best_mask_buf; // backup of the best segmentation mask
+} CompoundTypeRdBuffers;
+
+static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_col, int mi_row,
+ int_mv *cur_mv, int masked_compound_used,
+ BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst,
+ CompoundTypeRdBuffers *buffers, int *rate_mv,
+ int64_t *rd, RD_STATS *rd_stats,
+ int64_t ref_best_rd) {
const AV1_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = xd->mi[0];
- const int this_mode = mbmi->mode;
+ const PREDICTION_MODE this_mode = mbmi->mode;
const int bw = block_size_wide[bsize];
- const int bh = block_size_high[bsize];
int rate_sum, rs2;
int64_t dist_sum;
@@ -8605,45 +9119,19 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
int64_t tmp_skip_sse_sb;
INTERINTER_COMPOUND_DATA best_compound_data;
best_compound_data.type = COMPOUND_AVERAGE;
- DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1
- DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0
- uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE];
- uint8_t *preds0[1] = { pred0 };
- uint8_t *preds1[1] = { pred1 };
+ uint8_t *preds0[1] = { buffers->pred0 };
+ uint8_t *preds1[1] = { buffers->pred1 };
int strides[1] = { bw };
int tmp_rate_mv;
const int num_pix = 1 << num_pels_log2_lookup[bsize];
const int mask_len = 2 * num_pix * sizeof(uint8_t);
COMPOUND_TYPE cur_type;
int best_compmode_interinter_cost = 0;
- int can_use_previous = cm->allow_warped_motion;
+ int calc_pred_masked_compound = 1;
best_mv[0].as_int = cur_mv[0].as_int;
best_mv[1].as_int = cur_mv[1].as_int;
*rd = INT64_MAX;
- if (masked_compound_used) {
- // get inter predictors to use for masked compound modes
- av1_build_inter_predictors_for_planes_single_buf(
- xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous);
- av1_build_inter_predictors_for_planes_single_buf(
- xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
- const struct buf_2d *const src = &x->plane[0].src;
- if (get_bitdepth_data_path_index(xd)) {
- aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
- CONVERT_TO_BYTEPTR(pred1), bw, xd->bd);
- aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(pred1),
- bw, CONVERT_TO_BYTEPTR(pred0), bw, xd->bd);
- } else {
- aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, pred1,
- bw);
- aom_subtract_block(bh, bw, diff10, bw, pred1, bw, pred0, bw);
- }
- }
- const int orig_is_best = xd->plane[0].dst.buf == orig_dst->plane[0];
- const BUFFER_SET *backup_buf = orig_is_best ? tmp_dst : orig_dst;
- const BUFFER_SET *best_buf = orig_is_best ? orig_dst : tmp_dst;
for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
if (!is_interinter_compound_used(cur_type, bsize)) continue;
@@ -8662,17 +9150,17 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
}
masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
rs2 = masked_type_cost;
- // No need to call av1_build_inter_predictors_sby here
- // 1. COMPOUND_AVERAGE is always the first candidate
- // 2. av1_build_inter_predictors_sby has been called by
- // interpolation_filter_search
- int64_t est_rd =
- estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+ if (mode_rd < ref_best_rd) {
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ int64_t est_rd =
+ estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (est_rd != INT64_MAX)
+ best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+ }
// use spare buffer for following compound type try
- restore_dst_buf(xd, *backup_buf, 1);
- if (est_rd != INT64_MAX)
- best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+ restore_dst_buf(xd, *tmp_dst, 1);
} else {
mbmi->comp_group_idx = 1;
masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
@@ -8682,19 +9170,20 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
*rd / 3 < ref_best_rd) {
best_rd_cur = build_and_cost_compound_type(
cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
- &tmp_rate_mv, preds0, preds1, residual1, diff10, strides, mi_row,
- mi_col);
+ &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10,
+ strides, mi_row, mi_col, rd_stats->rate, ref_best_rd,
+ &calc_pred_masked_compound);
}
}
if (best_rd_cur < *rd) {
*rd = best_rd_cur;
best_compound_data = mbmi->interinter_comp;
if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) {
- memcpy(tmp_best_mask_buf, xd->seg_mask, mask_len);
+ memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len);
}
best_compmode_interinter_cost = rs2;
if (have_newmv_in_inter_mode(this_mode)) {
- if (use_masked_motion_search(cur_type)) {
+ if (cur_type == COMPOUND_WEDGE) {
best_tmp_rate_mv = tmp_rate_mv;
best_mv[0].as_int = mbmi->mv[0].as_int;
best_mv[1].as_int = mbmi->mv[1].as_int;
@@ -8712,28 +9201,69 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
mbmi->comp_group_idx =
(best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1;
mbmi->interinter_comp = best_compound_data;
- memcpy(xd->seg_mask, tmp_best_mask_buf, mask_len);
+ memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len);
}
if (have_newmv_in_inter_mode(this_mode)) {
mbmi->mv[0].as_int = best_mv[0].as_int;
mbmi->mv[1].as_int = best_mv[1].as_int;
- if (use_masked_motion_search(mbmi->interinter_comp.type)) {
+ if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
rd_stats->rate += best_tmp_rate_mv - *rate_mv;
*rate_mv = best_tmp_rate_mv;
}
}
- restore_dst_buf(xd, *best_buf, 1);
+ restore_dst_buf(xd, *orig_dst, 1);
return best_compmode_interinter_cost;
}
+static INLINE int is_single_newmv_valid(HandleInterModeArgs *args,
+ MB_MODE_INFO *mbmi,
+ PREDICTION_MODE this_mode) {
+ for (int ref_idx = 0; ref_idx < 2; ++ref_idx) {
+ const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx, 1);
+ const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx];
+ if (single_mode == NEWMV &&
+ args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int get_drl_refmv_count(const MACROBLOCK *const x,
+ const MV_REFERENCE_FRAME *ref_frame,
+ PREDICTION_MODE mode) {
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0;
+ const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV);
+ const int has_drl =
+ (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1);
+ const int ref_set =
+ has_drl ? AOMMIN(MAX_REF_MV_SERCH, ref_mv_count - has_nearmv) : 1;
+
+ return ref_set;
+}
+
+typedef struct {
+ int64_t rd;
+ int drl_cost;
+ int rate_mv;
+ int_mv mv;
+} inter_mode_info;
+
static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, RD_STATS *rd_stats,
RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
int *disable_skip, int mi_row, int mi_col,
- HandleInterModeArgs *args, int64_t ref_best_rd
+ HandleInterModeArgs *args, int64_t ref_best_rd,
+ uint8_t *const tmp_buf,
+ CompoundTypeRdBuffers *rd_buffers
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
,
- int64_t *best_est_rd
+ TileDataEnc *tile_data, int64_t *best_est_rd,
+ const int do_tx_search,
+ InterModesInfo *inter_modes_info
#endif
) {
const AV1_COMMON *cm = &cpi->common;
@@ -8742,15 +9272,26 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
MB_MODE_INFO *mbmi = xd->mi[0];
MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
const int is_comp_pred = has_second_ref(mbmi);
- const int this_mode = mbmi->mode;
+ const PREDICTION_MODE this_mode = mbmi->mode;
int i;
int refs[2] = { mbmi->ref_frame[0],
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
int rate_mv = 0;
- DECLARE_ALIGNED(32, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
- uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
int64_t rd = INT64_MAX;
- BUFFER_SET orig_dst, tmp_dst;
+
+ // do first prediction into the destination buffer. Do the next
+ // prediction into a temporary buffer. Then keep track of which one
+ // of these currently holds the best predictor, and use the other
+ // one for future predictions. In the end, copy from tmp_buf to
+ // dst if necessary.
+ struct macroblockd_plane *p = xd->plane;
+ BUFFER_SET orig_dst = {
+ { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+ { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+ };
+ const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+ tmp_buf + 2 * MAX_SB_SQUARE },
+ { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
int skip_txfm_sb = 0;
int64_t skip_sse_sb = INT64_MAX;
@@ -8765,36 +9306,29 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
MB_MODE_INFO best_mbmi = *mbmi;
int best_disable_skip;
int best_xskip;
- int plane_rate[MAX_MB_PLANE] = { 0 };
- int64_t plane_sse[MAX_MB_PLANE] = { 0 };
- int64_t plane_dist[MAX_MB_PLANE] = { 0 };
int64_t newmv_ret_val = INT64_MAX;
int_mv backup_mv[2] = { { 0 } };
int backup_rate_mv = 0;
+ inter_mode_info mode_info[MAX_REF_MV_SERCH];
int comp_idx;
const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp &
(mbmi->mode != GLOBAL_GLOBALMV);
- const int has_drl = (have_nearmv_in_inter_mode(mbmi->mode) &&
- mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
- ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
- mbmi_ext->ref_mv_count[ref_frame_type] > 1);
-
// TODO(jingning): This should be deprecated shortly.
- const int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
- const int ref_set =
- has_drl ? AOMMIN(MAX_REF_MV_SERCH,
- mbmi_ext->ref_mv_count[ref_frame_type] - idx_offset)
- : 1;
+ const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+ const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ mode_info[ref_mv_idx].mv.as_int = INVALID_MV;
+ mode_info[ref_mv_idx].rd = INT64_MAX;
+
if (cpi->sf.reduce_inter_modes && ref_mv_idx > 0) {
if (mbmi->ref_frame[0] == LAST2_FRAME ||
mbmi->ref_frame[0] == LAST3_FRAME ||
mbmi->ref_frame[1] == LAST2_FRAME ||
mbmi->ref_frame[1] == LAST3_FRAME) {
- if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + idx_offset]
+ if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + has_nearmv]
.weight < REF_CAT_LEVEL) {
continue;
}
@@ -8811,41 +9345,40 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
mode_ctx =
av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
- mbmi->num_proj_ref[0] = 0;
- mbmi->num_proj_ref[1] = 0;
+ mbmi->num_proj_ref = 0;
mbmi->motion_mode = SIMPLE_TRANSLATION;
mbmi->ref_mv_idx = ref_mv_idx;
- if (is_comp_pred) {
- for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) {
- const int single_mode =
- get_single_mode(this_mode, ref_idx, is_comp_pred);
- if (single_mode == NEWMV &&
- args->single_newmv[mbmi->ref_mv_idx][mbmi->ref_frame[ref_idx]]
- .as_int == INVALID_MV)
- continue;
- }
+ if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, this_mode))) {
+ continue;
}
rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
- rd_stats->rate +=
+ const int drl_cost =
get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+ rd_stats->rate += drl_cost;
+ mode_info[ref_mv_idx].drl_cost = drl_cost;
+
+ if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+ mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+ continue;
+ }
- const RD_STATS backup_rd_stats = *rd_stats;
- const MB_MODE_INFO backup_mbmi = *mbmi;
int64_t best_rd2 = INT64_MAX;
+ const RD_STATS backup_rd_stats = *rd_stats;
// If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
int rs = 0;
int compmode_interinter_cost = 0;
- *rd_stats = backup_rd_stats;
- *mbmi = backup_mbmi;
mbmi->compound_idx = comp_idx;
-
if (is_comp_pred && comp_idx == 0) {
+ *rd_stats = backup_rd_stats;
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->num_proj_ref = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
mbmi->comp_group_idx = 0;
- mbmi->compound_idx = 0;
const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
const int comp_index_ctx = get_comp_index_context(cm, xd);
@@ -8885,32 +9418,69 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
} else {
rd_stats->rate += rate_mv;
}
+
+ if (cpi->sf.skip_repeated_newmv) {
+ if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) {
+ int skip = 0;
+ int this_rate_mv = 0;
+ for (i = 0; i < ref_mv_idx; ++i) {
+ // Check if the motion search result same as previous results
+ if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int) {
+ // If the compared mode has no valid rd, it is unlikely this
+ // mode will be the best mode
+ if (mode_info[i].rd == INT64_MAX) {
+ skip = 1;
+ break;
+ }
+ // Compare the cost difference including drl cost and mv cost
+ if (mode_info[i].mv.as_int != INVALID_MV) {
+ const int compare_cost =
+ mode_info[i].rate_mv + mode_info[i].drl_cost;
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ this_rate_mv = av1_mv_bit_cost(&mode_info[i].mv.as_mv,
+ &ref_mv.as_mv, x->nmvjointcost,
+ x->mvcost, MV_COST_WEIGHT);
+ const int this_cost = this_rate_mv + drl_cost;
+
+ if (compare_cost < this_cost) {
+ skip = 1;
+ break;
+ } else {
+ // If the cost is less than current best result, make this
+ // the best and update corresponding variables
+ if (best_mbmi.ref_mv_idx == i) {
+ assert(best_rd != INT64_MAX);
+ best_mbmi.ref_mv_idx = ref_mv_idx;
+ best_rd_stats.rate += this_cost - compare_cost;
+ best_rd = RDCOST(x->rdmult, best_rd_stats.rate,
+ best_rd_stats.dist);
+ if (best_rd < ref_best_rd) ref_best_rd = best_rd;
+
+ skip = 1;
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (skip) {
+ args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
+ args->modelled_rd[this_mode][i][refs[0]];
+ args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
+ args->simple_rd[this_mode][i][refs[0]];
+ mode_info[ref_mv_idx].rd = mode_info[i].rd;
+ mode_info[ref_mv_idx].rate_mv = this_rate_mv;
+ mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
+
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+ }
+ }
}
for (i = 0; i < is_comp_pred + 1; ++i) {
mbmi->mv[i].as_int = cur_mv[i].as_int;
}
-
- // Initialise tmp_dst and orig_dst buffers to prevent "may be used
- // uninitialized" warnings in GCC when the stream is monochrome.
- memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane));
- memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride));
- memset(orig_dst.plane, 0, sizeof(tmp_dst.plane));
- memset(orig_dst.stride, 0, sizeof(tmp_dst.stride));
-
- // do first prediction into the destination buffer. Do the next
- // prediction into a temporary buffer. Then keep track of which one
- // of these currently holds the best predictor, and use the other
- // one for future predictions. In the end, copy from tmp_buf to
- // dst if necessary.
- for (i = 0; i < num_planes; i++) {
- tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
- tmp_dst.stride[i] = MAX_SB_SIZE;
- }
- for (i = 0; i < num_planes; i++) {
- orig_dst.plane[i] = xd->plane[i].dst.buf;
- orig_dst.stride[i] = xd->plane[i].dst.stride;
- }
-
const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
#if USE_DISCOUNT_NEWMV_TEST
// We don't include the cost of the second reference here, because there
@@ -8937,47 +9507,62 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
continue;
}
- ret_val = interpolation_filter_search(
- x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
- args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
- if (ret_val != 0) {
- restore_dst_buf(xd, orig_dst, num_planes);
- continue;
- } else if (cpi->sf.model_based_post_interp_filter_breakout &&
- ref_best_rd != INT64_MAX && (rd / 6 > ref_best_rd)) {
- restore_dst_buf(xd, orig_dst, num_planes);
- if ((rd >> 4) > ref_best_rd) break;
- continue;
- }
-
+ int skip_build_pred = 0;
if (is_comp_pred && comp_idx) {
+ // Find matching interp filter or set to default interp filter
+ const int need_search =
+ av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
+ int match_found = -1;
+ const InterpFilter assign_filter = cm->interp_filter;
+ if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
+ match_found = find_interp_filter_in_stats(x, mbmi);
+ }
+ if (!need_search || match_found == -1) {
+ set_default_interp_filters(mbmi, assign_filter);
+ }
+
int64_t best_rd_compound;
compmode_interinter_cost = compound_type_rd(
cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used,
- &orig_dst, &tmp_dst, &rate_mv, &best_rd_compound, rd_stats,
- ref_best_rd);
+ &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound,
+ rd_stats, ref_best_rd);
if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
restore_dst_buf(xd, orig_dst, num_planes);
continue;
}
- if (mbmi->interinter_comp.type != COMPOUND_AVERAGE) {
- int tmp_rate;
- int64_t tmp_dist;
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst,
- bsize);
- for (int plane = 0; plane < num_planes; ++plane)
- av1_subtract_plane(x, bsize, plane);
- model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
- &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate,
- plane_sse, plane_dist);
- rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
+ // No need to call av1_build_inter_predictors_sby if
+ // COMPOUND_AVERAGE is selected because it is the first
+ // candidate in compound_type_rd, and the following
+ // compound types searching uses tmp_dst buffer
+ if (mbmi->interinter_comp.type == COMPOUND_AVERAGE) {
+ if (num_planes > 1)
+ av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, &orig_dst,
+ bsize);
+ skip_build_pred = 1;
}
}
+ ret_val = interpolation_filter_search(
+ x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
+ args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb,
+ skip_build_pred, args, ref_best_rd);
+ if (args->modelled_rd != NULL && !is_comp_pred) {
+ args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+ }
+ if (ret_val != 0) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ } else if (cpi->sf.model_based_post_interp_filter_breakout &&
+ ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ if ((rd >> 3) * 2 > ref_best_rd) break;
+ continue;
+ }
+
if (search_jnt_comp) {
// if 1/2 model rd is larger than best_rd in jnt_comp mode,
// use jnt_comp mode, save additional search
- if ((rd >> 1) > best_rd) {
+ if ((rd >> 3) * 4 > best_rd) {
restore_dst_buf(xd, orig_dst, num_planes);
continue;
}
@@ -8991,31 +9576,31 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
if (is_comp_pred) {
const int mode0 = compound_ref0_mode(this_mode);
const int mode1 = compound_ref1_mode(this_mode);
- const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
- args->modelled_rd[mode1][refs[1]]);
- if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+ const int64_t mrd =
+ AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+ args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+ if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
restore_dst_buf(xd, orig_dst, num_planes);
continue;
}
- } else {
- args->modelled_rd[this_mode][refs[0]] = rd;
- }
- }
-
- if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
- // if current pred_error modeled rd is substantially more than the best
- // so far, do not bother doing full rd
- if (rd / 2 > ref_best_rd) {
- restore_dst_buf(xd, orig_dst, num_planes);
- continue;
}
}
-
rd_stats->rate += compmode_interinter_cost;
if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
// TODO(chengchen): this speed feature introduces big loss.
// Need better estimation of rate distortion.
+ int dummy_rate;
+ int64_t dummy_dist;
+ int plane_rate[MAX_MB_PLANE] = { 0 };
+ int64_t plane_sse[MAX_MB_PLANE] = { 0 };
+ int64_t plane_dist[MAX_MB_PLANE] = { 0 };
+
+ model_rd_sb_fn[MODELRD_TYPE_JNT_COMPOUND](
+ cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, &dummy_rate,
+ &dummy_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, plane_sse,
+ plane_dist);
+
rd_stats->rate += rs;
rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2];
rd_stats_y->rate = plane_rate[0];
@@ -9028,18 +9613,21 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
} else {
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- ret_val =
- motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
- disable_skip, mi_row, mi_col, args, ref_best_rd,
- refs, rate_mv, &orig_dst, best_est_rd);
+ ret_val = motion_mode_rd(
+ cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip,
+ mi_row, mi_col, args, ref_best_rd, refs, &rate_mv, &orig_dst,
+ tile_data, best_est_rd, do_tx_search, inter_modes_info);
#else
ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y,
rd_stats_uv, disable_skip, mi_row, mi_col,
- args, ref_best_rd, refs, rate_mv, &orig_dst);
+ args, ref_best_rd, refs, &rate_mv, &orig_dst);
#endif
}
+ mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
+ mode_info[ref_mv_idx].rate_mv = rate_mv;
if (ret_val != INT64_MAX) {
int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ mode_info[ref_mv_idx].rd = tmp_rd;
if (tmp_rd < best_rd) {
best_rd_stats = *rd_stats;
best_rd_stats_y = *rd_stats_y;
@@ -9049,7 +9637,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
best_disable_skip = *disable_skip;
best_xskip = x->skip;
memcpy(best_blk_skip, x->blk_skip,
- sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w);
}
if (tmp_rd < best_rd2) {
@@ -9062,8 +9650,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
}
restore_dst_buf(xd, orig_dst, num_planes);
}
-
- args->modelled_rd = NULL;
}
if (best_rd == INT64_MAX) return INT64_MAX;
@@ -9078,7 +9664,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
assert(IMPLIES(mbmi->comp_group_idx == 1,
mbmi->interinter_comp.type != COMPOUND_AVERAGE));
memcpy(x->blk_skip, best_blk_skip,
- sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w);
return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
}
@@ -9186,8 +9772,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
int sadpb = x->sadperbit16;
int cost_list[5];
int bestsme = av1_full_pixel_search(
- cpi, x, bsize, &mvp_full, step_param, sadpb,
- cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
+ cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
+ sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
(MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
x->mv_limits = tmp_mv_limits;
@@ -9229,8 +9815,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
} else {
super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
- memset(x->blk_skip, rd_stats.skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+ set_blk_skip(x, 0, i, rd_stats.skip);
}
if (num_planes > 1) {
super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
@@ -9254,7 +9840,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
best_skip = x->skip;
best_rdcost = rdc_noskip;
memcpy(best_blk_skip, x->blk_skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
}
if (!xd->lossless[mbmi->segment_id]) {
@@ -9271,7 +9857,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
best_skip = x->skip;
best_rdcost = rdc_skip;
memcpy(best_blk_skip, x->blk_skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
}
}
}
@@ -9279,7 +9865,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
*rd_cost = best_rdcost;
x->skip = best_skip;
memcpy(x->blk_skip, best_blk_skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
return best_rd;
}
@@ -9302,8 +9888,8 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
mbmi->mv[0].as_int = 0;
const int64_t intra_yrd =
- rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
- &y_skip, bsize, best_rd, ctx);
+ rd_pick_intra_sby_mode(cpi, x, mi_row, mi_col, &rate_y, &rate_y_tokenonly,
+ &dist_y, &y_skip, bsize, best_rd, ctx);
if (intra_yrd < best_rd) {
// Only store reconstructed luma when there's chroma RDO. When there's no
@@ -9447,6 +10033,17 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost,
mbmi->uv_mode = UV_DC_PRED;
mbmi->ref_frame[0] = ref_frame;
mbmi->ref_frame[1] = second_ref_frame;
+ const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ if (x->mbmi_ext->ref_mv_count[ref_frame_type] == UINT8_MAX) {
+ if (x->mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
+ x->mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
+ return;
+ }
+ MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count,
+ mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+ mi_col, mbmi_ext->mode_context);
+ }
assert(this_mode == NEAREST_NEARESTMV);
if (!build_cur_mv(mbmi->mv, this_mode, cm, x)) {
@@ -9508,7 +10105,7 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost,
memset(search_state->best_mbmode.inter_tx_size,
search_state->best_mbmode.tx_size,
sizeof(search_state->best_mbmode.inter_tx_size));
- set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n8_w, xd->n8_h,
+ set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n4_w, xd->n4_h,
search_state->best_mbmode.skip && is_inter_block(mbmi), xd);
// Set up color-related variables for skip mode.
@@ -9595,11 +10192,12 @@ static void sf_refine_fast_tx_type_search(
} else {
super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
- memset(x->blk_skip, rd_stats_y.skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+ set_blk_skip(x, 0, i, rd_stats_y.skip);
}
if (num_planes > 1) {
- inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, FTXS_NONE);
+ inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, INT64_MAX,
+ FTXS_NONE);
} else {
av1_init_rd_stats(&rd_stats_uv);
}
@@ -9647,7 +10245,7 @@ static void sf_refine_fast_tx_type_search(
static void set_params_rd_pick_inter_mode(
const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2],
- uint32_t mode_skip_mask[REF_FRAMES],
+ uint32_t mode_skip_mask[REF_FRAMES], int skip_ref_frame_mask,
unsigned int ref_costs_single[REF_FRAMES],
unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
@@ -9700,18 +10298,45 @@ static void set_params_rd_pick_inter_mode(
x->pred_mv_sad[ref_frame] = INT_MAX;
x->mbmi_ext->mode_context[ref_frame] = 0;
x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+ mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) {
+ if (mbmi->partition != PARTITION_NONE &&
+ mbmi->partition != PARTITION_SPLIT) {
+ if (skip_ref_frame_mask & (1 << ref_frame)) {
+ int skip = 1;
+ for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+ if (!(skip_ref_frame_mask & (1 << r))) {
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+ if (rf[0] == ref_frame || rf[1] == ref_frame) {
+ skip = 0;
+ break;
+ }
+ }
+ }
+ if (skip) continue;
+ }
+ }
assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
yv12_mb);
}
}
-
- // TODO(zoeliu@google.com): To further optimize the obtaining of motion vector
- // references for compound prediction, as not every pair of reference frames
- // woud be examined for the RD evaluation.
+ // ref_frame = ALTREF_FRAME
for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
x->mbmi_ext->mode_context[ref_frame] = 0;
+ mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
+ if (!((cpi->ref_frame_flags & ref_frame_flag_list[rf[0]]) &&
+ (cpi->ref_frame_flags & ref_frame_flag_list[rf[1]]))) {
+ continue;
+ }
+
+ if (mbmi->partition != PARTITION_NONE &&
+ mbmi->partition != PARTITION_SPLIT) {
+ if (skip_ref_frame_mask & (1 << ref_frame)) {
+ continue;
+ }
+ }
av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
mi_col, mbmi_ext->mode_context);
@@ -9838,9 +10463,10 @@ static void set_params_rd_pick_inter_mode(
}
}
-static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
- RD_STATS *rd_cost, PICK_MODE_CONTEXT *ctx,
- BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
+static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+ int mi_col, RD_STATS *rd_cost,
+ PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
+ MB_MODE_INFO *const mbmi,
PALETTE_MODE_INFO *const pmi,
unsigned int *ref_costs_single,
InterModeSearchState *search_state) {
@@ -9867,9 +10493,9 @@ static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
mbmi->ref_frame[0] = INTRA_FRAME;
mbmi->ref_frame[1] = NONE_FRAME;
rate_overhead_palette = rd_pick_palette_intra_sby(
- cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette,
- best_palette_color_map, &best_rd_palette, &best_model_rd_palette, NULL,
- NULL, NULL, NULL, ctx, best_blk_skip);
+ cpi, x, bsize, mi_row, mi_col, intra_mode_cost[DC_PRED],
+ &best_mbmi_palette, best_palette_color_map, &best_rd_palette,
+ &best_model_rd_palette, NULL, NULL, NULL, NULL, ctx, best_blk_skip);
if (pmi->palette_size[0] == 0) return;
memcpy(x->blk_skip, best_blk_skip,
@@ -9986,15 +10612,49 @@ static void init_inter_mode_search_state(InterModeSearchState *search_state,
av1_zero(search_state->single_newmv);
av1_zero(search_state->single_newmv_rate);
av1_zero(search_state->single_newmv_valid);
- for (int i = 0; i < MB_MODE_COUNT; ++i)
- for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
- search_state->modelled_rd[i][ref_frame] = INT64_MAX;
+ for (int i = 0; i < MB_MODE_COUNT; ++i) {
+ for (int j = 0; j < MAX_REF_MV_SERCH; ++j) {
+ for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+ search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+ search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+ }
+ }
+ }
+
+ for (int dir = 0; dir < 2; ++dir) {
+ for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+ SingleInterModeState *state;
+
+ state = &search_state->single_state[dir][mode][ref_frame];
+ state->ref_frame = NONE_FRAME;
+ state->rd = INT64_MAX;
+
+ state = &search_state->single_state_modelled[dir][mode][ref_frame];
+ state->ref_frame = NONE_FRAME;
+ state->rd = INT64_MAX;
+ }
+ }
+ }
+ for (int dir = 0; dir < 2; ++dir) {
+ for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+ search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME;
+ }
+ }
+ }
+ av1_zero(search_state->single_state_cnt);
+ av1_zero(search_state->single_state_modelled_cnt);
}
+// Case 1: return 0, means don't skip this mode
+// Case 2: return 1, means skip this mode completely
+// Case 3: return 2, means skip compound only, but still try single motion modes
static int inter_mode_search_order_independent_skip(
- const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index,
- int mi_row, int mi_col, uint32_t *mode_skip_mask,
- uint16_t *ref_frame_skip_mask) {
+ const AV1_COMP *cpi, const PICK_MODE_CONTEXT *ctx, const MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mode_index, int mi_row, int mi_col,
+ uint32_t *mode_skip_mask, uint16_t *ref_frame_skip_mask,
+ InterModeSearchState *search_state) {
const SPEED_FEATURES *const sf = &cpi->sf;
const AV1_COMMON *const cm = &cpi->common;
const struct segmentation *const seg = &cm->seg;
@@ -10003,6 +10663,32 @@ static int inter_mode_search_order_independent_skip(
const unsigned char segment_id = mbmi->segment_id;
const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+ int skip_motion_mode = 0;
+ if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+ const int ref_type = av1_ref_frame_type(ref_frame);
+ int skip_ref = ctx->skip_ref_frame_mask & (1 << ref_type);
+ if (ref_type <= ALTREF_FRAME && skip_ref) {
+ // Since the compound ref modes depends on the motion estimation result of
+ // two single ref modes( best mv of single ref modes as the start point )
+ // If current single ref mode is marked skip, we need to check if it will
+ // be used in compound ref modes.
+ for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+ if (!(ctx->skip_ref_frame_mask & (1 << r))) {
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+ if (rf[0] == ref_type || rf[1] == ref_type) {
+ // Found a not skipped compound ref mode which contains current
+ // single ref. So this single ref can't be skipped completly
+ // Just skip it's motion mode search, still try it's simple
+ // transition mode.
+ skip_motion_mode = 1;
+ skip_ref = 0;
+ break;
+ }
+ }
+ }
+ }
+ if (skip_ref) return 1;
+ }
if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
!x->cb_partition_scan) {
@@ -10115,9 +10801,12 @@ static int inter_mode_search_order_independent_skip(
return 1;
}
- if (skip_repeated_mv(cm, x, this_mode, ref_frame)) {
+ if (skip_repeated_mv(cm, x, this_mode, ref_frame, search_state)) {
return 1;
}
+ if (skip_motion_mode) {
+ return 2;
+ }
return 0;
}
@@ -10139,12 +10828,13 @@ static INLINE void init_mbmi(MB_MODE_INFO *mbmi, int mode_index,
set_default_interp_filters(mbmi, cm->interp_filter);
}
-static int handle_intra_mode(InterModeSearchState *search_state,
- const AV1_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize, int ref_frame_cost,
- const PICK_MODE_CONTEXT *ctx, int disable_skip,
- RD_STATS *rd_stats, RD_STATS *rd_stats_y,
- RD_STATS *rd_stats_uv) {
+static int64_t handle_intra_mode(InterModeSearchState *search_state,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int ref_frame_cost,
+ const PICK_MODE_CONTEXT *ctx, int disable_skip,
+ RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv) {
const AV1_COMMON *cm = &cpi->common;
const SPEED_FEATURES *const sf = &cpi->sf;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -10159,9 +10849,19 @@ static int handle_intra_mode(InterModeSearchState *search_state,
const int rows = block_size_high[bsize];
const int cols = block_size_wide[bsize];
const int num_planes = av1_num_planes(cm);
- av1_init_rd_stats(rd_stats);
- av1_init_rd_stats(rd_stats_y);
- av1_init_rd_stats(rd_stats_uv);
+ const int skip_ctx = av1_get_skip_context(xd);
+
+ int known_rate = intra_mode_cost[mbmi->mode];
+ known_rate += ref_frame_cost;
+ if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED)
+ known_rate += intra_cost_penalty;
+ known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+ const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
+ if (known_rd > search_state->best_rd) {
+ search_state->skip_intra_modes = 1;
+ return INT64_MAX;
+ }
+
TX_SIZE uv_tx;
int is_directional_mode = av1_is_directional_mode(mbmi->mode);
if (is_directional_mode && av1_use_angle_delta(bsize)) {
@@ -10178,20 +10878,33 @@ static int handle_intra_mode(InterModeSearchState *search_state,
search_state->directional_mode_skip_mask);
search_state->angle_stats_ready = 1;
}
- if (search_state->directional_mode_skip_mask[mbmi->mode]) return 0;
+ if (search_state->directional_mode_skip_mask[mbmi->mode]) return INT64_MAX;
+ av1_init_rd_stats(rd_stats_y);
rd_stats_y->rate = INT_MAX;
- rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize,
- intra_mode_cost[mbmi->mode], search_state->best_rd,
- &model_rd);
+ rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &rate_dummy, rd_stats_y,
+ bsize, intra_mode_cost[mbmi->mode],
+ search_state->best_rd, &model_rd);
} else {
+ av1_init_rd_stats(rd_stats_y);
mbmi->angle_delta[PLANE_TYPE_Y] = 0;
super_block_yrd(cpi, x, rd_stats_y, bsize, search_state->best_rd);
}
uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
memcpy(best_blk_skip, x->blk_skip,
sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
-
+ int try_filter_intra = 0;
+ int64_t best_rd_tmp = INT64_MAX;
if (mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+ if (rd_stats_y->rate != INT_MAX) {
+ const int tmp_rate = rd_stats_y->rate + x->filter_intra_cost[bsize][0] +
+ intra_mode_cost[mbmi->mode];
+ best_rd_tmp = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
+ try_filter_intra = !((best_rd_tmp / 2) > search_state->best_rd);
+ } else {
+ try_filter_intra = !(search_state->best_mbmode.skip);
+ }
+ }
+ if (try_filter_intra) {
RD_STATS rd_stats_y_fi;
int filter_intra_selected_flag = 0;
TX_SIZE best_tx_size = mbmi->tx_size;
@@ -10199,20 +10912,12 @@ static int handle_intra_mode(InterModeSearchState *search_state,
memcpy(best_txk_type, mbmi->txk_type,
sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
- int64_t best_rd_tmp = INT64_MAX;
- if (rd_stats_y->rate != INT_MAX) {
- best_rd_tmp = RDCOST(x->rdmult,
- rd_stats_y->rate + x->filter_intra_cost[bsize][0] +
- intra_mode_cost[mbmi->mode],
- rd_stats_y->dist);
- }
mbmi->filter_intra_mode_info.use_filter_intra = 1;
for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED;
fi_mode < FILTER_INTRA_MODES; ++fi_mode) {
int64_t this_rd_tmp;
mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
-
super_block_yrd(cpi, x, &rd_stats_y_fi, bsize, search_state->best_rd);
if (rd_stats_y_fi.rate == INT_MAX) {
continue;
@@ -10223,6 +10928,9 @@ static int handle_intra_mode(InterModeSearchState *search_state,
intra_mode_cost[mbmi->mode]);
this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
+ if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > search_state->best_rd) {
+ break;
+ }
if (this_rd_tmp < best_rd_tmp) {
best_tx_size = mbmi->tx_size;
memcpy(best_txk_type, mbmi->txk_type,
@@ -10249,12 +10957,23 @@ static int handle_intra_mode(InterModeSearchState *search_state,
mbmi->filter_intra_mode_info.use_filter_intra = 0;
}
}
-
- if (rd_stats_y->rate == INT_MAX) return 0;
-
+ if (rd_stats_y->rate == INT_MAX) return INT64_MAX;
+ const int mode_cost_y =
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]);
+ av1_init_rd_stats(rd_stats);
+ av1_init_rd_stats(rd_stats_uv);
if (num_planes > 1) {
uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
+ int rate_y =
+ rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate;
+ const int64_t rdy =
+ RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist);
+ if (search_state->best_rd < (INT64_MAX / 2) &&
+ rdy > (search_state->best_rd + (search_state->best_rd >> 2))) {
+ search_state->skip_intra_modes = 1;
+ return INT64_MAX;
+ }
choose_intra_uv_mode(
cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
&search_state->rate_uv_tokenonly[uv_tx],
@@ -10262,6 +10981,14 @@ static int handle_intra_mode(InterModeSearchState *search_state,
&search_state->mode_uv[uv_tx]);
if (try_palette) search_state->pmi_uv[uv_tx] = *pmi;
search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
+
+ const int uv_rate = search_state->rate_uv_tokenonly[uv_tx];
+ const int64_t uv_dist = search_state->dist_uvs[uv_tx];
+ const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
+ if (uv_rd > search_state->best_rd) {
+ search_state->skip_intra_modes = 1;
+ return INT64_MAX;
+ }
}
rd_stats_uv->rate = search_state->rate_uv_tokenonly[uv_tx];
@@ -10277,10 +11004,7 @@ static int handle_intra_mode(InterModeSearchState *search_state,
}
mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
}
-
- rd_stats->rate =
- rd_stats_y->rate +
- intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]);
+ rd_stats->rate = rd_stats_y->rate + mode_cost_y;
if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
// super_block_yrd above includes the cost of the tx_size in the
// tokenonly rate, but for intra blocks, tx_size is always coded
@@ -10308,14 +11032,13 @@ static int handle_intra_mode(InterModeSearchState *search_state,
rd_stats_y->rate = 0;
rd_stats_uv->rate = 0;
// Cost the skip mb case
- rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1];
+ rd_stats->rate += x->skip_cost[skip_ctx][1];
} else {
// Add in the cost of the no skip flag.
- rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][0];
+ rd_stats->rate += x->skip_cost[skip_ctx][0];
}
// Calculate the final RD estimate for this mode.
- int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-
+ const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
// Keep record of best intra rd
if (this_rd < search_state->best_intra_rd) {
search_state->best_intra_rd = this_rd;
@@ -10333,14 +11056,322 @@ static int handle_intra_mode(InterModeSearchState *search_state,
search_state->best_pred_rd[i] =
AOMMIN(search_state->best_pred_rd[i], this_rd);
}
- return 1;
+ return this_rd;
}
-void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
+static void collect_single_states(MACROBLOCK *x,
+ InterModeSearchState *search_state,
+ const MB_MODE_INFO *const mbmi) {
+ int i, j;
+ const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1;
+ const int mode_offset = INTER_OFFSET(this_mode);
+ const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+
+ // Simple rd
+ int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame];
+ for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ int64_t rd = search_state->simple_rd[this_mode][ref_mv_idx][ref_frame];
+ if (rd < simple_rd) simple_rd = rd;
+ }
+
+ // Insertion sort of single_state
+ SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 };
+ SingleInterModeState *state_s = search_state->single_state[dir][mode_offset];
+ i = search_state->single_state_cnt[dir][mode_offset];
+ for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j)
+ state_s[j] = state_s[j - 1];
+ state_s[j] = this_state_s;
+ search_state->single_state_cnt[dir][mode_offset]++;
+
+ // Modelled rd
+ int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame];
+ for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ int64_t rd = search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame];
+ if (rd < modelled_rd) modelled_rd = rd;
+ }
+
+ // Insertion sort of single_state_modelled
+ SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 };
+ SingleInterModeState *state_m =
+ search_state->single_state_modelled[dir][mode_offset];
+ i = search_state->single_state_modelled_cnt[dir][mode_offset];
+ for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j)
+ state_m[j] = state_m[j - 1];
+ state_m[j] = this_state_m;
+ search_state->single_state_modelled_cnt[dir][mode_offset]++;
+}
+
+static void analyze_single_states(const AV1_COMP *cpi,
+ InterModeSearchState *search_state) {
+ int i, j, dir, mode;
+ if (cpi->sf.prune_comp_search_by_single_result >= 1) {
+ for (dir = 0; dir < 2; ++dir) {
+ int64_t best_rd;
+ SingleInterModeState(*state)[FWD_REFS];
+
+ // Use the best rd of GLOBALMV or NEWMV to prune the unlikely
+ // reference frames for all the modes (NEARESTMV and NEARMV may not
+ // have same motion vectors). Always keep the best of each mode
+ // because it might form the best possible combination with other mode.
+ state = search_state->single_state[dir];
+ best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+ state[INTER_OFFSET(GLOBALMV)][0].rd);
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) {
+ if (state[mode][i].rd != INT64_MAX &&
+ (state[mode][i].rd >> 1) > best_rd) {
+ state[mode][i].valid = 0;
+ }
+ }
+ }
+
+ state = search_state->single_state_modelled[dir];
+ best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+ state[INTER_OFFSET(GLOBALMV)][0].rd);
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode];
+ ++i) {
+ if (state[mode][i].rd != INT64_MAX &&
+ (state[mode][i].rd >> 1) > best_rd) {
+ state[mode][i].valid = 0;
+ }
+ }
+ }
+ }
+ }
+
+ // Ordering by simple rd first, then by modelled rd
+ for (dir = 0; dir < 2; ++dir) {
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ const int state_cnt_s = search_state->single_state_cnt[dir][mode];
+ const int state_cnt_m =
+ search_state->single_state_modelled_cnt[dir][mode];
+ SingleInterModeState *state_s = search_state->single_state[dir][mode];
+ SingleInterModeState *state_m =
+ search_state->single_state_modelled[dir][mode];
+ int count = 0;
+ const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m);
+ for (i = 0; i < state_cnt_s; ++i) {
+ if (state_s[i].rd == INT64_MAX) break;
+ if (state_s[i].valid)
+ search_state->single_rd_order[dir][mode][count++] =
+ state_s[i].ref_frame;
+ }
+ if (count < max_candidates) {
+ for (i = 0; i < state_cnt_m; ++i) {
+ if (state_m[i].rd == INT64_MAX) break;
+ if (state_m[i].valid) {
+ int ref_frame = state_m[i].ref_frame;
+ int match = 0;
+ // Check if existing already
+ for (j = 0; j < count; ++j) {
+ if (search_state->single_rd_order[dir][mode][j] == ref_frame) {
+ match = 1;
+ break;
+ }
+ }
+ if (!match) {
+ // Check if this ref_frame is removed in simple rd
+ int valid = 1;
+ for (j = 0; j < state_cnt_s; j++) {
+ if (ref_frame == state_s[j].ref_frame && !state_s[j].valid) {
+ valid = 0;
+ break;
+ }
+ }
+ if (valid)
+ search_state->single_rd_order[dir][mode][count++] = ref_frame;
+ }
+ if (count >= max_candidates) break;
+ }
+ }
+ }
+ }
+ }
+}
+
+static int compound_skip_get_candidates(
+ const AV1_COMP *cpi, const InterModeSearchState *search_state,
+ const int dir, const PREDICTION_MODE mode) {
+ const int mode_offset = INTER_OFFSET(mode);
+ const SingleInterModeState *state =
+ search_state->single_state[dir][mode_offset];
+ const SingleInterModeState *state_modelled =
+ search_state->single_state_modelled[dir][mode_offset];
+ int max_candidates = 0;
+ int candidates;
+
+ for (int i = 0; i < FWD_REFS; ++i) {
+ if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break;
+ max_candidates++;
+ }
+
+ candidates = max_candidates;
+ if (cpi->sf.prune_comp_search_by_single_result >= 2) {
+ candidates = AOMMIN(2, max_candidates);
+ }
+ if (cpi->sf.prune_comp_search_by_single_result >= 3) {
+ if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX &&
+ state[0].ref_frame == state_modelled[0].ref_frame)
+ candidates = 1;
+ if (mode == NEARMV || mode == GLOBALMV) candidates = 1;
+ }
+ return candidates;
+}
+
+static int compound_skip_by_single_states(
+ const AV1_COMP *cpi, const InterModeSearchState *search_state,
+ const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame,
+ const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) {
+ const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame };
+ const int mode[2] = { compound_ref0_mode(this_mode),
+ compound_ref1_mode(this_mode) };
+ const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) };
+ const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1,
+ refs[1] <= GOLDEN_FRAME ? 0 : 1 };
+ int ref_searched[2] = { 0, 0 };
+ int ref_mv_match[2] = { 1, 1 };
+ int i, j;
+
+ for (i = 0; i < 2; ++i) {
+ const SingleInterModeState *state =
+ search_state->single_state[mode_dir[i]][mode_offset[i]];
+ const int state_cnt =
+ search_state->single_state_cnt[mode_dir[i]][mode_offset[i]];
+ for (j = 0; j < state_cnt; ++j) {
+ if (state[j].ref_frame == refs[i]) {
+ ref_searched[i] = 1;
+ break;
+ }
+ }
+ }
+
+ const int ref_set = get_drl_refmv_count(x, refs, this_mode);
+ for (i = 0; i < 2; ++i) {
+ if (mode[i] == NEARESTMV || mode[i] == NEARMV) {
+ const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME };
+ int idential = 1;
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) {
+ int_mv single_mv;
+ int_mv comp_mv;
+ get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, single_refs,
+ x->mbmi_ext);
+ get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, refs, x->mbmi_ext);
+
+ idential &= (single_mv.as_int == comp_mv.as_int);
+ if (!idential) {
+ ref_mv_match[i] = 0;
+ break;
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < 2; ++i) {
+ if (ref_searched[i] && ref_mv_match[i]) {
+ const int candidates =
+ compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]);
+ const MV_REFERENCE_FRAME *ref_order =
+ search_state->single_rd_order[mode_dir[i]][mode_offset[i]];
+ int match = 0;
+ for (j = 0; j < candidates; ++j) {
+ if (refs[i] == ref_order[j]) {
+ match = 1;
+ break;
+ }
+ }
+ if (!match) return 1;
+ }
+ }
+
+ return 0;
+}
+
+static INLINE int sf_check_is_drop_ref(const MODE_DEFINITION *mode,
+ InterModeSearchState *search_state) {
+ const MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0];
+ const MV_REFERENCE_FRAME second_ref_frame = mode->ref_frame[1];
+ if (search_state->num_available_refs > 2) {
+ if ((ref_frame == search_state->dist_order_refs[0] &&
+ second_ref_frame == search_state->dist_order_refs[1]) ||
+ (ref_frame == search_state->dist_order_refs[1] &&
+ second_ref_frame == search_state->dist_order_refs[0]))
+ return 1; // drop this pair of refs
+ }
+ return 0;
+}
+
+static INLINE void sf_drop_ref_analyze(InterModeSearchState *search_state,
+ const MODE_DEFINITION *mode,
+ int64_t distortion2) {
+ const PREDICTION_MODE this_mode = mode->mode;
+ MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0];
+ const int idx = ref_frame - LAST_FRAME;
+ if (idx && distortion2 > search_state->dist_refs[idx]) {
+ search_state->dist_refs[idx] = distortion2;
+ search_state->dist_order_refs[idx] = ref_frame;
+ }
+
+ // Reach the last single ref prediction mode
+ if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) {
+ // bubble sort dist_refs and the order index
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ for (int k = i + 1; k < REF_FRAMES; ++k) {
+ if (search_state->dist_refs[i] < search_state->dist_refs[k]) {
+ int64_t tmp_dist = search_state->dist_refs[i];
+ search_state->dist_refs[i] = search_state->dist_refs[k];
+ search_state->dist_refs[k] = tmp_dist;
+
+ int tmp_idx = search_state->dist_order_refs[i];
+ search_state->dist_order_refs[i] = search_state->dist_order_refs[k];
+ search_state->dist_order_refs[k] = tmp_idx;
+ }
+ }
+ }
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ if (search_state->dist_refs[i] == -1) break;
+ search_state->num_available_refs = i;
+ }
+ search_state->num_available_refs++;
+ }
+}
+
+static void alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
+ CompoundTypeRdBuffers *const bufs) {
+ CHECK_MEM_ERROR(
+ cm, bufs->pred0,
+ (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
+ CHECK_MEM_ERROR(
+ cm, bufs->pred1,
+ (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
+ CHECK_MEM_ERROR(
+ cm, bufs->residual1,
+ (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
+ CHECK_MEM_ERROR(
+ cm, bufs->diff10,
+ (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
+ CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf,
+ (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+ sizeof(*bufs->tmp_best_mask_buf)));
+}
+
+static void release_compound_type_rd_buffers(
+ CompoundTypeRdBuffers *const bufs) {
+ aom_free(bufs->pred0);
+ aom_free(bufs->pred1);
+ aom_free(bufs->residual1);
+ aom_free(bufs->diff10);
+ aom_free(bufs->tmp_best_mask_buf);
+ av1_zero(*bufs); // Set all pointers to NULL for safety.
+}
+
+void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
MACROBLOCK *x, int mi_row, int mi_col,
RD_STATS *rd_cost, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) {
- const AV1_COMMON *const cm = &cpi->common;
+ AV1_COMMON *const cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
const SPEED_FEATURES *const sf = &cpi->sf;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -10350,9 +11381,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
const struct segmentation *const seg = &cm->seg;
PREDICTION_MODE this_mode;
- MV_REFERENCE_FRAME ref_frame, second_ref_frame;
unsigned char segment_id = mbmi->segment_id;
- int i, k;
+ int i;
struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
unsigned int ref_costs_single[REF_FRAMES];
unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
@@ -10364,28 +11394,57 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
InterModeSearchState search_state;
init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
best_rd_so_far);
-
+ INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
+ INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
+ INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
+ };
HandleInterModeArgs args = {
{ NULL }, { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
{ NULL }, { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 },
NULL, NULL,
- NULL, NULL,
+ NULL, search_state.modelled_rd,
{ { 0 } }, INT_MAX,
- INT_MAX
+ INT_MAX, search_state.simple_rd,
+ 0, interintra_modes
};
for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
av1_invalid_rd_stats(rd_cost);
// init params, set frame modes, speed features
- set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col,
- ref_frame_skip_mask, mode_skip_mask,
- ref_costs_single, ref_costs_comp, yv12_mb);
+ set_params_rd_pick_inter_mode(
+ cpi, x, &args, bsize, mi_row, mi_col, ref_frame_skip_mask, mode_skip_mask,
+ ctx->skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb);
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
int64_t best_est_rd = INT64_MAX;
+ // TODO(angiebird): Turn this on when this speed feature is well tested
+#if 1
+ const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ const int do_tx_search = !md->ready;
+#else
+ const int do_tx_search = 1;
+#endif
+ InterModesInfo *inter_modes_info = &tile_data->inter_modes_info;
+ inter_modes_info->num = 0;
#endif
+ int intra_mode_num = 0;
+ int intra_mode_idx_ls[MAX_MODES];
+ int reach_first_comp_mode = 0;
+
+ // Temporary buffers used by handle_inter_mode().
+ // We allocate them once and reuse it in every call to that function.
+ // Note: Must be allocated on the heap due to large size of the arrays.
+ uint8_t *tmp_buf_orig;
+ CHECK_MEM_ERROR(
+ cm, tmp_buf_orig,
+ (uint8_t *)aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE));
+ uint8_t *const tmp_buf = get_buf_by_bd(xd, tmp_buf_orig);
+
+ CompoundTypeRdBuffers rd_buffers;
+ alloc_compound_type_rd_buffers(cm, &rd_buffers);
+
for (int midx = 0; midx < MAX_MODES; ++midx) {
int mode_index = mode_map[midx];
int64_t this_rd = INT64_MAX;
@@ -10394,42 +11453,44 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
int64_t distortion2 = 0;
int skippable = 0;
int this_skip2 = 0;
-
- this_mode = av1_mode_order[mode_index].mode;
- ref_frame = av1_mode_order[mode_index].ref_frame[0];
- second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
+ const MODE_DEFINITION *mode_order = &av1_mode_order[mode_index];
+ const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
+ const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
+ const int comp_pred = second_ref_frame > INTRA_FRAME;
+ this_mode = mode_order->mode;
init_mbmi(mbmi, mode_index, cm);
x->skip = 0;
set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
- if (inter_mode_search_order_independent_skip(cpi, x, bsize, mode_index,
- mi_row, mi_col, mode_skip_mask,
- ref_frame_skip_mask))
- continue;
-
- if (ref_frame == INTRA_FRAME) {
- if (sf->skip_intra_in_interframe && search_state.skip_intra_modes)
- continue;
+ // Reach the first compound prediction mode
+ if (sf->prune_comp_search_by_single_result > 0 && comp_pred &&
+ reach_first_comp_mode == 0) {
+ analyze_single_states(cpi, &search_state);
+ reach_first_comp_mode = 1;
}
+ const int ret = inter_mode_search_order_independent_skip(
+ cpi, ctx, x, bsize, mode_index, mi_row, mi_col, mode_skip_mask,
+ ref_frame_skip_mask, &search_state);
+ if (ret == 1) continue;
+ args.skip_motion_mode = (ret == 2);
- if (sf->drop_ref) {
- if (ref_frame > INTRA_FRAME && second_ref_frame > INTRA_FRAME) {
- if (search_state.num_available_refs > 2) {
- if ((ref_frame == search_state.dist_order_refs[0] &&
- second_ref_frame == search_state.dist_order_refs[1]) ||
- (ref_frame == search_state.dist_order_refs[1] &&
- second_ref_frame == search_state.dist_order_refs[0]))
- continue;
- }
+ if (sf->drop_ref && comp_pred) {
+ if (sf_check_is_drop_ref(mode_order, &search_state)) {
+ continue;
}
}
if (search_state.best_rd < search_state.mode_threshold[mode_index])
continue;
- const int comp_pred = second_ref_frame > INTRA_FRAME;
+ if (sf->prune_comp_search_by_single_result > 0 && comp_pred) {
+ if (compound_skip_by_single_states(cpi, &search_state, this_mode,
+ ref_frame, second_ref_frame, x))
+ continue;
+ }
+
const int ref_frame_cost = comp_pred
? ref_costs_comp[ref_frame][second_ref_frame]
: ref_costs_single[ref_frame];
@@ -10474,18 +11535,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
}
if (ref_frame == INTRA_FRAME) {
- RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
- const int ret = handle_intra_mode(
- &search_state, cpi, x, bsize, ref_frame_cost, ctx, disable_skip,
- &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
- if (!ret) {
- continue;
- }
- rate2 = intra_rd_stats.rate;
- distortion2 = intra_rd_stats.dist;
- this_rd = RDCOST(x->rdmult, rate2, distortion2);
- skippable = intra_rd_stats.skip;
- rate_y = intra_rd_stats_y.rate;
+ intra_mode_idx_ls[intra_mode_num++] = mode_index;
+ continue;
} else {
mbmi->angle_delta[PLANE_TYPE_Y] = 0;
mbmi->angle_delta[PLANE_TYPE_UV] = 0;
@@ -10501,17 +11552,17 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
args.single_newmv = search_state.single_newmv;
args.single_newmv_rate = search_state.single_newmv_rate;
args.single_newmv_valid = search_state.single_newmv_valid;
- args.modelled_rd = search_state.modelled_rd;
args.single_comp_cost = real_compmode_cost;
args.ref_frame_cost = ref_frame_cost;
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
- &rd_stats_uv, &disable_skip, mi_row, mi_col,
- &args, ref_best_rd, &best_est_rd);
+ this_rd = handle_inter_mode(
+ cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip,
+ mi_row, mi_col, &args, ref_best_rd, tmp_buf, &rd_buffers, tile_data,
+ &best_est_rd, do_tx_search, inter_modes_info);
#else
this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
&rd_stats_uv, &disable_skip, mi_row, mi_col,
- &args, ref_best_rd);
+ &args, ref_best_rd, tmp_buf, &rd_buffers);
#endif
rate2 = rd_stats.rate;
skippable = rd_stats.skip;
@@ -10520,6 +11571,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
rate_uv = rd_stats_uv.rate;
}
+ if (sf->prune_comp_search_by_single_result > 0 &&
+ is_inter_singleref_mode(this_mode)) {
+ collect_single_states(x, &search_state, mbmi);
+ }
+
if (this_rd == INT64_MAX) continue;
this_skip2 = mbmi->skip;
@@ -10554,10 +11610,24 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
search_state.best_mbmode = *mbmi;
search_state.best_skip2 = this_skip2;
search_state.best_mode_skippable = skippable;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ if (do_tx_search) {
+ // When do_tx_search == 0, handle_inter_mode won't provide correct
+ // rate_y and rate_uv because txfm_search process is replaced by
+ // rd estimation.
+ // Therfore, we should avoid updating best_rate_y and best_rate_uv
+ // here. These two values will be updated when txfm_search is called
+ search_state.best_rate_y =
+ rate_y +
+ x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
+ search_state.best_rate_uv = rate_uv;
+ }
+#else // CONFIG_COLLECT_INTER_MODE_RD_STATS
search_state.best_rate_y =
rate_y +
x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
search_state.best_rate_uv = rate_uv;
+#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
memcpy(ctx->blk_skip, x->blk_skip,
sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
}
@@ -10588,43 +11658,124 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT])
search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
}
+ if (sf->drop_ref && second_ref_frame == NONE_FRAME) {
+ // Collect data from single ref mode, and analyze data.
+ sf_drop_ref_analyze(&search_state, mode_order, distortion2);
+ }
- if (sf->drop_ref) {
- if (second_ref_frame == NONE_FRAME) {
- const int idx = ref_frame - LAST_FRAME;
- if (idx && distortion2 > search_state.dist_refs[idx]) {
- search_state.dist_refs[idx] = distortion2;
- search_state.dist_order_refs[idx] = ref_frame;
- }
+ if (x->skip && !comp_pred) break;
+ }
- // Reach the last single ref prediction mode
- if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) {
- // bubble sort dist_refs and the order index
- for (i = 0; i < REF_FRAMES; ++i) {
- for (k = i + 1; k < REF_FRAMES; ++k) {
- if (search_state.dist_refs[i] < search_state.dist_refs[k]) {
- int64_t tmp_dist = search_state.dist_refs[i];
- search_state.dist_refs[i] = search_state.dist_refs[k];
- search_state.dist_refs[k] = tmp_dist;
-
- int tmp_idx = search_state.dist_order_refs[i];
- search_state.dist_order_refs[i] =
- search_state.dist_order_refs[k];
- search_state.dist_order_refs[k] = tmp_idx;
- }
- }
- }
+ aom_free(tmp_buf_orig);
+ tmp_buf_orig = NULL;
+ release_compound_type_rd_buffers(&rd_buffers);
- for (i = 0; i < REF_FRAMES; ++i) {
- if (search_state.dist_refs[i] == -1) break;
- search_state.num_available_refs = i;
- }
- search_state.num_available_refs++;
- }
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ if (!do_tx_search) {
+ inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+ search_state.best_rd = INT64_MAX;
+
+ int64_t top_est_rd =
+ inter_modes_info->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx];
+ for (int j = 0; j < inter_modes_info->num; ++j) {
+ const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
+ *mbmi = inter_modes_info->mbmi_arr[data_idx];
+ int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
+ if (curr_est_rd * 0.9 > top_est_rd) {
+ continue;
+ }
+ const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Select prediction reference frames.
+ const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ RD_STATS rd_stats;
+ RD_STATS rd_stats_y;
+ RD_STATS rd_stats_uv;
+
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+ if (mbmi->motion_mode == OBMC_CAUSAL)
+ av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+ if (!txfm_search(cpi, x, bsize, mi_row, mi_col, &rd_stats, &rd_stats_y,
+ &rd_stats_uv, mode_rate, search_state.best_rd)) {
+ continue;
+ } else {
+ const int skip_ctx = av1_get_skip_context(xd);
+ inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
+ rd_stats.dist,
+ rd_stats_y.rate + rd_stats_uv.rate +
+ x->skip_cost[skip_ctx][mbmi->skip]);
+ }
+ rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+
+ if (rd_stats.rdcost < search_state.best_rd) {
+ search_state.best_rd = rd_stats.rdcost;
+ // Note index of best mode so far
+ const int mode_index = get_prediction_mode_idx(
+ mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ search_state.best_mode_index = mode_index;
+ *rd_cost = rd_stats;
+ search_state.best_rd = rd_stats.rdcost;
+ search_state.best_mbmode = *mbmi;
+ search_state.best_skip2 = mbmi->skip;
+ search_state.best_mode_skippable = rd_stats.skip;
+ search_state.best_rate_y =
+ rd_stats_y.rate +
+ x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip];
+ search_state.best_rate_uv = rd_stats_uv.rate;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
}
}
+ }
+#endif
- if (x->skip && !comp_pred) break;
+ for (int j = 0; j < intra_mode_num; ++j) {
+ const int mode_index = intra_mode_idx_ls[j];
+ const MV_REFERENCE_FRAME ref_frame =
+ av1_mode_order[mode_index].ref_frame[0];
+ assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME);
+ assert(ref_frame == INTRA_FRAME);
+ if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break;
+ init_mbmi(mbmi, mode_index, cm);
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME);
+
+ // Select prediction reference frames.
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ }
+
+ RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
+
+ const int ref_frame_cost = ref_costs_single[ref_frame];
+ intra_rd_stats.rdcost = handle_intra_mode(
+ &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0,
+ &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
+ if (intra_rd_stats.rdcost < search_state.best_rd) {
+ search_state.best_rd = intra_rd_stats.rdcost;
+ // Note index of best mode so far
+ search_state.best_mode_index = mode_index;
+ *rd_cost = intra_rd_stats;
+ search_state.best_rd = intra_rd_stats.rdcost;
+ search_state.best_mbmode = *mbmi;
+ search_state.best_skip2 = 0;
+ search_state.best_mode_skippable = intra_rd_stats.skip;
+ search_state.best_rate_y =
+ intra_rd_stats_y.rate +
+ x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip];
+ search_state.best_rate_uv = intra_rd_stats_uv.rate;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ }
}
// In effect only when speed >= 2.
@@ -10635,7 +11786,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
// Only try palette mode when the best mode so far is an intra mode.
if (try_palette && !is_inter_mode(search_state.best_mbmode.mode)) {
- search_palette_mode(cpi, x, rd_cost, ctx, bsize, mbmi, pmi,
+ search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi,
ref_costs_single, &search_state);
}
@@ -10776,11 +11927,11 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
- mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+ mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
// Select the samples according to motion vector difference
- if (mbmi->num_proj_ref[0] > 1)
- mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
- mbmi->num_proj_ref[0], bsize);
+ if (mbmi->num_proj_ref > 1)
+ mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+ mbmi->num_proj_ref, bsize);
}
set_default_interp_filters(mbmi, cm->interp_filter);
@@ -10853,7 +12004,7 @@ static INLINE void calc_target_weighted_pred_above(
struct calc_target_weighted_pred_ctxt *ctxt =
(struct calc_target_weighted_pred_ctxt *)fun_ctxt;
- const int bw = xd->n8_w << MI_SIZE_LOG2;
+ const int bw = xd->n4_w << MI_SIZE_LOG2;
const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
@@ -10899,7 +12050,7 @@ static INLINE void calc_target_weighted_pred_left(
struct calc_target_weighted_pred_ctxt *ctxt =
(struct calc_target_weighted_pred_ctxt *)fun_ctxt;
- const int bw = xd->n8_w << MI_SIZE_LOG2;
+ const int bw = xd->n4_w << MI_SIZE_LOG2;
const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
@@ -10982,8 +12133,8 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
int above_stride, const uint8_t *left,
int left_stride) {
const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
- const int bw = xd->n8_w << MI_SIZE_LOG2;
- const int bh = xd->n8_h << MI_SIZE_LOG2;
+ const int bw = xd->n4_w << MI_SIZE_LOG2;
+ const int bh = xd->n4_h << MI_SIZE_LOG2;
int32_t *mask_buf = x->mask_buf;
int32_t *wsrc_buf = x->wsrc_buf;
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
index 12df472c1..4c11f90b8 100644
--- a/third_party/aom/av1/encoder/rdopt.h
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RDOPT_H_
-#define AV1_ENCODER_RDOPT_H_
+#ifndef AOM_AV1_ENCODER_RDOPT_H_
+#define AOM_AV1_ENCODER_RDOPT_H_
#include "av1/common/blockd.h"
#include "av1/common/txb_common.h"
@@ -25,6 +25,10 @@ extern "C" {
#endif
#define MAX_REF_MV_SERCH 3
+#define DEFAULT_LUMA_INTERP_SKIP_FLAG 1
+#define DEFAULT_CHROMA_INTERP_SKIP_FLAG 2
+#define DEFAULT_INTERP_SKIP_FLAG \
+ (DEFAULT_LUMA_INTERP_SKIP_FLAG | DEFAULT_CHROMA_INTERP_SKIP_FLAG)
struct TileInfo;
struct macroblock;
@@ -111,7 +115,7 @@ unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
const struct buf_2d *ref,
BLOCK_SIZE bs, int bd);
-void av1_rd_pick_inter_mode_sb(const struct AV1_COMP *cpi,
+void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi,
struct TileDataEnc *tile_data,
struct macroblock *x, int mi_row, int mi_col,
struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
@@ -123,14 +127,12 @@ void av1_rd_pick_inter_mode_sb_seg_skip(
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-#define INTER_MODE_RD_TEST 0
-void av1_inter_mode_data_init();
-void av1_inter_mode_data_fit(int rdmult);
-void av1_inter_mode_data_show(const AV1_COMMON *cm);
+void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
#endif
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_RDOPT_H_
+#endif // AOM_AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/reconinter_enc.c b/third_party/aom/av1/encoder/reconinter_enc.c
new file mode 100644
index 000000000..23d920fc3
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.c
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/obmc.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static INLINE void calc_subpel_params(
+ MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
+ int plane, const int pre_x, const int pre_y, int x, int y,
+ struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params,
+ int bw, int bh) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int is_scaled = av1_is_scaled(sf);
+ if (is_scaled) {
+ int ssx = pd->subsampling_x;
+ int ssy = pd->subsampling_y;
+ int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
+ orig_pos_y += mv.row * (1 << (1 - ssy));
+ int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
+ orig_pos_x += mv.col * (1 << (1 - ssx));
+ int pos_y = sf->scale_value_y(orig_pos_y, sf);
+ int pos_x = sf->scale_value_x(orig_pos_x, sf);
+ pos_x += SCALE_EXTRA_OFF;
+ pos_y += SCALE_EXTRA_OFF;
+
+ const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+ const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+ const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+ << SCALE_SUBPEL_BITS;
+ const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+ pos_y = clamp(pos_y, top, bottom);
+ pos_x = clamp(pos_x, left, right);
+
+ *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+ (pos_x >> SCALE_SUBPEL_BITS);
+ subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+ subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+ subpel_params->xs = sf->x_step_q4;
+ subpel_params->ys = sf->y_step_q4;
+ } else {
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(
+ xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+ subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+ subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
+ (x + (mv_q4.col >> SUBPEL_BITS));
+ }
+}
+
+static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int plane, const MB_MODE_INFO *mi,
+ int build_for_obmc, int bw, int bh,
+ int mi_x, int mi_y) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ int is_compound = has_second_ref(mi);
+ int ref;
+ const int is_intrabc = is_intrabc_block(mi);
+ assert(IMPLIES(is_intrabc, !is_compound));
+ int is_global[2] = { 0, 0 };
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+ is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+ }
+
+ const BLOCK_SIZE bsize = mi->sb_type;
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
+ (block_size_high[bsize] < 8 && ss_y);
+
+ if (is_intrabc) sub8x8_inter = 0;
+
+ // For sub8x8 chroma blocks, we may be covering more than one luma block's
+ // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+ // the top-left corner of the prediction source - the correct top-left corner
+ // is at (pre_x, pre_y).
+ const int row_start =
+ (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+ const int col_start =
+ (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+ const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+ const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+ sub8x8_inter = sub8x8_inter && !build_for_obmc;
+ if (sub8x8_inter) {
+ for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
+ for (int col = col_start; col <= 0; ++col) {
+ const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+ if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
+ if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
+ }
+ }
+ }
+
+ if (sub8x8_inter) {
+ // block size
+ const int b4_w = block_size_wide[bsize] >> ss_x;
+ const int b4_h = block_size_high[bsize] >> ss_y;
+ const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+ const int b8_w = block_size_wide[plane_bsize] >> ss_x;
+ const int b8_h = block_size_high[plane_bsize] >> ss_y;
+ assert(!is_compound);
+
+ const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
+
+ int row = row_start;
+ for (int y = 0; y < b8_h; y += b4_h) {
+ int col = col_start;
+ for (int x = 0; x < b8_w; x += b4_w) {
+ MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+ is_compound = has_second_ref(this_mbmi);
+ int tmp_dst_stride = 8;
+ assert(bw < 8 || bh < 8);
+ ConvolveParams conv_params = get_conv_params_no_round(
+ 0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
+ conv_params.use_jnt_comp_avg = 0;
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+
+ ref = 0;
+ const RefBuffer *ref_buf =
+ &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+
+ pd->pre[ref].buf0 =
+ (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
+ pd->pre[ref].buf =
+ pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+ ref_buf->buf->uv_stride,
+ &ref_buf->sf);
+ pd->pre[ref].width = ref_buf->buf->uv_crop_width;
+ pd->pre[ref].height = ref_buf->buf->uv_crop_height;
+ pd->pre[ref].stride = ref_buf->buf->uv_stride;
+
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+ struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+
+ const MV mv = this_mbmi->mv[ref].as_mv;
+
+ uint8_t *pre;
+ SubpelParams subpel_params;
+ WarpTypesAllowed warp_types;
+ warp_types.global_warp_allowed = is_global[ref];
+ warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
+
+ calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+ &subpel_params, bw, bh);
+ conv_params.do_average = ref;
+ if (is_masked_compound_type(mi->interinter_comp.type)) {
+ // masked compound type has its own average mechanism
+ conv_params.do_average = 0;
+ }
+
+ av1_make_inter_predictor(
+ pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf,
+ b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
+ (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
+ plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+
+ ++col;
+ }
+ ++row;
+ }
+
+ for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
+ return;
+ }
+
+ {
+ ConvolveParams conv_params = get_conv_params_no_round(
+ 0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+ av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
+ &conv_params.bck_offset,
+ &conv_params.use_jnt_comp_avg, is_compound);
+
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf;
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+ struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+ const MV mv = mi->mv[ref].as_mv;
+
+ uint8_t *pre;
+ SubpelParams subpel_params;
+ calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre,
+ &subpel_params, bw, bh);
+
+ WarpTypesAllowed warp_types;
+ warp_types.global_warp_allowed = is_global[ref];
+ warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+ if (ref && is_masked_compound_type(mi->interinter_comp.type)) {
+ // masked compound type has its own average mechanism
+ conv_params.do_average = 0;
+ av1_make_masked_inter_predictor(
+ pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+ bh, &conv_params, mi->interp_filters, plane, &warp_types,
+ mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd,
+ cm->allow_warped_motion);
+ } else {
+ conv_params.do_average = ref;
+ av1_make_inter_predictor(
+ pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+ bh, &conv_params, mi->interp_filters, &warp_types,
+ mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref,
+ mi, build_for_obmc, xd, cm->allow_warped_motion);
+ }
+ }
+ }
+}
+
+static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int mi_row, int mi_col,
+ int plane_from, int plane_to) {
+ int plane;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const int bw = pd->width;
+ const int bh = pd->height;
+
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+
+ build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
+ }
+}
+
+void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize) {
+ av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, 0);
+}
+
+void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize) {
+ for (int plane_idx = 1; plane_idx < MAX_MB_PLANE; plane_idx++) {
+ av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize,
+ plane_idx);
+ }
+}
+
+void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize, int plane_idx) {
+ build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, plane_idx,
+ plane_idx);
+
+ if (is_interintra_pred(xd->mi[0])) {
+ BUFFER_SET default_ctx = { { NULL, NULL, NULL }, { 0, 0, 0 } };
+ if (!ctx) {
+ default_ctx.plane[plane_idx] = xd->plane[plane_idx].dst.buf;
+ default_ctx.stride[plane_idx] = xd->plane[plane_idx].dst.stride;
+ ctx = &default_ctx;
+ }
+ av1_build_interintra_predictors_sbp(cm, xd, xd->plane[plane_idx].dst.buf,
+ xd->plane[plane_idx].dst.stride, ctx,
+ plane_idx, bsize);
+ }
+}
+
+void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize) {
+ const int num_planes = av1_num_planes(cm);
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+ if (num_planes > 1)
+ av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+}
+
+// TODO(sarahparker):
+// av1_build_inter_predictor should be combined with
+// av1_make_inter_predictor
+void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, const MV *src_mv,
+ const struct scale_factors *sf, int w, int h,
+ ConvolveParams *conv_params,
+ InterpFilters interp_filters,
+ const WarpTypesAllowed *warp_types, int p_col,
+ int p_row, int plane, int ref,
+ enum mv_precision precision, int x, int y,
+ const MACROBLOCKD *xd, int can_use_previous) {
+ const int is_q4 = precision == MV_PRECISION_Q4;
+ const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+ is_q4 ? src_mv->col : src_mv->col * 2 };
+ MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
+ mv.col += SCALE_EXTRA_OFF;
+ mv.row += SCALE_EXTRA_OFF;
+
+ const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+ mv.col & SCALE_SUBPEL_MASK,
+ mv.row & SCALE_SUBPEL_MASK };
+ src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
+ (mv.col >> SCALE_SUBPEL_BITS);
+
+ av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
+ w, h, conv_params, interp_filters, warp_types, p_col,
+ p_row, plane, ref, xd->mi[0], 0, xd,
+ can_use_previous);
+}
+
+static INLINE void build_prediction_by_above_pred(
+ MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+ MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+ struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+ const int above_mi_col = ctxt->mi_col + rel_mi_col;
+ int mi_x, mi_y;
+ MB_MODE_INFO backup_mbmi = *above_mbmi;
+
+ av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
+ above_mbmi, ctxt, num_planes);
+ mi_x = above_mi_col << MI_SIZE_LOG2;
+ mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+ for (int j = 0; j < num_planes; ++j) {
+ const struct macroblockd_plane *pd = &xd->plane[j];
+ int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+ int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+ block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+ build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y);
+ }
+ *above_mbmi = backup_mbmi;
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]) {
+ if (!xd->up_available) return;
+
+ // Adjust mb_to_bottom_edge to have the correct value for the OBMC
+ // prediction block. This is half the height of the original block,
+ // except for 128-wide blocks, where we only use a height of 32.
+ int this_height = xd->n4_h * MI_SIZE;
+ int pred_height = AOMMIN(this_height / 2, 32);
+ xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
+
+ struct build_prediction_ctxt ctxt = { cm, mi_row,
+ mi_col, tmp_buf,
+ tmp_width, tmp_height,
+ tmp_stride, xd->mb_to_right_edge };
+ BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ foreach_overlappable_nb_above(cm, xd, mi_col,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ build_prediction_by_above_pred, &ctxt);
+
+ xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+ xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+ xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+}
+
+static INLINE void build_prediction_by_left_pred(
+ MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
+ MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+ struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+ const int left_mi_row = ctxt->mi_row + rel_mi_row;
+ int mi_x, mi_y;
+ MB_MODE_INFO backup_mbmi = *left_mbmi;
+
+ av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
+ left_mbmi, ctxt, num_planes);
+ mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+ mi_y = left_mi_row << MI_SIZE_LOG2;
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+ for (int j = 0; j < num_planes; ++j) {
+ const struct macroblockd_plane *pd = &xd->plane[j];
+ int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+ block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+ int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+ build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y);
+ }
+ *left_mbmi = backup_mbmi;
+}
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]) {
+ if (!xd->left_available) return;
+
+ // Adjust mb_to_right_edge to have the correct value for the OBMC
+ // prediction block. This is half the width of the original block,
+ // except for 128-wide blocks, where we only use a width of 32.
+ int this_width = xd->n4_w * MI_SIZE;
+ int pred_width = AOMMIN(this_width / 2, 32);
+ xd->mb_to_right_edge += (this_width - pred_width) * 8;
+
+ struct build_prediction_ctxt ctxt = { cm, mi_row,
+ mi_col, tmp_buf,
+ tmp_width, tmp_height,
+ tmp_stride, xd->mb_to_bottom_edge };
+ BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ foreach_overlappable_nb_left(cm, xd, mi_row,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ build_prediction_by_left_pred, &ctxt);
+
+ xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+ xd->mb_to_right_edge -= (this_width - pred_width) * 8;
+ xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
+}
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col) {
+ const int num_planes = av1_num_planes(cm);
+ uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+ int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int len = sizeof(uint16_t);
+ dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+ dst_buf1[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+ dst_buf1[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+ dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+ dst_buf2[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+ dst_buf2[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+ } else {
+ dst_buf1[0] = xd->tmp_obmc_bufs[0];
+ dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+ dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+ dst_buf2[0] = xd->tmp_obmc_bufs[1];
+ dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+ dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+ }
+ av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
+ dst_width1, dst_height1, dst_stride1);
+ av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
+ dst_width2, dst_height2, dst_stride2);
+ av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
+ mi_row, mi_col, 0, num_planes);
+ av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
+ dst_buf2, dst_stride2);
+}
+
+// Builds the inter-predictor for the single ref case
+// for use in the encoder to search the wedges efficiently.
+static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
+ int bw, int bh, int x, int y,
+ int w, int h, int mi_x, int mi_y,
+ int ref, uint8_t *const ext_dst,
+ int ext_dst_stride,
+ int can_use_previous) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const MB_MODE_INFO *mi = xd->mi[0];
+
+ const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+ struct buf_2d *const pre_buf = &pd->pre[ref];
+ uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
+ const MV mv = mi->mv[ref].as_mv;
+
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+ WarpTypesAllowed warp_types;
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+ warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
+ warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+ const int pre_x = (mi_x) >> pd->subsampling_x;
+ const int pre_y = (mi_y) >> pd->subsampling_y;
+ uint8_t *pre;
+ SubpelParams subpel_params;
+ calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+ &subpel_params, bw, bh);
+
+ av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
+ &subpel_params, sf, w, h, &conv_params,
+ mi->interp_filters, &warp_types, pre_x + x,
+ pre_y + y, plane, ref, mi, 0, xd, can_use_previous);
+}
+
+void av1_build_inter_predictors_for_planes_single_buf(
+ MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+ int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+ int can_use_previous) {
+ int plane;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(
+ bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x,
+ mi_y, ref, ext_dst[plane],
+ ext_dst_stride[plane], can_use_previous);
+ }
+}
+
+static void build_masked_compound(
+ uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+ int w) {
+ // Derive subsampling from h and w passed in. May be refactored to
+ // pass in subsampling factors directly.
+ const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+ const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+ const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+ aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, block_size_wide[sb_type], w, h, subw, subh);
+}
+
+static void build_masked_compound_highbd(
+ uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+ const uint8_t *src1_8, int src1_stride,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+ int w, int bd) {
+ // Derive subsampling from h and w passed in. May be refactored to
+ // pass in subsampling factors directly.
+ const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+ const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+ const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+ // const uint8_t *mask =
+ // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+ aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, block_size_wide[sb_type], w, h,
+ subw, subh, bd);
+}
+
+static void build_wedge_inter_predictor_from_buf(
+ MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
+ int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_compound = has_second_ref(mbmi);
+ MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+ mbmi->interinter_comp.seg_mask = xd->seg_mask;
+ const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+
+ if (is_compound && is_masked_compound_type(comp_data->type)) {
+ if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ av1_build_compound_diffwtd_mask_highbd(
+ comp_data->seg_mask, comp_data->mask_type,
+ CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
+ else
+ av1_build_compound_diffwtd_mask(
+ comp_data->seg_mask, comp_data->mask_type, ext_dst0,
+ ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
+ }
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ build_masked_compound_highbd(
+ dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
+ mbmi->sb_type, h, w, xd->bd);
+ else
+ build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+ ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
+ h, w);
+ } else {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
+ xd->bd);
+ else
+ aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
+ 0, NULL, 0, w, h);
+ }
+}
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane_from, int plane_to,
+ uint8_t *ext_dst0[3],
+ int ext_dst_stride0[3],
+ uint8_t *ext_dst1[3],
+ int ext_dst_stride1[3]) {
+ int plane;
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(
+ bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ build_wedge_inter_predictor_from_buf(
+ xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
+ ext_dst1[plane], ext_dst_stride1[plane]);
+ }
+}
diff --git a/third_party/aom/av1/encoder/reconinter_enc.h b/third_party/aom/av1/encoder/reconinter_enc.h
new file mode 100644
index 000000000..10d5e8c28
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_
+#define AOM_AV1_ENCODER_RECONINTER_ENC_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/filter.h"
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/convolve.h"
+#include "av1/common/warped_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize);
+
+void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize);
+
+void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize, int plane_idx);
+
+void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize);
+
+void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, const MV *src_mv,
+ const struct scale_factors *sf, int w, int h,
+ ConvolveParams *conv_params,
+ InterpFilters interp_filters,
+ const WarpTypesAllowed *warp_types, int p_col,
+ int p_row, int plane, int ref,
+ enum mv_precision precision, int x, int y,
+ const MACROBLOCKD *xd, int can_use_previous);
+
+// Detect if the block have sub-pixel level motion vectors
+// per component.
+#define CHECK_SUBPEL 0
+static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi,
+ const MACROBLOCKD *const xd,
+ int dir) {
+#if CHECK_SUBPEL
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ int plane;
+ int ref = (dir >> 1);
+
+ if (dir & 0x01) {
+ if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
+ } else {
+ if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
+ }
+
+ return 0;
+#else
+ (void)mbmi;
+ (void)xd;
+ (void)dir;
+ return 1;
+#endif
+}
+
+static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const int is_compound = has_second_ref(mi);
+ int ref;
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ int row_col;
+ for (row_col = 0; row_col < 2; ++row_col) {
+ const int dir = (ref << 1) + row_col;
+ if (has_subpel_mv_component(mi, xd, dir)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col);
+
+void av1_build_inter_predictors_for_planes_single_buf(
+ MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+ int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+ int can_use_previous);
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane_from, int plane_to,
+ uint8_t *ext_dst0[3],
+ int ext_dst_stride0[3],
+ uint8_t *ext_dst1[3],
+ int ext_dst_stride1[3]);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RECONINTER_ENC_H_
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
index a207b0f26..1ad13d66a 100644
--- a/third_party/aom/av1/encoder/segmentation.h
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_SEGMENTATION_H_
-#define AV1_ENCODER_SEGMENTATION_H_
+#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_
+#define AOM_AV1_ENCODER_SEGMENTATION_H_
#include "av1/common/blockd.h"
#include "av1/encoder/encoder.h"
@@ -35,4 +35,4 @@ void av1_reset_segment_features(AV1_COMMON *cm);
} // extern "C"
#endif
-#endif // AV1_ENCODER_SEGMENTATION_H_
+#endif // AOM_AV1_ENCODER_SEGMENTATION_H_
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
index d4b4b19c4..4c35baae0 100644
--- a/third_party/aom/av1/encoder/speed_features.c
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -98,6 +98,15 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
sf->use_square_partition_only_threshold = BLOCK_64X64;
}
+ // TODO(huisu@google.com): train models for 720P and above.
+ if (!is_720p_or_larger) {
+ sf->ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64
+ sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ }
+
if (speed >= 1) {
if (is_720p_or_larger) {
sf->use_square_partition_only_threshold = BLOCK_128X128;
@@ -106,6 +115,14 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
} else {
sf->use_square_partition_only_threshold = BLOCK_32X32;
}
+
+ if (!is_720p_or_larger) {
+ sf->ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64
+ sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ }
}
if (speed >= 2) {
@@ -126,13 +143,11 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
if (speed >= 3) {
if (is_720p_or_larger) {
sf->disable_split_mask = DISABLE_ALL_SPLIT;
- sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
sf->partition_search_breakout_dist_thr = (1 << 25);
sf->partition_search_breakout_rate_thr = 200;
} else {
sf->max_intra_bsize = BLOCK_32X32;
sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
- sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
sf->partition_search_breakout_dist_thr = (1 << 23);
sf->partition_search_breakout_rate_thr = 120;
}
@@ -166,6 +181,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
// Speed 0 for all speed features that give neutral coding performance change.
sf->reduce_inter_modes = 1;
sf->prune_ext_partition_types_search_level = 1;
+ sf->ml_prune_rect_partition = 1;
sf->ml_prune_ab_partition = 1;
sf->ml_prune_4_partition = 1;
sf->adaptive_txb_search_level = 1;
@@ -173,6 +189,11 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->model_based_prune_tx_search_level = 1;
sf->model_based_post_interp_filter_breakout = 1;
sf->inter_mode_rd_model_estimation = 1;
+ sf->prune_ref_frame_for_rect_partitions =
+ !(boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame);
+ sf->less_rectangular_check_level = 1;
+ sf->gm_search_type = GM_REDUCED_REF_SEARCH;
+ sf->gm_disable_recode = 1;
if (speed >= 1) {
sf->gm_erroradv_type = GM_ERRORADV_TR_1;
@@ -182,8 +203,10 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->intra_tx_size_search_init_depth_rect = 1;
sf->intra_tx_size_search_init_depth_sqr = 1;
sf->tx_size_search_lgr_block = 1;
- sf->two_pass_partition_search = 1;
- sf->mode_pruning_based_on_two_pass_partition_search = 1;
+ if (speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL) {
+ sf->two_pass_partition_search = 1;
+ sf->mode_pruning_based_on_two_pass_partition_search = 1;
+ }
sf->prune_ext_partition_types_search_level = 2;
sf->use_fast_interpolation_filter_search = 1;
sf->skip_repeat_interpolation_filter_search = 1;
@@ -198,6 +221,11 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->use_intra_txb_hash = 1;
sf->optimize_b_precheck = 1;
sf->dual_sgr_penalty_level = 1;
+ sf->use_accurate_subpel_search = 1;
+ sf->reuse_inter_intra_mode = 1;
+ sf->prune_comp_search_by_single_result = 1;
+ sf->skip_repeated_newmv = 1;
+ sf->obmc_full_pixel_search_level = 1;
}
if (speed >= 2) {
@@ -206,7 +234,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->selective_ref_frame = 2;
sf->fast_cdef_search = 1;
- sf->use_rd_breakout = 1;
sf->adaptive_rd_thresh = 1;
sf->mv.auto_mv_step_size = 1;
sf->mv.subpel_iters_per_step = 1;
@@ -224,8 +251,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
if (speed >= 3) {
sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
- sf->less_rectangular_check = 1;
- sf->mode_skip_start = 10;
+ sf->less_rectangular_check_level = 2;
sf->adaptive_pred_interp_filter = 1;
// adaptive_motion_search breaks encoder multi-thread tests.
// The values in x->pred_mv[] differ for single and multi-thread cases.
@@ -237,6 +263,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->adaptive_rd_thresh = 2;
sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
sf->gm_search_type = GM_DISABLE_SEARCH;
+ sf->prune_comp_search_by_single_result = 2;
}
if (speed >= 4) {
@@ -250,10 +277,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->adaptive_pred_interp_filter = 0;
sf->adaptive_mode_search = 1;
sf->cb_partition_search = !boosted;
- sf->cb_pred_filter_search = 1;
sf->alt_ref_search_fp = 1;
- sf->mode_skip_start = 6;
- sf->adaptive_interp_filter_search = 1;
}
if (speed >= 5) {
@@ -276,7 +300,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
FLAG_EARLY_TERMINATE;
sf->disable_filter_search_var_thresh = 200;
- sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
sf->use_fast_coef_costing = 1;
sf->partition_search_breakout_rate_thr = 300;
sf->use_transform_domain_distortion = 2;
@@ -296,33 +319,17 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->simple_model_rd_from_var = 1;
}
if (speed >= 7) {
- const int is_keyframe = cm->frame_type == KEY_FRAME;
- const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
sf->default_max_partition_size = BLOCK_32X32;
sf->default_min_partition_size = BLOCK_8X8;
sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
sf->frame_parameter_update = 0;
sf->mv.search_method = FAST_HEX;
- sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
- sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
- sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
- sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
- sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST;
- sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST;
- sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST;
sf->partition_search_type = REFERENCE_PARTITION;
- sf->reuse_inter_pred_sby = 1;
- sf->force_frame_boost =
- is_keyframe ||
- (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
- sf->max_delta_qindex = is_keyframe ? 20 : 15;
- sf->coeff_prob_appx_step = 4;
sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
}
if (speed >= 8) {
sf->mv.search_method = FAST_DIAMOND;
- sf->mv.fullpel_search_step_param = 10;
sf->mv.subpel_force_stop = 2;
sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
}
@@ -356,54 +363,6 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
}
-static void set_dev_sf(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) {
- AV1_COMMON *const cm = &cpi->common;
-
- if (speed & TXFM_CODING_SF) {
- sf->inter_tx_size_search_init_depth_rect = 1;
- sf->inter_tx_size_search_init_depth_sqr = 1;
- sf->intra_tx_size_search_init_depth_rect = 1;
- sf->intra_tx_size_search_init_depth_sqr = 1;
- sf->tx_size_search_method = USE_FAST_RD;
- sf->tx_type_search.fast_intra_tx_type_search = 1;
- sf->tx_type_search.fast_inter_tx_type_search = 1;
- }
-
- if (speed & INTER_PRED_SF) {
- sf->selective_ref_frame = 2;
- // sf->adaptive_motion_search = 1;
- sf->mv.auto_mv_step_size = 1;
- sf->adaptive_rd_thresh = 1;
- sf->mv.subpel_iters_per_step = 1;
- sf->adaptive_pred_interp_filter = 1;
- }
-
- if (speed & INTRA_PRED_SF) {
- sf->max_intra_bsize = BLOCK_32X32;
- }
-
- if (speed & PARTITION_SF) {
- if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
- has_internal_image_edge(cpi)) {
- sf->use_square_partition_only_threshold =
- frame_is_boosted(cpi) ? BLOCK_128X128 : BLOCK_4X4;
- } else {
- sf->use_square_partition_only_threshold =
- frame_is_intra_only(cm) ? BLOCK_128X128 : BLOCK_4X4;
- }
- sf->less_rectangular_check = 1;
- sf->prune_ext_partition_types_search_level = 2;
- }
-
- if (speed & LOOP_FILTER_SF) {
- sf->fast_cdef_search = 1;
- }
-
- if (speed & RD_SKIP_SF) {
- sf->use_rd_breakout = 1;
- }
-}
-
void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
SPEED_FEATURES *const sf = &cpi->sf;
@@ -432,9 +391,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
#endif // DISABLE_TRELLISQ_SEARCH
sf->gm_erroradv_type = GM_ERRORADV_TR_0;
sf->mv.reduce_first_step_size = 0;
- sf->coeff_prob_appx_step = 1;
sf->mv.auto_mv_step_size = 0;
- sf->mv.fullpel_search_step_param = 6;
sf->comp_inter_joint_search_thresh = BLOCK_4X4;
sf->adaptive_rd_thresh = 0;
sf->tx_size_search_method = USE_FULL_RD;
@@ -450,7 +407,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
sf->adaptive_motion_search = 0;
sf->adaptive_pred_interp_filter = 0;
sf->adaptive_mode_search = 0;
- sf->cb_pred_filter_search = 0;
sf->cb_partition_search = 0;
sf->alt_ref_search_fp = 0;
sf->partition_search_type = SEARCH_PARTITION;
@@ -461,22 +417,19 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
sf->tx_type_search.fast_inter_tx_type_search = 0;
sf->tx_type_search.skip_tx_search = 0;
sf->selective_ref_frame = 0;
- sf->less_rectangular_check = 0;
+ sf->less_rectangular_check_level = 0;
sf->use_square_partition_only_threshold = BLOCK_128X128;
+ sf->prune_ref_frame_for_rect_partitions = 0;
sf->auto_min_max_partition_size = NOT_IN_USE;
sf->rd_auto_partition_min_limit = BLOCK_4X4;
sf->default_max_partition_size = BLOCK_LARGEST;
sf->default_min_partition_size = BLOCK_4X4;
sf->adjust_partitioning_from_last_frame = 0;
- sf->last_partitioning_redo_frequency = 4;
sf->disable_split_mask = 0;
sf->mode_search_skip_flags = 0;
- sf->force_frame_boost = 0;
- sf->max_delta_qindex = 0;
sf->disable_filter_search_var_thresh = 0;
- sf->adaptive_interp_filter_search = 0;
sf->allow_partition_search_skip = 0;
- sf->use_accurate_subpel_search = 1;
+ sf->use_accurate_subpel_search = 2;
sf->disable_wedge_search_var_thresh = 0;
sf->fast_wedge_sign_estimate = 0;
sf->drop_ref = 0;
@@ -491,48 +444,46 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
sf->optimize_b_precheck = 0;
sf->jnt_comp_fast_tx_search = 0;
sf->jnt_comp_skip_mv_search = 0;
+ sf->reuse_inter_intra_mode = 0;
for (i = 0; i < TX_SIZES; i++) {
sf->intra_y_mode_mask[i] = INTRA_ALL;
sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
}
- sf->use_rd_breakout = 0;
sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
- sf->use_fast_coef_updates = TWO_LOOP;
sf->use_fast_coef_costing = 0;
- sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set
- sf->schedule_mode_search = 0;
- for (i = 0; i < BLOCK_SIZES_ALL; ++i) sf->inter_mode_mask[i] = INTER_ALL;
sf->max_intra_bsize = BLOCK_LARGEST;
- sf->reuse_inter_pred_sby = 0;
// This setting only takes effect when partition_search_type is set
// to FIXED_PARTITION.
sf->always_this_block_size = BLOCK_16X16;
- sf->search_type_check_frequency = 50;
// Recode loop tolerance %.
sf->recode_tolerance = 25;
- sf->default_interp_filter = SWITCHABLE;
sf->partition_search_breakout_dist_thr = 0;
sf->partition_search_breakout_rate_thr = 0;
sf->simple_model_rd_from_var = 0;
sf->prune_ext_partition_types_search_level = 0;
+ sf->ml_prune_rect_partition = 0;
sf->ml_prune_ab_partition = 0;
sf->ml_prune_4_partition = 0;
sf->fast_cdef_search = 0;
+ for (i = 0; i < PARTITION_BLOCK_SIZES; ++i)
+ sf->ml_partition_search_breakout_thresh[i] = -1; // -1 means not enabled.
// Set this at the appropriate speed levels
sf->use_transform_domain_distortion = 0;
sf->gm_search_type = GM_FULL_SEARCH;
+ sf->gm_disable_recode = 0;
sf->use_fast_interpolation_filter_search = 0;
sf->skip_repeat_interpolation_filter_search = 0;
sf->use_hash_based_trellis = 0;
+ sf->prune_comp_search_by_single_result = 0;
+ sf->skip_repeated_newmv = 0;
// Set decoder side speed feature to use less dual sgr modes
sf->dual_sgr_penalty_level = 0;
sf->inter_mode_rd_model_estimation = 0;
-
- set_dev_sf(cpi, sf, oxcf->dev_sf);
+ sf->obmc_full_pixel_search_level = 0;
if (oxcf->mode == GOOD)
set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed);
@@ -599,10 +550,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
x->min_partition_size = sf->default_min_partition_size;
x->max_partition_size = sf->default_max_partition_size;
- if (!cpi->oxcf.frame_periodic_boost) {
- sf->max_delta_qindex = 0;
- }
-
// This is only used in motion vector unit test.
if (cpi->oxcf.motion_vector_unit_test == 1)
cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
@@ -611,5 +558,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
#if CONFIG_DIST_8X8
if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
+
+ if (cpi->oxcf.using_dist_8x8) x->min_partition_size = BLOCK_8X8;
#endif // CONFIG_DIST_8X8
}
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
index d0408ba2f..41013b2e7 100644
--- a/third_party/aom/av1/encoder/speed_features.h
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_SPEED_FEATURES_H_
-#define AV1_ENCODER_SPEED_FEATURES_H_
+#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_
+#define AOM_AV1_ENCODER_SPEED_FEATURES_H_
#include "av1/common/enums.h"
@@ -54,25 +54,6 @@ enum {
(1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
(1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) |
(1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV),
- INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) |
- (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV),
- INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) |
- (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
- (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
- (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
- INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) |
- (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
- (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV),
- INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) | (1 << NEWMV) |
- (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
- (1 << NEW_NEWMV) | (1 << NEW_NEARESTMV) |
- (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) |
- (1 << NEAR_NEWMV),
- INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) |
- (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
- (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
- (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
- (1 << NEAR_NEARMV),
INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
(1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
(1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
@@ -133,11 +114,6 @@ typedef enum {
} SUBPEL_SEARCH_METHODS;
typedef enum {
- NO_MOTION_THRESHOLD = 0,
- LOW_MOTION_THRESHOLD = 7
-} MOTION_THRESHOLD;
-
-typedef enum {
USE_FULL_RD = 0,
USE_FAST_RD,
USE_LARGESTALL,
@@ -179,12 +155,6 @@ typedef enum {
} MODE_SEARCH_SKIP_LOGIC;
typedef enum {
- FLAG_SKIP_EIGHTTAP_REGULAR = 1 << EIGHTTAP_REGULAR,
- FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
- FLAG_SKIP_MULTITAP_SHARP = 1 << MULTITAP_SHARP,
-} INTERP_FILTER_MASK;
-
-typedef enum {
NO_PRUNE = 0,
// eliminates one tx type in vertical and horizontal direction
PRUNE_ONE = 1,
@@ -224,16 +194,6 @@ typedef enum {
REFERENCE_PARTITION
} PARTITION_SEARCH_TYPE;
-typedef enum {
- // Does a dry run to see if any of the contexts need to be updated or not,
- // before the final run.
- TWO_LOOP = 0,
-
- // No dry run, also only half the coef contexts and bands are updated.
- // The rest are not updated at all.
- ONE_LOOP_REDUCED = 1
-} FAST_COEFF_UPDATE;
-
typedef struct MV_SPEED_FEATURES {
// Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
SEARCH_METHODS search_method;
@@ -257,9 +217,6 @@ typedef struct MV_SPEED_FEATURES {
// Control when to stop subpel search
int subpel_force_stop;
-
- // This variable sets the step_param used in full pel motion search.
- int fullpel_search_step_param;
} MV_SPEED_FEATURES;
#define MAX_MESH_STEP 4
@@ -332,13 +289,6 @@ typedef struct SPEED_FEATURES {
// mode to be evaluated. A high value means we will be faster.
int adaptive_rd_thresh;
- // Coefficient probability model approximation step size
- int coeff_prob_appx_step;
-
- // The threshold is to determine how slow the motino is, it is used when
- // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
- MOTION_THRESHOLD lf_motion_threshold;
-
// Determine which method we use to determine transform size. We can choose
// between options like full rd, largest for prediction size, largest
// for intra and model coefs for the rest.
@@ -355,11 +305,6 @@ typedef struct SPEED_FEATURES {
// largest transform only, since the largest transform block size is 64x64.
int tx_size_search_lgr_block;
- // After looking at the first set of modes (set by index here), skip
- // checking modes for reference frames that don't match the reference frame
- // of the best so far.
- int mode_skip_start;
-
PARTITION_SEARCH_TYPE partition_search_type;
TX_TYPE_SEARCH tx_type_search;
@@ -397,6 +342,9 @@ typedef struct SPEED_FEATURES {
// aggressiveness of pruning in order.
int prune_ext_partition_types_search_level;
+ // Use a ML model to prune horz and vert partitions
+ int ml_prune_rect_partition;
+
// Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
int ml_prune_ab_partition;
@@ -413,12 +361,16 @@ typedef struct SPEED_FEATURES {
int mode_pruning_based_on_two_pass_partition_search;
// Skip rectangular partition test when partition type none gives better
- // rd than partition type split.
- int less_rectangular_check;
+ // rd than partition type split. Can take values 0 - 2, 0 referring to no
+ // skipping, and 1 - 2 increasing aggressiveness of skipping in order.
+ int less_rectangular_check_level;
// Use square partition only beyond this block size.
BLOCK_SIZE use_square_partition_only_threshold;
+ // Prune reference frames for rectangular partitions.
+ int prune_ref_frame_for_rect_partitions;
+
// Sets min and max partition sizes for this superblock based on the
// same superblock in last encoded frame, and the left and above neighbor.
AUTO_MIN_MAX_MODE auto_min_max_partition_size;
@@ -435,10 +387,6 @@ typedef struct SPEED_FEATURES {
// frame's partitioning. Only used if use_lastframe_partitioning is set.
int adjust_partitioning_from_last_frame;
- // How frequently we re do the partitioning from scratch. Only used if
- // use_lastframe_partitioning is set.
- int last_partitioning_redo_frequency;
-
// Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
// it always, to allow it for only Last frame and Intra, disable it for all
// inter modes or to enable it always.
@@ -461,8 +409,6 @@ typedef struct SPEED_FEATURES {
// Pattern to be used for any exhaustive mesh searches.
MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
- int schedule_mode_search;
-
// Allows sub 8x8 modes to use the prediction filter that was determined
// best for 8x8 mode. If set to 0 we always re check all the filters for
// sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
@@ -472,20 +418,10 @@ typedef struct SPEED_FEATURES {
// Adaptive prediction mode search
int adaptive_mode_search;
- // Chessboard pattern prediction filter type search
- int cb_pred_filter_search;
-
int cb_partition_search;
int alt_ref_search_fp;
- // Use finer quantizer in every other few frames that run variable block
- // partition type search.
- int force_frame_boost;
-
- // Maximally allowed base quantization index fluctuation.
- int max_delta_qindex;
-
// Implements various heuristics to skip searching modes
// The heuristics selected are based on flags
// defined in the MODE_SEARCH_SKIP_HEURISTICS enum
@@ -506,22 +442,9 @@ typedef struct SPEED_FEATURES {
int intra_y_mode_mask[TX_SIZES];
int intra_uv_mode_mask[TX_SIZES];
- // This variable enables an early break out of mode testing if the model for
- // rd built from the prediction signal indicates a value that's much
- // higher than the best rd we've seen so far.
- int use_rd_breakout;
-
// This feature controls how the loop filter level is determined.
LPF_PICK_METHOD lpf_pick;
- // This feature limits the number of coefficients updates we actually do
- // by only looking at counts from 1/2 the bands.
- FAST_COEFF_UPDATE use_fast_coef_updates;
-
- // A binary mask indicating if NEARESTMV, NEARMV, GLOBALMV, NEWMV
- // modes are used in order from LSB to MSB for each BLOCK_SIZE.
- int inter_mode_mask[BLOCK_SIZES_ALL];
-
// This feature controls whether we do the expensive context update and
// calculation in the rd coefficient costing loop.
int use_fast_coef_costing;
@@ -535,28 +458,13 @@ typedef struct SPEED_FEATURES {
// TODO(aconverse): Fold this into one of the other many mode skips
BLOCK_SIZE max_intra_bsize;
- // The frequency that we check if
- // FIXED_PARTITION search type should be used.
- int search_type_check_frequency;
-
- // When partition is pre-set, the inter prediction result from pick_inter_mode
- // can be reused in final block encoding process. It is enabled only for real-
- // time mode speed 6.
- int reuse_inter_pred_sby;
-
- // default interp filter choice
- InterpFilter default_interp_filter;
-
- // adaptive interp_filter search to allow skip of certain filter types.
- int adaptive_interp_filter_search;
-
- // mask for skip evaluation of certain interp_filter type.
- INTERP_FILTER_MASK interp_filter_search_mask;
-
// Partition search early breakout thresholds.
int64_t partition_search_breakout_dist_thr;
int partition_search_breakout_rate_thr;
+ // Thresholds for ML based partition search breakout.
+ int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
+
// Allow skipping partition search for still image frame
int allow_partition_search_skip;
@@ -577,6 +485,9 @@ typedef struct SPEED_FEATURES {
GM_SEARCH_TYPE gm_search_type;
+ // whether to disable the global motion recode loop
+ int gm_disable_recode;
+
// Do limited interpolation filter search for dual filters, since best choice
// usually includes EIGHTTAP_REGULAR.
int use_fast_interpolation_filter_search;
@@ -624,6 +535,25 @@ typedef struct SPEED_FEATURES {
// Dynamically estimate final rd from prediction error and mode cost
int inter_mode_rd_model_estimation;
+
+ // Skip some ref frames in compound motion search by single motion search
+ // result. Has three levels for now: 0 referring to no skipping, and 1 - 3
+ // increasing aggressiveness of skipping in order.
+ // Note: The search order might affect the result. It is better to search same
+ // single inter mode as a group.
+ int prune_comp_search_by_single_result;
+
+ // Reuse the inter_intra_mode search result from NEARESTMV mode to other
+ // single ref modes
+ int reuse_inter_intra_mode;
+
+ // Set the full pixel search level of obmc
+ // 0: obmc_full_pixel_diamond
+ // 1: obmc_refining_search_sad (faster)
+ int obmc_full_pixel_search_level;
+
+ // flag to skip NEWMV mode in drl if the motion search result is the same
+ int skip_repeated_newmv;
} SPEED_FEATURES;
struct AV1_COMP;
@@ -635,4 +565,4 @@ void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi);
} // extern "C"
#endif
-#endif // AV1_ENCODER_SPEED_FEATURES_H_
+#endif // AOM_AV1_ENCODER_SPEED_FEATURES_H_
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
index d7e4f4eb3..75fdf02a5 100644
--- a/third_party/aom/av1/encoder/temporal_filter.c
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -25,6 +25,7 @@
#include "av1/encoder/mcomp.h"
#include "av1/encoder/encoder.h"
#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/reconinter_enc.h"
#include "av1/encoder/segmentation.h"
#include "av1/encoder/temporal_filter.h"
#include "aom_dsp/aom_dsp_common.h"
@@ -37,13 +38,12 @@ static void temporal_filter_predictors_mb_c(
MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
uint8_t *pred, struct scale_factors *scale, int x, int y,
- int can_use_previous) {
- const int which_mv = 0;
+ int can_use_previous, int num_planes) {
const MV mv = { mv_row, mv_col };
enum mv_precision mv_precision_uv;
int uv_stride;
// TODO(angiebird): change plane setting accordingly
- ConvolveParams conv_params = get_conv_params(which_mv, 0, 0, xd->bd);
+ ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
const InterpFilters interp_filters = xd->mi[0]->interp_filters;
WarpTypesAllowed warp_types;
memset(&warp_types, 0, sizeof(WarpTypesAllowed));
@@ -55,37 +55,21 @@ static void temporal_filter_predictors_mb_c(
uv_stride = stride;
mv_precision_uv = MV_PRECISION_Q3;
}
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- av1_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale,
- 16, 16, which_mv, interp_filters,
- &warp_types, x, y, 0, MV_PRECISION_Q3, x,
- y, xd, can_use_previous);
-
- av1_highbd_build_inter_predictor(
- u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale,
- uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types,
- x, y, 1, mv_precision_uv, x, y, xd, can_use_previous);
-
- av1_highbd_build_inter_predictor(
- v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale,
- uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types,
- x, y, 2, mv_precision_uv, x, y, xd, can_use_previous);
- return;
- }
av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
&conv_params, interp_filters, &warp_types, x, y, 0,
0, MV_PRECISION_Q3, x, y, xd, can_use_previous);
- av1_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width,
- &mv, scale, uv_block_width, uv_block_height,
- &conv_params, interp_filters, &warp_types, x, y, 1,
- 0, mv_precision_uv, x, y, xd, can_use_previous);
+ if (num_planes > 1) {
+ av1_build_inter_predictor(
+ u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale,
+ uv_block_width, uv_block_height, &conv_params, interp_filters,
+ &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous);
- av1_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width,
- &mv, scale, uv_block_width, uv_block_height,
- &conv_params, interp_filters, &warp_types, x, y, 2,
- 0, mv_precision_uv, x, y, xd, can_use_previous);
+ av1_build_inter_predictor(
+ v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale,
+ uv_block_width, uv_block_height, &conv_params, interp_filters,
+ &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous);
+ }
}
void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
@@ -214,7 +198,8 @@ void av1_highbd_temporal_filter_apply_c(
static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
uint8_t *arf_frame_buf,
uint8_t *frame_ptr_buf,
- int stride) {
+ int stride, int x_pos,
+ int y_pos) {
MACROBLOCK *const x = &cpi->td.mb;
MACROBLOCKD *const xd = &x->e_mbd;
const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
@@ -250,11 +235,9 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
x->mvcost = x->mv_cost_stack;
x->nmvjointcost = x->nmv_vec_cost;
- // Use mv costing from x->mvcost directly
- av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
- cond_cost_list(cpi, cost_list), &cpi->fn_ptr[BLOCK_16X16], 0,
- &best_ref_mv1);
-
+ av1_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
+ NSTEP, 1, sadpb, cond_cost_list(cpi, cost_list),
+ &best_ref_mv1, 0, 0, x_pos, y_pos, 0);
x->mv_limits = tmp_mv_limits;
// Ignore mv costing by sending NULL pointer instead of cost array
@@ -370,7 +353,8 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
// Find best match in this frame by MC
int err = temporal_filter_find_matching_mb_c(
cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
- frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride);
+ frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride,
+ mb_col * 16, mb_row * 16);
// Assign higher weight to matching MB if it's error
// score is lower. If not applying MC default behavior
@@ -386,7 +370,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row,
mbd->mi[0]->mv[0].as_mv.col, predictor, scale, mb_col * 16,
- mb_row * 16, cm->allow_warped_motion);
+ mb_row * 16, cm->allow_warped_motion, num_planes);
// Apply the filter (YUV)
if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -556,14 +540,6 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
strength = group_boost / 300;
}
- // Adjustments for second level arf in multi arf case.
- if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
- strength >>= 1;
- }
- }
-
*arnr_frames = frames;
*arnr_strength = strength;
}
@@ -593,21 +569,6 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) {
int which_arf = gf_group->arf_update_idx[gf_group->index];
-#if USE_GF16_MULTI_LAYER
- if (cpi->rc.baseline_gf_interval == 16) {
- // Identify the index to the current ARF.
- const int num_arfs_in_gf = cpi->num_extra_arfs + 1;
- int arf_idx;
- for (arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) {
- if (gf_group->index == cpi->arf_pos_in_gf[arf_idx]) {
- which_arf = arf_idx;
- break;
- }
- }
- assert(arf_idx < num_arfs_in_gf);
- }
-#endif // USE_GF16_MULTI_LAYER
-
// Set the temporal filtering status for the corresponding OVERLAY frame
if (strength == 0 && frames_to_blur == 1)
cpi->is_arf_filter_off[which_arf] = 1;
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
index bc0863a63..2ddc68b2c 100644
--- a/third_party/aom/av1/encoder/temporal_filter.h
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_TEMPORAL_FILTER_H_
-#define AV1_ENCODER_TEMPORAL_FILTER_H_
+#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
#ifdef __cplusplus
extern "C" {
@@ -22,4 +22,4 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance);
} // extern "C"
#endif
-#endif // AV1_ENCODER_TEMPORAL_FILTER_H_
+#endif // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
index de1cbe99c..63b505f36 100644
--- a/third_party/aom/av1/encoder/tokenize.h
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_TOKENIZE_H_
-#define AV1_ENCODER_TOKENIZE_H_
+#ifndef AOM_AV1_ENCODER_TOKENIZE_H_
+#define AOM_AV1_ENCODER_TOKENIZE_H_
#include "av1/common/entropy.h"
#include "av1/encoder/block.h"
@@ -70,4 +70,4 @@ static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
} // extern "C"
#endif
-#endif // AV1_ENCODER_TOKENIZE_H_
+#endif // AOM_AV1_ENCODER_TOKENIZE_H_
diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h
index 69063b801..405bc9e6e 100644
--- a/third_party/aom/av1/encoder/tx_prune_model_weights.h
+++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
-#define AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
#ifdef __cplusplus
extern "C" {
@@ -19,79 +19,114 @@ extern "C" {
#include "av1/encoder/ml.h"
// Tx type model for 4x4 block.
-static const float av1_tx_type_nn_weights_4x4_layer0[32] = {
- 0.72406f, -0.40019f, 0.51795f, -0.43881f, -0.49746f, -0.41780f, -0.39409f,
- -0.16183f, -1.00135f, -0.41733f, -0.96534f, 0.93272f, 1.06229f, 0.04188f,
- 0.60919f, 0.92405f, -0.39359f, 0.70570f, 0.75375f, 1.11966f, -1.86360f,
- -0.35421f, 0.18743f, 0.13346f, -0.21262f, 0.07050f, 0.10533f, -0.47402f,
- 1.33417f, 1.72899f, 1.17983f, 0.10552f,
+static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = {
+ -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+ 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f,
+ -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+ 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f,
+ 1.35792f, 0.27733f, 0.88660f, -0.68304f,
};
-static const float av1_tx_type_nn_bias_4x4_layer0[8] = {
- 1.96273f, -0.69845f, -0.10999f, -1.11311f,
- 1.35101f, 0.43842f, -0.29264f, -1.15376f,
+static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = {
+ 1.38742f, 0.59540f, -1.37622f, 1.92114f,
+ 0.00000f, -0.38998f, -0.32726f, -0.15650f,
};
-static const float av1_tx_type_nn_weights_4x4_layer1[32] = {
- 0.79770f, 0.08520f, 0.23298f, 0.05285f, 0.87506f, -0.90784f, -0.06197f,
- -1.00580f, 0.68639f, -0.34881f, 0.15366f, -1.64658f, 0.80755f, -0.26293f,
- 0.10253f, -0.23915f, 1.14696f, -0.10928f, -1.61377f, 0.00863f, 0.98599f,
- -0.43872f, 0.61196f, -0.03787f, 1.01060f, 0.17643f, -0.00208f, -0.15738f,
- 0.06517f, 0.72885f, 0.24387f, 1.28535f,
+static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = {
+ 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f,
+ -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f,
+ -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f,
+ 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+ -0.26782f, -0.65416f, -0.10648f, 0.05568f,
};
-static const float av1_tx_type_nn_bias_4x4_layer1[4] = {
- 1.23769f,
- 1.40308f,
- 0.09871f,
- 1.82070f,
+static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = {
+ 4.07177f,
+ 3.26961f,
+ 0.58083f,
+ 1.21199f,
};
-static const NN_CONFIG av1_tx_type_nnconfig_4x4 = {
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = {
4, // num_inputs
4, // num_outputs
1, // num_hidden_layers
{
8,
}, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x4_hor_layer0,
+ av1_tx_type_nn_weights_4x4_hor_layer1 },
+ { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = {
+ -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f,
+ 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f,
+ 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f,
+ 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+ -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = {
+ -0.33685f, 0.22025f, 0.28140f, 0.56138f,
+ 0.93489f, -1.77048f, 1.34989f, -0.93747f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = {
+ -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f,
+ 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f,
+ -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+ -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f,
+ -0.86315f, -0.53336f, 0.30320f, -1.32331f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = {
+ -1.31519f,
+ -3.26321f,
+ 1.71794f,
+ -1.90778f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
{
- av1_tx_type_nn_weights_4x4_layer0,
- av1_tx_type_nn_weights_4x4_layer1,
- },
- {
- av1_tx_type_nn_bias_4x4_layer0,
- av1_tx_type_nn_bias_4x4_layer1,
- },
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x4_ver_layer0,
+ av1_tx_type_nn_weights_4x4_ver_layer1 },
+ { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 }
};
/******************************************************************************/
// Tx type model for 4x8 block.
static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = {
- 0.68355f, -0.06887f, 0.68525f, -0.86048f, -0.35906f, -0.28597f, -0.21108f,
- 0.12591f, -1.13025f, -0.65695f, -0.25658f, 0.39155f, 0.89011f, 0.19258f,
- 0.28316f, 0.61172f, 0.52587f, 0.99182f, 0.75704f, 0.66788f, -1.61814f,
- -1.23483f, -0.62868f, -0.11902f, 0.33295f, 0.64796f, 0.92345f, -0.71821f,
- 0.07575f, 0.34687f, 0.20518f, -0.87850f,
+ 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f,
+ 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f,
+ -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f,
+ -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+ -1.35896f, -1.17121f, 1.68866f, 0.10357f,
};
static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = {
- 1.14049f, -0.18583f, 1.92114f, -0.72057f,
- 1.32715f, 0.96713f, 1.09877f, -0.64345f,
+ 2.93391f, 0.66831f, -0.21419f, 0.00000f,
+ -0.72878f, 0.15127f, -1.46755f, 0.16658f,
};
static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = {
- 0.71978f, 0.06896f, 1.48617f, 0.97124f, -0.02487f, -0.95359f, 0.68983f,
- -0.16313f, 0.51324f, -0.33770f, 0.45938f, -1.08238f, 0.72938f, 0.42300f,
- 0.85691f, -0.03783f, 1.12617f, -0.04034f, 0.36923f, 0.25638f, 1.10167f,
- 0.41633f, 0.72602f, -0.14797f, 0.66888f, 0.11437f, -0.99797f, -0.20725f,
- 1.01163f, 2.06308f, 1.23331f, -0.15481f,
+ -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f,
+ -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f,
+ 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f,
+ 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+ -0.50191f, 0.18219f, 1.83664f, -0.75276f,
};
static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = {
- 2.14443f,
- 1.98356f,
- 0.74616f,
- 2.58795f,
+ -1.17455f,
+ -2.26089f,
+ -1.79863f,
+ -2.26333f,
};
static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
@@ -101,62 +136,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
{
8,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_4x8_hor_layer0,
- av1_tx_type_nn_weights_4x8_hor_layer1,
- },
- {
- av1_tx_type_nn_bias_4x8_hor_layer0,
- av1_tx_type_nn_bias_4x8_hor_layer1,
- },
+ { av1_tx_type_nn_weights_4x8_hor_layer0,
+ av1_tx_type_nn_weights_4x8_hor_layer1 },
+ { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 }
};
static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = {
- 0.88859f, 1.02796f, 1.15509f, 0.61719f, 0.85804f, 1.17581f, 0.93524f,
- 0.06546f, 0.08018f, -0.78562f, -0.36614f, 0.14149f, -0.30069f, -0.52647f,
- -0.82789f, 0.60527f, -1.74026f, -0.20271f, 0.09875f, 0.03708f, 0.09430f,
- -0.24043f, -0.38433f, 1.21014f, 1.42443f, 0.69586f, 1.07812f, 1.21748f,
- 1.10989f, 0.93122f, 1.04127f, 0.39424f, 0.95592f, 0.12904f, 0.46330f,
- 0.49722f, 0.46303f, 0.36979f, 0.60227f, 0.39345f, -2.01632f, -0.05706f,
- 0.07766f, -0.01271f, -0.16577f, -0.21957f, -0.14800f, 0.24898f, 0.27662f,
- 0.42028f, 0.44748f, 1.14585f, 1.38805f, 0.46182f, -0.22982f, -0.07324f,
- 0.29886f, -0.46959f, -0.04228f, -0.01064f, 0.24260f, -0.32282f, -0.23804f,
- 1.44466f, -0.42190f, -0.36385f, 0.39746f, 0.38557f, -0.09624f, -0.21540f,
- 0.57385f, -0.72878f, -0.39677f, -0.00717f, 0.60499f, 1.33849f, 1.05337f,
- 1.11947f, 0.38487f, 0.86534f, -0.33970f, 0.71140f, 0.20772f, 0.61132f,
- 0.06181f, -0.20027f, 0.13736f, -0.72321f, 0.64586f, -0.56740f, -0.90912f,
- -0.20452f, 0.15381f, -0.84346f, 0.19550f, 0.63164f, 1.35441f, 0.63218f,
- 0.82883f, 0.38803f, -0.23874f, -0.02962f, 0.23846f, -0.06822f, -0.40159f,
- -0.17850f, -0.69524f, 1.12299f, -0.08286f, -0.14150f, -0.28456f, -0.41519f,
- -0.12792f, -0.55286f, 0.51655f, 0.06636f, 0.73759f, 0.70072f, 0.12616f,
- 0.31282f, 0.17130f, -1.34233f, 0.37221f, 0.95838f, 0.16286f, 1.04301f,
- 0.73600f, -0.11233f,
+ -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f,
+ -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+ -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f,
+ 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f,
+ 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f,
+ 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f,
+ -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f,
+ -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f,
+ 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f,
+ -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+ -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f,
+ -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f,
+ 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+ 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+ -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f,
+ -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f,
+ 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f,
+ -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f,
+ -0.21958f, 0.05970f,
};
static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = {
- -0.89131f, 0.09124f, -0.71678f, -1.19929f, 0.98963f, 0.16896f,
- -0.44943f, -0.97532f, -0.13997f, 1.07136f, -0.46362f, -0.45253f,
- -0.63015f, -0.20008f, 1.24048f, -0.21265f,
+ 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f,
+ 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f,
+ 0.08288f, 0.18195f, -0.79890f, 0.10047f,
};
static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = {
- -0.79795f, 0.45973f, -0.54188f, -1.05095f, 0.64404f, -0.56470f, -0.57018f,
- 0.61644f, 0.50229f, 1.14006f, 0.13805f, -0.42058f, -0.07468f, 0.66203f,
- 0.93180f, -0.59662f, -0.25152f, 0.00336f, 1.09769f, -1.11921f, 0.15151f,
- 0.58750f, -0.42480f, -0.95908f, -0.10980f, 1.31715f, 0.06665f, -0.52371f,
- 0.37228f, -0.12364f, 0.54876f, -0.32698f, 0.39863f, -0.97669f, -1.06351f,
- 1.82755f, 1.02851f, 0.10322f, -0.08322f, 0.08891f, -0.05715f, 0.93503f,
- 0.02096f, -0.39506f, -0.99330f, -0.09407f, 0.75108f, -0.30104f, 1.78314f,
- -0.01786f, -0.17392f, 0.00461f, 0.41394f, 0.92566f, 1.11251f, -0.71380f,
- -0.04907f, 0.12736f, 0.00208f, 0.94451f, -0.31783f, -0.19655f, 0.64619f,
- 0.50359f,
+ -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f,
+ -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f,
+ -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f,
+ -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f,
+ 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f,
+ 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f,
+ -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+ -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+ -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f,
+ -1.01848f,
};
static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = {
- 0.39274f,
- 1.27276f,
- 0.30322f,
- 2.55238f,
+ -1.45955f,
+ -2.08949f,
+ -1.24813f,
+ -1.55368f,
};
static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
@@ -166,64 +196,59 @@ static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_4x8_ver_layer0,
- av1_tx_type_nn_weights_4x8_ver_layer1,
- },
- {
- av1_tx_type_nn_bias_4x8_ver_layer0,
- av1_tx_type_nn_bias_4x8_ver_layer1,
- },
+ { av1_tx_type_nn_weights_4x8_ver_layer0,
+ av1_tx_type_nn_weights_4x8_ver_layer1 },
+ { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 }
};
/******************************************************************************/
// Tx type model for 8x4 block.
static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = {
- 0.64828f, 0.61618f, 0.98975f, -0.14562f, 0.26957f, 1.80872f, 0.58299f,
- -0.06917f, 0.00937f, -0.74073f, -0.66045f, -0.04576f, -0.39802f, -0.76960f,
- -0.85166f, 0.88799f, -0.70694f, -0.34366f, -0.54906f, -0.39502f, -0.29465f,
- -0.49650f, -0.32171f, 1.37181f, 1.30432f, 0.71843f, 1.01916f, 1.01582f,
- 0.90999f, 0.86334f, 1.04603f, 0.40734f, 0.96187f, 0.53742f, 0.07510f,
- 0.44167f, 0.02049f, -0.02874f, 0.97191f, 1.03647f, -2.62751f, -0.01390f,
- -0.09282f, -0.02522f, -0.30849f, -0.19386f, -0.51356f, 0.52778f, 0.77191f,
- 0.75416f, 0.69067f, 0.93561f, 1.35982f, 0.76193f, 0.57869f, 0.00251f,
- -0.87244f, -0.26922f, -0.06682f, 0.07176f, 0.51142f, 0.58948f, 0.13914f,
- 0.71165f, -0.40329f, -0.33201f, 0.35293f, 0.33437f, -0.01812f, -0.24765f,
- 0.26810f, -0.77088f, 1.35707f, 0.22243f, 0.78402f, 0.66191f, 0.79890f,
- 1.90669f, 0.73189f, 0.24222f, -0.34682f, 0.66990f, 0.19554f, 0.58414f,
- 0.05060f, -0.21271f, 0.11656f, -0.74907f, 0.68837f, -0.39147f, -1.78263f,
- -0.69918f, -0.06838f, -0.26927f, 0.38502f, 0.08305f, 1.29848f, 0.67328f,
- 0.67269f, 0.65805f, -0.47778f, -1.02617f, 0.16523f, 0.12223f, -0.35294f,
- -0.15866f, -0.56224f, 1.25895f, -0.21422f, -0.33518f, -0.33519f, -0.37414f,
- 0.55122f, 0.14806f, 0.44312f, -0.07865f, 0.75295f, 0.10766f, 0.59922f,
- 0.48837f, -0.19099f, -2.07991f, 0.35755f, 0.87813f, 0.07559f, 1.00724f,
- 0.25223f, -0.06761f,
+ -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f,
+ 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f,
+ -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f,
+ -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f,
+ -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f,
+ 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f,
+ 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f,
+ -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+ -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+ 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f,
+ 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f,
+ -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f,
+ -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f,
+ 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f,
+ 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+ 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f,
+ -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+ -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+ -1.85523f, 0.92532f,
};
static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = {
- -0.54227f, 0.08599f, -0.77447f, -1.10920f, 0.89298f, 0.05454f,
- -0.73681f, 0.21048f, -0.41041f, 1.25690f, -0.60918f, 0.14661f,
- -0.65392f, -0.25881f, 1.67995f, -0.03550f,
+ 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f,
+ -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f,
+ -0.28958f, -0.32869f, -0.01704f, 0.68171f,
};
static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = {
- -0.22312f, 0.73552f, 0.48399f, -0.66996f, 0.36527f, -0.42228f, -1.10793f,
- 0.31167f, 0.16177f, 1.69315f, -0.06287f, -0.35804f, -0.24889f, 0.80824f,
- 1.08952f, -0.62838f, 0.30066f, -0.19043f, -0.00518f, -1.31005f, 0.65797f,
- 1.07714f, -0.24253f, 0.49779f, 0.05848f, 1.08914f, 0.08015f, -0.38853f,
- 0.35108f, -0.11026f, 0.64528f, -0.37615f, 0.39995f, -0.58117f, -1.29627f,
- 1.74169f, 0.75558f, -0.04910f, 0.35020f, 0.04556f, 0.12634f, 1.27223f,
- 0.02608f, -0.19687f, -0.78649f, -0.22746f, 1.02589f, -0.28411f, 1.42443f,
- -0.42115f, -0.21153f, -0.01733f, 0.62001f, 0.87167f, 1.66008f, -0.39179f,
- -0.06293f, 0.27012f, 0.16871f, 0.64597f, 0.67358f, -0.20053f, 0.95830f,
- 0.44232f,
+ -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f,
+ -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f,
+ 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f,
+ -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+ 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f,
+ -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f,
+ -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f,
+ 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f,
+ 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f,
+ -1.10654f,
};
static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = {
- 0.14889f,
- 1.74197f,
- 0.53696f,
- 2.87574f,
+ -0.92861f,
+ -1.45151f,
+ -1.33588f,
+ -4.33853f,
};
static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
@@ -233,42 +258,37 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_8x4_hor_layer0,
- av1_tx_type_nn_weights_8x4_hor_layer1,
- },
- {
- av1_tx_type_nn_bias_8x4_hor_layer0,
- av1_tx_type_nn_bias_8x4_hor_layer1,
- },
+ { av1_tx_type_nn_weights_8x4_hor_layer0,
+ av1_tx_type_nn_weights_8x4_hor_layer1 },
+ { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 }
};
static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = {
- 0.81919f, 0.15527f, 0.60055f, -0.54617f, -0.35510f, -0.28223f, -0.20478f,
- 0.15001f, -1.84806f, -0.30274f, -0.00865f, 0.33939f, 1.11970f, 0.44630f,
- 0.32074f, 0.39637f, 0.08149f, 1.28070f, 0.86703f, 0.76503f, -1.83991f,
- -1.13575f, -0.68605f, -0.23690f, 0.07099f, 0.64960f, 0.82543f, -0.72028f,
- 0.08220f, 0.34338f, 0.20245f, -0.88920f,
+ -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+ -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+ -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f,
+ -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f,
+ 1.66212f, 1.70826f, 1.55182f, 0.12230f,
};
static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = {
- 1.14995f, -0.16021f, 2.38325f, -0.65179f,
- 1.09624f, 1.07662f, 0.63837f, -0.64847f,
+ 0.10943f, 2.09789f, 2.16578f, 0.15766f,
+ -0.42461f, 0.00000f, 1.22090f, -1.28717f,
};
static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = {
- 0.10278f, 0.06819f, 1.73885f, 1.29889f, -0.18482f, -1.06132f, 0.67003f,
- -0.23280f, 0.50181f, -0.33890f, 0.43524f, -1.03147f, 1.09640f, 0.66332f,
- 0.47652f, -0.02251f, 0.94245f, -0.03861f, 0.84776f, 0.28377f, 0.92044f,
- 0.23572f, 0.52082f, -0.16266f, 0.45290f, 0.11342f, -0.50310f, -0.92633f,
- 1.46345f, 1.84714f, 1.06804f, -0.13610f,
+ 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f,
+ 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f,
+ 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f,
+ -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f,
+ -1.15005f, -0.39311f, 1.51236f, -1.68973f,
};
static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = {
- 2.41028f,
- 1.95675f,
- 0.82387f,
- 2.41923f,
+ 1.81013f,
+ 1.10517f,
+ 2.90059f,
+ 0.95391f,
};
static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
@@ -278,131 +298,181 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
{
8,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_8x4_ver_layer0,
- av1_tx_type_nn_weights_8x4_ver_layer1,
- },
- {
- av1_tx_type_nn_bias_8x4_ver_layer0,
- av1_tx_type_nn_bias_8x4_ver_layer1,
- },
+ { av1_tx_type_nn_weights_8x4_ver_layer0,
+ av1_tx_type_nn_weights_8x4_ver_layer1 },
+ { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 }
};
/******************************************************************************/
// Tx type model for 8x8 block.
-static const float av1_tx_type_nn_weights_8x8_layer0[128] = {
- 0.98214f, 1.05643f, 0.91173f, 0.24165f, 0.39961f, 0.25736f, 0.68593f,
- 0.10553f, 0.13353f, -0.49687f, -1.66413f, 1.16584f, 2.25147f, -0.72247f,
- -2.65486f, -0.03628f, -1.47746f, -1.07644f, -1.25551f, -0.91260f, -1.26199f,
- -1.06022f, -1.42138f, 1.10500f, 2.96552f, -0.40638f, 0.02258f, -0.23137f,
- 0.34922f, -0.01454f, 0.41251f, 0.35944f, -1.56742f, 0.01406f, 0.88114f,
- 1.42462f, 0.87243f, 0.02439f, 0.07035f, 0.34303f, -3.16843f, 0.25798f,
- 0.07494f, 0.38926f, -0.12267f, 0.09049f, -0.36711f, 0.01551f, 1.41269f,
- 1.33505f, 1.43627f, 1.41909f, 1.44605f, 1.43008f, 1.36721f, 0.19443f,
- -0.08606f, 0.17285f, 0.63692f, 0.92092f, 0.61007f, 0.87100f, -0.33631f,
- 1.98025f, -0.40686f, -0.33808f, 0.34919f, 0.33817f, -0.01807f, -0.25259f,
- 0.26442f, -0.76979f, 1.07788f, -1.38747f, 1.34315f, 2.79947f, 2.02838f,
- -0.25062f, 0.00174f, 1.25888f, 0.17344f, 0.20897f, 1.28765f, 1.95749f,
- 1.62351f, 1.04556f, 0.43858f, 0.12463f, 1.66399f, 0.03971f, 0.36614f,
- 0.56932f, 0.15982f, 0.11587f, 0.21402f, 1.89386f, -0.91267f, -0.79781f,
- 1.79155f, 0.60147f, -0.90118f, -4.32718f, -0.58154f, -0.02181f, -0.40734f,
- -0.11409f, -0.79470f, 0.69697f, -0.16588f, -0.16090f, -0.21236f, -0.52776f,
- -0.64455f, 0.09173f, 0.80766f, 0.76097f, 0.20295f, -0.93467f, -0.43509f,
- 0.59659f, 0.07788f, -3.79459f, 0.16268f, 0.47343f, 0.05106f, -0.24880f,
- 1.18941f, 0.10346f,
-};
-
-static const float av1_tx_type_nn_bias_8x8_layer0[16] = {
- 0.75780f, 0.25628f, 0.19911f, -0.41384f, 1.33909f, 0.31498f,
- -1.37171f, -1.09561f, -0.44056f, 0.49001f, -0.65804f, -1.96031f,
- 0.64806f, -0.52520f, 1.38838f, 0.15519f,
-};
-
-static const float av1_tx_type_nn_weights_8x8_layer1[64] = {
- -0.63856f, -2.02670f, -0.92947f, 0.00216f, 1.47710f, -2.01099f, -2.11289f,
- -0.92288f, 0.19296f, 1.37866f, -0.85975f, -0.78624f, -2.10392f, 0.13976f,
- 1.06968f, -2.04120f, 0.57991f, -1.84941f, -0.81512f, -2.08254f, -0.47334f,
- 0.12256f, -1.39594f, -1.02829f, 0.06134f, 2.25646f, -1.25196f, -2.65317f,
- -1.94473f, 0.10989f, 0.55446f, -1.76557f, 0.33455f, -1.85556f, -3.01878f,
- -0.25100f, 1.65520f, -1.61409f, 1.16336f, -1.15560f, 0.13631f, 1.50733f,
- -1.07538f, -0.91200f, -1.93132f, 0.09271f, 0.24425f, -1.80655f, -0.01138f,
- -1.36421f, -0.62970f, -0.84766f, -0.34714f, -0.50531f, 1.91005f, -1.60316f,
- -0.02495f, 1.04938f, 0.28411f, -0.79809f, -1.48232f, 0.00766f, 0.94016f,
- -1.11974f,
-};
-
-static const float av1_tx_type_nn_bias_8x8_layer1[4] = {
- 0.53574f,
- 1.57736f,
- -0.13698f,
- 2.64613f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_8x8 = {
+static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = {
+ -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f,
+ -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f,
+ 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f,
+ 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+ -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f,
+ -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f,
+ -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f,
+ 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+ 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f,
+ -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+ 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f,
+ -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f,
+ 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f,
+ 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f,
+ 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f,
+ 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+ 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f,
+ 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+ -0.99892f, 1.09823f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = {
+ -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f,
+ -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f,
+ -0.26319f, 2.65579f, -1.30137f, -0.01487f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = {
+ -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f,
+ -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f,
+ 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+ 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f,
+ 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+ -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f,
+ 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f,
+ 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f,
+ 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f,
+ 0.06161f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = {
+ 1.70385f,
+ 1.82373f,
+ 1.78496f,
+ 1.80826f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = {
8, // num_inputs
4, // num_outputs
1, // num_hidden_layers
{
16,
}, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x8_hor_layer0,
+ av1_tx_type_nn_weights_8x8_hor_layer1 },
+ { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = {
+ -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+ 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f,
+ -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f,
+ -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f,
+ 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f,
+ 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f,
+ 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f,
+ -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+ -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f,
+ 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f,
+ 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f,
+ -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f,
+ 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f,
+ 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f,
+ -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f,
+ 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f,
+ -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f,
+ -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+ -1.29848f, 0.39308f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = {
+ -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f,
+ 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f,
+ 0.83015f, 0.06024f, 1.17180f, 0.65122f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = {
+ -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f,
+ 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+ 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f,
+ 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+ 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+ 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f,
+ 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f,
+ 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f,
+ -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f,
+ -0.41305f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = {
+ 2.14067f,
+ 2.76699f,
+ 2.04233f,
+ 1.34803f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
{
- av1_tx_type_nn_weights_8x8_layer0,
- av1_tx_type_nn_weights_8x8_layer1,
- },
- {
- av1_tx_type_nn_bias_8x8_layer0,
- av1_tx_type_nn_bias_8x8_layer1,
- },
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x8_ver_layer0,
+ av1_tx_type_nn_weights_8x8_ver_layer1 },
+ { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 }
};
/******************************************************************************/
// Tx type model for 8x16 block.
static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = {
- 1.36274f, 1.37313f, 1.26859f, 1.26459f, 1.37979f, 1.47217f, 1.29710f,
- 0.15765f, 0.31552f, -0.05727f, 0.25562f, 0.47925f, -0.32913f, -0.55757f,
- -0.98010f, 0.08568f, -0.62754f, 0.12834f, -0.03717f, 0.06286f, 0.26159f,
- 0.26023f, -0.62605f, 1.34500f, 1.47720f, 0.47937f, 0.84793f, 0.87866f,
- 0.81260f, 0.74761f, 0.84217f, 0.53321f, -0.78232f, 0.35321f, 0.41240f,
- 0.45002f, 0.88973f, 0.51055f, 0.91115f, -0.45512f, -2.37418f, -0.25205f,
- 0.05893f, -0.15685f, -0.25156f, -0.17104f, -0.12230f, 0.17802f, 0.18796f,
- -0.05797f, 0.26484f, 1.23515f, 1.70393f, 0.46022f, -0.14354f, 0.08501f,
- -0.84625f, -0.42578f, -0.29345f, -0.51797f, -0.56515f, -0.47465f, 0.23970f,
- 1.59912f, -0.40332f, -0.33209f, 0.37274f, 0.36831f, -0.00248f, -0.24295f,
- 0.29539f, -0.76136f, -0.22531f, 0.12371f, 0.37889f, 1.02639f, 1.73330f,
- 1.09686f, 1.04111f, 0.69006f, -1.27157f, 0.94013f, 0.61621f, 0.62274f,
- 0.48759f, 0.55672f, 0.62597f, -0.38846f, 1.72124f, 0.08214f, -0.06650f,
- 0.32617f, 0.10958f, 0.24650f, 0.10740f, 1.16861f, 0.50701f, 0.45383f,
- 0.90016f, -0.00695f, -0.11986f, -0.07834f, 0.20346f, 0.25863f, -0.40889f,
- -0.11344f, -0.79108f, 0.76259f, -0.14562f, -0.15459f, -0.20954f, -0.51306f,
- 0.02743f, -0.82456f, -0.00861f, -0.27274f, 0.28762f, 0.07282f, 0.26410f,
- 0.53413f, -0.22208f, -0.85031f, -1.39129f, -0.74519f, 0.09771f, 0.80313f,
- 1.07698f, 0.02531f,
+ -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+ 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f,
+ -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f,
+ 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+ -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f,
+ 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f,
+ -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f,
+ 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f,
+ -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f,
+ -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f,
+ 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f,
+ 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f,
+ -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f,
+ 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f,
+ -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f,
+ 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f,
+ 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f,
+ -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+ -0.28136f, 0.42556f,
};
static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = {
- -1.30434f, -1.19259f, -0.43467f, -0.85386f, 0.96584f, 0.29276f,
- -0.41990f, -0.96924f, -0.30933f, 0.95264f, -0.25330f, -1.19584f,
- 1.46564f, -0.42959f, 1.55720f, 0.18479f,
+ 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f,
+ -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+ 1.81560f, -1.02643f, -0.81690f, 0.08302f,
};
static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = {
- -1.72959f, -0.21670f, 0.10616f, -0.02006f, 0.15084f, -0.85303f, -0.27535f,
- 0.58704f, 0.23683f, 1.19743f, 0.77971f, 0.49874f, 0.19508f, 0.19641f,
- 1.47895f, -0.52173f, -0.56746f, -0.50761f, 0.15864f, -0.95168f, 0.48103f,
- 0.91904f, -0.11700f, 0.62863f, 0.06526f, 1.63803f, -0.72325f, -1.80449f,
- 0.66373f, 0.12831f, 0.27139f, -0.26346f, 1.50852f, 0.25079f, -0.54255f,
- 1.78815f, 1.39691f, -0.44989f, -0.18511f, -1.52903f, 0.13983f, 1.06906f,
- -0.30184f, 0.37566f, 0.46209f, 0.10440f, 0.64695f, -0.34002f, 1.96990f,
- 0.21189f, -0.91248f, -0.11263f, 0.26708f, 1.27405f, 1.89776f, 0.02081f,
- -0.06977f, -0.02584f, 0.47733f, 0.27117f, 1.33315f, -0.09175f, 0.48747f,
- 1.16772f,
+ 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f,
+ -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f,
+ 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f,
+ -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f,
+ 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f,
+ 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f,
+ 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f,
+ 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f,
+ 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f,
+ -1.31243f,
};
static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = {
- 1.25783f,
- 1.19452f,
- 0.69964f,
- 2.41982f,
+ 0.83359f,
+ 1.06875f,
+ 1.77645f,
+ 1.49570f,
};
static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
@@ -412,62 +482,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_8x16_hor_layer0,
- av1_tx_type_nn_weights_8x16_hor_layer1,
- },
- {
- av1_tx_type_nn_bias_8x16_hor_layer0,
- av1_tx_type_nn_bias_8x16_hor_layer1,
- },
+ { av1_tx_type_nn_weights_8x16_hor_layer0,
+ av1_tx_type_nn_weights_8x16_hor_layer1 },
+ { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 }
};
static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = {
- 0.90888f, 0.86305f, 0.81674f, 0.75352f, 1.07834f, 0.99048f, 0.96355f,
- 0.13836f, -0.51334f, 0.19906f, 1.84608f, 0.67828f, 0.45876f, 0.08325f,
- 0.28190f, -0.01958f, -1.96553f, 0.27837f, -0.05929f, 0.13491f, 0.21036f,
- 0.05797f, -0.01373f, 0.73765f, 1.39603f, -0.53767f, 0.10362f, 0.03420f,
- 0.41909f, 0.09510f, 0.32284f, 0.83860f, 0.13954f, 0.48434f, 1.47762f,
- 0.45891f, 0.23613f, 0.13013f, 0.82097f, -0.03251f, -1.89757f, 0.21589f,
- -0.10370f, 0.02530f, -0.25659f, 0.01466f, -0.23661f, 0.22783f, 0.92100f,
- 1.02915f, 1.20358f, 1.17251f, 0.97749f, 1.04696f, 0.91333f, 0.54576f,
- -0.52792f, 0.02217f, 0.25652f, 0.31405f, -0.18398f, 0.04572f, -0.81359f,
- 1.82883f, -0.40047f, -0.33056f, 0.35255f, 0.34448f, -0.00339f, -0.23857f,
- 0.28925f, -0.77175f, -0.24325f, -0.21420f, 1.11451f, 1.39553f, 0.51573f,
- 0.05476f, 1.13791f, 0.94959f, -0.35710f, 0.67467f, 0.16722f, 0.61213f,
- 0.07683f, -0.20613f, 0.13440f, -0.72131f, -0.15418f, -0.17688f, -0.16510f,
- -0.19226f, 0.09270f, -2.43559f, -0.12669f, 0.05074f, 0.30414f, 0.00927f,
- 0.60630f, 0.00801f, -1.07310f, -0.06227f, 2.10607f, 0.02382f, -0.39891f,
- -0.09149f, -0.78596f, 0.83966f, -0.14802f, -0.14083f, -0.20831f, -0.55136f,
- 0.08566f, -0.00647f, 0.07044f, 0.53408f, 0.85720f, -0.07393f, 0.24476f,
- 0.43767f, 0.30519f, -1.89430f, 0.23252f, 1.63790f, 0.17316f, -0.03903f,
- 0.25269f, 0.01562f,
+ 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f,
+ -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f,
+ -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f,
+ 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+ -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f,
+ 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f,
+ 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f,
+ 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f,
+ -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f,
+ -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f,
+ 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f,
+ 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+ -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+ -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f,
+ -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f,
+ -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+ -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+ 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+ -0.12236f, 0.16075f,
};
static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = {
- -0.83370f, -0.20704f, -0.60437f, -0.81664f, 1.16998f, 0.16745f,
- -1.34680f, -1.07083f, -0.34649f, 0.65598f, -0.56278f, 0.22660f,
- -0.25956f, -0.29608f, 1.24359f, -0.09167f,
+ -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f,
+ -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+ 0.57598f, 0.99819f, 0.75175f, 0.17044f,
};
static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = {
- -0.71147f, -0.63964f, -0.69220f, 0.22326f, 0.67191f, -0.58894f, -0.98464f,
- 0.23583f, 0.22824f, 1.39838f, 0.09920f, -0.59411f, -0.67101f, 0.19088f,
- 0.83025f, -0.66991f, -0.42889f, -0.49969f, 1.39532f, -1.02000f, 0.62101f,
- 0.57175f, -0.83226f, 0.01551f, 0.05604f, 1.23028f, 0.02030f, -0.55995f,
- -0.42349f, 0.15375f, 0.52132f, -0.52421f, 0.89586f, -0.73778f, -0.10911f,
- 0.22447f, 1.16858f, -0.48169f, 1.73890f, -0.69860f, 0.12504f, 1.10492f,
- 0.04391f, -0.85670f, -0.49257f, 0.09616f, 0.76518f, -0.44854f, 1.50938f,
- 0.62246f, -0.40366f, -0.11182f, -0.01680f, 0.59724f, 1.32170f, -1.09061f,
- -0.04278f, -0.02449f, 0.25024f, 1.26239f, 0.42345f, -0.10031f, 0.80871f,
- 0.44198f,
+ -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f,
+ 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f,
+ -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+ 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f,
+ -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+ -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f,
+ -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+ 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f,
+ 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f,
+ 2.20547f,
};
static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = {
- 0.68329f,
- 1.33555f,
- 0.25943f,
- 3.23439f,
+ -0.44080f,
+ -1.67455f,
+ -1.46332f,
+ -6.13206f,
};
static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
@@ -477,64 +542,59 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_8x16_ver_layer0,
- av1_tx_type_nn_weights_8x16_ver_layer1,
- },
- {
- av1_tx_type_nn_bias_8x16_ver_layer0,
- av1_tx_type_nn_bias_8x16_ver_layer1,
- },
+ { av1_tx_type_nn_weights_8x16_ver_layer0,
+ av1_tx_type_nn_weights_8x16_ver_layer1 },
+ { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 }
};
/******************************************************************************/
// Tx type model for 16x8 block.
static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = {
- 0.89821f, 0.90804f, 1.13052f, 0.74855f, 1.02053f, 0.91260f, 0.97102f,
- 0.16808f, -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f,
- -0.12236f, -0.03158f, -1.43561f, 0.07794f, 0.16586f, 0.09731f, 0.12967f,
- 0.09725f, -0.16826f, 1.26640f, 0.88004f, 0.27312f, -0.07993f, 0.33640f,
- 0.11732f, 0.33384f, 0.97066f, -0.61744f, -0.48545f, 0.44622f, 0.73744f,
- 0.32262f, -0.05713f, 0.42280f, 1.10378f, 0.18540f, -2.07906f, 0.11443f,
- 0.37877f, 0.24136f, -0.12524f, -0.12434f, 0.02116f, 0.11716f, 1.28267f,
- 1.01508f, 1.26184f, 1.22545f, 1.29582f, 1.18855f, 1.27564f, 0.42001f,
- -0.41481f, 0.06725f, -0.13133f, -0.24801f, 0.16515f, 0.16228f, 0.35197f,
- 0.53610f, -0.39805f, -0.32584f, 0.40096f, 0.38621f, -0.00030f, -0.23434f,
- 0.29149f, -0.76542f, 0.04996f, -0.30036f, 1.48687f, 0.90852f, -0.03083f,
- -0.15953f, 1.19259f, 0.87690f, -1.08977f, 0.78757f, 0.81149f, 0.54089f,
- 0.35400f, 0.37919f, 0.84997f, -0.20449f, 0.39601f, -0.37596f, 0.64748f,
- 0.26021f, 0.37354f, 0.23593f, 0.16335f, 1.70681f, 0.31800f, -0.00964f,
- 0.82687f, -0.78372f, -1.47438f, 0.32410f, 1.37436f, 0.07476f, -0.40574f,
- -0.10353f, -0.79300f, 0.74381f, -0.15601f, -0.14380f, -0.20961f, -0.52697f,
- 0.04669f, -0.00870f, 0.05624f, -0.09036f, 0.25701f, 0.30336f, 0.24199f,
- 0.45579f, 0.66330f, -1.81834f, 0.74965f, 1.22747f, 0.25072f, 0.25100f,
- 0.43289f, -0.00362f,
+ 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f,
+ -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f,
+ -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f,
+ 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f,
+ 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f,
+ 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f,
+ 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f,
+ -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f,
+ -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f,
+ -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f,
+ 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f,
+ -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f,
+ -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f,
+ -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f,
+ 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f,
+ -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+ -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+ 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f,
+ -0.36570f, -0.50757f,
};
static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = {
- -0.87643f, 0.36754f, -0.86409f, 1.37761f, 1.22688f, 0.09074f,
- -1.47139f, -1.06100f, -0.24087f, 1.10382f, -0.32837f, -1.39592f,
- -0.14741f, -0.43954f, 1.72137f, -0.21704f,
+ -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+ 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f,
+ -0.12329f, 0.08986f, 1.08117f, -0.00220f,
};
static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = {
- -0.81860f, -0.80745f, -0.43612f, 0.58656f, 0.37455f, -0.56519f, -1.71536f,
- 0.23278f, 0.23951f, 1.09610f, 0.49986f, 0.43375f, -0.53182f, 0.17376f,
- 1.05626f, -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f, 1.14295f,
- 0.45571f, -0.52504f, -0.00303f, 0.06044f, 0.66119f, -0.60340f, -1.14344f,
- -0.28045f, 0.12742f, 0.61484f, -0.41016f, 1.36102f, -0.86969f, -0.52728f,
- 1.01725f, 0.67083f, -0.10138f, 1.36406f, 0.34066f, 0.12498f, 0.86595f,
- -0.39636f, -0.27888f, -0.40244f, 0.09847f, 0.81178f, -0.45313f, 1.39127f,
- 0.99865f, -0.57908f, 0.55072f, 0.49638f, 1.11524f, 1.85504f, -0.28316f,
- -0.05195f, -0.23284f, 0.26461f, -1.28120f, 0.60707f, -0.06110f, 0.74085f,
- 0.63304f,
+ 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+ 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f,
+ -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f,
+ -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f,
+ -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f,
+ -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f,
+ 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f,
+ 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f,
+ 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f,
+ -0.23347f,
};
static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = {
- 0.71765f,
- 1.40400f,
- 0.32221f,
- 3.07234f,
+ 3.57175f,
+ 2.42612f,
+ 3.31259f,
+ 2.08287f,
};
static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
@@ -544,62 +604,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_16x8_hor_layer0,
- av1_tx_type_nn_weights_16x8_hor_layer1,
- },
- {
- av1_tx_type_nn_bias_16x8_hor_layer0,
- av1_tx_type_nn_bias_16x8_hor_layer1,
- },
+ { av1_tx_type_nn_weights_16x8_hor_layer0,
+ av1_tx_type_nn_weights_16x8_hor_layer1 },
+ { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 }
};
static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = {
- 1.20497f, 1.23691f, 1.23738f, 1.07773f, 1.15264f, 1.31959f, 1.15365f,
- 0.17179f, 0.68612f, 0.55636f, 0.57145f, 0.67022f, 0.19636f, -1.27420f,
- -1.36428f, -0.16706f, -1.20934f, -0.87794f, -0.97146f, -0.74722f, -1.14493f,
- -1.02689f, -0.88153f, 0.83857f, 1.53355f, 0.13601f, 0.35451f, 0.53750f,
- 0.62381f, 0.32438f, 0.59405f, 0.33090f, -1.52948f, -0.46094f, 0.42634f,
- 0.48763f, 0.30707f, 0.52553f, 0.71427f, -0.31287f, -2.37106f, -0.18756f,
- 0.16561f, -0.00431f, -0.13747f, -0.09336f, -0.16511f, 0.13454f, 0.45010f,
- -0.00317f, -0.06403f, 0.95442f, 1.59636f, 0.30602f, -0.05515f, 0.05467f,
- -0.21758f, -0.19192f, -0.17935f, -0.00545f, 0.35409f, 0.26141f, -0.32174f,
- 1.78129f, -0.40161f, -0.33158f, 0.38084f, 0.38081f, 0.01053f, -0.23567f,
- 0.29239f, -0.76159f, -0.19373f, 0.13649f, 0.66949f, 1.19733f, 1.92557f,
- 1.16691f, 0.94955f, 0.62324f, -0.85434f, -0.07699f, 0.87683f, 0.95911f,
- 0.86106f, 0.57959f, 0.40146f, -0.35851f, 1.55427f, 0.15349f, -0.01582f,
- 0.32517f, 0.03784f, 0.15916f, 0.09024f, 1.43187f, 0.56160f, 0.11521f,
- 0.52476f, -0.26107f, -0.38167f, -0.31596f, 0.31304f, -0.65366f, -0.40680f,
- -0.11082f, -0.78585f, 0.77906f, -0.13322f, -0.13747f, -0.21001f, -0.53204f,
- -0.06752f, -0.84741f, -0.53442f, -0.16284f, 0.54027f, 0.13586f, -0.42001f,
- 0.85388f, 0.08300f, -0.89325f, -1.73681f, -0.70473f, 0.23151f, 0.69549f,
- 0.72124f, 0.12769f,
+ 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f,
+ 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+ -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f,
+ 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f,
+ 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f,
+ -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f,
+ 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f,
+ -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f,
+ 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f,
+ 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f,
+ 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+ -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+ -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f,
+ -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f,
+ 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f,
+ 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+ -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f,
+ -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f,
+ -0.81945f, -0.41647f,
};
static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = {
- -1.15644f, -0.31062f, 0.20697f, -0.60304f, -1.19498f, 0.21451f,
- -0.42825f, -0.71800f, -0.25816f, 1.47408f, -0.24423f, -1.45773f,
- -0.55834f, -0.36938f, 1.56759f, 0.07238f,
+ 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f,
+ 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f,
+ -0.04510f, 0.48000f, -0.09354f, -0.42422f,
};
static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = {
- -1.45227f, -0.67141f, 0.75237f, 0.32681f, -0.70528f, -0.76730f, -0.49777f,
- 0.02418f, 0.25096f, 1.14840f, 0.23548f, 0.48755f, 0.33164f, 0.21050f,
- 1.41651f, -0.28888f, -0.76668f, 0.04439f, 0.67538f, -1.06438f, 0.68128f,
- 0.95824f, 0.08530f, -0.03635f, 0.06820f, 1.38621f, -0.50424f, -1.72992f,
- -0.20949f, 0.13400f, 0.93366f, -0.05324f, 1.41593f, -0.75119f, -1.80912f,
- 1.05440f, 0.62580f, -0.30867f, -0.07025f, -0.34654f, 0.13621f, 1.74426f,
- -0.22417f, 0.47031f, -0.08142f, 0.10151f, 0.42498f, 0.06635f, 1.50623f,
- 1.04130f, 0.85107f, 0.23382f, 0.69800f, 1.10856f, 1.18767f, -0.69395f,
- -0.07985f, 0.50412f, 0.46019f, 0.49214f, 0.44219f, -0.09502f, 0.75745f,
- 0.99208f,
+ 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f,
+ -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f,
+ 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f,
+ -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f,
+ -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f,
+ 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f,
+ 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f,
+ -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f,
+ 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f,
+ -0.00873f,
};
static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = {
- 0.68774f,
- 0.88572f,
- 0.77462f,
- 3.05667f,
+ 3.34981f,
+ 3.74710f,
+ 1.38339f,
+ 0.45176f,
};
static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
@@ -609,14 +664,9 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_16x8_ver_layer0,
- av1_tx_type_nn_weights_16x8_ver_layer1,
- },
- {
- av1_tx_type_nn_bias_16x8_ver_layer0,
- av1_tx_type_nn_bias_16x8_ver_layer1,
- },
+ { av1_tx_type_nn_weights_16x8_ver_layer0,
+ av1_tx_type_nn_weights_16x8_ver_layer1 },
+ { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 }
};
/******************************************************************************/
@@ -687,445 +737,253 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x16 = {
};
/******************************************************************************/
-// Tx type model for 16x32 block.
-static const float av1_tx_type_nn_weights_16x32_hor_layer0[128] = {
- 0.89821f, 0.90804f, 1.13052f, 0.74855f, 1.02053f, 0.91260f, 0.97102f,
- 0.16808f, -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f,
- -0.12236f, -0.03158f, -1.43561f, 0.07794f, 0.16586f, 0.09731f, 0.12967f,
- 0.09725f, -0.16826f, 1.26640f, 0.88004f, 0.27312f, -0.07993f, 0.33640f,
- 0.11732f, 0.33384f, 0.97066f, -0.61744f, -0.48545f, 0.44622f, 0.73744f,
- 0.32262f, -0.05713f, 0.42280f, 1.10378f, 0.18540f, -2.07906f, 0.11443f,
- 0.37877f, 0.24136f, -0.12524f, -0.12434f, 0.02116f, 0.11716f, 1.28267f,
- 1.01508f, 1.26184f, 1.22545f, 1.29582f, 1.18855f, 1.27564f, 0.42001f,
- -0.41481f, 0.06725f, -0.13133f, -0.24801f, 0.16515f, 0.16228f, 0.35197f,
- 0.53610f, -0.39805f, -0.32584f, 0.40096f, 0.38621f, -0.00030f, -0.23434f,
- 0.29149f, -0.76542f, 0.04996f, -0.30036f, 1.48687f, 0.90852f, -0.03083f,
- -0.15953f, 1.19259f, 0.87690f, -1.08977f, 0.78757f, 0.81149f, 0.54089f,
- 0.35400f, 0.37919f, 0.84997f, -0.20449f, 0.39601f, -0.37596f, 0.64748f,
- 0.26021f, 0.37354f, 0.23593f, 0.16335f, 1.70681f, 0.31800f, -0.00964f,
- 0.82687f, -0.78372f, -1.47438f, 0.32410f, 1.37436f, 0.07476f, -0.40574f,
- -0.10353f, -0.79300f, 0.74381f, -0.15601f, -0.14380f, -0.20961f, -0.52697f,
- 0.04669f, -0.00870f, 0.05624f, -0.09036f, 0.25701f, 0.30336f, 0.24199f,
- 0.45579f, 0.66330f, -1.81834f, 0.74965f, 1.22747f, 0.25072f, 0.25100f,
- 0.43289f, -0.00362f,
-};
-
-static const float av1_tx_type_nn_bias_16x32_hor_layer0[16] = {
- -0.87643f, 0.36754f, -0.86409f, 1.37761f, 1.22688f, 0.09074f,
- -1.47139f, -1.06100f, -0.24087f, 1.10382f, -0.32837f, -1.39592f,
- -0.14741f, -0.43954f, 1.72137f, -0.21704f,
-};
-
-static const float av1_tx_type_nn_weights_16x32_hor_layer1[64] = {
- -0.81860f, -0.80745f, -0.43612f, 0.58656f, 0.37455f, -0.56519f, -1.71536f,
- 0.23278f, 0.23951f, 1.09610f, 0.49986f, 0.43375f, -0.53182f, 0.17376f,
- 1.05626f, -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f, 1.14295f,
- 0.45571f, -0.52504f, -0.00303f, 0.06044f, 0.66119f, -0.60340f, -1.14344f,
- -0.28045f, 0.12742f, 0.61484f, -0.41016f, 1.36102f, -0.86969f, -0.52728f,
- 1.01725f, 0.67083f, -0.10138f, 1.36406f, 0.34066f, 0.12498f, 0.86595f,
- -0.39636f, -0.27888f, -0.40244f, 0.09847f, 0.81178f, -0.45313f, 1.39127f,
- 0.99865f, -0.57908f, 0.55072f, 0.49638f, 1.11524f, 1.85504f, -0.28316f,
- -0.05195f, -0.23284f, 0.26461f, -1.28120f, 0.60707f, -0.06110f, 0.74085f,
- 0.63304f,
-};
-
-static const float av1_tx_type_nn_bias_16x32_hor_layer1[4] = {
- 0.71765f,
- 1.40400f,
- 0.32221f,
- 3.07234f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_16x32_hor = {
+// Tx type model for 4x16 block.
+static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = {
+ 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f,
+ 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+ 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f,
+ 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f,
+ -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = {
+ -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+ 0.62806f, -0.20675f, 4.91940f, -0.56079f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = {
+ -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f,
+ -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f,
+ 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+ 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+ 1.28413f, -0.30326f, 2.45329f, -0.83335f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = {
+ 2.33198f,
+ 3.36245f,
+ 1.62603f,
+ 2.91056f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x16_hor_layer0,
+ av1_tx_type_nn_weights_4x16_hor_layer1 },
+ { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = {
+ 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f,
+ 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f,
+ -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f,
+ -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+ -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f,
+ -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f,
+ 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f,
+ 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f,
+ 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+ -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f,
+ -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f,
+ 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+ 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+ 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f,
+ 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f,
+ -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+ 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f,
+ 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f,
+ -0.27975f, -0.01149f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = {
+ -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f,
+ -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+ -0.32530f, 0.73483f, 0.08322f, -0.23890f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = {
+ 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f,
+ -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f,
+ 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f,
+ -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f,
+ 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f,
+ -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f,
+ 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f,
+ 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f,
+ -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f,
+ -0.56513f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = {
+ 4.60896f,
+ 4.53551f,
+ 4.53124f,
+ 4.27435f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = {
8, // num_inputs
4, // num_outputs
1, // num_hidden_layers
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_16x32_hor_layer0,
- av1_tx_type_nn_weights_16x32_hor_layer1,
- },
- {
- av1_tx_type_nn_bias_16x32_hor_layer0,
- av1_tx_type_nn_bias_16x32_hor_layer1,
- },
+ { av1_tx_type_nn_weights_4x16_ver_layer0,
+ av1_tx_type_nn_weights_4x16_ver_layer1 },
+ { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 }
};
+/******************************************************************************/
-static const float av1_tx_type_nn_weights_16x32_ver_layer0[512] = {
- -0.01219f, 0.51494f, 0.25450f, 0.45788f, -0.87277f, 0.32954f, -0.04851f,
- -0.24321f, -0.40000f, 0.21915f, 0.14108f, 0.98268f, 0.18989f, 0.54298f,
- 0.36349f, 0.38931f, 1.08124f, 0.87199f, 1.03553f, 1.14777f, 1.04254f,
- 1.11336f, 0.92198f, 0.84715f, 1.89363f, 1.21587f, 0.72377f, 1.25097f,
- 0.84231f, 0.95529f, 1.12346f, 0.19113f, -0.04559f, 0.56859f, 0.59747f,
- 0.60176f, 0.82465f, 0.59009f, 0.67240f, 1.58674f, -0.92951f, -0.23449f,
- 0.11923f, -0.19151f, -0.15914f, 0.03146f, -0.16541f, 0.17181f, -0.21834f,
- 0.21906f, 0.96708f, 0.36085f, -0.42380f, -2.25681f, -0.48812f, 0.72875f,
- 0.06585f, 0.18818f, -0.02109f, -0.10996f, 0.00187f, -0.02078f, 0.04484f,
- -0.07171f, 0.94773f, -0.33466f, 0.28484f, 0.14791f, 0.30274f, 0.13377f,
- 0.40970f, 0.45133f, 1.69265f, -0.36422f, -0.15889f, 0.07670f, 0.44675f,
- -0.28665f, -0.07097f, 1.03803f, -0.83274f, -0.24571f, 0.08039f, -0.23790f,
- -0.23276f, -0.28031f, 0.26451f, -0.18513f, -2.23336f, -0.62073f, 0.32495f,
- -0.67644f, -0.08559f, -0.36382f, -0.24515f, -0.01899f, 0.09012f, 0.19723f,
- 0.04017f, 0.31624f, 0.58369f, 0.30411f, -0.81165f, -2.58541f, -0.20491f,
- 0.68089f, -0.14799f, 0.13925f, 0.12867f, 0.15229f, 0.06887f, -0.03784f,
- 0.02288f, -0.28712f, 0.14107f, 0.29485f, -0.11662f, 0.25239f, 0.30311f,
- -0.07377f, -0.10962f, 0.59856f, 0.47967f, 0.01847f, -0.27889f, 0.46786f,
- 0.18118f, 0.09355f, -2.10076f, 0.38823f, 0.28202f, 0.29104f, 0.86977f,
- 0.52377f, 0.21161f, 0.72888f, -0.00952f, 0.15982f, -0.14651f, 0.28763f,
- -0.14155f, 0.00093f, 0.08351f, 0.34685f, -0.22066f, 0.20378f, 0.25416f,
- 0.03423f, -0.11068f, -0.41612f, 0.56913f, -0.06697f, -0.12585f, -0.21033f,
- -0.14513f, -0.04477f, -0.35778f, 0.03437f, 0.06956f, -0.25356f, -1.46010f,
- -0.08142f, 0.11926f, -0.63551f, -0.13882f, 0.34164f, 0.10821f, 1.07323f,
- -0.62435f, -0.27116f, 0.25971f, 0.11952f, -0.39480f, -0.05474f, -0.12582f,
- 0.28289f, 0.13723f, 0.58369f, 0.41865f, 0.28574f, 1.01357f, 0.46661f,
- 0.61717f, 0.85708f, -0.03930f, -0.38013f, -0.33888f, -0.20561f, -0.19087f,
- -0.01041f, 0.12119f, -0.20786f, 0.55915f, 0.67511f, 0.55554f, 0.56540f,
- 0.76647f, 0.54766f, 0.45166f, 0.61384f, 0.95407f, -0.06811f, -0.62132f,
- 0.12713f, 0.63713f, 2.04090f, 1.17054f, 0.00469f, -0.93692f, -0.24136f,
- -0.04281f, -0.15787f, 0.37956f, -0.09174f, -0.72494f, 0.55285f, -1.40996f,
- -0.54077f, 0.38445f, -0.08258f, 0.64259f, -0.54058f, -0.49865f, 1.41371f,
- 0.89014f, 0.78788f, 0.37919f, 0.87447f, -0.00760f, -0.00947f, 0.16323f,
- -0.36632f, -1.38115f, -0.24619f, 0.40490f, -0.08871f, -0.25365f, -0.60842f,
- 0.11128f, 0.18658f, -0.86001f, -0.28271f, 0.39572f, -0.29930f, -0.10110f,
- 0.33706f, 0.21731f, 0.15383f, -0.01707f, 0.02812f, 0.31192f, 0.39742f,
- 0.38260f, -0.48263f, 0.57385f, 0.53239f, -0.60013f, -0.63211f, -0.45140f,
- -0.73520f, -0.95260f, -0.70633f, -0.96190f, 0.01747f, -0.05195f, -0.07138f,
- -1.09535f, -0.63548f, -1.55700f, -0.35721f, -0.18923f, 0.77568f, 0.09419f,
- 0.36919f, -0.32761f, -0.06597f, -0.38988f, -0.43674f, -0.24284f, 0.36906f,
- 0.28414f, 0.19273f, -0.68516f, 0.09514f, -0.45381f, 0.19917f, -0.32377f,
- 1.32549f, 0.08244f, -0.64405f, 0.13195f, 2.85307f, 0.47631f, -0.33408f,
- 0.04168f, 0.18585f, -0.18029f, 0.07986f, -0.08816f, -0.00703f, -0.01515f,
- -0.13164f, 0.00571f, 0.05676f, 1.51425f, 0.73360f, 0.43486f, -0.08223f,
- -0.06183f, -0.57098f, -0.29948f, 0.05945f, 0.19238f, -0.47980f, -0.35902f,
- -0.19931f, 0.43443f, 0.67436f, 0.78573f, 0.25703f, 1.01863f, 0.99047f,
- 0.95228f, 1.02429f, 1.19264f, 0.29935f, -0.26583f, -0.98749f, -0.46167f,
- -0.29727f, -0.10515f, -0.39790f, -0.59321f, -0.61925f, -0.95452f, 0.04292f,
- -0.48273f, -0.91195f, -0.45971f, -0.46355f, -0.88319f, -0.51712f, -0.47682f,
- -0.86110f, -0.59178f, -0.57163f, -0.94946f, 0.19627f, -0.18699f, 0.11037f,
- 1.39110f, 0.05715f, 3.00762f, 1.52243f, 0.25028f, 0.12779f, -0.12871f,
- 0.04764f, 0.08288f, -0.16572f, -0.06580f, 0.05845f, -0.01474f, 0.04886f,
- -0.10000f, 0.12911f, -0.01416f, -0.12472f, 0.14358f, 0.16554f, 0.08853f,
- 0.13418f, -0.05408f, -0.13871f, -0.00049f, 0.20725f, -0.05603f, 0.27885f,
- -0.14277f, 0.29653f, -0.24739f, 0.10101f, -0.17068f, -2.43802f, 0.41834f,
- 0.49784f, 0.34949f, 0.98487f, 0.16792f, 1.07355f, 0.32546f, 1.32377f,
- -0.08584f, 0.85214f, -0.05721f, 0.90307f, 0.20167f, 0.52664f, -0.14478f,
- 0.64997f, 0.06846f, 0.32475f, 0.64453f, 0.70143f, -0.03091f, -0.24958f,
- -0.39021f, -0.57693f, -0.18319f, 0.11793f, -0.05948f, 0.36670f, -0.27932f,
- 0.14800f, -0.55459f, -0.89673f, 0.65922f, 0.54308f, -0.16731f, -0.59731f,
- -0.20705f, -0.18183f, -0.05645f, -0.06829f, -0.40210f, -0.27955f, 0.28054f,
- 0.57665f, 0.14171f, 0.54693f, -0.22144f, -0.59664f, 0.13295f, 0.07057f,
- -0.19698f, 0.03328f, -0.09687f, -0.32390f, -0.11506f, -0.40406f, -0.11473f,
- 0.10399f, -0.29824f, 0.16028f, 0.00053f, 0.22699f, 0.04203f, -0.43880f,
- -0.12654f, 0.12172f, 0.21087f, -0.46350f, -0.22081f, -0.06173f, -0.23287f,
- 0.90314f, 0.04466f, -0.06149f, 0.32682f, 0.16609f, -0.58991f, -0.03786f,
- -0.41329f, 0.02632f, 0.23411f, 0.25344f, 0.16468f, 0.31007f, 0.21845f,
- 0.32462f, 0.33945f, 0.11527f, -0.35926f, -0.18584f, 0.29340f, 0.78199f,
- 2.39287f, 0.53838f, -1.55085f, 0.02238f, -0.26153f, -0.42498f, -0.02460f,
- 0.19261f, -0.10870f, -0.08453f, -0.39561f, 0.08600f, 0.36310f, 0.58439f,
- -0.59526f, 0.13104f, -0.06703f, -0.17529f, -0.41431f, -0.23121f, -0.32394f,
- -0.33324f, -0.21405f, -0.41702f, -0.29236f, -0.31766f, -0.33512f, -0.22679f,
- -0.13680f, -0.00118f, -1.81744f, -2.34798f, -1.08048f, -0.29883f, -0.29123f,
- -0.01752f,
-};
-
-static const float av1_tx_type_nn_bias_16x32_ver_layer0[32] = {
- 1.02458f, -1.02185f, -0.18978f, 0.05981f, -0.94931f, 0.34544f, 0.04415f,
- -0.60036f, -0.11368f, -0.14154f, 1.23438f, 0.51640f, -0.57587f, -0.91380f,
- 0.95720f, 0.68298f, -0.06353f, -2.14960f, -0.11080f, 0.79380f, -0.94199f,
- 0.43040f, 0.01358f, 0.07201f, -0.49689f, -0.14839f, -0.80132f, -0.13925f,
- -0.11834f, -0.24998f, -0.33976f, 0.35497f,
-};
-
-static const float av1_tx_type_nn_weights_16x32_ver_layer1[128] = {
- 0.87367f, -1.06469f, -0.50829f, -0.70540f, 1.14596f, -1.12346f, -0.94467f,
- 0.01380f, -0.18911f, 0.07961f, -0.18626f, 0.61902f, -0.64423f, 1.21545f,
- 1.01149f, 0.26309f, 1.50380f, 1.93940f, -0.64064f, 1.03987f, -1.88000f,
- -0.44574f, -1.53303f, 1.36307f, 1.00292f, 0.37031f, 0.21594f, 0.16758f,
- 0.02592f, -0.77431f, -0.31797f, -1.53826f, 1.14013f, -1.21957f, 0.04571f,
- -0.22168f, 0.32299f, 0.25949f, -0.13306f, 0.17850f, 0.92494f, 0.19999f,
- 0.07494f, -0.03362f, -0.53453f, 1.02970f, -0.22947f, 0.73964f, 1.08445f,
- 0.16855f, -0.02686f, 0.25254f, 0.05952f, 0.02194f, 0.05649f, 0.39195f,
- 0.14139f, 0.53843f, -0.06959f, -0.06993f, -0.14151f, -0.53147f, 0.17481f,
- -1.21977f, 0.62932f, 1.07173f, 0.24049f, -0.51574f, 0.97492f, -0.28169f,
- -0.15406f, -0.05441f, -0.25415f, 0.16583f, 0.43674f, -0.00593f, -0.09277f,
- 0.61402f, 1.35562f, -0.03926f, 0.18967f, -0.29548f, -0.55509f, 0.23661f,
- 0.05023f, 0.36226f, -0.83314f, 0.39357f, 0.19943f, -0.63431f, -0.03847f,
- 0.12213f, 0.62024f, -0.11704f, -0.22483f, 0.96624f, 0.18518f, 0.09181f,
- -0.63068f, 0.66797f, 0.74107f, 0.40624f, 0.70636f, -0.06921f, 0.34175f,
- -0.15513f, 2.07844f, 0.22126f, 0.52919f, 0.26793f, -0.50018f, 1.10549f,
- 0.10970f, 0.05831f, 0.82842f, -1.22975f, 1.78377f, 0.92679f, 2.01480f,
- -1.19011f, -0.53381f, 0.38533f, 0.45579f, -0.10683f, -0.40828f, 0.31398f,
- 0.14978f, 0.91325f,
-};
-
-static const float av1_tx_type_nn_bias_16x32_ver_layer1[4] = {
- 1.03659f,
- 1.80249f,
- 1.25710f,
- 1.32000f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_16x32_ver = {
- 16, // num_inputs
- 4, // num_outputs
- 1, // num_hidden_layers
+// Tx type model for 16x4 block.
+static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = {
+ 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f,
+ 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+ -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f,
+ -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f,
+ -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f,
+ -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f,
+ 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f,
+ 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f,
+ 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f,
+ -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f,
+ 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f,
+ -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f,
+ 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f,
+ -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f,
+ -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+ -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f,
+ 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f,
+ 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f,
+ 0.19055f, -1.56413f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = {
+ -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f,
+ 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f,
+ 1.14048f, 0.33308f, -1.10886f, 0.41184f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = {
+ -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+ 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f,
+ -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f,
+ -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f,
+ 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f,
+ -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f,
+ -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f,
+ 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f,
+ 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f,
+ -0.43819f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = {
+ 2.32575f,
+ 2.75703f,
+ 1.12304f,
+ 2.15567f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
{
- 32,
+ 16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_16x32_ver_layer0,
- av1_tx_type_nn_weights_16x32_ver_layer1,
- },
- {
- av1_tx_type_nn_bias_16x32_ver_layer0,
- av1_tx_type_nn_bias_16x32_ver_layer1,
- },
+ { av1_tx_type_nn_weights_16x4_hor_layer0,
+ av1_tx_type_nn_weights_16x4_hor_layer1 },
+ { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 }
};
-/******************************************************************************/
-// Tx type model for 32x16 block.
-static const float av1_tx_type_nn_weights_32x16_hor_layer0[512] = {
- -0.07289f, 0.30798f, 0.41881f, 0.33434f, -0.01599f, 0.85307f, -0.16060f,
- -0.07922f, -0.04693f, 0.29186f, 0.44117f, 1.02417f, 0.12447f, 0.46321f,
- 0.40060f, 0.50140f, 0.48338f, 0.47298f, 0.36585f, 0.42821f, 0.41289f,
- 0.47534f, 0.42900f, 0.26061f, 0.45887f, 0.38163f, 0.17302f, 1.00888f,
- 1.79910f, 1.36140f, 0.24471f, 0.04557f, 1.10823f, 0.74325f, 0.91210f,
- 0.81387f, 0.98865f, -0.09874f, 0.55146f, 0.19385f, -0.50752f, -0.17249f,
- 0.27261f, -0.02763f, -0.03286f, 0.09122f, 0.07015f, 0.20012f, 0.68983f,
- -1.25345f, -0.00145f, 0.71567f, 0.54948f, -0.56154f, -0.28918f, 0.11997f,
- -0.09907f, 0.09195f, 0.05768f, 0.15558f, 0.11284f, -0.35195f, -0.08723f,
- -0.03571f, 0.94031f, 0.63737f, 0.98202f, 0.93826f, 0.87126f, 0.88530f,
- 0.97697f, 0.55283f, 0.58670f, 0.86502f, 0.97008f, 0.99709f, 0.66214f,
- 0.96660f, 0.99890f, 0.31945f, -1.00301f, 0.13215f, -0.03950f, 0.21148f,
- 0.05128f, 0.10955f, 0.44839f, -0.33438f, -2.09773f, 0.13908f, 0.58669f,
- 0.25268f, -0.24006f, 0.01286f, -0.05732f, 0.03401f, -0.06896f, 0.35397f,
- 0.05133f, -0.21449f, -0.38437f, -0.32326f, -0.38731f, -0.44419f, 0.25968f,
- -0.29422f, -0.12553f, -0.08896f, -0.16400f, -0.22309f, 0.21380f, -0.26912f,
- 0.06866f, -0.25694f, 0.17632f, 0.32032f, -0.10666f, 0.26278f, 0.31877f,
- -0.09338f, -0.14289f, 0.54232f, 0.46070f, 0.00059f, -0.27914f, 0.45177f,
- 0.16274f, -0.08811f, -0.45791f, 0.53946f, -0.16794f, 0.16229f, 0.11840f,
- -0.24435f, 0.26894f, -0.33180f, -0.47314f, 0.34061f, -0.13939f, 0.13321f,
- -0.05208f, -0.18139f, -0.35234f, 1.37298f, -0.19360f, 0.21728f, 0.26088f,
- 0.04045f, -0.10763f, -0.40470f, 0.50026f, -0.06726f, -0.12871f, -0.20963f,
- -0.14583f, -0.04711f, -0.35988f, 0.03091f, 0.06491f, -0.31668f, -0.52190f,
- 0.23397f, -0.13984f, -0.15207f, -0.49977f, 0.51205f, 0.12559f, -0.03631f,
- 0.33447f, -0.36684f, 0.17533f, 0.15671f, -0.00096f, 0.06817f, 0.20922f,
- 0.34006f, 0.71260f, 0.45024f, 0.53033f, 0.15645f, 0.76019f, 0.56870f,
- 0.83066f, 0.63022f, 1.74436f, -0.24798f, 0.06795f, -0.00749f, 0.17795f,
- 0.10371f, 0.06527f, 0.41054f, 0.49003f, 0.34630f, 0.02615f, 0.30320f,
- -0.47133f, -0.49584f, 0.21775f, 0.27530f, -0.29977f, -0.64269f, 0.52627f,
- -0.02492f, 0.08077f, 0.40786f, -0.36015f, -0.70714f, -1.98185f, -0.28187f,
- 0.35018f, -0.06105f, -0.12710f, 0.06606f, -0.27805f, 0.44630f, -0.84731f,
- -0.26699f, 0.25856f, 0.06194f, -0.18674f, -0.11560f, -0.43277f, 1.10579f,
- 0.95876f, 0.17415f, 0.56386f, 0.68426f, 0.50180f, 0.24844f, 0.12347f,
- 0.15281f, -0.19089f, 0.52279f, 0.41860f, -0.05270f, -0.17029f, -0.03542f,
- 0.10621f, -0.25088f, 0.24070f, -0.08951f, 0.29950f, -0.36720f, 0.02151f,
- 0.20129f, -0.70066f, -0.23144f, -0.20070f, -0.39262f, -0.01597f, -0.05591f,
- 0.23814f, -0.25991f, 0.05812f, 0.60554f, -0.06106f, -0.58326f, 0.28762f,
- -0.18747f, 0.08232f, -0.04243f, -0.03293f, 0.14722f, -0.13017f, -0.67263f,
- 0.38698f, -0.18207f, -0.11496f, -0.27976f, -0.55345f, 1.42872f, 0.04684f,
- 0.04214f, 0.00030f, 0.02410f, 0.19966f, -0.04246f, 0.00442f, 0.23121f,
- 0.13364f, 0.21548f, -0.12748f, -0.14066f, -0.28354f, 0.59937f, -0.27553f,
- 1.57503f, -0.01050f, -0.17724f, 0.44110f, -0.80334f, 0.72064f, 1.00501f,
- -0.72638f, 0.02774f, 0.48540f, -0.72016f, -0.27721f, 0.31559f, 0.07322f,
- 0.20279f, -0.19647f, 0.02352f, 0.12662f, 0.19743f, 0.30543f, 0.25712f,
- 0.44702f, 0.16417f, 0.17888f, -2.58469f, 0.20555f, 0.57782f, -0.10892f,
- 0.14527f, 0.82251f, 0.04200f, 0.44626f, 0.10818f, 0.71204f, 0.62903f,
- 0.69178f, 0.73603f, 0.52717f, 0.83020f, 0.48824f, 1.03270f, -0.00152f,
- 0.07958f, 0.24181f, -0.78839f, -0.74214f, -0.72998f, -1.58694f, 0.17735f,
- 0.56318f, 0.32580f, -0.58503f, -0.33673f, -0.00838f, 0.48924f, 0.43362f,
- 0.12750f, 0.00295f, 0.38624f, 0.17037f, 0.00729f, -0.26256f, -0.41669f,
- 0.36847f, 0.22424f, 1.33334f, 0.18112f, 0.37682f, 0.49173f, -0.45240f,
- -0.04857f, -0.35038f, -0.83099f, -0.01988f, 0.03497f, 0.38033f, 0.13685f,
- 0.17597f, 0.28668f, 0.31193f, -0.43281f, 0.43267f, -0.50495f, 0.01969f,
- 0.14131f, -0.09326f, -0.39425f, -0.62048f, -0.09119f, -0.28306f, -0.52671f,
- -0.38584f, -0.10953f, 0.19669f, 0.34540f, -0.49941f, 0.04605f, -0.43535f,
- 0.27519f, 0.03659f, -0.31961f, 0.13330f, 0.87009f, 0.20101f, -0.70392f,
- -0.27883f, 0.33874f, -0.34308f, 0.67760f, 0.88195f, 0.55752f, -0.26563f,
- 0.17875f, 0.06964f, 0.87607f, 1.47616f, 0.46747f, -0.56408f, -0.39352f,
- -0.16427f, -0.41185f, 0.14187f, 0.19265f, -0.58613f, 0.56345f, -0.17729f,
- -0.11320f, 0.08752f, -0.01329f, 1.20981f, 0.45170f, -0.20571f, -0.01150f,
- 0.26476f, 0.13508f, 0.22020f, -0.42684f, -0.22499f, -1.51212f, 0.86648f,
- 0.21776f, 0.24666f, 0.71339f, 0.42742f, -0.00952f, 0.14762f, 0.07693f,
- -0.19599f, 0.03075f, -0.09703f, -0.32483f, -0.11616f, -0.40461f, -0.11693f,
- 0.10038f, -0.30038f, 0.14686f, 0.00548f, 0.20350f, 0.00763f, -0.43756f,
- -0.01997f, 0.00902f, 0.07470f, -0.41441f, -0.20605f, 0.07626f, -0.34973f,
- 0.47455f, -0.15251f, -0.05325f, 0.04964f, 0.32477f, -0.54604f, 0.25273f,
- -0.18461f, -0.30841f, 0.64908f, 0.60752f, 0.64148f, 0.72788f, 0.71232f,
- 0.58597f, 0.73017f, 0.58857f, 0.71908f, 0.59860f, 0.61849f, 0.99398f,
- 0.39572f, -0.36165f, -1.88646f, 0.14384f, -0.60541f, -0.21380f, -0.55498f,
- -0.50960f, -0.08801f, 0.51892f, 0.19126f, 0.57879f, 1.19447f, 0.25673f,
- -0.21631f, -0.43562f, -0.27497f, -0.02206f, -0.56169f, 0.58952f, -0.60983f,
- -0.64088f, -0.69087f, -0.56261f, -0.74089f, -0.65063f, -0.66978f, -0.60836f,
- -0.92770f, -0.77182f, -1.61031f, -0.70007f, -0.68402f, -0.42242f, -0.66722f,
- -0.14533f,
-};
-
-static const float av1_tx_type_nn_bias_32x16_hor_layer0[32] = {
- 1.53781f, -0.49320f, -0.31646f, 0.02826f, -1.05554f, 0.06559f, -0.12399f,
- -0.61671f, -0.28956f, -0.15419f, 0.87189f, -0.43375f, -1.08477f, -0.66006f,
- 0.36233f, 0.82678f, -0.92342f, -1.47101f, -0.02937f, -0.16497f, -0.75457f,
- 0.50173f, -0.07560f, 0.71598f, 1.50795f, -0.04745f, -0.14008f, -0.18510f,
- -0.14988f, -0.67044f, 0.79659f, 0.70610f,
-};
-
-static const float av1_tx_type_nn_weights_32x16_hor_layer1[128] = {
- 0.84983f, -0.62530f, -0.82600f, -0.52563f, -0.11942f, -0.50279f, -0.13425f,
- -0.02850f, 0.50767f, 0.10252f, 0.24540f, 0.67748f, -0.43483f, -0.22242f,
- 0.23431f, 0.57287f, 0.69560f, 1.13814f, -0.47427f, -0.55858f, -1.47072f,
- 0.26587f, -0.36335f, 0.83060f, 1.01645f, -0.52895f, -0.11614f, 0.17390f,
- -0.13664f, -0.83098f, -0.07985f, -1.36820f, 0.47759f, -0.55612f, 0.46852f,
- 0.07406f, -0.80467f, 0.23059f, 0.09992f, -0.06164f, 0.13541f, 0.06135f,
- 0.83605f, -0.53224f, -0.13867f, 0.93838f, -0.61290f, 0.27732f, -0.46688f,
- -0.41810f, 0.12885f, 0.13619f, -0.24612f, 0.07215f, 0.98866f, 0.10993f,
- 1.05799f, -0.27146f, -0.00079f, -0.08585f, 0.08322f, -0.33809f, 0.67598f,
- -1.06515f, 1.28866f, 0.61028f, -0.31704f, -0.59905f, 1.62151f, 0.10969f,
- 0.20671f, -0.17818f, 0.14170f, 0.19322f, 0.30602f, 0.93111f, 0.19011f,
- -0.45609f, 0.82506f, 0.32936f, -0.07858f, -0.27106f, -0.31638f, 0.23299f,
- 0.81491f, 0.32584f, -0.52093f, -0.32472f, 0.53643f, -0.42605f, 0.01641f,
- 0.09002f, 0.15832f, -0.08790f, 0.05511f, 1.00730f, 0.46309f, 0.68166f,
- -0.18835f, 0.64512f, -1.00540f, 0.86802f, 0.18981f, -0.06982f, -0.24514f,
- -0.08027f, 0.61199f, -0.20830f, 0.72001f, 0.17477f, 0.06511f, 0.00801f,
- -0.43590f, 0.37257f, 0.70323f, 0.60233f, 1.62541f, 0.74383f, -0.22254f,
- -0.33892f, 0.22881f, 0.62817f, 0.68915f, -0.06417f, 0.00969f, 1.65869f,
- 0.89060f, 0.75948f,
-};
-
-static const float av1_tx_type_nn_bias_32x16_hor_layer1[4] = {
- 0.95359f,
- 1.56043f,
- 1.06017f,
- 2.54520f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_32x16_hor = {
- 16, // num_inputs
- 4, // num_outputs
- 1, // num_hidden_layers
- {
- 32,
- }, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_32x16_hor_layer0,
- av1_tx_type_nn_weights_32x16_hor_layer1,
- },
- {
- av1_tx_type_nn_bias_32x16_hor_layer0,
- av1_tx_type_nn_bias_32x16_hor_layer1,
- },
+static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = {
+ 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+ 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f,
+ -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f,
+ -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+ -0.17967f, -0.96622f, 0.42635f, -1.04784f,
};
-static const float av1_tx_type_nn_weights_32x16_ver_layer0[128] = {
- 1.30219f, 1.30548f, 1.33334f, 1.20560f, 1.01572f, 1.38100f, 1.37504f,
- 0.12599f, -0.96957f, 0.19400f, 0.75734f, 0.11295f, -0.40447f, -1.53062f,
- -0.82980f, 0.02168f, -1.11289f, -0.66861f, -0.83663f, -0.91455f, -0.78618f,
- -0.87176f, -1.10711f, 0.71207f, 1.49689f, -0.12715f, 0.29357f, 0.35234f,
- 0.61016f, 0.80708f, 0.83564f, 1.05961f, -0.99842f, 0.82004f, 0.02638f,
- 0.44606f, 0.32298f, 0.21321f, 0.47290f, -0.71442f, -2.81050f, -0.02520f,
- -0.08919f, 0.00369f, -0.05257f, -0.07011f, -0.16394f, 0.06290f, 0.80086f,
- 0.32349f, 0.47411f, 1.36126f, 1.68162f, 0.91325f, -0.27495f, 0.00262f,
- 0.06025f, 0.42832f, 0.36965f, 0.38063f, 0.32772f, 0.40914f, 0.44510f,
- 3.02239f, -1.84077f, 0.49536f, -0.27340f, -0.10437f, -0.34293f, -0.08047f,
- -0.29651f, -0.97111f, -0.34187f, 0.52869f, 1.27240f, 1.20306f, 1.19121f,
- 1.28742f, 0.26393f, -0.62319f, 0.92285f, -0.08303f, -0.33118f, -0.13053f,
- 0.24875f, -0.52089f, 0.44691f, -1.08908f, 1.20921f, 0.36538f, -0.46792f,
- -0.18855f, -0.13443f, -0.28472f, -0.10353f, 0.06911f, 0.68519f, 0.08228f,
- -0.49027f, -0.34381f, 0.04719f, -0.33298f, 0.72525f, 0.09538f, -0.29216f,
- -0.07260f, -0.55827f, 0.54542f, -0.10144f, -0.09292f, -0.14427f, -0.38361f,
- -0.41559f, 0.75338f, -0.04530f, 0.27944f, 0.06932f, -0.11537f, 0.29568f,
- 1.92155f, -0.98996f, -0.08841f, 0.49386f, 0.15947f, 0.53290f, 1.46747f,
- 0.59360f, 0.25468f,
-};
-
-static const float av1_tx_type_nn_bias_32x16_ver_layer0[16] = {
- -1.19673f, 0.33043f, 0.24408f, 0.46221f, 2.00646f, 0.19031f,
- -0.64944f, -0.43452f, 1.04400f, 1.47371f, 0.52460f, -1.39577f,
- 0.83852f, -0.25536f, 1.33200f, -0.24444f,
-};
-
-static const float av1_tx_type_nn_weights_32x16_ver_layer1[64] = {
- -1.31447f, -0.86455f, 0.85217f, 1.00048f, 0.37395f, -1.35713f, -0.54032f,
- 0.82803f, 0.89606f, 1.57696f, 0.68067f, 0.42512f, -0.26250f, 0.14621f,
- 0.93249f, -0.77690f, -0.93652f, -0.44488f, 0.68360f, -0.88178f, 1.89111f,
- 0.67700f, -0.29310f, 0.91604f, -1.21881f, 1.11188f, 0.45045f, -0.86119f,
- -0.09294f, 0.09360f, 0.80794f, 0.41027f, 1.80399f, -0.50362f, -1.44689f,
- 0.85148f, 0.90707f, -0.18458f, 0.14165f, 1.17367f, 0.70869f, 1.57147f,
- 0.24692f, 0.16626f, 0.56794f, 0.07313f, 0.14728f, -0.74296f, 1.74127f,
- 1.26560f, 0.17753f, 1.10194f, 0.56435f, 1.73779f, 1.42841f, -1.16773f,
- 0.24584f, 0.10813f, -0.60187f, 0.79802f, 0.75229f, -0.06112f, 1.77282f,
- 1.01058f,
-};
-
-static const float av1_tx_type_nn_bias_32x16_ver_layer1[4] = {
- 0.83082f,
- 2.03845f,
- 0.59627f,
- 2.31341f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_32x16_ver = {
- 8, // num_inputs
+static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = {
+ -0.52088f, 0.52844f, -1.03655f, -0.30974f,
+ 2.59952f, -1.93604f, 0.00000f, 2.51787f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = {
+ 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f,
+ 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f,
+ 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f,
+ -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f,
+ 1.26814f, -1.93873f, -0.00768f, 1.58309f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = {
+ 2.34713f,
+ 1.68667f,
+ 1.25488f,
+ 1.69812f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = {
+ 4, // num_inputs
4, // num_outputs
1, // num_hidden_layers
{
- 16,
+ 8,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_32x16_ver_layer0,
- av1_tx_type_nn_weights_32x16_ver_layer1,
- },
- {
- av1_tx_type_nn_bias_32x16_ver_layer0,
- av1_tx_type_nn_bias_32x16_ver_layer1,
- },
+ { av1_tx_type_nn_weights_16x4_ver_layer0,
+ av1_tx_type_nn_weights_16x4_ver_layer1 },
+ { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 }
};
/******************************************************************************/
// Map tx_size to its corresponding neural net model for tx type prediction.
static const NN_CONFIG *av1_tx_type_nnconfig_map_hor[] = {
- &av1_tx_type_nnconfig_4x4, // 4x4
- &av1_tx_type_nnconfig_8x8, // 8x8
- &av1_tx_type_nnconfig_16x16, // 16x16
- NULL, // 32x32
- NULL, // 64x64
- &av1_tx_type_nnconfig_4x8_hor, // 4x8
- &av1_tx_type_nnconfig_8x4_hor, // 8x4
- &av1_tx_type_nnconfig_8x16_hor, // 8x16
- &av1_tx_type_nnconfig_16x8_hor, // 16x8
- &av1_tx_type_nnconfig_16x32_hor, // 16x32
- &av1_tx_type_nnconfig_32x16_hor, // 32x16
- NULL, // 32x64
- NULL, // 64x32
- NULL, // 4x16
- NULL, // 16x4
- NULL, // 8x32
- NULL, // 32x8
- NULL, // 16x64
- NULL, // 64x16
+ &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
};
static const NN_CONFIG *av1_tx_type_nnconfig_map_ver[] = {
- &av1_tx_type_nnconfig_4x4, // 4x4 transform
- &av1_tx_type_nnconfig_8x8, // 8x8 transform
- &av1_tx_type_nnconfig_16x16, // 16x16 transform
- NULL, // 32x32 transform
- NULL, // 64x64 transform
- &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform
- &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform
- &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform
- &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform
- &av1_tx_type_nnconfig_16x32_ver, // 16x32 transform
- &av1_tx_type_nnconfig_32x16_ver, // 32x16 transform
- NULL, // 32x64 transform
- NULL, // 64x32 transform
- NULL, // 4x16 transform
- NULL, // 16x4 transform
- NULL, // 8x32 transform
- NULL, // 32x8 transform
- NULL, // 16x64 transform
- NULL, // 64x16 transform
+ &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
};
// Tx split model for 4x8 block.
@@ -2083,4 +1941,4 @@ static const NN_CONFIG *av1_tx_split_nnconfig_map[TX_SIZES_ALL] = {
} // extern "C"
#endif
-#endif // AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#endif // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
index c71f2e74c..07615543c 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -395,7 +395,8 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
}
void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+ int8_t cos_bit, const int instride,
+ const int outstride) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
@@ -480,70 +481,70 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
// stage 1
__m128i x1[64];
- x1[0] = _mm_add_epi32(input[0], input[63]);
- x1[63] = _mm_sub_epi32(input[0], input[63]);
- x1[1] = _mm_add_epi32(input[1], input[62]);
- x1[62] = _mm_sub_epi32(input[1], input[62]);
- x1[2] = _mm_add_epi32(input[2], input[61]);
- x1[61] = _mm_sub_epi32(input[2], input[61]);
- x1[3] = _mm_add_epi32(input[3], input[60]);
- x1[60] = _mm_sub_epi32(input[3], input[60]);
- x1[4] = _mm_add_epi32(input[4], input[59]);
- x1[59] = _mm_sub_epi32(input[4], input[59]);
- x1[5] = _mm_add_epi32(input[5], input[58]);
- x1[58] = _mm_sub_epi32(input[5], input[58]);
- x1[6] = _mm_add_epi32(input[6], input[57]);
- x1[57] = _mm_sub_epi32(input[6], input[57]);
- x1[7] = _mm_add_epi32(input[7], input[56]);
- x1[56] = _mm_sub_epi32(input[7], input[56]);
- x1[8] = _mm_add_epi32(input[8], input[55]);
- x1[55] = _mm_sub_epi32(input[8], input[55]);
- x1[9] = _mm_add_epi32(input[9], input[54]);
- x1[54] = _mm_sub_epi32(input[9], input[54]);
- x1[10] = _mm_add_epi32(input[10], input[53]);
- x1[53] = _mm_sub_epi32(input[10], input[53]);
- x1[11] = _mm_add_epi32(input[11], input[52]);
- x1[52] = _mm_sub_epi32(input[11], input[52]);
- x1[12] = _mm_add_epi32(input[12], input[51]);
- x1[51] = _mm_sub_epi32(input[12], input[51]);
- x1[13] = _mm_add_epi32(input[13], input[50]);
- x1[50] = _mm_sub_epi32(input[13], input[50]);
- x1[14] = _mm_add_epi32(input[14], input[49]);
- x1[49] = _mm_sub_epi32(input[14], input[49]);
- x1[15] = _mm_add_epi32(input[15], input[48]);
- x1[48] = _mm_sub_epi32(input[15], input[48]);
- x1[16] = _mm_add_epi32(input[16], input[47]);
- x1[47] = _mm_sub_epi32(input[16], input[47]);
- x1[17] = _mm_add_epi32(input[17], input[46]);
- x1[46] = _mm_sub_epi32(input[17], input[46]);
- x1[18] = _mm_add_epi32(input[18], input[45]);
- x1[45] = _mm_sub_epi32(input[18], input[45]);
- x1[19] = _mm_add_epi32(input[19], input[44]);
- x1[44] = _mm_sub_epi32(input[19], input[44]);
- x1[20] = _mm_add_epi32(input[20], input[43]);
- x1[43] = _mm_sub_epi32(input[20], input[43]);
- x1[21] = _mm_add_epi32(input[21], input[42]);
- x1[42] = _mm_sub_epi32(input[21], input[42]);
- x1[22] = _mm_add_epi32(input[22], input[41]);
- x1[41] = _mm_sub_epi32(input[22], input[41]);
- x1[23] = _mm_add_epi32(input[23], input[40]);
- x1[40] = _mm_sub_epi32(input[23], input[40]);
- x1[24] = _mm_add_epi32(input[24], input[39]);
- x1[39] = _mm_sub_epi32(input[24], input[39]);
- x1[25] = _mm_add_epi32(input[25], input[38]);
- x1[38] = _mm_sub_epi32(input[25], input[38]);
- x1[26] = _mm_add_epi32(input[26], input[37]);
- x1[37] = _mm_sub_epi32(input[26], input[37]);
- x1[27] = _mm_add_epi32(input[27], input[36]);
- x1[36] = _mm_sub_epi32(input[27], input[36]);
- x1[28] = _mm_add_epi32(input[28], input[35]);
- x1[35] = _mm_sub_epi32(input[28], input[35]);
- x1[29] = _mm_add_epi32(input[29], input[34]);
- x1[34] = _mm_sub_epi32(input[29], input[34]);
- x1[30] = _mm_add_epi32(input[30], input[33]);
- x1[33] = _mm_sub_epi32(input[30], input[33]);
- x1[31] = _mm_add_epi32(input[31], input[32]);
- x1[32] = _mm_sub_epi32(input[31], input[32]);
+ x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]);
+ x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]);
+ x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]);
+ x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]);
+ x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]);
+ x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]);
+ x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]);
+ x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]);
+ x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]);
+ x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]);
+ x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]);
+ x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]);
+ x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]);
+ x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]);
+ x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]);
+ x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]);
+ x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]);
+ x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]);
+ x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]);
+ x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]);
+ x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]);
+ x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]);
+ x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]);
+ x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]);
+ x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]);
+ x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]);
+ x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]);
+ x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]);
+ x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]);
+ x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]);
+ x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]);
+ x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]);
+ x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]);
+ x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]);
+ x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]);
+ x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]);
+ x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]);
+ x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]);
+ x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]);
+ x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]);
+ x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]);
+ x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]);
+ x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]);
+ x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]);
+ x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]);
+ x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]);
+ x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]);
+ x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]);
+ x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]);
+ x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]);
+ x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]);
+ x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]);
+ x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]);
+ x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]);
+ x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]);
+ x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]);
+ x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]);
+ x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]);
+ x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]);
+ x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]);
+ x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]);
+ x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]);
+ x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]);
+ x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]);
// stage 2
__m128i x2[64];
@@ -1149,68 +1150,68 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
x10[48], __rounding, cos_bit);
// stage 11
- output[0] = x10[0];
- output[1] = x10[32];
- output[2] = x10[16];
- output[3] = x10[48];
- output[4] = x10[8];
- output[5] = x10[40];
- output[6] = x10[24];
- output[7] = x10[56];
- output[8] = x10[4];
- output[9] = x10[36];
- output[10] = x10[20];
- output[11] = x10[52];
- output[12] = x10[12];
- output[13] = x10[44];
- output[14] = x10[28];
- output[15] = x10[60];
- output[16] = x10[2];
- output[17] = x10[34];
- output[18] = x10[18];
- output[19] = x10[50];
- output[20] = x10[10];
- output[21] = x10[42];
- output[22] = x10[26];
- output[23] = x10[58];
- output[24] = x10[6];
- output[25] = x10[38];
- output[26] = x10[22];
- output[27] = x10[54];
- output[28] = x10[14];
- output[29] = x10[46];
- output[30] = x10[30];
- output[31] = x10[62];
- output[32] = x10[1];
- output[33] = x10[33];
- output[34] = x10[17];
- output[35] = x10[49];
- output[36] = x10[9];
- output[37] = x10[41];
- output[38] = x10[25];
- output[39] = x10[57];
- output[40] = x10[5];
- output[41] = x10[37];
- output[42] = x10[21];
- output[43] = x10[53];
- output[44] = x10[13];
- output[45] = x10[45];
- output[46] = x10[29];
- output[47] = x10[61];
- output[48] = x10[3];
- output[49] = x10[35];
- output[50] = x10[19];
- output[51] = x10[51];
- output[52] = x10[11];
- output[53] = x10[43];
- output[54] = x10[27];
- output[55] = x10[59];
- output[56] = x10[7];
- output[57] = x10[39];
- output[58] = x10[23];
- output[59] = x10[55];
- output[60] = x10[15];
- output[61] = x10[47];
- output[62] = x10[31];
- output[63] = x10[63];
+ output[0 * outstride] = x10[0];
+ output[1 * outstride] = x10[32];
+ output[2 * outstride] = x10[16];
+ output[3 * outstride] = x10[48];
+ output[4 * outstride] = x10[8];
+ output[5 * outstride] = x10[40];
+ output[6 * outstride] = x10[24];
+ output[7 * outstride] = x10[56];
+ output[8 * outstride] = x10[4];
+ output[9 * outstride] = x10[36];
+ output[10 * outstride] = x10[20];
+ output[11 * outstride] = x10[52];
+ output[12 * outstride] = x10[12];
+ output[13 * outstride] = x10[44];
+ output[14 * outstride] = x10[28];
+ output[15 * outstride] = x10[60];
+ output[16 * outstride] = x10[2];
+ output[17 * outstride] = x10[34];
+ output[18 * outstride] = x10[18];
+ output[19 * outstride] = x10[50];
+ output[20 * outstride] = x10[10];
+ output[21 * outstride] = x10[42];
+ output[22 * outstride] = x10[26];
+ output[23 * outstride] = x10[58];
+ output[24 * outstride] = x10[6];
+ output[25 * outstride] = x10[38];
+ output[26 * outstride] = x10[22];
+ output[27 * outstride] = x10[54];
+ output[28 * outstride] = x10[14];
+ output[29 * outstride] = x10[46];
+ output[30 * outstride] = x10[30];
+ output[31 * outstride] = x10[62];
+ output[32 * outstride] = x10[1];
+ output[33 * outstride] = x10[33];
+ output[34 * outstride] = x10[17];
+ output[35 * outstride] = x10[49];
+ output[36 * outstride] = x10[9];
+ output[37 * outstride] = x10[41];
+ output[38 * outstride] = x10[25];
+ output[39 * outstride] = x10[57];
+ output[40 * outstride] = x10[5];
+ output[41 * outstride] = x10[37];
+ output[42 * outstride] = x10[21];
+ output[43 * outstride] = x10[53];
+ output[44 * outstride] = x10[13];
+ output[45 * outstride] = x10[45];
+ output[46 * outstride] = x10[29];
+ output[47 * outstride] = x10[61];
+ output[48 * outstride] = x10[3];
+ output[49 * outstride] = x10[35];
+ output[50 * outstride] = x10[19];
+ output[51 * outstride] = x10[51];
+ output[52 * outstride] = x10[11];
+ output[53 * outstride] = x10[43];
+ output[54 * outstride] = x10[27];
+ output[55 * outstride] = x10[59];
+ output[56 * outstride] = x10[7];
+ output[57 * outstride] = x10[39];
+ output[58 * outstride] = x10[23];
+ output[59 * outstride] = x10[55];
+ output[60 * outstride] = x10[15];
+ output[61 * outstride] = x10[47];
+ output[62 * outstride] = x10[31];
+ output[63 * outstride] = x10[63];
}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
index abb95f31e..8ec0256eb 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -14,6 +14,7 @@
#include "av1/common/enums.h"
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "av1/encoder/x86/av1_txfm1d_sse4.h"
#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
@@ -52,9 +53,22 @@ static void fdct32_new_sse4_1(const __m128i *input, __m128i *output,
}
}
+static void fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 64;
+ const int num_per_128 = 4;
+ int col_num = txfm_size / num_per_128;
+ (void)stage_range;
+ for (int col = 0; col < col_num; col++) {
+ av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num,
+ col_num);
+ }
+}
+
static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
switch (txfm_type) {
case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break;
+ case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break;
default: assert(0);
}
return NULL;
@@ -95,6 +109,42 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
transpose_32(txfm_size, buf_128, out_128);
}
+static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
+ int32_t *output, const int stride,
+ const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *txfm_buf) {
+ assert(cfg->tx_size < TX_SIZES);
+ const int txfm_size = tx_size_wide[cfg->tx_size];
+ const int8_t *shift = cfg->shift;
+ const int8_t *stage_range_col = cfg->stage_range_col;
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ __m128i *buf_128 = (__m128i *)txfm_buf;
+ __m128i *out_128 = (__m128i *)output;
+
+ const int num_per_128 = 4;
+ int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+ int col_num = txfm_size / num_per_128;
+
+ int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+ txfm_size);
+ /*col wise transform*/
+ txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+ transpose_32(txfm_size, out_128, buf_128);
+
+ /*row wise transform*/
+ for (int col = 0; col < (col_num >> 1); col++) {
+ av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row,
+ col_num, (col_num >> 1));
+ }
+
+ txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+ av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+ transpose_32x32(buf_128, out_128);
+}
+
void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd) {
DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
@@ -104,6 +154,15 @@ void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
}
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+ (void)bd;
+ fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA,
const __m128i *inputB, __m128i *output) {
__m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]);
@@ -162,8 +221,8 @@ static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
bufA[j] = _mm_cvtepi16_epi32(buf[j]);
bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
}
- av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
- av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
+ av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+ av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
@@ -209,10 +268,10 @@ static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
bufA[j] = _mm_cvtepi16_epi32(buf[j]);
bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
}
- av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
- av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
- av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
- av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+ av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+ av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
int32_t *output8 = output + 8 * 32 * i;
for (int j = 0; j < width_div8; ++j) {
@@ -260,8 +319,8 @@ static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
}
av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row);
av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row);
- av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
- av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
int32_t *output8 = output + 8 * 32 * i;
for (int j = 0; j < (32 / 4); ++j) {
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
index c582ca0e3..38707137c 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_FWD_TXFM_AVX2_H_
-#define AV1_FWD_TXFM_AVX2_H_
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
#include <immintrin.h>
static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
@@ -100,4 +100,4 @@ static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
*in1 = _mm256_srai_epi32(temp1, cos_bit);
}
-#endif // AV1_FWD_TXFM_AVX2_H_
+#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
index aa14d3ade..99a6b9082 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
-#define AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
#include <immintrin.h>
@@ -114,4 +114,4 @@ static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
}
#endif
-#endif // AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
+#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
index 0adefecdb..6df2a8bdb 100644
--- a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
+++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_TXMF1D_SSE2_H_
-#define AV1_TXMF1D_SSE2_H_
+#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
#include <smmintrin.h>
#include "av1/common/av1_txfm.h"
@@ -29,7 +29,8 @@ void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
int8_t cos_bit);
void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
- int8_t cos_bit);
+ int8_t cos_bit, const int instride,
+ const int outstride);
void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
@@ -138,4 +139,4 @@ static INLINE void transpose_32(int txfm_size, const __m128i *input,
}
#endif
-#endif // AV1_TXMF1D_SSE2_H_
+#endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_avx2.c b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
new file mode 100644
index 000000000..7642f57d1
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+#include <immintrin.h> /* AVX2 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = width + TX_PAD_HOR;
+ const __m256i y_zeros = _mm256_setzero_si256();
+
+ const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
+ uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
+ uint8_t *pre_buf_end = pre_buf + pre_len;
+ do {
+ yy_storeu_256(pre_buf, y_zeros);
+ pre_buf += 32;
+ } while (pre_buf < pre_buf_end);
+
+ const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+ uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride;
+ uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
+
+ do {
+ yy_storeu_256(bottom_buf, y_zeros);
+ bottom_buf += 32;
+ } while (bottom_buf < bottom_buf_end);
+
+ int i = 0;
+ uint8_t *ls = levels;
+ const tran_low_t *cf = coeff;
+ if (width == 4) {
+ do {
+ const __m256i c0 = yy_loadu_256(cf);
+ const __m256i c1 = yy_loadu_256(cf + 8);
+ const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
+ const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
+ const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
+ const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
+ yy_storeu_256(ls, res);
+ ls += 32;
+ cf += 16;
+ i += 4;
+ } while (i < height);
+ } else if (width == 8) {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ const __m128i res0 = _mm256_castsi256_si128(res);
+ const __m128i res1 = _mm256_extracti128_si256(res, 1);
+ xx_storel_64(ls, res0);
+ *(int32_t *)(ls + width) = 0;
+ xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
+ *(int32_t *)(ls + width + stride) = 0;
+ xx_storel_64(ls + stride * 2, res1);
+ *(int32_t *)(ls + width + stride * 2) = 0;
+ xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
+ *(int32_t *)(ls + width + stride * 3) = 0;
+ cf += 32;
+ ls += stride << 2;
+ i += 4;
+ } while (i < height);
+ } else if (width == 16) {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ xx_storeu_128(ls, _mm256_castsi256_si128(res));
+ xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
+ cf += 32;
+ *(int32_t *)(ls + width) = 0;
+ *(int32_t *)(ls + stride + width) = 0;
+ ls += stride << 1;
+ i += 2;
+ } while (i < height);
+ } else {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ yy_storeu_256(ls, res);
+ cf += 32;
+ *(int32_t *)(ls + width) = 0;
+ ls += stride;
+ i += 1;
+ } while (i < height);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
index b3a879b0f..5e0687cd3 100644
--- a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
@@ -14,43 +14,55 @@
#include <smmintrin.h> /* SSE4.1 */
#include "aom/aom_integer.h"
-#include "aom_dsp/x86/mem_sse2.h"
#include "av1/common/onyxc_int.h"
#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
const int height, uint8_t *const levels) {
const int stride = width + TX_PAD_HOR;
- memset(levels - TX_PAD_TOP * stride, 0,
- sizeof(*levels) * TX_PAD_TOP * stride);
- memset(levels + stride * height, 0,
- sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
-
const __m128i zeros = _mm_setzero_si128();
+
+ const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
+ uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
+ uint8_t *pre_buf_end = pre_buf + pre_len;
+ do {
+ _mm_storeu_si128((__m128i *)(pre_buf), zeros);
+ pre_buf += 16;
+ } while (pre_buf < pre_buf_end);
+
+ const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+ uint8_t *bottom_buf = levels + stride * height;
+ uint8_t *bottom_buf_end = bottom_buf + bottom_len;
+ do {
+ _mm_storeu_si128((__m128i *)(bottom_buf), zeros);
+ bottom_buf += 16;
+ } while (bottom_buf < bottom_buf_end);
+
int i = 0;
uint8_t *ls = levels;
const tran_low_t *cf = coeff;
if (width == 4) {
do {
- const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
- const __m128i coeffB = _mm_load_si128((__m128i *)(cf + width));
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
const __m128i absAB = _mm_abs_epi16(coeffAB);
const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
- _mm_storeu_si128((__m128i *)ls, lsAB);
+ xx_storeu_128(ls, lsAB);
ls += (stride << 1);
cf += (width << 1);
i += 2;
} while (i < height);
} else if (width == 8) {
do {
- const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
- const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
const __m128i absAB = _mm_abs_epi16(coeffAB);
const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
- _mm_storeu_si128((__m128i *)ls, absAB8);
+ xx_storeu_128(ls, absAB8);
ls += stride;
cf += width;
i += 1;
@@ -59,16 +71,16 @@ void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
do {
int j = 0;
do {
- const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
- const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
- const __m128i coeffC = _mm_load_si128((__m128i *)(cf + 8));
- const __m128i coeffD = _mm_load_si128((__m128i *)(cf + 12));
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
+ const __m128i coeffC = xx_loadu_128(cf + 8);
+ const __m128i coeffD = xx_loadu_128(cf + 12);
const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
const __m128i absAB = _mm_abs_epi16(coeffAB);
const __m128i absCD = _mm_abs_epi16(coeffCD);
const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
- _mm_storeu_si128((__m128i *)(ls + j), absABCD);
+ xx_storeu_128(ls + j, absABCD);
j += 16;
cf += 16;
} while (j < width);
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index 4cd6371a6..535485ae8 100644
--- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -17,6 +17,7 @@
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_ports/mem.h"
@@ -393,7 +394,32 @@ static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
_mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
}
-static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output,
+ const int stride) {
+ _mm_storeu_si128((__m128i *)(output), res[0]);
+ _mm_storeu_si128((__m128i *)(output + 4), res[1]);
+ _mm_storeu_si128((__m128i *)(output + stride), res[2]);
+ _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]);
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ (void)(col_num);
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
@@ -589,7 +615,9 @@ static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
out[13] = u[3]; // buf0[3]
}
-static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ (void)(col_num);
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
@@ -780,82 +808,82 @@ void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
switch (tx_type) {
case DCT_DCT:
load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case ADST_DCT:
load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case DCT_ADST:
load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case ADST_ADST:
load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case FLIPADST_DCT:
load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case DCT_FLIPADST:
load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
- fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case FLIPADST_FLIPADST:
load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case ADST_FLIPADST:
load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case FLIPADST_ADST:
load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
@@ -940,7 +968,26 @@ static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
convert_8x8_to_16x16(in, out);
}
-static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *botL = input + 8 * stride;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ }
+
+ load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
@@ -962,7 +1009,6 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
__m128i u[16], v[16], x;
- const int col_num = 4;
int col;
// Calculate the column 0, 1, 2, 3
@@ -1226,7 +1272,8 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
}
}
-static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int num_cols) {
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
@@ -1271,25 +1318,25 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
__m128i u[16], v[16], x, y;
int col;
- for (col = 0; col < 4; ++col) {
+ for (col = 0; col < num_cols; ++col) {
// stage 0
// stage 1
- u[0] = in[0 * 4 + col];
- u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
- u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
- u[3] = in[8 * 4 + col];
- u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
- u[5] = in[12 * 4 + col];
- u[6] = in[4 * 4 + col];
- u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
- u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
- u[9] = in[14 * 4 + col];
- u[10] = in[6 * 4 + col];
- u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
- u[12] = in[2 * 4 + col];
- u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
- u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
- u[15] = in[10 * 4 + col];
+ u[0] = in[0 * num_cols + col];
+ u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]);
+ u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]);
+ u[3] = in[8 * num_cols + col];
+ u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]);
+ u[5] = in[12 * num_cols + col];
+ u[6] = in[4 * num_cols + col];
+ u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]);
+ u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]);
+ u[9] = in[14 * num_cols + col];
+ u[10] = in[6 * num_cols + col];
+ u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]);
+ u[12] = in[2 * num_cols + col];
+ u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]);
+ u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]);
+ u[15] = in[10 * num_cols + col];
// stage 2
v[0] = u[0];
@@ -1453,22 +1500,22 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
// stage 9
- out[0 * 4 + col] = v[1];
- out[1 * 4 + col] = v[14];
- out[2 * 4 + col] = v[3];
- out[3 * 4 + col] = v[12];
- out[4 * 4 + col] = v[5];
- out[5 * 4 + col] = v[10];
- out[6 * 4 + col] = v[7];
- out[7 * 4 + col] = v[8];
- out[8 * 4 + col] = v[9];
- out[9 * 4 + col] = v[6];
- out[10 * 4 + col] = v[11];
- out[11 * 4 + col] = v[4];
- out[12 * 4 + col] = v[13];
- out[13 * 4 + col] = v[2];
- out[14 * 4 + col] = v[15];
- out[15 * 4 + col] = v[0];
+ out[0 * num_cols + col] = v[1];
+ out[1 * num_cols + col] = v[14];
+ out[2 * num_cols + col] = v[3];
+ out[3 * num_cols + col] = v[12];
+ out[4 * num_cols + col] = v[5];
+ out[5 * num_cols + col] = v[10];
+ out[6 * num_cols + col] = v[7];
+ out[7 * num_cols + col] = v[8];
+ out[8 * num_cols + col] = v[9];
+ out[9 * num_cols + col] = v[6];
+ out[10 * num_cols + col] = v[11];
+ out[11 * num_cols + col] = v[4];
+ out[12 * num_cols + col] = v[13];
+ out[13 * num_cols + col] = v[2];
+ out[14 * num_cols + col] = v[15];
+ out[15 * num_cols + col] = v[0];
}
}
@@ -1482,6 +1529,11 @@ static void col_txfm_16x16_rounding(__m128i *in, int shift) {
col_txfm_8x8_rounding(&in[48], shift);
}
+static void col_txfm_8x16_rounding(__m128i *in, int shift) {
+ col_txfm_8x8_rounding(&in[0], shift);
+ col_txfm_8x8_rounding(&in[16], shift);
+}
+
static void write_buffer_16x16(const __m128i *in, int32_t *output) {
const int size_8x8 = 16 * 4;
write_buffer_8x8(&in[0], output);
@@ -1499,85 +1551,86 @@ void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
const int txw_idx = get_txw_idx(TX_16X16);
const int txh_idx = get_txh_idx(TX_16X16);
+ const int col_num = 4;
switch (tx_type) {
case DCT_DCT:
load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case ADST_DCT:
load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case DCT_ADST:
load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case ADST_ADST:
load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case FLIPADST_DCT:
load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case DCT_FLIPADST:
load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
- fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case FLIPADST_FLIPADST:
load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case ADST_FLIPADST:
load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case FLIPADST_ADST:
load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
@@ -1585,3 +1638,146 @@ void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
}
(void)bd;
}
+
+static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
+ for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
+ for (int i = 1; i < size; i += 2) in[size - i] = out[i];
+}
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_sse4_1, // DCT_DCT
+ fadst8x8_sse4_1, // ADST_DCT
+ fdct8x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fadst8x8_sse4_1, // FLIPADST_DCT
+ fdct8x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ NULL, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16x16_sse4_1, // DCT_DCT
+ fdct16x16_sse4_1, // ADST_DCT
+ fadst16x16_sse4_1, // DCT_ADST
+ fadst16x16_sse4_1, // ADST_ADST
+ fdct16x16_sse4_1, // FLIPADST_DCT
+ fadst16x16_sse4_1, // DCT_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_FLIPADST
+ fadst16x16_sse4_1, // ADST_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_ADST
+ NULL, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16x16_sse4_1, // DCT_DCT
+ fadst16x16_sse4_1, // ADST_DCT
+ fdct16x16_sse4_1, // DCT_ADST
+ fadst16x16_sse4_1, // ADST_ADST
+ fadst16x16_sse4_1, // FLIPADST_DCT
+ fdct16x16_sse4_1, // DCT_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_FLIPADST
+ fadst16x16_sse4_1, // ADST_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_ADST
+ NULL, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_sse4_1, // DCT_DCT
+ fdct8x8_sse4_1, // ADST_DCT
+ fadst8x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fdct8x8_sse4_1, // FLIPADST_DCT
+ fadst8x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ NULL, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[32], out[32];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+ int bit = fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
+ col_txfm(in, in, bit, 0);
+ col_txfm_8x8_rounding(in, -shift[1]);
+ transpose_8x8(in, out + i * 16);
+ }
+
+ if (lr_flip) {
+ flip_buf_sse4_1(in, out, 32);
+ row_txfm(in, out, bit, 2);
+ } else {
+ row_txfm(out, out, bit, 2);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ transpose_8x8(out + i * 16, in);
+ av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
+ write_buffer_16x8(in, coeff + i * 8, 16);
+ }
+
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[32], out[32];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+ int bit = fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+ col_txfm(in, in, bit, 2);
+ col_txfm_8x16_rounding(in, -shift[1]);
+ transpose_8x8(in, out);
+ transpose_8x8(in + 16, out + 16);
+
+ for (int i = 0; i < 2; i++) {
+ row_txfm(out + i * 16, out, bit, 0);
+ transpose_8x8(out, in);
+ av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
+ write_buffer_8x8(in, coeff + i * 64);
+ }
+
+ (void)bd;
+}
diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
new file mode 100644
index 000000000..06aaaa7ee
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src,
+ const __m128i *shuffle, const __m256i *kl) {
+ const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+ const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
+ const __m256i dst0 = yy_loadu_256(dst);
+ const __m256i r0 = _mm256_add_epi32(dst0, d0);
+ yy_storeu_256(dst, r0);
+}
+
+static INLINE void acc_stat_win7_one_line_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+ int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint8_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m256i kl =
+ _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win7_opt_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win7_one_line_avx2(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = 0; l < WIENER_WIN * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+static INLINE void acc_stat_win5_one_line_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint8_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m256i kl =
+ _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win5_opt_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win5_one_line_avx2(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2_CHROMA; ++k) {
+ for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, double *M, double *H) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else {
+ av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ }
+}
+
+static INLINE __m256i pair_set_epi16(uint16_t a, uint16_t b) {
+ return _mm256_set1_epi32(
+ (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+ __m256i sum64 = _mm256_setzero_si256();
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt0_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+ yy_loadu_256(flt0 + j + 8)),
+ 0xd8);
+ const __m256i flt1_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+ yy_loadu_256(flt1 + j + 8)),
+ 0xd8);
+ const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+ const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
+ const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
+ const __m256i v0 = _mm256_madd_epi16(
+ xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m256i v1 = _mm256_madd_epi16(
+ xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[0] > 0) {
+ __m256i xq_coeff =
+ pair_set_epi16(xq[0], (-xq[0] * (1 << SGRPROJ_RST_BITS)));
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt0_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+ yy_loadu_256(flt0 + j + 8)),
+ 0xd8);
+ const __m256i v0 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt0_16b, d0));
+ const __m256i v1 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt0_16b, d0));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[1] > 0) {
+ __m256i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt1_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+ yy_loadu_256(flt1 + j + 8)),
+ 0xd8);
+ const __m256i v0 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt1_16b, d0));
+ const __m256i v1 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt1_16b, d0));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt1 += flt1_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i diff0 = _mm256_sub_epi16(d0, s0);
+ const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64_0, sum64_1);
+ }
+ int64_t sum[4];
+ yy_storeu_256(sum, sum64);
+ err += sum[0] + sum[1] + sum[2] + sum[3];
+ return err;
+}
diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
new file mode 100644
index 000000000..04e4d1afc
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
+ const __m128i *shuffle, const __m128i *kl) {
+ const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+ const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
+ const __m128i d1 =
+ _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)));
+ const __m128i dst0 = xx_loadu_128(dst);
+ const __m128i dst1 = xx_loadu_128(dst + 4);
+ const __m128i r0 = _mm_add_epi32(dst0, d0);
+ const __m128i r1 = _mm_add_epi32(dst1, d1);
+ xx_storeu_128(dst, r0);
+ xx_storeu_128(dst + 4, r1);
+}
+
+static INLINE void acc_stat_win7_one_line_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+ int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ const int wiener_win = 7;
+ int j, k, l;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m128i kl =
+ _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win7_opt_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win7_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = 0; l < WIENER_WIN * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+static INLINE void acc_stat_win5_one_line_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ const int wiener_win = WIENER_WIN_CHROMA;
+ int j, k, l;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m128i kl =
+ _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win5_opt_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win5_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+ for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, double *M, double *H) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else {
+ av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ }
+}
+
+static INLINE __m128i pair_set_epi16(uint16_t a, uint16_t b) {
+ return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+ __m128i sum64 = _mm_setzero_si128();
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j < width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt0_16b =
+ _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+ const __m128i flt1_16b =
+ _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+ const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+ const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0);
+ const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0);
+ const __m128i v0 = _mm_madd_epi16(
+ xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m128i v1 = _mm_madd_epi16(
+ xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[0] > 0) {
+ __m128i xq_coeff = pair_set_epi16(xq[0], -(xq[0] << SGRPROJ_RST_BITS));
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j < width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt0_16b =
+ _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+ const __m128i v0 =
+ _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt0_16b, d0));
+ const __m128i v1 =
+ _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt0_16b, d0));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[1] > 0) {
+ __m128i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j < width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt1_16b =
+ _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+ const __m128i v0 =
+ _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt1_16b, d0));
+ const __m128i v1 =
+ _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt1_16b, d0));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt1 += flt1_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else {
+ __m128i sum32 = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width - 16; j += 16) {
+ const __m128i d = xx_loadu_128(dat + j);
+ const __m128i s = xx_loadu_128(src + j);
+ const __m128i d0 = _mm_cvtepu8_epi16(d);
+ const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8));
+ const __m128i s0 = _mm_cvtepu8_epi16(s);
+ const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8));
+ const __m128i diff0 = _mm_sub_epi16(d0, s0);
+ const __m128i diff1 = _mm_sub_epi16(d1, s1);
+ const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+ const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+ sum32 = _mm_add_epi32(sum32, err0);
+ sum32 = _mm_add_epi32(sum32, err1);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64_0, sum64_1);
+ }
+ int64_t sum[2];
+ xx_storeu_128(sum, sum64);
+ err += sum[0] + sum[1];
+ return err;
+}
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
index f776e84c7..2a792f14e 100644
--- a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
@@ -14,7 +14,7 @@
#include <smmintrin.h>
#include "aom_dsp/x86/synonyms.h"
-
+#include "aom_dsp/x86/synonyms_avx2.h"
#include "aom/aom_integer.h"
#include "av1/common/reconinter.h"
@@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
uint64_t csse;
const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
- const __m256i v_zext_q = _mm256_set1_epi64x(0xffffffff);
+ const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff);
__m256i v_acc0_q = _mm256_setzero_si256();
diff --git a/third_party/aom/av1/exports_com b/third_party/aom/av1/exports_com
new file mode 100644
index 000000000..5c8e0e09d
--- /dev/null
+++ b/third_party/aom/av1/exports_com
@@ -0,0 +1,2 @@
+text aom_read_obu_header_and_size
+text av1_resize_frame420
diff --git a/third_party/aom/av1/exports_dec b/third_party/aom/av1/exports_dec
index 05860e8c0..daabf6766 100644
--- a/third_party/aom/av1/exports_dec
+++ b/third_party/aom/av1/exports_dec
@@ -1,2 +1,3 @@
data aom_codec_av1_dx_algo
text aom_codec_av1_dx
+text av1_add_film_grain
diff --git a/third_party/aom/av1/exports_test b/third_party/aom/av1/exports_test
new file mode 100644
index 000000000..dab377575
--- /dev/null
+++ b/third_party/aom/av1/exports_test
@@ -0,0 +1,2 @@
+text av1_get_fwd_txfm_cfg
+text av1_rtcd
diff --git a/third_party/aom/build/cmake/aom_config_defaults.cmake b/third_party/aom/build/cmake/aom_config_defaults.cmake
index 19af5c43b..a07438cfe 100644
--- a/third_party/aom/build/cmake/aom_config_defaults.cmake
+++ b/third_party/aom/build/cmake/aom_config_defaults.cmake
@@ -7,90 +7,190 @@
# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
# License 1.0 was not distributed with this source code in the PATENTS file, you
# can obtain it at www.aomedia.org/license/patent.
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+# This file sets default values for libaom configuration variables. All libaom
+# config variables are added to the CMake variable cache via the macros provided
+# in util.cmake.
+
#
-# Defaults for every libaom configuration variable. Here we add all libaom
-# config variables to the cmake variable cache, but omit the FORCE parameter to
-# allow users to specify values when executing cmake to generate build files.
-# Values here are used only if not set by the user.
-set(INLINE "" CACHE STRING "Sets INLINE value for current target.")
+# The variables in this section of the file are detected at configuration time,
+# but can be overridden via the use of CONFIG_* and ENABLE_* values also defined
+# in this file.
+#
+
+set_aom_detect_var(INLINE "" STRING "Sets INLINE value for current target.")
# CPUs.
-set(ARCH_ARM 0 CACHE NUMBER "Enables ARM architecture.")
-set(ARCH_MIPS 0 CACHE NUMBER "Enables MIPS architecture.")
-set(ARCH_PPC 0 CACHE NUMBER "Enables PPC architecture.")
-set(ARCH_X86 0 CACHE NUMBER "Enables X86 architecture.")
-set(ARCH_X86_64 0 CACHE NUMBER "Enables X86_64 architecture.")
-
-# ARM optimization flags.
-set(HAVE_NEON 0 CACHE NUMBER "Enables NEON intrinsics optimizations.")
-
-# MIPS optimization flags.
-set(HAVE_DSPR2 0 CACHE NUMBER "Enables DSPR2 optimizations.")
-set(HAVE_MIPS32 0 CACHE NUMBER "Enables MIPS32 optimizations.")
-set(HAVE_MIPS64 0 CACHE NUMBER "Enables MIPS64 optimizations. ")
-set(HAVE_MSA 0 CACHE NUMBER "Enables MSA optimizations.")
-
-# PPC optimization flags.
-set(HAVE_VSX 0 CACHE NUMBER "Enables VSX optimizations.")
-
-# x86/x86_64 optimization flags.
-set(HAVE_AVX 0 CACHE NUMBER "Enables AVX optimizations.")
-set(HAVE_AVX2 0 CACHE NUMBER "Enables AVX2 optimizations.")
-set(HAVE_MMX 0 CACHE NUMBER "Enables MMX optimizations. ")
-set(HAVE_SSE 0 CACHE NUMBER "Enables SSE optimizations.")
-set(HAVE_SSE2 0 CACHE NUMBER "Enables SSE2 optimizations.")
-set(HAVE_SSE3 0 CACHE NUMBER "Enables SSE3 optimizations.")
-set(HAVE_SSE4_1 0 CACHE NUMBER "Enables SSE 4.1 optimizations.")
-set(HAVE_SSE4_2 0 CACHE NUMBER "Enables SSE 4.2 optimizations.")
-set(HAVE_SSSE3 0 CACHE NUMBER "Enables SSSE3 optimizations.")
+set_aom_detect_var(ARCH_ARM 0 NUMBER "Enables ARM architecture.")
+set_aom_detect_var(ARCH_MIPS 0 NUMBER "Enables MIPS architecture.")
+set_aom_detect_var(ARCH_PPC 0 NUMBER "Enables PPC architecture.")
+set_aom_detect_var(ARCH_X86 0 NUMBER "Enables X86 architecture.")
+set_aom_detect_var(ARCH_X86_64 0 NUMBER "Enables X86_64 architecture.")
+
+# ARM feature flags.
+set_aom_detect_var(HAVE_NEON 0 NUMBER "Enables NEON intrinsics optimizations.")
+
+# MIPS feature flags.
+set_aom_detect_var(HAVE_DSPR2 0 NUMBER "Enables DSPR2 optimizations.")
+set_aom_detect_var(HAVE_MIPS32 0 NUMBER "Enables MIPS32 optimizations.")
+set_aom_detect_var(HAVE_MIPS64 0 NUMBER "Enables MIPS64 optimizations. ")
+set_aom_detect_var(HAVE_MSA 0 NUMBER "Enables MSA optimizations.")
+
+# PPC feature flags.
+set_aom_detect_var(HAVE_VSX 0 NUMBER "Enables VSX optimizations.")
+
+# x86/x86_64 feature flags.
+set_aom_detect_var(HAVE_AVX 0 NUMBER "Enables AVX optimizations.")
+set_aom_detect_var(HAVE_AVX2 0 NUMBER "Enables AVX2 optimizations.")
+set_aom_detect_var(HAVE_MMX 0 NUMBER "Enables MMX optimizations. ")
+set_aom_detect_var(HAVE_SSE 0 NUMBER "Enables SSE optimizations.")
+set_aom_detect_var(HAVE_SSE2 0 NUMBER "Enables SSE2 optimizations.")
+set_aom_detect_var(HAVE_SSE3 0 NUMBER "Enables SSE3 optimizations.")
+set_aom_detect_var(HAVE_SSE4_1 0 NUMBER "Enables SSE 4.1 optimizations.")
+set_aom_detect_var(HAVE_SSE4_2 0 NUMBER "Enables SSE 4.2 optimizations.")
+set_aom_detect_var(HAVE_SSSE3 0 NUMBER "Enables SSSE3 optimizations.")
# Flags describing the build environment.
-set(HAVE_FEXCEPT 0 CACHE NUMBER "Internal flag, GNU fenv.h present for target.")
-set(HAVE_PTHREAD_H 0 CACHE NUMBER "Internal flag, target pthread support.")
-set(HAVE_UNISTD_H 0 CACHE NUMBER "Internal flag, unistd.h present for target.")
-set(HAVE_WXWIDGETS 0 CACHE NUMBER "WxWidgets present.")
+set_aom_detect_var(HAVE_FEXCEPT 0 NUMBER
+ "Internal flag, GNU fenv.h present for target.")
+set_aom_detect_var(HAVE_PTHREAD_H 0 NUMBER
+ "Internal flag, target pthread support.")
+set_aom_detect_var(HAVE_UNISTD_H 0 NUMBER
+ "Internal flag, unistd.h present for target.")
+set_aom_detect_var(HAVE_WXWIDGETS 0 NUMBER "WxWidgets present.")
+
+#
+# Variables in this section can be set from the CMake command line or from
+# within the CMake GUI. The variables control libaom features.
+#
# Build configuration flags.
-set(CONFIG_AV1_DECODER 1 CACHE NUMBER "Enable AV1 decoder.")
-set(CONFIG_AV1_ENCODER 1 CACHE NUMBER "Enable AV1 encoder.")
-set(CONFIG_BIG_ENDIAN 0 CACHE NUMBER "Internal flag.")
-set(CONFIG_GCC 0 CACHE NUMBER "Building with GCC (detected).")
-set(CONFIG_GCOV 0 CACHE NUMBER "Enable gcov support.")
-set(CONFIG_GPROF 0 CACHE NUMBER "Enable gprof support.")
-set(CONFIG_LIBYUV 1 CACHE NUMBER "Enables libyuv scaling/conversion support.")
-set(CONFIG_MSVS 0 CACHE NUMBER "Building with MS Visual Studio (detected).")
-set(CONFIG_MULTITHREAD 1 CACHE NUMBER "Multithread support.")
-set(CONFIG_OS_SUPPORT 0 CACHE NUMBER "Internal flag.")
-set(CONFIG_PIC 0 CACHE NUMBER "Build with PIC enabled.")
-set(CONFIG_RUNTIME_CPU_DETECT 1 CACHE NUMBER "Runtime CPU detection support.")
-set(CONFIG_SHARED 0 CACHE NUMBER "Build shared libs.")
-set(CONFIG_STATIC 1 CACHE NUMBER "Build static libs.")
-set(CONFIG_WEBM_IO 1 CACHE NUMBER "Enables WebM support.")
+set_aom_config_var(AOM_RTCD_FLAGS "" STRING
+ "Arguments to pass to rtcd.pl. Separate with ';'")
+set_aom_config_var(CONFIG_AV1_DECODER 1 NUMBER "Enable AV1 decoder.")
+set_aom_config_var(CONFIG_AV1_ENCODER 1 NUMBER "Enable AV1 encoder.")
+set_aom_config_var(CONFIG_BIG_ENDIAN 0 NUMBER "Internal flag.")
+set_aom_config_var(CONFIG_GCC 0 NUMBER "Building with GCC (detect).")
+set_aom_config_var(CONFIG_GCOV 0 NUMBER "Enable gcov support.")
+set_aom_config_var(CONFIG_GPROF 0 NUMBER "Enable gprof support.")
+set_aom_config_var(CONFIG_LIBYUV 1 NUMBER
+ "Enables libyuv scaling/conversion support.")
+
+set_aom_config_var(CONFIG_MULTITHREAD 1 NUMBER "Multithread support.")
+set_aom_config_var(CONFIG_OS_SUPPORT 0 NUMBER "Internal flag.")
+set_aom_config_var(CONFIG_PIC 0 NUMBER "Build with PIC enabled.")
+set_aom_config_var(CONFIG_RUNTIME_CPU_DETECT 1 NUMBER
+ "Runtime CPU detection support.")
+set_aom_config_var(CONFIG_SHARED 0 NUMBER "Build shared libs.")
+set_aom_config_var(CONFIG_STATIC 1 NUMBER "Build static libs.")
+set_aom_config_var(CONFIG_WEBM_IO 1 NUMBER "Enables WebM support.")
# Debugging flags.
-set(CONFIG_BITSTREAM_DEBUG 0 CACHE NUMBER "Bitstream debugging flag.")
-set(CONFIG_DEBUG 0 CACHE NUMBER "Debug build flag.")
-set(CONFIG_MISMATCH_DEBUG 0 CACHE NUMBER "Mismatch debugging flag.")
+set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0 NUMBER "Bitstream debugging flag.")
+set_aom_config_var(CONFIG_DEBUG 0 NUMBER "Debug build flag.")
+set_aom_config_var(CONFIG_MISMATCH_DEBUG 0 NUMBER "Mismatch debugging flag.")
# AV1 feature flags.
-set(CONFIG_ACCOUNTING 0 CACHE NUMBER "Enables bit accounting.")
-set(CONFIG_ANALYZER 0 CACHE NUMBER "Enables bit stream analyzer.")
-set(CONFIG_COEFFICIENT_RANGE_CHECKING 0 CACHE NUMBER "Coefficient range check.")
-set(CONFIG_DENOISE 0 CACHE NUMBER "Denoise/noise modeling support in encoder.")
-set(CONFIG_FILEOPTIONS 1 CACHE NUMBER "Enables encoder config file support.")
-set(CONFIG_INSPECTION 0 CACHE NUMBER "Enables bitstream inspection.")
-set(CONFIG_INTERNAL_STATS 0 CACHE NUMBER "Enables internal encoder stats.")
-set(CONFIG_LOWBITDEPTH 0 CACHE NUMBER "Enables 8-bit optimized pipeline.")
-set(CONFIG_SIZE_LIMIT 0 CACHE NUMBER "Limit max decode width/height.")
-set(CONFIG_SPATIAL_RESAMPLING 1 CACHE NUMBER "Spatial resampling.")
-set(DECODE_HEIGHT_LIMIT 0 CACHE NUMBER "Set limit for decode height.")
-set(DECODE_WIDTH_LIMIT 0 CACHE NUMBER "Set limit for decode width.")
+set_aom_config_var(CONFIG_ACCOUNTING 0 NUMBER "Enables bit accounting.")
+set_aom_config_var(CONFIG_ANALYZER 0 NUMBER "Enables bit stream analyzer.")
+set_aom_config_var(CONFIG_COEFFICIENT_RANGE_CHECKING 0 NUMBER
+ "Coefficient range check.")
+set_aom_config_var(CONFIG_DENOISE 1 NUMBER
+ "Denoise/noise modeling support in encoder.")
+set_aom_config_var(CONFIG_FILEOPTIONS 1 NUMBER
+ "Enables encoder config file support.")
+set_aom_config_var(CONFIG_FIX_GF_LENGTH 1 NUMBER
+ "Fix the GF length if possible")
+set_aom_config_var(CONFIG_INSPECTION 0 NUMBER "Enables bitstream inspection.")
+set_aom_config_var(CONFIG_INTERNAL_STATS 0 NUMBER
+ "Enables internal encoder stats.")
+set_aom_config_var(CONFIG_LOWBITDEPTH 0 NUMBER
+ "Enables 8-bit optimized pipeline.")
+set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2 NUMBER
+ "Max profile to support decoding.")
+set_aom_config_var(CONFIG_NORMAL_TILE_MODE 0 NUMBER
+ "Only enables normal tile mode.")
+set_aom_config_var(
+ CONFIG_REDUCED_ENCODER_BORDER 0 NUMBER
+ "Enable reduced border extention for encoder. \
+ Disables superres and resize support."
+ )
+set_aom_config_var(CONFIG_SIZE_LIMIT 0 NUMBER "Limit max decode width/height.")
+set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 NUMBER "Spatial resampling.")
+set_aom_config_var(DECODE_HEIGHT_LIMIT 0 NUMBER "Set limit for decode height.")
+set_aom_config_var(DECODE_WIDTH_LIMIT 0 NUMBER "Set limit for decode width.")
+set_aom_config_var(CONFIG_GLOBAL_MOTION_SEARCH 1 NUMBER
+ "Global motion search flag.")
# AV1 experiment flags.
-set(CONFIG_COLLECT_INTER_MODE_RD_STATS 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_COLLECT_RD_STATS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_DIST_8X8 1 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_ENTROPY_STATS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_FP_MB_STATS 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_INTER_STATS_ONLY 0 CACHE NUMBER "AV1 experiment flag.")
-set(CONFIG_RD_DEBUG 0 CACHE NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_COLLECT_INTER_MODE_RD_STATS 1 NUMBER
+ "AV1 experiment flag.")
+set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_DIST_8X8 0 NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_ENTROPY_STATS 0 NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_FP_MB_STATS 0 NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_RD_DEBUG 0 NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL 1 NUMBER
+ "AV1 experiment flag.")
+set_aom_config_var(CONFIG_SHARP_SETTINGS 0 NUMBER
+ "Use sharper encoding settings")
+
+#
+# Variables in this section control optional features of the build system.
+#
+set_aom_option_var(ENABLE_CCACHE "Enable ccache support." OFF)
+set_aom_option_var(ENABLE_DECODE_PERF_TESTS "Enables decoder performance tests"
+ OFF)
+set_aom_option_var(ENABLE_DISTCC "Enable distcc support." OFF)
+set_aom_option_var(ENABLE_DOCS
+ "Enable documentation generation (doxygen required)." ON)
+set_aom_option_var(ENABLE_ENCODE_PERF_TESTS "Enables encoder performance tests"
+ OFF)
+set_aom_option_var(ENABLE_EXAMPLES "Enables build of example code." ON)
+set_aom_option_var(ENABLE_GOMA "Enable goma support." OFF)
+set_aom_option_var(
+ ENABLE_IDE_TEST_HOSTING
+ "Enables running tests within IDEs like Visual Studio and Xcode." OFF)
+set_aom_option_var(ENABLE_NASM "Use nasm instead of yasm for x86 assembly." OFF)
+set_aom_option_var(ENABLE_TESTDATA "Enables unit test data download targets."
+ ON)
+set_aom_option_var(ENABLE_TESTS "Enables unit tests." ON)
+set_aom_option_var(ENABLE_TOOLS "Enable applications in tools sub directory."
+ ON)
+set_aom_option_var(ENABLE_WERROR "Converts warnings to errors at compile time."
+ OFF)
+
+# ARM assembly/intrinsics flags.
+set_aom_option_var(ENABLE_NEON "Enables NEON optimizations on ARM targets." ON)
+
+# MIPS assembly/intrinsics flags.
+set_aom_option_var(ENABLE_DSPR2 "Enables DSPR2 optimizations on MIPS targets."
+ OFF)
+set_aom_option_var(ENABLE_MSA "Enables MSA optimizations on MIPS targets." OFF)
+
+# VSX intrinsics flags.
+set_aom_option_var(ENABLE_VSX "Enables VSX optimizations on PowerPC targets."
+ ON)
+
+# x86/x86_64 assembly/intrinsics flags.
+set_aom_option_var(ENABLE_MMX
+ "Enables MMX optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE
+ "Enables SSE optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE2
+ "Enables SSE2 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE3
+ "Enables SSE3 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSSE3
+ "Enables SSSE3 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE4_1
+ "Enables SSE4_1 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE4_2
+ "Enables SSE4_2 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_AVX
+ "Enables AVX optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_AVX2
+ "Enables AVX2 optimizations on x86/x86_64 targets." ON)
diff --git a/third_party/aom/build/cmake/aom_configure.cmake b/third_party/aom/build/cmake/aom_configure.cmake
index a12389778..c0c7381e8 100644
--- a/third_party/aom/build/cmake/aom_configure.cmake
+++ b/third_party/aom/build/cmake/aom_configure.cmake
@@ -20,19 +20,6 @@ include(FindThreads)
set(AOM_SUPPORTED_CPU_TARGETS
"arm64 armv7 armv7s generic mips32 mips64 ppc x86 x86_64")
-# Generate the user config settings. This must occur before include of
-# aom_config_defaults.cmake (because it turns every config variable into a cache
-# variable with its own help string).
-get_cmake_property(cmake_cache_vars CACHE_VARIABLES)
-foreach(cache_var ${cmake_cache_vars})
- get_property(cache_var_helpstring CACHE ${cache_var} PROPERTY HELPSTRING)
- set(cmdline_helpstring "No help, variable specified on the command line.")
- if("${cache_var_helpstring}" STREQUAL "${cmdline_helpstring}")
- set(AOM_CMAKE_CONFIG "${AOM_CMAKE_CONFIG} -D${cache_var}=${${cache_var}}")
- endif()
-endforeach()
-string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
-
include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
include("${AOM_ROOT}/build/cmake/aom_experiment_deps.cmake")
include("${AOM_ROOT}/build/cmake/aom_optimization.cmake")
@@ -40,6 +27,16 @@ include("${AOM_ROOT}/build/cmake/compiler_flags.cmake")
include("${AOM_ROOT}/build/cmake/compiler_tests.cmake")
include("${AOM_ROOT}/build/cmake/util.cmake")
+# Generate the user config settings.
+list(APPEND aom_build_vars ${AOM_CONFIG_VARS} ${AOM_OPTION_VARS})
+foreach(cache_var ${aom_build_vars})
+ get_property(cache_var_helpstring CACHE ${cache_var} PROPERTY HELPSTRING)
+ if("${cache_var_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+ set(AOM_CMAKE_CONFIG "${AOM_CMAKE_CONFIG} -D${cache_var}=${${cache_var}}")
+ endif()
+endforeach()
+string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
+
# Detect target CPU.
if(NOT AOM_TARGET_CPU)
if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "AMD64" OR
@@ -73,7 +70,9 @@ if(NOT AOM_TARGET_CPU)
endif()
if(CMAKE_TOOLCHAIN_FILE) # Add toolchain file to config string.
- set(toolchain_string "-DCMAKE_TOOLCHAIN_FILE=\\\"${CMAKE_TOOLCHAIN_FILE}\\\"")
+ file(RELATIVE_PATH toolchain_path "${AOM_CONFIG_DIR}"
+ "${CMAKE_TOOLCHAIN_FILE}")
+ set(toolchain_string "-DCMAKE_TOOLCHAIN_FILE=\\\"${toolchain_path}\\\"")
set(AOM_CMAKE_CONFIG "${toolchain_string} ${AOM_CMAKE_CONFIG}")
else()
@@ -81,6 +80,8 @@ else()
set(AOM_CMAKE_CONFIG "-DAOM_TARGET_CPU=${AOM_TARGET_CPU} ${AOM_CMAKE_CONFIG}")
endif()
set(AOM_CMAKE_CONFIG "-G \\\"${CMAKE_GENERATOR}\\\" ${AOM_CMAKE_CONFIG}")
+file(RELATIVE_PATH source_path "${AOM_CONFIG_DIR}" "${AOM_ROOT}")
+set(AOM_CMAKE_CONFIG "cmake ${source_path} ${AOM_CMAKE_CONFIG}")
string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
message("--- aom_configure: Detected CPU: ${AOM_TARGET_CPU}")
@@ -90,13 +91,13 @@ if("${CMAKE_BUILD_TYPE}" MATCHES "Deb")
set(CONFIG_DEBUG 1)
endif()
-if(NOT MSVC)
- if(BUILD_SHARED_LIBS)
- set(CONFIG_PIC 1)
- set(CONFIG_SHARED 1)
- set(CONFIG_STATIC 0)
- endif()
+if(BUILD_SHARED_LIBS)
+ set(CONFIG_PIC 1)
+ set(CONFIG_SHARED 1)
+ set(CONFIG_STATIC 0)
+endif()
+if(NOT MSVC)
if(CONFIG_PIC)
# TODO(tomfinegan): clang needs -pie in CMAKE_EXE_LINKER_FLAGS for this to
@@ -109,8 +110,6 @@ if(NOT MSVC)
set(AOM_AS_FLAGS ${AOM_AS_FLAGS} -DPIC)
endif()
endif()
-else()
- set(CONFIG_MSVS 1)
endif()
if(NOT "${AOM_SUPPORTED_CPU_TARGETS}" MATCHES "${AOM_TARGET_CPU}")
@@ -261,13 +260,15 @@ else()
add_compiler_flag_if_supported("-Wlogical-op")
add_compiler_flag_if_supported("-Wpointer-arith")
add_compiler_flag_if_supported("-Wsign-compare")
- add_compiler_flag_if_supported("-Wstack-usage=360000")
add_compiler_flag_if_supported("-Wstring-conversion")
add_compiler_flag_if_supported("-Wtype-limits")
add_compiler_flag_if_supported("-Wuninitialized")
add_compiler_flag_if_supported("-Wunused")
add_compiler_flag_if_supported("-Wvla")
+ add_c_flag_if_supported("-Wstack-usage=100000")
+ add_cxx_flag_if_supported("-Wstack-usage=360000")
+
# TODO(jzern): this could be added as a cxx flags for test/*.cc only, avoiding
# third_party.
add_c_flag_if_supported("-Wshorten-64-to-32")
@@ -278,6 +279,12 @@ else()
# Add -Wundef only for C files to avoid massive gtest warning spam.
add_c_flag_if_supported("-Wundef")
+ # Quiet gcc 6 vs 7 abi warnings:
+ # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+ if("${AOM_TARGET_CPU}" MATCHES "arm")
+ add_cxx_flag_if_supported("-Wno-psabi")
+ endif()
+
if(ENABLE_WERROR)
add_compiler_flag_if_supported("-Werror")
endif()
@@ -344,7 +351,7 @@ foreach(NUM RANGE ${AOM_RTCD_CUSTOM_COMMAND_COUNT})
list(GET AOM_RTCD_HEADER_FILE_LIST ${NUM} AOM_RTCD_HEADER_FILE)
list(GET AOM_RTCD_SOURCE_FILE_LIST ${NUM} AOM_RTCD_SOURCE_FILE)
list(GET AOM_RTCD_SYMBOL_LIST ${NUM} AOM_RTCD_SYMBOL)
- execute_process(COMMAND ${PERL_EXECUTABLE} "${AOM_ROOT}/build/make/rtcd.pl"
+ execute_process(COMMAND ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/rtcd.pl"
--arch=${AOM_TARGET_CPU}
--sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS}
--config=${AOM_CONFIG_DIR}/config/aom_config.h
diff --git a/third_party/aom/build/cmake/aom_experiment_deps.cmake b/third_party/aom/build/cmake/aom_experiment_deps.cmake
index e2c8102aa..0688704e5 100644
--- a/third_party/aom/build/cmake/aom_experiment_deps.cmake
+++ b/third_party/aom/build/cmake/aom_experiment_deps.cmake
@@ -25,4 +25,8 @@ macro(fix_experiment_configs)
change_config_and_warn(CONFIG_RD_DEBUG 0 CONFIG_JNT_COMP)
endif()
+ if(CONFIG_DIST_8X8 AND CONFIG_MULTITHREAD)
+ change_config_and_warn(CONFIG_DIST_8X8 0 CONFIG_MULTITHREAD)
+ endif()
+
endmacro()
diff --git a/third_party/aom/build/cmake/aom_optimization.cmake b/third_party/aom/build/cmake/aom_optimization.cmake
index ce3dc0340..be32a3212 100644
--- a/third_party/aom/build/cmake/aom_optimization.cmake
+++ b/third_party/aom/build/cmake/aom_optimization.cmake
@@ -193,13 +193,13 @@ function(test_nasm)
endfunction()
# Adds build command for generation of rtcd C source files using
-# build/make/rtcd.pl. $config is the input perl file, $output is the output C
+# build/cmake/rtcd.pl. $config is the input perl file, $output is the output C
# include file, $source is the C source file, and $symbol is used for the symbol
# argument passed to rtcd.pl.
function(add_rtcd_build_step config output source symbol)
add_custom_command(OUTPUT ${output}
COMMAND ${PERL_EXECUTABLE} ARGS
- "${AOM_ROOT}/build/make/rtcd.pl"
+ "${AOM_ROOT}/build/cmake/rtcd.pl"
--arch=${AOM_TARGET_CPU}
--sym=${symbol} ${AOM_RTCD_FLAGS}
--config=${AOM_CONFIG_DIR}/config/aom_config.h
diff --git a/third_party/aom/build/cmake/exports.cmake b/third_party/aom/build/cmake/exports.cmake
index 5abfc9a5f..e0813dc0f 100644
--- a/third_party/aom/build/cmake/exports.cmake
+++ b/third_party/aom/build/cmake/exports.cmake
@@ -34,7 +34,8 @@ function(setup_exports_target)
-DAOM_SYM_FILE="${aom_sym_file}" -DAOM_MSVC=${MSVC}
-DAOM_XCODE=${XCODE} -DCONFIG_NAME=$<CONFIG>
-DCONFIG_AV1_DECODER=${CONFIG_AV1_DECODER}
- -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER} -P
+ -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER}
+ -DENABLE_TESTS=${ENABLE_TESTS} -P
"${AOM_ROOT}/build/cmake/generate_exports.cmake"
SOURCES ${AOM_EXPORTS_SOURCES}
DEPENDS ${AOM_EXPORTS_SOURCES})
@@ -47,10 +48,12 @@ function(setup_exports_target)
set_property(TARGET aom APPEND_STRING
PROPERTY LINK_FLAGS "-exported_symbols_list ${aom_sym_file}")
elseif(WIN32)
- message(FATAL_ERROR "Windows DLL builds not supported yet.")
if(NOT MSVC)
set_property(TARGET aom APPEND_STRING
PROPERTY LINK_FLAGS "-Wl,--version-script ${aom_sym_file}")
+ else()
+ set_property(TARGET aom APPEND_STRING
+ PROPERTY LINK_FLAGS "/DEF:${aom_sym_file}")
endif()
# TODO(tomfinegan): Sort out the import lib situation and flags for MSVC.
diff --git a/third_party/aom/build/cmake/exports_sources.cmake b/third_party/aom/build/cmake/exports_sources.cmake
index 48790dbaa..576920e36 100644
--- a/third_party/aom/build/cmake/exports_sources.cmake
+++ b/third_party/aom/build/cmake/exports_sources.cmake
@@ -13,14 +13,20 @@ if(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_)
endif() # AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_
set(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_ 1)
-set(AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_com")
+list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_com"
+ "${AOM_ROOT}/av1/exports_com")
if(CONFIG_AV1_DECODER)
- set(AOM_EXPORTS_SOURCES ${AOM_EXPORTS_SOURCES} "${AOM_ROOT}/aom/exports_dec"
- "${AOM_ROOT}/av1/exports_dec")
+ list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_dec"
+ "${AOM_ROOT}/av1/exports_dec")
endif()
if(CONFIG_AV1_ENCODER)
- set(AOM_EXPORTS_SOURCES ${AOM_EXPORTS_SOURCES} "${AOM_ROOT}/aom/exports_enc"
- "${AOM_ROOT}/av1/exports_enc")
+ list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_enc"
+ "${AOM_ROOT}/av1/exports_enc")
+endif()
+
+if(ENABLE_TESTS)
+ list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_test"
+ "${AOM_ROOT}/av1/exports_test")
endif()
diff --git a/third_party/aom/build/cmake/generate_aom_config_templates.cmake b/third_party/aom/build/cmake/generate_aom_config_templates.cmake
index 8fbb4737b..b91c036de 100644
--- a/third_party/aom/build/cmake/generate_aom_config_templates.cmake
+++ b/third_party/aom/build/cmake/generate_aom_config_templates.cmake
@@ -78,23 +78,24 @@ if(NOT EXISTS "${AOM_DEFAULTS}")
endif()
include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
-get_cmake_property(cmake_cache_vars CACHE_VARIABLES)
+list(APPEND aom_build_vars ${AOM_DETECT_VARS} ${AOM_CONFIG_VARS})
+list(SORT aom_build_vars)
set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake")
file(WRITE "${aom_config_h_template}" ${h_file_header_block})
-foreach(cache_var ${cmake_cache_vars})
- if(NOT "${cache_var}" MATCHES "AOM_CONFIG_DIR\|AOM_ROOT\|^CMAKE_")
+foreach(aom_var ${aom_build_vars})
+ if(NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS")
file(APPEND "${aom_config_h_template}"
- "\#define ${cache_var} \${${cache_var}}\n")
+ "\#define ${aom_var} \${${aom_var}}\n")
endif()
endforeach()
-file(APPEND "${aom_config_h_template}" "\#endif /* AOM_CONFIG_H_ */")
+file(APPEND "${aom_config_h_template}" "\#endif // AOM_CONFIG_H_")
set(aom_asm_config_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake")
file(WRITE "${aom_asm_config_template}" ${asm_file_header_block})
-foreach(cache_var ${cmake_cache_vars})
- if(NOT "${cache_var}" MATCHES "AOM_CONFIG_DIR\|AOM_ROOT\|^CMAKE_\|INLINE")
- file(APPEND "${aom_asm_config_template}"
- "${cache_var} equ \${${cache_var}}\n")
+foreach(aom_var ${aom_build_vars})
+ if(NOT "${aom_var}" STREQUAL "INLINE" AND NOT "${aom_var}" STREQUAL
+ "AOM_RTCD_FLAGS")
+ file(APPEND "${aom_asm_config_template}" "${aom_var} equ \${${aom_var}}\n")
endif()
endforeach()
diff --git a/third_party/aom/build/cmake/generate_exports.cmake b/third_party/aom/build/cmake/generate_exports.cmake
index 4dce3a671..7ab5aaef8 100644
--- a/third_party/aom/build/cmake/generate_exports.cmake
+++ b/third_party/aom/build/cmake/generate_exports.cmake
@@ -24,9 +24,7 @@ include("${AOM_ROOT}/build/cmake/exports_sources.cmake")
if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
set(symbol_prefix "_")
elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
- set(symbol_prefix "_")
- file(WRITE "${AOM_SYM_FILE}" "LIBRARY libaom INITINSTANCE TERMINSTANCE\n"
- "DATA MULTIPLE NONSHARED\n" "EXPORTS\n")
+ file(WRITE "${AOM_SYM_FILE}" "LIBRARY aom\n" "EXPORTS\n")
else()
set(symbol_suffix ";")
endif()
@@ -35,11 +33,10 @@ set(aom_sym_file "${AOM_SYM_FILE}")
if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
file(REMOVE "${aom_sym_file}")
-elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS")
- file(WRITE "${aom_sym_file}" "LIBRARY libaom INITINSTANCE TERMINSTANCE\n"
- "DATA MULTIPLE NONSHARED\n" "EXPORTS\n")
+elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
+ file(WRITE "${aom_sym_file}" "LIBRARY aom\n" "EXPORTS\n")
else()
- file(WRITE "${aom_sym_file}" "{ global:\n")
+ file(WRITE "${aom_sym_file}" "{\nglobal:\n")
endif()
foreach(export_file ${AOM_EXPORTS_SOURCES})
@@ -50,11 +47,20 @@ endforeach()
foreach(exported_symbol ${exported_symbols})
string(STRIP "${exported_symbol}" exported_symbol)
+ if("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
+ string(SUBSTRING ${exported_symbol} 0 4 export_type)
+ string(COMPARE EQUAL "${export_type}" "data" is_data)
+ if(is_data)
+ set(symbol_suffix " DATA")
+ else()
+ set(symbol_suffix "")
+ endif()
+ endif()
string(REGEX REPLACE "text \|data " "" "exported_symbol" "${exported_symbol}")
- set(exported_symbol "${symbol_prefix}${exported_symbol}${symbol_suffix}")
+ set(exported_symbol " ${symbol_prefix}${exported_symbol}${symbol_suffix}")
file(APPEND "${aom_sym_file}" "${exported_symbol}\n")
endforeach()
if("${aom_sym_file}" MATCHES "ver$")
- file(APPEND "${aom_sym_file}" " };")
+ file(APPEND "${aom_sym_file}" " \nlocal:\n *;\n};")
endif()
diff --git a/third_party/aom/build/make/ios-Info.plist b/third_party/aom/build/cmake/ios-Info.plist
index 300e3e310..300e3e310 100644
--- a/third_party/aom/build/make/ios-Info.plist
+++ b/third_party/aom/build/cmake/ios-Info.plist
diff --git a/third_party/aom/build/make/iosbuild.sh b/third_party/aom/build/cmake/iosbuild.sh
index 167ece200..167ece200 100755
--- a/third_party/aom/build/make/iosbuild.sh
+++ b/third_party/aom/build/cmake/iosbuild.sh
diff --git a/third_party/aom/build/make/rtcd.pl b/third_party/aom/build/cmake/rtcd.pl
index b849a1eba..46e06907c 100755
--- a/third_party/aom/build/make/rtcd.pl
+++ b/third_party/aom/build/cmake/rtcd.pl
@@ -304,7 +304,7 @@ sub arm() {
#include "aom_ports/arm.h"
static void setup_rtcd_internal(void)
{
- int flags = arm_cpu_caps();
+ int flags = aom_arm_cpu_caps();
(void)flags;
@@ -409,12 +409,13 @@ EOF
#
&require("c");
+&require(keys %required);
if ($opts{arch} eq 'x86') {
@ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
x86;
} elsif ($opts{arch} eq 'x86_64') {
@ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
- @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
+ @REQUIRES = filter(qw/mmx sse sse2/);
&require(@REQUIRES);
x86;
} elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') {
@@ -430,6 +431,7 @@ if ($opts{arch} eq 'x86') {
arm;
} elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
@ALL_ARCHS = filter(qw/neon/);
+ &require("neon");
arm;
} elsif ($opts{arch} eq 'ppc') {
@ALL_ARCHS = filter(qw/vsx/);
diff --git a/third_party/aom/build/cmake/util.cmake b/third_party/aom/build/cmake/util.cmake
index a0c705691..b70ec4013 100644
--- a/third_party/aom/build/cmake/util.cmake
+++ b/third_party/aom/build/cmake/util.cmake
@@ -13,19 +13,21 @@ if(AOM_BUILD_CMAKE_UTIL_CMAKE_)
endif() # AOM_BUILD_CMAKE_UTIL_CMAKE_
set(AOM_BUILD_CMAKE_UTIL_CMAKE_ 1)
+# Directory where generated sources will be written.
+set(AOM_GEN_SRC_DIR "${AOM_CONFIG_DIR}/gen_src")
+
# Creates dummy source file in $AOM_CONFIG_DIR named $basename.$extension and
# returns the full path to the dummy source file via the $out_file_path
# parameter.
-function(create_dummy_source_file basename extension out_file_path)
- set(dummy_source_file "${AOM_CONFIG_DIR}/${basename}_dummy.${extension}")
+macro(create_dummy_source_file basename extension out_file_path)
+ set(dummy_source_file "${AOM_GEN_SRC_DIR}/${basename}_dummy.${extension}")
file(
WRITE
"${dummy_source_file}" "// Generated file. DO NOT EDIT!\n"
"// ${target_name} needs a ${extension} file to force link language, \n"
"// or to silence a harmless CMake warning: Ignore me.\n"
"void ${target_name}_dummy_function(void) {}\n")
- set(${out_file_path} ${dummy_source_file} PARENT_SCOPE)
-endfunction()
+endmacro()
# Convenience function for adding a dummy source file to $target_name using
# $extension as the file extension. Wraps create_dummy_source_file().
@@ -86,3 +88,84 @@ function(set_compiler_launcher launcher_flag launcher_name)
endif()
endfunction()
+# Sentinel value used to detect when a variable has been set via the -D argument
+# passed to CMake on the command line.
+set(cmake_cmdline_helpstring "No help, variable specified on the command line.")
+
+# Wrapper macro for set() that does some book keeping to help with storage of
+# build configuration information.
+#
+# Sets the default value for variable $name when the value of $name has not
+# already been set via the CMake command line.
+#
+# The names of variables defaulted through this macro are added to
+# $AOM_CONFIG_VARS to facilitate build logging and diagnostics.
+macro(set_aom_detect_var name value type helpstring)
+ unset(list_index)
+ list(FIND AOM_DETECT_VARS ${name} list_index)
+ if(${list_index} EQUAL -1)
+ list(APPEND AOM_DETECT_VARS ${name})
+ endif()
+
+ # Update the variable only when it does not carry the CMake assigned help
+ # string for variables specified via the command line.
+ unset(cache_helpstring)
+ get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
+ if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+ set(${name} ${value} CACHE ${type} "${helpstring}")
+ mark_as_advanced(${name})
+ else()
+ message(
+ WARNING
+ "${name} has been set by CMake, but it may be overridden by the build "
+ "system during environment detection")
+ endif()
+endmacro()
+
+# Wrapper macro for set() that does some book keeping to help with storage of
+# build configuration information.
+#
+# Sets the default value for variable $name when the value of $name has not
+# already been set via the CMake command line.
+#
+# The names of variables defaulted through this macro are added to
+# $AOM_CONFIG_VARS to facilitate build logging and diagnostics.
+macro(set_aom_config_var name value type helpstring)
+ unset(list_index)
+ list(FIND AOM_CONFIG_VARS ${name} list_index)
+ if(${list_index} EQUAL -1)
+ list(APPEND AOM_CONFIG_VARS ${name})
+ endif()
+
+ # Update the variable only when it does not carry the CMake assigned help
+ # string for variables specified via the command line.
+ unset(cache_helpstring)
+ get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
+ if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+ set(${name} ${value} CACHE ${type} "${helpstring}")
+ endif()
+endmacro()
+
+# Wrapper macro for option() that does some book keeping to help with storage of
+# build configuration information.
+#
+# Sets the default value for variable $name when the value of $name has not
+# already been set via the CMake command line.
+#
+# The names of variables defaulted through this macro are added to
+# $AOM_OPTION_VARS to facilitate build logging and diagnostics.
+macro(set_aom_option_var name helpstring value)
+ unset(list_index)
+ list(FIND AOM_OPTION_VARS ${name} list_index)
+ if(${list_index} EQUAL -1)
+ list(APPEND AOM_OPTION_VARS ${name})
+ endif()
+
+ # Update the variable only when it does not carry the CMake assigned help
+ # string for variables specified via the command line.
+ unset(cache_helpstring)
+ get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
+ if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+ option(${name} "${helpstring}" ${value})
+ endif()
+endmacro()
diff --git a/third_party/aom/build/cmake/version.pl b/third_party/aom/build/cmake/version.pl
index 7c0608aeb..7d23f2b27 100755
--- a/third_party/aom/build/cmake/version.pl
+++ b/third_party/aom/build/cmake/version.pl
@@ -14,7 +14,7 @@ use warnings;
use 5.010;
use Getopt::Long;
-my $git_desc;
+my $git_desc = '';
my $version_data;
my $version_filename;
GetOptions('version_data=s' => \$version_data,
@@ -68,7 +68,7 @@ if (length($git_desc) > 0) {
open(my $version_file, '>', $version_filename) or
die("Cannot open $version_filename: $!");
-my $version_packed = "((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))";
+my $version_packed = "((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_PATCH))";
my $year = (localtime)[5] + 1900;
my $lic_block = << "EOF";
/*
@@ -91,7 +91,8 @@ $lic_block
#define VERSION_MINOR $version_minor
#define VERSION_PATCH $version_patch
#define VERSION_EXTRA \"$version_extra\"
-#define VERSION_PACKED $version_packed
+#define VERSION_PACKED \\
+ $version_packed
#define VERSION_STRING_NOSP \"$git_desc\"
#define VERSION_STRING \" $git_desc\"
EOF
@@ -102,7 +103,8 @@ $lic_block
#define VERSION_MINOR $version_minor
#define VERSION_PATCH $version_patch
#define VERSION_EXTRA \"$version_extra\"
-#define VERSION_PACKED $version_packed
+#define VERSION_PACKED \\
+ $version_packed
#define VERSION_STRING_NOSP \"v$version_string\"
#define VERSION_STRING \" v$version_string\"
EOF
diff --git a/third_party/aom/build/make/thumb.pm b/third_party/aom/build/make/thumb.pm
deleted file mode 100644
index 0a6629d78..000000000
--- a/third_party/aom/build/make/thumb.pm
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env perl
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-package thumb;
-
-sub FixThumbInstructions($$)
-{
- my $short_branches = $_[1];
- my $branch_shift_offset = $short_branches ? 1 : 0;
-
- # Write additions with shifts, such as "add r10, r11, lsl #8",
- # in three operand form, "add r10, r10, r11, lsl #8".
- s/(add\s+)(r\d+),\s*(r\d+),\s*(lsl #\d+)/$1$2, $2, $3, $4/g;
-
- # Convert additions with a non-constant shift into a sequence
- # with left shift, addition and a right shift (to restore the
- # register to the original value). Currently the right shift
- # isn't necessary in the code base since the values in these
- # registers aren't used, but doing the shift for consistency.
- # This converts instructions such as "add r12, r12, r5, lsl r4"
- # into the sequence "lsl r5, r4", "add r12, r12, r5", "lsr r5, r4".
- s/^(\s*)(add)(\s+)(r\d+),\s*(r\d+),\s*(r\d+),\s*lsl (r\d+)/$1lsl$3$6, $7\n$1$2$3$4, $5, $6\n$1lsr$3$6, $7/g;
-
- # Convert loads with right shifts in the indexing into a
- # sequence of an add, load and sub. This converts
- # "ldrb r4, [r9, lr, asr #1]" into "add r9, r9, lr, asr #1",
- # "ldrb r9, [r9]", "sub r9, r9, lr, asr #1".
- s/^(\s*)(ldrb)(\s+)(r\d+),\s*\[(\w+),\s*(\w+),\s*(asr #\d+)\]/$1add $3$5, $5, $6, $7\n$1$2$3$4, [$5]\n$1sub $3$5, $5, $6, $7/g;
-
- # Convert register indexing with writeback into a separate add
- # instruction. This converts "ldrb r12, [r1, r2]!" into
- # "ldrb r12, [r1, r2]", "add r1, r1, r2".
- s/^(\s*)(ldrb)(\s+)(r\d+),\s*\[(\w+),\s*(\w+)\]!/$1$2$3$4, [$5, $6]\n$1add $3$5, $6/g;
-
- # Convert negative register indexing into separate sub/add instructions.
- # This converts "ldrne r4, [src, -pstep, lsl #1]" into
- # "subne src, src, pstep, lsl #1", "ldrne r4, [src]",
- # "addne src, src, pstep, lsl #1". In a couple of cases where
- # this is used, it's used for two subsequent load instructions,
- # where a hand-written version of it could merge two subsequent
- # add and sub instructions.
- s/^(\s*)((ldr|str|pld)(ne)?)(\s+)(r\d+,\s*)?\[(\w+), -([^\]]+)\]/$1sub$4$5$7, $7, $8\n$1$2$5$6\[$7\]\n$1add$4$5$7, $7, $8/g;
-
- # Convert register post indexing to a separate add instruction.
- # This converts "ldrneb r9, [r0], r2" into "ldrneb r9, [r0]",
- # "addne r0, r0, r2".
- s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g;
-
- # Convert "mov pc, lr" into "bx lr", since the former only works
- # for switching from arm to thumb (and only in armv7), but not
- # from thumb to arm.
- s/mov(\s*)pc\s*,\s*lr/bx$1lr/g;
-}
-
-1;
diff --git a/third_party/aom/common/args.h b/third_party/aom/common/args.h
index d12973666..6a2664269 100644
--- a/third_party/aom/common/args.h
+++ b/third_party/aom/common/args.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef ARGS_H_
-#define ARGS_H_
+#ifndef AOM_COMMON_ARGS_H_
+#define AOM_COMMON_ARGS_H_
#include <stdio.h>
#ifdef __cplusplus
@@ -65,4 +65,4 @@ int arg_parse_list(const struct arg *arg, int *list, int n);
} // extern "C"
#endif
-#endif // ARGS_H_
+#endif // AOM_COMMON_ARGS_H_
diff --git a/third_party/aom/common/av1_config.c b/third_party/aom/common/av1_config.c
new file mode 100644
index 000000000..e8decf76f
--- /dev/null
+++ b/third_party/aom/common/av1_config.c
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdio.h>
+#include <string.h>
+
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "av1/common/obu_util.h"
+#include "common/av1_config.h"
+#include "config/aom_config.h"
+
+// Helper macros to reduce verbosity required to check for read errors.
+//
+// Note that when using these macros, even single line if statements should use
+// curly braces to avoid unexpected behavior because all but the
+// AV1C_POP_ERROR_HANDLER_DATA() macro consist of multiple statements.
+#define AV1C_READ_BIT_OR_RETURN_ERROR(field) \
+ int field = 0; \
+ do { \
+ field = aom_rb_read_bit(reader); \
+ if (result == -1) { \
+ fprintf(stderr, \
+ "av1c: Error reading bit for " #field ", value=%d result=%d.\n", \
+ field, result); \
+ return -1; \
+ } \
+ } while (0)
+
+#define AV1C_READ_BITS_OR_RETURN_ERROR(field, length) \
+ int field = 0; \
+ do { \
+ field = aom_rb_read_literal(reader, (length)); \
+ if (result == -1) { \
+ fprintf(stderr, \
+ "av1c: Could not read bits for " #field \
+ ", value=%d result=%d.\n", \
+ field, result); \
+ return -1; \
+ } \
+ } while (0)
+
+// Helper macros for setting/restoring the error handler data in
+// aom_read_bit_buffer.
+#define AV1C_PUSH_ERROR_HANDLER_DATA(new_data) \
+ void *original_error_handler_data = NULL; \
+ do { \
+ original_error_handler_data = reader->error_handler_data; \
+ reader->error_handler_data = &new_data; \
+ } while (0)
+
+#define AV1C_POP_ERROR_HANDLER_DATA() \
+ do { \
+ reader->error_handler_data = original_error_handler_data; \
+ } while (0)
+
+static const size_t kAv1cSize = 4;
+
+static void bitreader_error_handler(void *data) {
+ int *error_val = (int *)data;
+ *error_val = -1;
+}
+
+// Parse the AV1 timing_info() structure:
+// timing_info( ) {
+// num_units_in_display_tick f(32)
+// time_scale f(32)
+// equal_picture_interval f(1)
+// if (equal_picture_interval)
+// num_ticks_per_picture_minus_1 uvlc()
+// }
+static int parse_timing_info(struct aom_read_bit_buffer *reader) {
+ int result = 0;
+ AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_display_tick, 32);
+ AV1C_READ_BITS_OR_RETURN_ERROR(time_scale, 32);
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(equal_picture_interval);
+ if (equal_picture_interval) {
+ uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(reader);
+ if (result == -1) {
+ fprintf(stderr,
+ "av1c: Could not read bits for "
+ "num_ticks_per_picture_minus_1, value=%u.\n",
+ num_ticks_per_picture_minus_1);
+ return result;
+ }
+ }
+
+ AV1C_POP_ERROR_HANDLER_DATA();
+ return result;
+}
+
+// Parse the AV1 decoder_model_info() structure:
+// decoder_model_info( ) {
+// buffer_delay_length_minus_1 f(5)
+// num_units_in_decoding_tick f(32)
+// buffer_removal_time_length_minus_1 f(5)
+// frame_presentation_time_length_minus_1 f(5)
+// }
+//
+// Returns -1 upon failure, or the value of buffer_delay_length_minus_1 + 1.
+static int parse_decoder_model_info(struct aom_read_bit_buffer *reader) {
+ int result = 0;
+ AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(buffer_delay_length_minus_1, 5);
+ AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_decoding_tick, 32);
+ AV1C_READ_BITS_OR_RETURN_ERROR(buffer_removal_time_length_minus_1, 5);
+ AV1C_READ_BITS_OR_RETURN_ERROR(frame_presentation_time_length_minus_1, 5);
+
+ AV1C_POP_ERROR_HANDLER_DATA();
+ return buffer_delay_length_minus_1 + 1;
+}
+
+// Parse the AV1 operating_parameters_info() structure:
+// operating_parameters_info( op ) {
+// n = buffer_delay_length_minus_1 + 1
+// decoder_buffer_delay[ op ] f(n)
+// encoder_buffer_delay[ op ] f(n)
+// low_delay_mode_flag[ op ] f(1)
+// }
+static int parse_operating_parameters_info(struct aom_read_bit_buffer *reader,
+ int buffer_delay_length_minus_1) {
+ int result = 0;
+ AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+ const int buffer_delay_length = buffer_delay_length_minus_1 + 1;
+ AV1C_READ_BITS_OR_RETURN_ERROR(decoder_buffer_delay, buffer_delay_length);
+ AV1C_READ_BITS_OR_RETURN_ERROR(encoder_buffer_delay, buffer_delay_length);
+ AV1C_READ_BIT_OR_RETURN_ERROR(low_delay_mode_flag);
+
+ AV1C_POP_ERROR_HANDLER_DATA();
+ return result;
+}
+
+// Parse the AV1 color_config() structure..See:
+// https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=44
+static int parse_color_config(struct aom_read_bit_buffer *reader,
+ Av1Config *config) {
+ int result = 0;
+ AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth);
+ config->high_bitdepth = high_bitdepth;
+
+ int bit_depth = 0;
+ if (config->seq_profile == 2 && config->high_bitdepth) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit);
+ config->twelve_bit = twelve_bit;
+ bit_depth = config->twelve_bit ? 12 : 10;
+ } else {
+ bit_depth = config->high_bitdepth ? 10 : 8;
+ }
+
+ if (config->seq_profile != 1) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(mono_chrome);
+ config->monochrome = mono_chrome;
+ }
+
+ int color_primaries = AOM_CICP_CP_UNSPECIFIED;
+ int transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
+ int matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(color_description_present_flag);
+ if (color_description_present_flag) {
+ AV1C_READ_BITS_OR_RETURN_ERROR(color_primaries_val, 8);
+ color_primaries = color_primaries_val;
+ AV1C_READ_BITS_OR_RETURN_ERROR(transfer_characteristics_val, 8);
+ transfer_characteristics = transfer_characteristics_val;
+ AV1C_READ_BITS_OR_RETURN_ERROR(matrix_coefficients_val, 8);
+ matrix_coefficients = matrix_coefficients_val;
+ }
+
+ if (config->monochrome) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(color_range);
+ config->chroma_subsampling_x = 1;
+ config->chroma_subsampling_y = 1;
+ } else if (color_primaries == AOM_CICP_CP_BT_709 &&
+ transfer_characteristics == AOM_CICP_TC_SRGB &&
+ matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ config->chroma_subsampling_x = 0;
+ config->chroma_subsampling_y = 0;
+ } else {
+ AV1C_READ_BIT_OR_RETURN_ERROR(color_range);
+ if (config->seq_profile == 0) {
+ config->chroma_subsampling_x = 1;
+ config->chroma_subsampling_y = 1;
+ } else if (config->seq_profile == 1) {
+ config->chroma_subsampling_x = 0;
+ config->chroma_subsampling_y = 0;
+ } else {
+ if (bit_depth == 12) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_x);
+ config->chroma_subsampling_x = subsampling_x;
+ if (subsampling_x) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_y);
+ config->chroma_subsampling_y = subsampling_y;
+ } else {
+ config->chroma_subsampling_y = 0;
+ }
+ } else {
+ config->chroma_subsampling_x = 1;
+ config->chroma_subsampling_y = 0;
+ }
+ }
+
+ if (config->chroma_subsampling_x && config->chroma_subsampling_y) {
+ AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2);
+ config->chroma_sample_position = chroma_sample_position;
+ }
+ }
+
+ if (!config->monochrome) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(separate_uv_delta_q);
+ }
+
+ AV1C_POP_ERROR_HANDLER_DATA();
+ return result;
+}
+
+// Parse AV1 Sequence Header OBU. See:
+// https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=41
+static int parse_sequence_header(const uint8_t *const buffer, size_t length,
+ Av1Config *config) {
+ int result = 0;
+ // The reader instance is local to this function, but a pointer to the
+ // reader instance is used within this function and throughout this file to
+ // allow use of the helper macros that reduce parse error checking verbosity.
+ struct aom_read_bit_buffer reader_instance = {
+ buffer, buffer + length, 0, &result, bitreader_error_handler
+ };
+ struct aom_read_bit_buffer *reader = &reader_instance;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3);
+ config->seq_profile = seq_profile;
+ AV1C_READ_BIT_OR_RETURN_ERROR(still_picture);
+ AV1C_READ_BIT_OR_RETURN_ERROR(reduced_still_picture_header);
+ if (reduced_still_picture_header) {
+ config->initial_presentation_delay_present = 0;
+ AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5);
+ config->seq_level_idx_0 = seq_level_idx_0;
+ config->seq_tier_0 = 0;
+ } else {
+ int has_decoder_model = 0;
+ int buffer_delay_length = 0;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(timing_info_present_flag);
+ if (timing_info_present_flag) {
+ if (parse_timing_info(reader) != 0) return -1;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_info_present_flag);
+ if (decoder_model_info_present_flag &&
+ (buffer_delay_length = parse_decoder_model_info(reader)) == -1) {
+ return -1;
+ }
+ has_decoder_model = 1;
+ }
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present);
+ config->initial_presentation_delay_present =
+ initial_presentation_delay_present;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(operating_points_cnt_minus_1, 5);
+ const int num_operating_points = operating_points_cnt_minus_1 + 1;
+
+ for (int op_index = 0; op_index < num_operating_points; ++op_index) {
+ AV1C_READ_BITS_OR_RETURN_ERROR(operating_point_idc, 12);
+ AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx, 5);
+
+ int seq_tier = 0;
+ if (seq_level_idx > 7) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_this_op);
+ seq_tier = seq_tier_this_op;
+ }
+
+ if (has_decoder_model) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_present_for_op);
+ if (decoder_model_present_for_op) {
+ if (parse_operating_parameters_info(reader, buffer_delay_length) ==
+ -1) {
+ return -1;
+ }
+ }
+ }
+
+ if (config->initial_presentation_delay_present) {
+ // Skip the initial presentation delay bits if present since this
+ // function has no access to the data required to properly set the
+ // field.
+ AV1C_READ_BIT_OR_RETURN_ERROR(
+ initial_presentation_delay_present_for_this_op);
+ if (initial_presentation_delay_present_for_this_op) {
+ AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_1, 4);
+ }
+ }
+
+ if (op_index == 0) {
+ // Av1Config needs only the values from the first operating point.
+ config->seq_level_idx_0 = seq_level_idx;
+ config->seq_tier_0 = seq_tier;
+ config->initial_presentation_delay_present = 0;
+ config->initial_presentation_delay_minus_one = 0;
+ }
+ }
+ }
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(frame_width_bits_minus_1, 4);
+ AV1C_READ_BITS_OR_RETURN_ERROR(frame_height_bits_minus_1, 4);
+ AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_width_minus_1,
+ frame_width_bits_minus_1 + 1);
+ AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_height_minus_1,
+ frame_height_bits_minus_1 + 1);
+
+ int frame_id_numbers_present = 0;
+ if (!reduced_still_picture_header) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(frame_id_numbers_present_flag);
+ frame_id_numbers_present = frame_id_numbers_present_flag;
+ }
+
+ if (frame_id_numbers_present) {
+ AV1C_READ_BITS_OR_RETURN_ERROR(delta_frame_id_length_minus_2, 4);
+ AV1C_READ_BITS_OR_RETURN_ERROR(additional_frame_id_length_minus_1, 3);
+ }
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(use_128x128_superblock);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_filter_intra);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_intra_edge_filter);
+
+ if (!reduced_still_picture_header) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_interintra_compound);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_masked_compound);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_warped_motion);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_dual_filter);
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_order_hint);
+ if (enable_order_hint) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_jnt_comp);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_ref_frame_mvs);
+ }
+
+ const int SELECT_SCREEN_CONTENT_TOOLS = 2;
+ int seq_force_screen_content_tools = SELECT_SCREEN_CONTENT_TOOLS;
+ AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_screen_content_tools);
+ if (!seq_choose_screen_content_tools) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_screen_content_tools_val);
+ seq_force_screen_content_tools = seq_force_screen_content_tools_val;
+ }
+
+ if (seq_force_screen_content_tools > 0) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_integer_mv);
+
+ if (!seq_choose_integer_mv) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_integer_mv);
+ }
+ }
+
+ if (enable_order_hint) {
+ AV1C_READ_BITS_OR_RETURN_ERROR(order_hint_bits_minus_1, 3);
+ }
+ }
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_superres);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_cdef);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_restoration);
+
+ if (parse_color_config(reader, config) != 0) {
+ fprintf(stderr, "av1c: color_config() parse failed.\n");
+ return -1;
+ }
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(film_grain_params_present);
+ return 0;
+}
+
+int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb,
+ Av1Config *config) {
+ if (!buffer || length == 0 || !config) {
+ return -1;
+ }
+
+ ObuHeader obu_header;
+ memset(&obu_header, 0, sizeof(obu_header));
+
+ size_t sequence_header_length = 0;
+ size_t obu_header_length = 0;
+ if (aom_read_obu_header_and_size(buffer, length, is_annexb, &obu_header,
+ &sequence_header_length,
+ &obu_header_length) != AOM_CODEC_OK ||
+ obu_header.type != OBU_SEQUENCE_HEADER ||
+ sequence_header_length + obu_header_length > length) {
+ return -1;
+ }
+
+ memset(config, 0, sizeof(*config));
+ config->marker = 1;
+ config->version = 1;
+ return parse_sequence_header(buffer + obu_header_length,
+ sequence_header_length, config);
+}
+
+int read_av1config(const uint8_t *buffer, size_t buffer_length,
+ size_t *bytes_read, Av1Config *config) {
+ if (!buffer || buffer_length < kAv1cSize || !bytes_read || !config) return -1;
+
+ *bytes_read = 0;
+
+ int result = 0;
+ struct aom_read_bit_buffer reader_instance = {
+ buffer, buffer + buffer_length, 0, &result, bitreader_error_handler
+ };
+ struct aom_read_bit_buffer *reader = &reader_instance;
+
+ memset(config, 0, sizeof(*config));
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(marker);
+ config->marker = marker;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(version, 7);
+ config->version = version;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3);
+ config->seq_profile = seq_profile;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5);
+ config->seq_level_idx_0 = seq_level_idx_0;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_0);
+ config->seq_tier_0 = seq_tier_0;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth);
+ config->high_bitdepth = high_bitdepth;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit);
+ config->twelve_bit = twelve_bit;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(monochrome);
+ config->monochrome = monochrome;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_x);
+ config->chroma_subsampling_x = chroma_subsampling_x;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_y);
+ config->chroma_subsampling_y = chroma_subsampling_y;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2);
+ config->chroma_sample_position = chroma_sample_position;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(reserved, 3);
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present);
+ config->initial_presentation_delay_present =
+ initial_presentation_delay_present;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_one, 4);
+ config->initial_presentation_delay_minus_one =
+ initial_presentation_delay_minus_one;
+
+ *bytes_read = aom_rb_bytes_read(reader);
+
+ return 0;
+}
+
+int write_av1config(const Av1Config *config, size_t capacity,
+ size_t *bytes_written, uint8_t *buffer) {
+ if (!config || !buffer || capacity < kAv1cSize || !bytes_written) return -1;
+
+ *bytes_written = 0;
+ memset(buffer, 0, kAv1cSize);
+
+ struct aom_write_bit_buffer writer = { buffer, 0 };
+
+ aom_wb_write_bit(&writer, config->marker);
+ aom_wb_write_literal(&writer, config->version, 7);
+ aom_wb_write_literal(&writer, config->seq_profile, 3);
+ aom_wb_write_literal(&writer, config->seq_level_idx_0, 5);
+ aom_wb_write_bit(&writer, config->seq_tier_0);
+ aom_wb_write_bit(&writer, config->high_bitdepth);
+ aom_wb_write_bit(&writer, config->twelve_bit);
+ aom_wb_write_bit(&writer, config->monochrome);
+ aom_wb_write_bit(&writer, config->chroma_subsampling_x);
+ aom_wb_write_bit(&writer, config->chroma_subsampling_y);
+ aom_wb_write_literal(&writer, config->chroma_sample_position, 2);
+ aom_wb_write_literal(&writer, 0, 3); // reserved
+ aom_wb_write_bit(&writer, config->initial_presentation_delay_present);
+
+ if (config->initial_presentation_delay_present) {
+ aom_wb_write_literal(&writer, config->initial_presentation_delay_minus_one,
+ 4);
+ } else {
+ aom_wb_write_literal(&writer, 0, 4); // reserved
+ }
+
+ *bytes_written = aom_wb_bytes_written(&writer);
+ return 0;
+}
+
+#undef AV1C_READ_BIT_OR_RETURN_ERROR
+#undef AV1C_READ_BITS_OR_RETURN_ERROR
+#undef AV1C_PUSH_ERROR_HANDLER_DATA
+#undef AV1C_POP_ERROR_HANDLER_DATA
diff --git a/third_party/aom/common/av1_config.h b/third_party/aom/common/av1_config.h
new file mode 100644
index 000000000..a15bedb30
--- /dev/null
+++ b/third_party/aom/common/av1_config.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_AV1_CONFIG_H_
+#define AOM_COMMON_AV1_CONFIG_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Struct representing ISOBMFF/Matroska AV1 config. See:
+// https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox-syntax
+//
+// The AV1 config has the following format:
+//
+// unsigned int (1) marker = 1;
+// unsigned int (7) version = 1;
+// unsigned int (3) seq_profile;
+// unsigned int (5) seq_level_idx_0;
+// unsigned int (1) seq_tier_0;
+// unsigned int (1) high_bitdepth;
+// unsigned int (1) twelve_bit;
+// unsigned int (1) monochrome;
+// unsigned int (1) chroma_subsampling_x;
+// unsigned int (1) chroma_subsampling_y;
+// unsigned int (2) chroma_sample_position;
+// unsigned int (3) reserved = 0;
+//
+// unsigned int (1) initial_presentation_delay_present;
+// if (initial_presentation_delay_present) {
+// unsigned int (4) initial_presentation_delay_minus_one;
+// } else {
+// unsigned int (4) reserved = 0;
+// }
+//
+// unsigned int (8)[] configOBUs;
+//
+// Note: get_av1config_from_obu() does not currently store 'configOBUs' data, so
+// the field is omitted.
+typedef struct _Av1Config {
+ uint8_t marker;
+ uint8_t version;
+ uint8_t seq_profile;
+ uint8_t seq_level_idx_0;
+ uint8_t seq_tier_0;
+ uint8_t high_bitdepth;
+ uint8_t twelve_bit;
+ uint8_t monochrome;
+ uint8_t chroma_subsampling_x;
+ uint8_t chroma_subsampling_y;
+ uint8_t chroma_sample_position;
+ uint8_t initial_presentation_delay_present;
+ uint8_t initial_presentation_delay_minus_one;
+} Av1Config;
+
+// Attempts to parse a Sequence Header OBU and set the paramenters of 'config'.
+// Returns 0 upon success, and -1 upon failure. 'buffer' can contain multiple
+// OBUs, but the Sequence Header OBU must be the first OBU within the buffer.
+int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb,
+ Av1Config *config);
+
+// Attempts to parse an AV1 config from 'buffer'. Returns 0 upon success.
+// Returns -1 when 'buffer_length' is less than 4, when passed NULL pointers, or
+// when parsing of 'buffer' fails.
+int read_av1config(const uint8_t *buffer, size_t buffer_length,
+ size_t *bytes_read, Av1Config *config);
+
+// Writes 'config' to 'buffer'. Returns 0 upon successful write to 'buffer'.
+// Returns -1 when passed NULL pointers or when 'capacity' insufficient.
+int write_av1config(const Av1Config *config, size_t capacity,
+ size_t *bytes_written, uint8_t *buffer);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // AOM_COMMON_AV1_CONFIG_H_
diff --git a/third_party/aom/common/ivfdec.h b/third_party/aom/common/ivfdec.h
index 9013dea64..ea294faa1 100644
--- a/third_party/aom/common/ivfdec.h
+++ b/third_party/aom/common/ivfdec.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef IVFDEC_H_
-#define IVFDEC_H_
+#ifndef AOM_COMMON_IVFDEC_H_
+#define AOM_COMMON_IVFDEC_H_
#include "common/tools_common.h"
@@ -27,4 +27,4 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
} /* extern "C" */
#endif
-#endif // IVFDEC_H_
+#endif // AOM_COMMON_IVFDEC_H_
diff --git a/third_party/aom/common/ivfenc.h b/third_party/aom/common/ivfenc.h
index f0cab8178..8f6d947d4 100644
--- a/third_party/aom/common/ivfenc.h
+++ b/third_party/aom/common/ivfenc.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef IVFENC_H_
-#define IVFENC_H_
+#ifndef AOM_COMMON_IVFENC_H_
+#define AOM_COMMON_IVFENC_H_
#include "common/tools_common.h"
@@ -31,4 +31,4 @@ void ivf_write_frame_size(FILE *outfile, size_t frame_size);
} /* extern "C" */
#endif
-#endif // IVFENC_H_
+#endif // AOM_COMMON_IVFENC_H_
diff --git a/third_party/aom/common/md5_utils.h b/third_party/aom/common/md5_utils.h
index bd4991b3a..144fa3ad2 100644
--- a/third_party/aom/common/md5_utils.h
+++ b/third_party/aom/common/md5_utils.h
@@ -20,8 +20,8 @@
* Still in the public domain.
*/
-#ifndef MD5_UTILS_H_
-#define MD5_UTILS_H_
+#ifndef AOM_COMMON_MD5_UTILS_H_
+#define AOM_COMMON_MD5_UTILS_H_
#ifdef __cplusplus
extern "C" {
@@ -46,4 +46,4 @@ void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]);
} // extern "C"
#endif
-#endif // MD5_UTILS_H_
+#endif // AOM_COMMON_MD5_UTILS_H_
diff --git a/third_party/aom/common/obudec.c b/third_party/aom/common/obudec.c
index cd88f1648..acbd12e0c 100644
--- a/third_party/aom/common/obudec.c
+++ b/third_party/aom/common/obudec.c
@@ -17,7 +17,7 @@
#include "aom_ports/mem_ops.h"
#include "av1/common/common.h"
-#include "av1/decoder/obu.h"
+#include "av1/common/obu_util.h"
#define OBU_BUFFER_SIZE (500 * 1024)
diff --git a/third_party/aom/common/obudec.h b/third_party/aom/common/obudec.h
index c52a94e9d..b2adb1e3d 100644
--- a/third_party/aom/common/obudec.h
+++ b/third_party/aom/common/obudec.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef OBUDEC_H_
-#define OBUDEC_H_
+#ifndef AOM_COMMON_OBUDEC_H_
+#define AOM_COMMON_OBUDEC_H_
#include "common/tools_common.h"
@@ -45,4 +45,4 @@ void obudec_free(struct ObuDecInputContext *obu_ctx);
} /* extern "C" */
#endif
-#endif // OBUDEC_H_
+#endif // AOM_COMMON_OBUDEC_H_
diff --git a/third_party/aom/common/rawenc.c b/third_party/aom/common/rawenc.c
new file mode 100644
index 000000000..5a2731d3a
--- /dev/null
+++ b/third_party/aom/common/rawenc.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/rawenc.h"
+
+void raw_write_image_file(const aom_image_t *img, const int *planes,
+ const int num_planes, FILE *file) {
+ const int bytes_per_sample = ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+ for (int i = 0; i < num_planes; ++i) {
+ const int plane = planes[i];
+ const unsigned char *buf = img->planes[plane];
+ const int stride = img->stride[plane];
+ const int w = aom_img_plane_width(img, plane);
+ const int h = aom_img_plane_height(img, plane);
+ for (int y = 0; y < h; ++y) {
+ fwrite(buf, bytes_per_sample, w, file);
+ buf += stride;
+ }
+ }
+}
+
+void raw_update_image_md5(const aom_image_t *img, const int *planes,
+ const int num_planes, MD5Context *md5) {
+ for (int i = 0; i < num_planes; ++i) {
+ const int plane = planes[i];
+ const unsigned char *buf = img->planes[plane];
+ const int stride = img->stride[plane];
+ const int w = aom_img_plane_width(img, plane) *
+ ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+ const int h = aom_img_plane_height(img, plane);
+ for (int y = 0; y < h; ++y) {
+ MD5Update(md5, buf, w);
+ buf += stride;
+ }
+ }
+}
diff --git a/third_party/aom/common/rawenc.h b/third_party/aom/common/rawenc.h
new file mode 100644
index 000000000..cf5e00e6f
--- /dev/null
+++ b/third_party/aom/common/rawenc.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_RAWENC_H_
+#define AOM_COMMON_RAWENC_H_
+
+#include "aom/aom_decoder.h"
+#include "common/md5_utils.h"
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void raw_write_image_file(const aom_image_t *img, const int *planes,
+ const int num_planes, FILE *file);
+void raw_update_image_md5(const aom_image_t *img, const int *planes,
+ const int num_planes, MD5Context *md5);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_RAWENC_H_
diff --git a/third_party/aom/common/tools_common.h b/third_party/aom/common/tools_common.h
index 587903650..4e1d12f4a 100644
--- a/third_party/aom/common/tools_common.h
+++ b/third_party/aom/common/tools_common.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TOOLS_COMMON_H_
-#define TOOLS_COMMON_H_
+#ifndef AOM_COMMON_TOOLS_COMMON_H_
+#define AOM_COMMON_TOOLS_COMMON_H_
#include <stdio.h>
@@ -161,4 +161,4 @@ void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src);
} /* extern "C" */
#endif
-#endif // TOOLS_COMMON_H_
+#endif // AOM_COMMON_TOOLS_COMMON_H_
diff --git a/third_party/aom/common/video_common.h b/third_party/aom/common/video_common.h
index 965038d39..bf95031be 100644
--- a/third_party/aom/common/video_common.h
+++ b/third_party/aom/common/video_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef VIDEO_COMMON_H_
-#define VIDEO_COMMON_H_
+#ifndef AOM_COMMON_VIDEO_COMMON_H_
+#define AOM_COMMON_VIDEO_COMMON_H_
#include "common/tools_common.h"
@@ -22,4 +22,4 @@ typedef struct {
unsigned int is_annexb;
} AvxVideoInfo;
-#endif // VIDEO_COMMON_H_
+#endif // AOM_COMMON_VIDEO_COMMON_H_
diff --git a/third_party/aom/common/video_reader.c b/third_party/aom/common/video_reader.c
index b54c250c2..47ad6e189 100644
--- a/third_party/aom/common/video_reader.c
+++ b/third_party/aom/common/video_reader.c
@@ -17,11 +17,13 @@
#include "common/obudec.h"
#include "common/tools_common.h"
#include "common/video_reader.h"
+#include "common/webmdec.h"
struct AvxVideoReaderStruct {
AvxVideoInfo info;
struct AvxInputContext input_ctx;
struct ObuDecInputContext obu_ctx;
+ struct WebmInputContext webm_ctx;
uint8_t *buffer;
size_t buffer_size;
size_t frame_size;
@@ -49,6 +51,13 @@ AvxVideoReader *aom_video_reader_open(const char *filename) {
reader->info.codec_fourcc = reader->input_ctx.fourcc;
reader->info.frame_width = reader->input_ctx.width;
reader->info.frame_height = reader->input_ctx.height;
+#if CONFIG_WEBM_IO
+ } else if (file_is_webm(&reader->webm_ctx, &reader->input_ctx)) {
+ reader->input_ctx.file_type = FILE_TYPE_WEBM;
+ reader->info.codec_fourcc = reader->input_ctx.fourcc;
+ reader->info.frame_width = reader->input_ctx.width;
+ reader->info.frame_height = reader->input_ctx.height;
+#endif
} else if (file_is_obu(&reader->obu_ctx)) {
reader->input_ctx.file_type = FILE_TYPE_OBU;
// assume AV1
@@ -83,6 +92,11 @@ int aom_video_reader_read_frame(AvxVideoReader *reader) {
return !obudec_read_temporal_unit(&reader->obu_ctx, &reader->buffer,
&reader->frame_size,
&reader->buffer_size);
+#if CONFIG_WEBM_IO
+ } else if (reader->input_ctx.file_type == FILE_TYPE_WEBM) {
+ return !webm_read_frame(&reader->webm_ctx, &reader->buffer,
+ &reader->frame_size, &reader->buffer_size);
+#endif
} else {
assert(0);
return 0;
diff --git a/third_party/aom/common/video_reader.h b/third_party/aom/common/video_reader.h
index 1eb33831a..903deae84 100644
--- a/third_party/aom/common/video_reader.h
+++ b/third_party/aom/common/video_reader.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef VIDEO_READER_H_
-#define VIDEO_READER_H_
+#ifndef AOM_COMMON_VIDEO_READER_H_
+#define AOM_COMMON_VIDEO_READER_H_
#include "common/video_common.h"
@@ -54,4 +54,4 @@ const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader);
} // extern "C"
#endif
-#endif // VIDEO_READER_H_
+#endif // AOM_COMMON_VIDEO_READER_H_
diff --git a/third_party/aom/common/video_writer.h b/third_party/aom/common/video_writer.h
index 16655d3a6..3e2b6554b 100644
--- a/third_party/aom/common/video_writer.h
+++ b/third_party/aom/common/video_writer.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef VIDEO_WRITER_H_
-#define VIDEO_WRITER_H_
+#ifndef AOM_COMMON_VIDEO_WRITER_H_
+#define AOM_COMMON_VIDEO_WRITER_H_
#include "common/video_common.h"
@@ -42,4 +42,4 @@ int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
} // extern "C"
#endif
-#endif // VIDEO_WRITER_H_
+#endif // AOM_COMMON_VIDEO_WRITER_H_
diff --git a/third_party/aom/common/warnings.h b/third_party/aom/common/warnings.h
index 61db2dcf8..36f1fe070 100644
--- a/third_party/aom/common/warnings.h
+++ b/third_party/aom/common/warnings.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef WARNINGS_H_
-#define WARNINGS_H_
+#ifndef AOM_COMMON_WARNINGS_H_
+#define AOM_COMMON_WARNINGS_H_
#ifdef __cplusplus
extern "C" {
@@ -31,4 +31,4 @@ void check_encoder_config(int disable_prompt,
} // extern "C"
#endif
-#endif // WARNINGS_H_
+#endif // AOM_COMMON_WARNINGS_H_
diff --git a/third_party/aom/common/webmdec.h b/third_party/aom/common/webmdec.h
index d5b472a01..5ac75cb30 100644
--- a/third_party/aom/common/webmdec.h
+++ b/third_party/aom/common/webmdec.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef WEBMDEC_H_
-#define WEBMDEC_H_
+#ifndef AOM_COMMON_WEBMDEC_H_
+#define AOM_COMMON_WEBMDEC_H_
#include "common/tools_common.h"
@@ -68,4 +68,4 @@ void webm_free(struct WebmInputContext *webm_ctx);
} // extern "C"
#endif
-#endif // WEBMDEC_H_
+#endif // AOM_COMMON_WEBMDEC_H_
diff --git a/third_party/aom/common/webmenc.h b/third_party/aom/common/webmenc.h
index 74387fb8d..aa9832fba 100644
--- a/third_party/aom/common/webmenc.h
+++ b/third_party/aom/common/webmenc.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef WEBMENC_H_
-#define WEBMENC_H_
+#ifndef AOM_COMMON_WEBMENC_H_
+#define AOM_COMMON_WEBMENC_H_
#include <stdio.h>
#include <stdlib.h>
@@ -53,4 +53,4 @@ void write_webm_file_footer(struct WebmOutputContext *webm_ctx);
} // extern "C"
#endif
-#endif // WEBMENC_H_
+#endif // AOM_COMMON_WEBMENC_H_
diff --git a/third_party/aom/common/y4menc.c b/third_party/aom/common/y4menc.c
index d33d0313e..585d22197 100644
--- a/third_party/aom/common/y4menc.c
+++ b/third_party/aom/common/y4menc.c
@@ -11,56 +11,93 @@
#include <assert.h>
+#include "common/rawenc.h"
#include "common/y4menc.h"
-int y4m_write_file_header(char *buf, size_t len, int width, int height,
- const struct AvxRational *framerate,
- aom_img_fmt_t fmt, unsigned int bit_depth) {
- const char *color;
+// Returns the Y4M name associated with the monochrome colorspace.
+const char *monochrome_colorspace(unsigned int bit_depth) {
+ switch (bit_depth) {
+ case 8: return "Cmono";
+ case 9: return "Cmono9";
+ case 10: return "Cmono10";
+ case 12: return "Cmono12";
+ case 16: return "Cmono16";
+ default: assert(0); return NULL;
+ }
+}
+
+// Return the Y4M name of the 8-bit colorspace, given the chroma position and
+// image format.
+const char *colorspace8(aom_chroma_sample_position_t csp, aom_img_fmt_t fmt) {
+ switch (fmt) {
+ case AOM_IMG_FMT_444A: return "C444alpha";
+ case AOM_IMG_FMT_I444: return "C444";
+ case AOM_IMG_FMT_I422: return "C422";
+ default:
+ if (csp == AOM_CSP_VERTICAL) {
+ return "C420mpeg2 XYSCSS=420MPEG2";
+ } else {
+ return "C420jpeg";
+ }
+ }
+}
+
+// Return the Y4M name of the colorspace, given the bit depth and image format.
+const char *colorspace(unsigned int bit_depth, aom_chroma_sample_position_t csp,
+ aom_img_fmt_t fmt) {
switch (bit_depth) {
- case 8:
- color = fmt == AOM_IMG_FMT_444A
- ? "C444alpha\n"
- : fmt == AOM_IMG_FMT_I444
- ? "C444\n"
- : fmt == AOM_IMG_FMT_I422 ? "C422\n" : "C420jpeg\n";
- break;
+ case 8: return colorspace8(csp, fmt);
case 9:
- color = fmt == AOM_IMG_FMT_I44416
- ? "C444p9 XYSCSS=444P9\n"
- : fmt == AOM_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n"
- : "C420p9 XYSCSS=420P9\n";
- break;
+ return fmt == AOM_IMG_FMT_I44416
+ ? "C444p9 XYSCSS=444P9"
+ : fmt == AOM_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9"
+ : "C420p9 XYSCSS=420P9";
case 10:
- color = fmt == AOM_IMG_FMT_I44416
- ? "C444p10 XYSCSS=444P10\n"
- : fmt == AOM_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n"
- : "C420p10 XYSCSS=420P10\n";
- break;
+ return fmt == AOM_IMG_FMT_I44416
+ ? "C444p10 XYSCSS=444P10"
+ : fmt == AOM_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10"
+ : "C420p10 XYSCSS=420P10";
case 12:
- color = fmt == AOM_IMG_FMT_I44416
- ? "C444p12 XYSCSS=444P12\n"
- : fmt == AOM_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n"
- : "C420p12 XYSCSS=420P12\n";
- break;
+ return fmt == AOM_IMG_FMT_I44416
+ ? "C444p12 XYSCSS=444P12"
+ : fmt == AOM_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12"
+ : "C420p12 XYSCSS=420P12";
case 14:
- color = fmt == AOM_IMG_FMT_I44416
- ? "C444p14 XYSCSS=444P14\n"
- : fmt == AOM_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n"
- : "C420p14 XYSCSS=420P14\n";
- break;
+ return fmt == AOM_IMG_FMT_I44416
+ ? "C444p14 XYSCSS=444P14"
+ : fmt == AOM_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14"
+ : "C420p14 XYSCSS=420P14";
case 16:
- color = fmt == AOM_IMG_FMT_I44416
- ? "C444p16 XYSCSS=444P16\n"
- : fmt == AOM_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n"
- : "C420p16 XYSCSS=420P16\n";
- break;
- default: color = NULL; assert(0);
+ return fmt == AOM_IMG_FMT_I44416
+ ? "C444p16 XYSCSS=444P16"
+ : fmt == AOM_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16"
+ : "C420p16 XYSCSS=420P16";
+ default: assert(0); return NULL;
}
- return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s", width, height,
+}
+
+int y4m_write_file_header(char *buf, size_t len, int width, int height,
+ const struct AvxRational *framerate, int monochrome,
+ aom_chroma_sample_position_t csp, aom_img_fmt_t fmt,
+ unsigned int bit_depth) {
+ const char *color = monochrome ? monochrome_colorspace(bit_depth)
+ : colorspace(bit_depth, csp, fmt);
+ return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s\n", width, height,
framerate->numerator, framerate->denominator, 'p', color);
}
int y4m_write_frame_header(char *buf, size_t len) {
return snprintf(buf, len, "FRAME\n");
}
+
+void y4m_write_image_file(const aom_image_t *img, const int *planes,
+ FILE *file) {
+ int num_planes = img->monochrome ? 1 : 3;
+ raw_write_image_file(img, planes, num_planes, file);
+}
+
+void y4m_update_image_md5(const aom_image_t *img, const int *planes,
+ MD5Context *md5) {
+ int num_planes = img->monochrome ? 1 : 3;
+ raw_update_image_md5(img, planes, num_planes, md5);
+}
diff --git a/third_party/aom/common/y4menc.h b/third_party/aom/common/y4menc.h
index 6344176ba..f6d5fd86b 100644
--- a/third_party/aom/common/y4menc.h
+++ b/third_party/aom/common/y4menc.h
@@ -9,10 +9,11 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef Y4MENC_H_
-#define Y4MENC_H_
+#ifndef AOM_COMMON_Y4MENC_H_
+#define AOM_COMMON_Y4MENC_H_
#include "aom/aom_decoder.h"
+#include "common/md5_utils.h"
#include "common/tools_common.h"
#ifdef __cplusplus
@@ -22,12 +23,17 @@ extern "C" {
#define Y4M_BUFFER_SIZE 128
int y4m_write_file_header(char *buf, size_t len, int width, int height,
- const struct AvxRational *framerate,
- aom_img_fmt_t fmt, unsigned int bit_depth);
+ const struct AvxRational *framerate, int monochrome,
+ aom_chroma_sample_position_t csp, aom_img_fmt_t fmt,
+ unsigned int bit_depth);
int y4m_write_frame_header(char *buf, size_t len);
+void y4m_write_image_file(const aom_image_t *img, const int *planes,
+ FILE *file);
+void y4m_update_image_md5(const aom_image_t *img, const int *planes,
+ MD5Context *md5);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // Y4MENC_H_
+#endif // AOM_COMMON_Y4MENC_H_
diff --git a/third_party/aom/common/y4minput.c b/third_party/aom/common/y4minput.c
index a1dca10cd..eca8b1bba 100644
--- a/third_party/aom/common/y4minput.c
+++ b/third_party/aom/common/y4minput.c
@@ -393,7 +393,7 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
}
/*Perform vertical filtering to reduce a single plane from 4:2:2 to 4:2:0.
- This is used as a helper by several converation routines.*/
+ This is used as a helper by several conversion routines.*/
static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst,
const unsigned char *_src, int _c_w,
int _c_h) {
diff --git a/third_party/aom/common/y4minput.h b/third_party/aom/common/y4minput.h
index db20190db..01b9ce972 100644
--- a/third_party/aom/common/y4minput.h
+++ b/third_party/aom/common/y4minput.h
@@ -12,8 +12,8 @@
* Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
*/
-#ifndef Y4MINPUT_H_
-#define Y4MINPUT_H_
+#ifndef AOM_COMMON_Y4MINPUT_H_
+#define AOM_COMMON_Y4MINPUT_H_
#include <stdio.h>
#include "aom/aom_image.h"
@@ -66,4 +66,4 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *img);
} // extern "C"
#endif
-#endif // Y4MINPUT_H_
+#endif // AOM_COMMON_Y4MINPUT_H_
diff --git a/third_party/aom/docs.cmake b/third_party/aom/docs.cmake
index fd8d02ce6..b5bfa9b56 100644
--- a/third_party/aom/docs.cmake
+++ b/third_party/aom/docs.cmake
@@ -20,10 +20,17 @@ set(AOM_DOXYGEN_CONFIG_TEMPLATE "libs.doxy_template")
set(AOM_DOXYGEN_OUTPUT_DIR "${AOM_CONFIG_DIR}/dox")
set(AOM_DOXYGEN_SECTIONS "av1")
-set(AOM_DOXYGEN_SOURCES "${AOM_ROOT}/aom/aom.h" "${AOM_ROOT}/aom/aom_codec.h"
- "${AOM_ROOT}/aom/aom_frame_buffer.h" "${AOM_ROOT}/aom/aom_image.h"
- "${AOM_ROOT}/aom/aom_integer.h" "${AOM_ROOT}/keywords.dox"
- "${AOM_ROOT}/mainpage.dox" "${AOM_ROOT}/usage.dox")
+set(AOM_DOXYGEN_SOURCES
+ "${AOM_ROOT}/aom/aom.h"
+ "${AOM_ROOT}/aom/aom_codec.h"
+ "${AOM_ROOT}/aom/aom_decoder.h"
+ "${AOM_ROOT}/aom/aom_encoder.h"
+ "${AOM_ROOT}/aom/aom_frame_buffer.h"
+ "${AOM_ROOT}/aom/aom_image.h"
+ "${AOM_ROOT}/aom/aom_integer.h"
+ "${AOM_ROOT}/keywords.dox"
+ "${AOM_ROOT}/mainpage.dox"
+ "${AOM_ROOT}/usage.dox")
if(CONFIG_AV1_DECODER)
set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
@@ -37,8 +44,7 @@ if(CONFIG_AV1_DECODER)
set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_decoder decoder")
- set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES}
- "${AOM_ROOT}/aom/aom_decoder.h" "${AOM_ROOT}/aom/aomdx.h"
+ set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomdx.h"
"${AOM_ROOT}/usage_dx.dox")
if(CONFIG_ANALYZER)
@@ -78,7 +84,7 @@ if(CONFIG_AV1_ENCODER)
set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_encoder encoder")
set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomcx.h"
- "${AOM_ROOT}/aom/aom_encoder.h" "${AOM_ROOT}/usage_cx.dox")
+ "${AOM_ROOT}/usage_cx.dox")
endif()
if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
diff --git a/third_party/aom/examples/aom_cx_set_ref.c b/third_party/aom/examples/aom_cx_set_ref.c
index fc037d484..8e3d216fe 100644
--- a/third_party/aom/examples/aom_cx_set_ref.c
+++ b/third_party/aom/examples/aom_cx_set_ref.c
@@ -54,12 +54,11 @@
#include "aom/aom_decoder.h"
#include "aom/aom_encoder.h"
#include "aom/aomcx.h"
+#include "aom_scale/yv12config.h"
#include "common/tools_common.h"
#include "common/video_writer.h"
#include "examples/encoder_util.h"
-#define AOM_BORDER_IN_PIXELS 288
-
static const char *exec_name;
void usage_exit() {
diff --git a/third_party/aom/examples/encoder_util.h b/third_party/aom/examples/encoder_util.h
index 966f5e004..a6bb3fb48 100644
--- a/third_party/aom/examples/encoder_util.h
+++ b/third_party/aom/examples/encoder_util.h
@@ -11,8 +11,8 @@
// Utility functions used by encoder binaries.
-#ifndef EXAMPLES_ENCODER_UTIL_H_
-#define EXAMPLES_ENCODER_UTIL_H_
+#ifndef AOM_EXAMPLES_ENCODER_UTIL_H_
+#define AOM_EXAMPLES_ENCODER_UTIL_H_
#include "aom/aom_image.h"
@@ -30,4 +30,4 @@ void aom_find_mismatch(const aom_image_t *const img1,
int aom_compare_img(const aom_image_t *const img1,
const aom_image_t *const img2);
-#endif // EXAMPLES_ENCODER_UTIL_H_
+#endif // AOM_EXAMPLES_ENCODER_UTIL_H_
diff --git a/third_party/aom/examples/lightfield_bitstream_parsing.c b/third_party/aom/examples/lightfield_bitstream_parsing.c
index 71d4dec77..159f1617a 100644
--- a/third_party/aom/examples/lightfield_bitstream_parsing.c
+++ b/third_party/aom/examples/lightfield_bitstream_parsing.c
@@ -104,6 +104,7 @@ static int get_image_bps(aom_img_fmt_t fmt) {
case AOM_IMG_FMT_I44416: return 48;
default: die("Invalid image format");
}
+ return 0;
}
int main(int argc, char **argv) {
@@ -212,6 +213,7 @@ int main(int argc, char **argv) {
if (!aom_video_writer_write_frame(writer, frame_hdr_buf, bytes_to_copy,
pts))
die_codec(&codec, "Failed to copy compressed camera frame header.");
+ free(frame_hdr_buf);
}
// Read out the image format.
@@ -219,6 +221,7 @@ int main(int argc, char **argv) {
if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
die_codec(&codec, "Failed to get the image format");
const int bps = get_image_bps(ref_fmt);
+ if (!bps) die_codec(&codec, "Invalid image format.");
// read out the tile size.
unsigned int tile_size = 0;
if (aom_codec_control(&codec, AV1D_GET_TILE_SIZE, &tile_size))
diff --git a/third_party/aom/examples/lightfield_decoder.c b/third_party/aom/examples/lightfield_decoder.c
index 5da468413..f5e54db7f 100644
--- a/third_party/aom/examples/lightfield_decoder.c
+++ b/third_party/aom/examples/lightfield_decoder.c
@@ -25,12 +25,11 @@
#include "aom/aom_decoder.h"
#include "aom/aomdx.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
#include "common/tools_common.h"
#include "common/video_reader.h"
-#define MAX_EXTERNAL_REFERENCES 128
-#define AOM_BORDER_IN_PIXELS 288
-
static const char *exec_name;
void usage_exit(void) {
diff --git a/third_party/aom/examples/lightfield_encoder.c b/third_party/aom/examples/lightfield_encoder.c
index f8c37fbb0..e55cd5ce3 100644
--- a/third_party/aom/examples/lightfield_encoder.c
+++ b/third_party/aom/examples/lightfield_encoder.c
@@ -37,14 +37,11 @@
#include "aom/aom_encoder.h"
#include "aom/aomcx.h"
+#include "aom_scale/yv12config.h"
#include "av1/common/enums.h"
-
#include "common/tools_common.h"
#include "common/video_writer.h"
-#define MAX_EXTERNAL_REFERENCES 128
-#define AOM_BORDER_IN_PIXELS 288
-
static const char *exec_name;
void usage_exit(void) {
diff --git a/third_party/aom/examples/lightfield_tile_list_decoder.c b/third_party/aom/examples/lightfield_tile_list_decoder.c
index 2e4f3898d..5556bf0e7 100644
--- a/third_party/aom/examples/lightfield_tile_list_decoder.c
+++ b/third_party/aom/examples/lightfield_tile_list_decoder.c
@@ -30,12 +30,11 @@
#include "aom/aom_decoder.h"
#include "aom/aomdx.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
#include "common/tools_common.h"
#include "common/video_reader.h"
-#define MAX_EXTERNAL_REFERENCES 128
-#define AOM_BORDER_IN_PIXELS 288
-
static const char *exec_name;
void usage_exit(void) {
diff --git a/third_party/aom/examples/noise_model.c b/third_party/aom/examples/noise_model.c
index 5a5b4d40d..5cc6003b6 100644
--- a/third_party/aom/examples/noise_model.c
+++ b/third_party/aom/examples/noise_model.c
@@ -179,9 +179,14 @@ static void print_variance_y(FILE *debug_file, aom_image_t *raw,
int block_size, aom_film_grain_t *grain) {
aom_image_t renoised;
grain->apply_grain = 1;
- grain->random_seed = 1071;
+ grain->random_seed = 7391;
aom_img_alloc(&renoised, raw->fmt, raw->w, raw->h, 1);
- av1_add_film_grain(grain, denoised, &renoised);
+
+ if (av1_add_film_grain(grain, denoised, &renoised)) {
+ fprintf(stderr, "Internal failure in av1_add_film_grain().\n");
+ aom_img_free(&renoised);
+ return;
+ }
const int num_blocks_w = (raw->w + block_size - 1) / block_size;
const int num_blocks_h = (raw->h + block_size - 1) / block_size;
@@ -324,7 +329,7 @@ int main(int argc, char *argv[]) {
const int num_blocks_h = (info.frame_height + block_size - 1) / block_size;
uint8_t *flat_blocks = (uint8_t *)aom_malloc(num_blocks_w * num_blocks_h);
// Sets the random seed on the first entry in the output table
- int16_t random_seed = 1071;
+ int16_t random_seed = 7391;
aom_noise_model_t noise_model;
aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, args.bit_depth,
high_bd };
diff --git a/third_party/aom/stats/aomstats.h b/third_party/aom/stats/aomstats.h
index 643809344..b9c71871a 100644
--- a/third_party/aom/stats/aomstats.h
+++ b/third_party/aom/stats/aomstats.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOMSTATS_H_
-#define AOMSTATS_H_
+#ifndef AOM_STATS_AOMSTATS_H_
+#define AOM_STATS_AOMSTATS_H_
#include <stdio.h>
@@ -41,4 +41,4 @@ aom_fixed_buf_t stats_get(stats_io_t *stats);
} // extern "C"
#endif
-#endif // AOMSTATS_H_
+#endif // AOM_STATS_AOMSTATS_H_
diff --git a/third_party/aom/stats/rate_hist.h b/third_party/aom/stats/rate_hist.h
index e6aa149ae..55b8c5d43 100644
--- a/third_party/aom/stats/rate_hist.h
+++ b/third_party/aom/stats/rate_hist.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef RATE_HIST_H_
-#define RATE_HIST_H_
+#ifndef AOM_STATS_RATE_HIST_H_
+#define AOM_STATS_RATE_HIST_H_
#include "aom/aom_encoder.h"
@@ -38,4 +38,4 @@ void show_rate_histogram(struct rate_hist *hist, const aom_codec_enc_cfg_t *cfg,
} // extern "C"
#endif
-#endif // RATE_HIST_H_
+#endif // AOM_STATS_RATE_HIST_H_
diff --git a/third_party/aom/test/acm_random.h b/third_party/aom/test/acm_random.h
index 023387061..0a8317fd5 100644
--- a/third_party/aom/test/acm_random.h
+++ b/third_party/aom/test/acm_random.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_ACM_RANDOM_H_
-#define TEST_ACM_RANDOM_H_
+#ifndef AOM_TEST_ACM_RANDOM_H_
+#define AOM_TEST_ACM_RANDOM_H_
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -81,4 +81,4 @@ class ACMRandom {
} // namespace libaom_test
-#endif // TEST_ACM_RANDOM_H_
+#endif // AOM_TEST_ACM_RANDOM_H_
diff --git a/third_party/aom/test/av1_config_test.cc b/third_party/aom/test/av1_config_test.cc
new file mode 100644
index 000000000..e2f2c5390
--- /dev/null
+++ b/third_party/aom/test/av1_config_test.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <string.h>
+
+#include "common/av1_config.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+//
+// Input buffers containing exactly one Sequence Header OBU.
+//
+// Each buffer is named according to the OBU storage format (Annex-B vs Low
+// Overhead Bitstream Format) and the type of Sequence Header OBU ("Full"
+// Sequence Header OBUs vs Sequence Header OBUs with the
+// reduced_still_image_flag set).
+//
+const uint8_t kAnnexBFullSequenceHeaderObu[] = {
+ 0x0c, 0x08, 0x00, 0x00, 0x00, 0x04, 0x45, 0x7e, 0x3e, 0xff, 0xfc, 0xc0, 0x20
+};
+const uint8_t kAnnexBReducedStillImageSequenceHeaderObu[] = {
+ 0x08, 0x08, 0x18, 0x22, 0x2b, 0xf1, 0xfe, 0xc0, 0x20
+};
+
+const uint8_t kLobfFullSequenceHeaderObu[] = {
+ 0x0a, 0x0b, 0x00, 0x00, 0x00, 0x04, 0x45, 0x7e, 0x3e, 0xff, 0xfc, 0xc0, 0x20
+};
+
+const uint8_t kLobfReducedStillImageSequenceHeaderObu[] = {
+ 0x0a, 0x07, 0x18, 0x22, 0x2b, 0xf1, 0xfe, 0xc0, 0x20
+};
+
+const uint8_t kAv1cAllZero[] = { 0, 0, 0, 0 };
+
+// The size of AV1 config when no configOBUs are present at the end of the
+// configuration structure.
+const size_t kAv1cNoConfigObusSize = 4;
+
+bool VerifyAv1c(const uint8_t *const obu_buffer, size_t obu_buffer_length,
+ bool is_annexb) {
+ Av1Config av1_config;
+ memset(&av1_config, 0, sizeof(av1_config));
+ bool parse_ok = get_av1config_from_obu(obu_buffer, obu_buffer_length,
+ is_annexb, &av1_config) == 0;
+ if (parse_ok) {
+ EXPECT_EQ(1, av1_config.marker);
+ EXPECT_EQ(1, av1_config.version);
+ EXPECT_EQ(0, av1_config.seq_profile);
+ EXPECT_EQ(0, av1_config.seq_level_idx_0);
+ EXPECT_EQ(0, av1_config.seq_tier_0);
+ EXPECT_EQ(0, av1_config.high_bitdepth);
+ EXPECT_EQ(0, av1_config.twelve_bit);
+ EXPECT_EQ(0, av1_config.monochrome);
+ EXPECT_EQ(1, av1_config.chroma_subsampling_x);
+ EXPECT_EQ(1, av1_config.chroma_subsampling_y);
+ EXPECT_EQ(0, av1_config.chroma_sample_position);
+ EXPECT_EQ(0, av1_config.initial_presentation_delay_present);
+ EXPECT_EQ(0, av1_config.initial_presentation_delay_minus_one);
+ }
+ return parse_ok && ::testing::Test::HasFailure() == false;
+}
+
+TEST(Av1Config, ObuInvalidInputs) {
+ Av1Config av1_config;
+ memset(&av1_config, 0, sizeof(av1_config));
+ ASSERT_EQ(-1, get_av1config_from_obu(NULL, 0, 0, NULL));
+ ASSERT_EQ(-1,
+ get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0], 0, 0, NULL));
+ ASSERT_EQ(
+ -1, get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0],
+ sizeof(kLobfFullSequenceHeaderObu), 0, NULL));
+ ASSERT_EQ(-1, get_av1config_from_obu(NULL, sizeof(kLobfFullSequenceHeaderObu),
+ 0, NULL));
+ ASSERT_EQ(-1, get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0], 0, 0,
+ &av1_config));
+}
+
+TEST(Av1Config, ReadInvalidInputs) {
+ Av1Config av1_config;
+ memset(&av1_config, 0, sizeof(av1_config));
+ size_t bytes_read = 0;
+ ASSERT_EQ(-1, read_av1config(NULL, 0, NULL, NULL));
+ ASSERT_EQ(-1, read_av1config(NULL, 4, NULL, NULL));
+ ASSERT_EQ(-1, read_av1config(&kAv1cAllZero[0], 0, NULL, NULL));
+ ASSERT_EQ(-1, read_av1config(&kAv1cAllZero[0], 4, &bytes_read, NULL));
+ ASSERT_EQ(-1, read_av1config(NULL, 4, &bytes_read, &av1_config));
+}
+
+TEST(Av1Config, WriteInvalidInputs) {
+ Av1Config av1_config;
+ memset(&av1_config, 0, sizeof(av1_config));
+ size_t bytes_written = 0;
+ uint8_t av1c_buffer[4] = { 0 };
+ ASSERT_EQ(-1, write_av1config(NULL, 0, NULL, NULL));
+ ASSERT_EQ(-1, write_av1config(&av1_config, 0, NULL, NULL));
+ ASSERT_EQ(-1, write_av1config(&av1_config, 0, &bytes_written, NULL));
+
+ ASSERT_EQ(-1,
+ write_av1config(&av1_config, 0, &bytes_written, &av1c_buffer[0]));
+ ASSERT_EQ(-1, write_av1config(&av1_config, 4, &bytes_written, NULL));
+}
+
+TEST(Av1Config, GetAv1ConfigFromLobfObu) {
+ // Test parsing of a Sequence Header OBU with the reduced_still_picture_header
+ // unset-- aka a full Sequence Header OBU.
+ ASSERT_TRUE(VerifyAv1c(kLobfFullSequenceHeaderObu,
+ sizeof(kLobfFullSequenceHeaderObu), false));
+
+ // Test parsing of a reduced still image Sequence Header OBU.
+ ASSERT_TRUE(VerifyAv1c(kLobfReducedStillImageSequenceHeaderObu,
+ sizeof(kLobfReducedStillImageSequenceHeaderObu),
+ false));
+}
+
+TEST(Av1Config, GetAv1ConfigFromAnnexBObu) {
+ // Test parsing of a Sequence Header OBU with the reduced_still_picture_header
+ // unset-- aka a full Sequence Header OBU.
+ ASSERT_TRUE(VerifyAv1c(kAnnexBFullSequenceHeaderObu,
+ sizeof(kAnnexBFullSequenceHeaderObu), true));
+
+ // Test parsing of a reduced still image Sequence Header OBU.
+ ASSERT_TRUE(VerifyAv1c(kAnnexBReducedStillImageSequenceHeaderObu,
+ sizeof(kAnnexBReducedStillImageSequenceHeaderObu),
+ true));
+}
+
+TEST(Av1Config, ReadWriteConfig) {
+ Av1Config av1_config;
+ memset(&av1_config, 0, sizeof(av1_config));
+
+ // Test writing out the AV1 config.
+ size_t bytes_written = 0;
+ uint8_t av1c_buffer[4] = { 0 };
+ ASSERT_EQ(0, write_av1config(&av1_config, sizeof(av1c_buffer), &bytes_written,
+ &av1c_buffer[0]));
+ ASSERT_EQ(kAv1cNoConfigObusSize, bytes_written);
+ for (size_t i = 0; i < kAv1cNoConfigObusSize; ++i) {
+ ASSERT_EQ(kAv1cAllZero[i], av1c_buffer[i])
+ << "Mismatch in output Av1Config at offset=" << i;
+ }
+
+ // Test reading the AV1 config.
+ size_t bytes_read = 0;
+ ASSERT_EQ(0, read_av1config(&kAv1cAllZero[0], sizeof(kAv1cAllZero),
+ &bytes_read, &av1_config));
+ ASSERT_EQ(kAv1cNoConfigObusSize, bytes_read);
+ ASSERT_EQ(0, write_av1config(&av1_config, sizeof(av1c_buffer), &bytes_written,
+ &av1c_buffer[0]));
+ for (size_t i = 0; i < kAv1cNoConfigObusSize; ++i) {
+ ASSERT_EQ(kAv1cAllZero[i], av1c_buffer[i])
+ << "Mismatch in output Av1Config at offset=" << i;
+ }
+}
+
+} // namespace
diff --git a/third_party/aom/test/av1_convolve_2d_test_util.cc b/third_party/aom/test/av1_convolve_2d_test_util.cc
index 1aa08044e..409fd23e1 100644
--- a/third_party/aom/test/av1_convolve_2d_test_util.cc
+++ b/third_party/aom/test/av1_convolve_2d_test_util.cc
@@ -71,9 +71,9 @@ void AV1Convolve2DSrTest::RunCheckOutput(convolve_2d_func test_impl) {
out_h);
for (int do_average = 0; do_average < 1; ++do_average) {
ConvolveParams conv_params1 =
- get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, 8);
+ get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
ConvolveParams conv_params2 =
- get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, 8);
+ get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
const int subx_range = has_subx ? 16 : 1;
const int suby_range = has_suby ? 16 : 1;
@@ -126,7 +126,7 @@ void AV1Convolve2DSrTest::RunSpeedTest(convolve_2d_func test_impl) {
const int do_average = 0;
ConvolveParams conv_params2 =
- get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, 8);
+ get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
// Make sure that sizes 2xN and Nx2 are also tested for chroma.
const int num_sizes =
@@ -195,10 +195,10 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
out_h);
for (int do_average = 0; do_average <= 1; ++do_average) {
- ConvolveParams conv_params1 = get_conv_params_no_round(
- 0, do_average, 0, output1, MAX_SB_SIZE, 1, 8);
- ConvolveParams conv_params2 = get_conv_params_no_round(
- 0, do_average, 0, output2, MAX_SB_SIZE, 1, 8);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(do_average, 0, output1, MAX_SB_SIZE, 1, 8);
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(do_average, 0, output2, MAX_SB_SIZE, 1, 8);
// Test special case where jnt_comp_avg is not used
conv_params1.use_jnt_comp_avg = 0;
@@ -331,7 +331,7 @@ void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) {
out_h);
ConvolveParams conv_params =
- get_conv_params_no_round(0, do_average, 0, output, MAX_SB_SIZE, 1, 8);
+ get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, 8);
conv_params.use_jnt_comp_avg = 0;
@@ -395,7 +395,7 @@ void AV1HighbdConvolve2DSrTest::RunSpeedTest(
suby = 0;
ConvolveParams conv_params =
- get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, bd);
+ get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
// Make sure that sizes 2xN and Nx2 are also tested for chroma.
const int num_sizes =
@@ -464,9 +464,9 @@ void AV1HighbdConvolve2DSrTest::RunCheckOutput(
out_h);
for (int do_average = 0; do_average < 1; ++do_average) {
ConvolveParams conv_params1 =
- get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, bd);
+ get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
ConvolveParams conv_params2 =
- get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, bd);
+ get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
const int subx_range = has_subx ? 16 : 1;
const int suby_range = has_suby ? 16 : 1;
@@ -538,7 +538,7 @@ void AV1HighbdJntConvolve2DTest::RunSpeedTest(
out_h);
ConvolveParams conv_params =
- get_conv_params_no_round(0, do_average, 0, output, MAX_SB_SIZE, 1, bd);
+ get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, bd);
// Test special case where jnt_comp_avg is not used
conv_params.use_jnt_comp_avg = 0;
@@ -597,9 +597,9 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput(
out_h);
for (int do_average = 0; do_average <= 1; ++do_average) {
ConvolveParams conv_params1 = get_conv_params_no_round(
- 0, do_average, 0, output1, MAX_SB_SIZE, 1, bd);
+ do_average, 0, output1, MAX_SB_SIZE, 1, bd);
ConvolveParams conv_params2 = get_conv_params_no_round(
- 0, do_average, 0, output2, MAX_SB_SIZE, 1, bd);
+ do_average, 0, output2, MAX_SB_SIZE, 1, bd);
// Test special case where jnt_comp_avg is not used
conv_params1.use_jnt_comp_avg = 0;
diff --git a/third_party/aom/test/av1_convolve_2d_test_util.h b/third_party/aom/test/av1_convolve_2d_test_util.h
index cd4607d68..e0eb58410 100644
--- a/third_party/aom/test/av1_convolve_2d_test_util.h
+++ b/third_party/aom/test/av1_convolve_2d_test_util.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
-#define TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
+#ifndef AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
+#define AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
#include "config/av1_rtcd.h"
#include "config/aom_dsp_rtcd.h"
@@ -114,4 +114,4 @@ class AV1HighbdJntConvolve2DTest
} // namespace libaom_test
-#endif // TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
+#endif // AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
diff --git a/third_party/aom/test/av1_convolve_scale_test.cc b/third_party/aom/test/av1_convolve_scale_test.cc
index b99caaeeb..3b1698eeb 100644
--- a/third_party/aom/test/av1_convolve_scale_test.cc
+++ b/third_party/aom/test/av1_convolve_scale_test.cc
@@ -279,7 +279,7 @@ class ConvolveScaleTestBase : public ::testing::Test {
filter_x_.set(ntaps_x_, false);
filter_y_.set(ntaps_y_, true);
convolve_params_ =
- get_conv_params_no_round(0, avg_ != false, 0, NULL, 0, 1, bd);
+ get_conv_params_no_round(avg_ != false, 0, NULL, 0, 1, bd);
delete image_;
image_ = new TestImage<SrcPixel>(width_, height_, bd_);
diff --git a/third_party/aom/test/av1_encoder_parms_get_to_decoder.cc b/third_party/aom/test/av1_encoder_parms_get_to_decoder.cc
new file mode 100644
index 000000000..e8470e5d5
--- /dev/null
+++ b/third_party/aom/test/av1_encoder_parms_get_to_decoder.cc
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+#include "aom/aom_decoder.h"
+#include "av1/decoder/decoder.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+
+struct ParamPassingTestVideo {
+ const char *name;
+ uint32_t width;
+ uint32_t height;
+ uint32_t bitrate;
+ int frames;
+};
+
+const ParamPassingTestVideo kAV1ParamPassingTestVector = {
+ "niklas_1280_720_30.y4m", 1280, 720, 600, 3
+};
+
+struct EncodeParameters {
+ int32_t lossless;
+ aom_color_primaries_t color_primaries;
+ aom_transfer_characteristics_t transfer_characteristics;
+ aom_matrix_coefficients_t matrix_coefficients;
+ aom_color_range_t color_range;
+ aom_chroma_sample_position_t chroma_sample_position;
+ int32_t render_size[2];
+};
+
+const EncodeParameters kAV1EncodeParameterSet[] = {
+ { 1,
+ AOM_CICP_CP_BT_709,
+ AOM_CICP_TC_BT_709,
+ AOM_CICP_MC_BT_709,
+ AOM_CR_STUDIO_RANGE,
+ AOM_CSP_UNKNOWN,
+ { 0, 0 } },
+ { 0,
+ AOM_CICP_CP_BT_470_M,
+ AOM_CICP_TC_BT_470_M,
+ AOM_CICP_MC_BT_470_B_G,
+ AOM_CR_FULL_RANGE,
+ AOM_CSP_VERTICAL,
+ { 0, 0 } },
+ { 1,
+ AOM_CICP_CP_BT_601,
+ AOM_CICP_TC_BT_601,
+ AOM_CICP_MC_BT_601,
+ AOM_CR_STUDIO_RANGE,
+ AOM_CSP_COLOCATED,
+ { 0, 0 } },
+ { 0,
+ AOM_CICP_CP_BT_2020,
+ AOM_CICP_TC_BT_2020_10_BIT,
+ AOM_CICP_MC_BT_2020_NCL,
+ AOM_CR_FULL_RANGE,
+ AOM_CSP_RESERVED,
+ { 640, 480 } },
+};
+
+class AVxEncoderParmsGetToDecoder
+ : public ::libaom_test::EncoderTest,
+ public ::libaom_test::CodecTestWithParam<EncodeParameters> {
+ protected:
+ AVxEncoderParmsGetToDecoder()
+ : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
+
+ virtual ~AVxEncoderParmsGetToDecoder() {}
+
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(::libaom_test::kTwoPassGood);
+ cfg_.g_lag_in_frames = 25;
+ test_video_ = kAV1ParamPassingTestVector;
+ cfg_.rc_target_bitrate = test_video_.bitrate;
+ }
+
+ virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) {
+ if (video->frame() == 1) {
+ encoder->Control(AV1E_SET_COLOR_PRIMARIES, encode_parms.color_primaries);
+ encoder->Control(AV1E_SET_TRANSFER_CHARACTERISTICS,
+ encode_parms.transfer_characteristics);
+ encoder->Control(AV1E_SET_MATRIX_COEFFICIENTS,
+ encode_parms.matrix_coefficients);
+ encoder->Control(AV1E_SET_COLOR_RANGE, encode_parms.color_range);
+ encoder->Control(AV1E_SET_CHROMA_SAMPLE_POSITION,
+ encode_parms.chroma_sample_position);
+ encoder->Control(AV1E_SET_LOSSLESS, encode_parms.lossless);
+ if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
+ encoder->Control(AV1E_SET_RENDER_SIZE, encode_parms.render_size);
+ }
+ }
+ }
+
+ virtual void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) {
+ (void)pts;
+ if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
+ EXPECT_EQ(encode_parms.render_size[0], (int)img.r_w);
+ EXPECT_EQ(encode_parms.render_size[1], (int)img.r_h);
+ }
+ EXPECT_EQ(encode_parms.color_primaries, img.cp);
+ EXPECT_EQ(encode_parms.transfer_characteristics, img.tc);
+ EXPECT_EQ(encode_parms.matrix_coefficients, img.mc);
+ EXPECT_EQ(encode_parms.color_range, img.range);
+ EXPECT_EQ(encode_parms.chroma_sample_position, img.csp);
+ }
+
+ virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ if (encode_parms.lossless) {
+ EXPECT_EQ(kMaxPsnr, pkt->data.psnr.psnr[0]);
+ }
+ }
+
+ virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ ParamPassingTestVideo test_video_;
+
+ private:
+ EncodeParameters encode_parms;
+};
+
+TEST_P(AVxEncoderParmsGetToDecoder, BitstreamParms) {
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ testing::internal::scoped_ptr<libaom_test::VideoSource> video(
+ new libaom_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames));
+ ASSERT_TRUE(video.get() != NULL);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+AV1_INSTANTIATE_TEST_CASE(AVxEncoderParmsGetToDecoder,
+ ::testing::ValuesIn(kAV1EncodeParameterSet));
+} // namespace
diff --git a/third_party/aom/test/av1_fwd_txfm2d_test.cc b/third_party/aom/test/av1_fwd_txfm2d_test.cc
index 6577e33b8..75f20536b 100644
--- a/third_party/aom/test/av1_fwd_txfm2d_test.cc
+++ b/third_party/aom/test/av1_fwd_txfm2d_test.cc
@@ -20,6 +20,7 @@
#include "test/util.h"
#include "test/av1_txfm_test.h"
#include "av1/common/av1_txfm.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
using libaom_test::ACMRandom;
using libaom_test::TYPE_TXFM;
@@ -351,4 +352,160 @@ INSTANTIATE_TEST_CASE_P(AVX2, AV1FwdTxfm2dTest,
Combine(ValuesIn(fwd_txfm_for_avx2),
Values(av1_lowbd_fwd_txfm_avx2)));
#endif // HAVE_AVX2
+
+typedef void (*Highbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param);
+
+void AV1HighbdFwdTxfm2dMatchTest(TX_SIZE tx_size,
+ Highbd_fwd_txfm_func target_func) {
+ const int bd_ar[2] = { 10, 12 };
+ TxfmParam param;
+ memset(&param, 0, sizeof(param));
+ const int rows = tx_size_high[tx_size];
+ const int cols = tx_size_wide[tx_size];
+ for (int i = 0; i < 2; ++i) {
+ const int bd = bd_ar[i];
+ for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+ if (libaom_test::IsTxSizeTypeValid(
+ tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+ continue;
+ }
+
+ FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+ if (ref_func != NULL) {
+ DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+ DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+ DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+ int input_stride = 64;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ for (int cnt = 0; cnt < 500; ++cnt) {
+ if (cnt == 0) {
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ input[r * input_stride + c] = (1 << bd) - 1;
+ }
+ }
+ } else {
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+ }
+ }
+ }
+ param.tx_type = (TX_TYPE)tx_type;
+ param.tx_size = (TX_SIZE)tx_size;
+ param.tx_set_type = EXT_TX_SET_ALL16;
+ param.bd = bd;
+
+ ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+ target_func(input, output, input_stride, &param);
+ const int check_rows = AOMMIN(32, rows);
+ const int check_cols = AOMMIN(32, rows * cols / check_rows);
+ for (int r = 0; r < check_rows; ++r) {
+ for (int c = 0; c < check_cols; ++c) {
+ ASSERT_EQ(ref_output[r * check_cols + c],
+ output[r * check_cols + c])
+ << "[" << r << "," << c << "] cnt:" << cnt
+ << " tx_size: " << tx_size << " tx_type: " << tx_type;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void AV1HighbdFwdTxfm2dSpeedTest(TX_SIZE tx_size,
+ Highbd_fwd_txfm_func target_func) {
+ const int bd_ar[2] = { 10, 12 };
+ TxfmParam param;
+ memset(&param, 0, sizeof(param));
+ const int rows = tx_size_high[tx_size];
+ const int cols = tx_size_wide[tx_size];
+ const int num_loops = 1000000 / (rows * cols);
+
+ for (int i = 0; i < 2; ++i) {
+ const int bd = bd_ar[i];
+ for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+ if (libaom_test::IsTxSizeTypeValid(
+ tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+ continue;
+ }
+
+ FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+ if (ref_func != NULL) {
+ DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+ DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+ DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+ int input_stride = 64;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+ }
+ }
+
+ param.tx_type = (TX_TYPE)tx_type;
+ param.tx_size = (TX_SIZE)tx_size;
+ param.tx_set_type = EXT_TX_SET_ALL16;
+ param.bd = bd;
+
+ aom_usec_timer ref_timer, test_timer;
+
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < num_loops; ++i) {
+ ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < num_loops; ++i) {
+ target_func(input, output, input_stride, &param);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t "
+ "gain=%d \n",
+ tx_size, tx_type, elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+ }
+ }
+ }
+}
+
+typedef ::testing::tuple<TX_SIZE, Highbd_fwd_txfm_func> HighbdFwdTxfm2dParam;
+
+class AV1HighbdFwdTxfm2dTest
+ : public ::testing::TestWithParam<HighbdFwdTxfm2dParam> {};
+
+TEST_P(AV1HighbdFwdTxfm2dTest, match) {
+ AV1HighbdFwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+TEST_P(AV1HighbdFwdTxfm2dTest, DISABLED_Speed) {
+ AV1HighbdFwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+using ::testing::Combine;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if HAVE_SSE4_1
+static TX_SIZE Highbd_fwd_txfm_for_sse4_1[] = {
+ TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4,
+ TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
+ TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16,
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdFwdTxfm2dTest,
+ Combine(ValuesIn(Highbd_fwd_txfm_for_sse4_1),
+ Values(av1_highbd_fwd_txfm)));
+#endif // HAVE_SSE4_1
+
} // namespace
diff --git a/third_party/aom/test/av1_highbd_iht_test.cc b/third_party/aom/test/av1_highbd_iht_test.cc
index 8cadc85e7..2d6490c2a 100644
--- a/third_party/aom/test/av1_highbd_iht_test.cc
+++ b/third_party/aom/test/av1_highbd_iht_test.cc
@@ -14,10 +14,12 @@
#include "config/av1_rtcd.h"
#include "test/acm_random.h"
+#include "test/av1_txfm_test.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "av1/common/enums.h"
+#include "av1/common/scan.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
@@ -142,55 +144,8 @@ using ::testing::make_tuple;
#define PARAM_LIST_4X4 \
&av1_fwd_txfm2d_4x4_c, &av1_inv_txfm2d_add_4x4_sse4_1, \
&av1_inv_txfm2d_add_4x4_c, 16
-#define PARAM_LIST_8X8 \
- &av1_fwd_txfm2d_8x8_c, &av1_inv_txfm2d_add_8x8_sse4_1, \
- &av1_inv_txfm2d_add_8x8_c, 64
-#define PARAM_LIST_16X16 \
- &av1_fwd_txfm2d_16x16_c, &av1_inv_txfm2d_add_16x16_sse4_1, \
- &av1_inv_txfm2d_add_16x16_c, 256
-#define PARAM_LIST_64X64 \
- &av1_fwd_txfm2d_64x64_c, &av1_inv_txfm2d_add_64x64_sse4_1, \
- &av1_inv_txfm2d_add_64x64_c, 4096
const IHbdHtParam kArrayIhtParam[] = {
- // 16x16
- make_tuple(PARAM_LIST_16X16, DCT_DCT, 10),
- make_tuple(PARAM_LIST_16X16, DCT_DCT, 12),
- make_tuple(PARAM_LIST_16X16, ADST_DCT, 10),
- make_tuple(PARAM_LIST_16X16, ADST_DCT, 12),
- make_tuple(PARAM_LIST_16X16, DCT_ADST, 10),
- make_tuple(PARAM_LIST_16X16, DCT_ADST, 12),
- make_tuple(PARAM_LIST_16X16, ADST_ADST, 10),
- make_tuple(PARAM_LIST_16X16, ADST_ADST, 12),
- make_tuple(PARAM_LIST_16X16, FLIPADST_DCT, 10),
- make_tuple(PARAM_LIST_16X16, FLIPADST_DCT, 12),
- make_tuple(PARAM_LIST_16X16, DCT_FLIPADST, 10),
- make_tuple(PARAM_LIST_16X16, DCT_FLIPADST, 12),
- make_tuple(PARAM_LIST_16X16, FLIPADST_FLIPADST, 10),
- make_tuple(PARAM_LIST_16X16, FLIPADST_FLIPADST, 12),
- make_tuple(PARAM_LIST_16X16, ADST_FLIPADST, 10),
- make_tuple(PARAM_LIST_16X16, ADST_FLIPADST, 12),
- make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 10),
- make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 12),
- // 8x8
- make_tuple(PARAM_LIST_8X8, DCT_DCT, 10),
- make_tuple(PARAM_LIST_8X8, DCT_DCT, 12),
- make_tuple(PARAM_LIST_8X8, ADST_DCT, 10),
- make_tuple(PARAM_LIST_8X8, ADST_DCT, 12),
- make_tuple(PARAM_LIST_8X8, DCT_ADST, 10),
- make_tuple(PARAM_LIST_8X8, DCT_ADST, 12),
- make_tuple(PARAM_LIST_8X8, ADST_ADST, 10),
- make_tuple(PARAM_LIST_8X8, ADST_ADST, 12),
- make_tuple(PARAM_LIST_8X8, FLIPADST_DCT, 10),
- make_tuple(PARAM_LIST_8X8, FLIPADST_DCT, 12),
- make_tuple(PARAM_LIST_8X8, DCT_FLIPADST, 10),
- make_tuple(PARAM_LIST_8X8, DCT_FLIPADST, 12),
- make_tuple(PARAM_LIST_8X8, FLIPADST_FLIPADST, 10),
- make_tuple(PARAM_LIST_8X8, FLIPADST_FLIPADST, 12),
- make_tuple(PARAM_LIST_8X8, ADST_FLIPADST, 10),
- make_tuple(PARAM_LIST_8X8, ADST_FLIPADST, 12),
- make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 10),
- make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 12),
// 4x4
make_tuple(PARAM_LIST_4X4, DCT_DCT, 10),
make_tuple(PARAM_LIST_4X4, DCT_DCT, 12),
@@ -210,27 +165,151 @@ const IHbdHtParam kArrayIhtParam[] = {
make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 12),
make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 10),
make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12),
- make_tuple(PARAM_LIST_64X64, DCT_DCT, 10),
- make_tuple(PARAM_LIST_64X64, DCT_DCT, 12),
};
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvHTNxN,
::testing::ValuesIn(kArrayIhtParam));
#endif // HAVE_SSE4_1
-#if HAVE_AVX2
-#define PARAM_LIST_32X32 \
- &av1_fwd_txfm2d_32x32_c, &av1_inv_txfm2d_add_32x32_avx2, \
- &av1_inv_txfm2d_add_32x32_c, 1024
-
-const IHbdHtParam kArrayIhtParam32x32[] = {
- // 32x32
- make_tuple(PARAM_LIST_32X32, DCT_DCT, 10),
- make_tuple(PARAM_LIST_32X32, DCT_DCT, 12),
+typedef void (*HighbdInvTxfm2dFunc)(const int32_t *input, uint8_t *output,
+ int stride, const TxfmParam *txfm_param);
+
+typedef ::testing::tuple<const HighbdInvTxfm2dFunc> AV1HighbdInvTxfm2dParam;
+class AV1HighbdInvTxfm2d
+ : public ::testing::TestWithParam<AV1HighbdInvTxfm2dParam> {
+ public:
+ virtual void SetUp() { target_func_ = GET_PARAM(0); }
+ void RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size, int run_times,
+ int bit_depth);
+
+ private:
+ HighbdInvTxfm2dFunc target_func_;
};
-INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdInvHTNxN,
- ::testing::ValuesIn(kArrayIhtParam32x32));
+void AV1HighbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type_, TX_SIZE tx_size_,
+ int run_times, int bit_depth_) {
+ FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size_];
+ TxfmParam txfm_param;
+ const int BLK_WIDTH = 64;
+ const int BLK_SIZE = BLK_WIDTH * BLK_WIDTH;
+ DECLARE_ALIGNED(16, int16_t, input[BLK_SIZE]) = { 0 };
+ DECLARE_ALIGNED(32, int32_t, inv_input[BLK_SIZE]) = { 0 };
+ DECLARE_ALIGNED(32, uint16_t, output[BLK_SIZE]) = { 0 };
+ DECLARE_ALIGNED(32, uint16_t, ref_output[BLK_SIZE]) = { 0 };
+ int stride = BLK_WIDTH;
+ int rows = tx_size_high[tx_size_];
+ int cols = tx_size_wide[tx_size_];
+ const int rows_nonezero = AOMMIN(32, rows);
+ const int cols_nonezero = AOMMIN(32, cols);
+ const uint16_t mask = (1 << bit_depth_) - 1;
+ run_times /= (rows * cols);
+ run_times = AOMMAX(1, run_times);
+ const SCAN_ORDER *scan_order = get_default_scan(tx_size_, tx_type_);
+ const int16_t *scan = scan_order->scan;
+ const int16_t eobmax = rows_nonezero * cols_nonezero;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ int randTimes = run_times == 1 ? (eobmax) : 1;
+
+ txfm_param.tx_type = tx_type_;
+ txfm_param.tx_size = tx_size_;
+ txfm_param.lossless = 0;
+ txfm_param.bd = bit_depth_;
+ txfm_param.is_hbd = 1;
+ txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+
+ for (int cnt = 0; cnt < randTimes; ++cnt) {
+ for (int r = 0; r < BLK_WIDTH; ++r) {
+ for (int c = 0; c < BLK_WIDTH; ++c) {
+ input[r * cols + c] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask);
+ output[r * stride + c] = rnd.Rand16() & mask;
+
+ ref_output[r * stride + c] = output[r * stride + c];
+ }
+ }
+ fwd_func_(input, inv_input, stride, tx_type_, bit_depth_);
+
+ // produce eob input by setting high freq coeffs to zero
+ const int eob = AOMMIN(cnt + 1, eobmax);
+ for (int i = eob; i < eobmax; i++) {
+ inv_input[scan[i]] = 0;
+ }
+ txfm_param.eob = eob;
+ aom_usec_timer ref_timer, test_timer;
+
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_highbd_inv_txfm_add_c(inv_input, CONVERT_TO_BYTEPTR(ref_output),
+ stride, &txfm_param);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(inv_input, CONVERT_TO_BYTEPTR(output), stride, &txfm_param);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+ if (run_times > 10) {
+ printf(
+ "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t "
+ "gain=%d \n",
+ tx_size_, tx_type_, elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+ } else {
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ ASSERT_EQ(ref_output[r * stride + c], output[r * stride + c])
+ << "[" << r << "," << c << "] " << cnt
+ << " tx_size: " << static_cast<int>(tx_size_)
+ << " tx_type: " << tx_type_ << " eob " << eob;
+ }
+ }
+ }
+ }
+}
+
+TEST_P(AV1HighbdInvTxfm2d, match) {
+ int bitdepth_ar[2] = { 10, 12 };
+ for (int k = 0; k < 2; ++k) {
+ int bd = bitdepth_ar[k];
+ for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+ for (int i = 0; i < (int)TX_TYPES; ++i) {
+ if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
+ static_cast<TX_TYPE>(i))) {
+ RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+ 1, bd);
+ }
+ }
+ }
+ }
+}
+
+TEST_P(AV1HighbdInvTxfm2d, DISABLED_Speed) {
+ int bitdepth_ar[2] = { 10, 12 };
+ for (int k = 0; k < 2; ++k) {
+ int bd = bitdepth_ar[k];
+ for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+ for (int i = 0; i < (int)TX_TYPES; ++i) {
+ if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
+ static_cast<TX_TYPE>(i))) {
+ RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+ 1000000, bd);
+ }
+ }
+ }
+ }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvTxfm2d,
+ ::testing::Values(av1_highbd_inv_txfm_add_sse4_1));
+#endif
-#endif // HAVE_AVX2
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdInvTxfm2d,
+ ::testing::Values(av1_highbd_inv_txfm_add_avx2));
+#endif
} // namespace
diff --git a/third_party/aom/test/av1_round_shift_array_test.cc b/third_party/aom/test/av1_round_shift_array_test.cc
index 825d1348e..181a39460 100644
--- a/third_party/aom/test/av1_round_shift_array_test.cc
+++ b/third_party/aom/test/av1_round_shift_array_test.cc
@@ -27,9 +27,11 @@ namespace AV1CompRoundShift {
typedef void (*comp_round_shift_array_func)(int32_t *arr, int size, int bit);
+#if HAVE_SSE4_1 || HAVE_NEON
const int kValidBitCheck[] = {
-4, -3, -2, -1, 0, 1, 2, 3, 4,
};
+#endif // HAVE_SSE4_1 || HAVE_NEON
typedef ::testing::tuple<comp_round_shift_array_func, BLOCK_SIZE, int>
CompRoundShiftParam;
diff --git a/third_party/aom/test/av1_txfm_test.h b/third_party/aom/test/av1_txfm_test.h
index 70d1a894f..a18164741 100644
--- a/third_party/aom/test/av1_txfm_test.h
+++ b/third_party/aom/test/av1_txfm_test.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_TXFM_TEST_H_
-#define AV1_TXFM_TEST_H_
+#ifndef AOM_TEST_AV1_TXFM_TEST_H_
+#define AOM_TEST_AV1_TXFM_TEST_H_
#include <stdio.h>
#include <stdlib.h>
@@ -132,4 +132,4 @@ void txfm_stage_range_check(const int8_t *stage_range, int stage_num,
const int8_t cos_bit, int low_range,
int high_range);
} // namespace libaom_test
-#endif // AV1_TXFM_TEST_H_
+#endif // AOM_TEST_AV1_TXFM_TEST_H_
diff --git a/third_party/aom/test/blend_a64_mask_test.cc b/third_party/aom/test/blend_a64_mask_test.cc
index c9c6795ee..66ca6fc5f 100644
--- a/third_party/aom/test/blend_a64_mask_test.cc
+++ b/third_party/aom/test/blend_a64_mask_test.cc
@@ -43,10 +43,16 @@ class BlendA64MaskTest : public FunctionEquivalenceTest<BlendA64Func> {
virtual ~BlendA64MaskTest() {}
- virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1) = 0;
+ virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1,
+ int run_times) = 0;
template <typename Pixel>
- void GetSources(Pixel **src0, Pixel **src1, Pixel * /*dst*/) {
+ void GetSources(Pixel **src0, Pixel **src1, Pixel * /*dst*/, int run_times) {
+ if (run_times > 1) {
+ *src0 = src0_;
+ *src1 = src1_;
+ return;
+ }
switch (this->rng_(3)) {
case 0: // Separate sources
*src0 = src0_;
@@ -68,19 +74,20 @@ class BlendA64MaskTest : public FunctionEquivalenceTest<BlendA64Func> {
}
}
- void GetSources(uint16_t **src0, uint16_t **src1, uint8_t * /*dst*/) {
+ void GetSources(uint16_t **src0, uint16_t **src1, uint8_t * /*dst*/,
+ int /*run_times*/) {
*src0 = src0_;
*src1 = src1_;
}
uint8_t Rand1() { return this->rng_.Rand8() & 1; }
- void RunTest() {
- w_ = 4 << this->rng_(MAX_SB_SIZE_LOG2 - 1);
- h_ = 4 << this->rng_(MAX_SB_SIZE_LOG2 - 1);
-
- subx_ = Rand1();
- suby_ = Rand1();
+ void RunOneTest(int block_size, int subx, int suby, int run_times) {
+ w_ = block_size_wide[block_size];
+ h_ = block_size_high[block_size];
+ run_times = run_times > 1 ? run_times / w_ : 1;
+ subx_ = subx;
+ suby_ = suby;
dst_offset_ = this->rng_(33);
dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
@@ -100,19 +107,26 @@ class BlendA64MaskTest : public FunctionEquivalenceTest<BlendA64Func> {
p_src0 = src0_;
p_src1 = src1_;
- GetSources(&p_src0, &p_src1, &dst_ref_[0]);
+ GetSources(&p_src0, &p_src1, &dst_ref_[0], run_times);
- Execute(p_src0, p_src1);
+ Execute(p_src0, p_src1, run_times);
for (int r = 0; r < h_; ++r) {
for (int c = 0; c < w_; ++c) {
ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
dst_tst_[dst_offset_ + r * dst_stride_ + c])
- << w_ << "x" << h_ << " r: " << r << " c: " << c;
+ << w_ << "x" << h_ << " subx " << subx_ << " suby " << suby_
+ << " r: " << r << " c: " << c;
}
}
}
+ void RunTest(int block_size, int run_times) {
+ subx_ = Rand1();
+ suby_ = Rand1();
+ RunOneTest(block_size, subx_, suby_, run_times);
+ }
+
DstPixel dst_ref_[kBufSize];
DstPixel dst_tst_[kBufSize];
uint32_t dst_stride_;
@@ -148,19 +162,37 @@ typedef libaom_test::FuncParam<F8B> TestFuncs;
class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t, uint8_t> {
protected:
- void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
- params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
- src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
- kMaxMaskWidth, w_, h_, subx_, suby_);
- ASM_REGISTER_STATE_CHECK(params_.tst_func(
- dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
- src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, kMaxMaskWidth,
- w_, h_, subx_, suby_));
+ void Execute(const uint8_t *p_src0, const uint8_t *p_src1, int run_times) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_, mask_,
+ kMaxMaskWidth, w_, h_, subx_, suby_);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_, mask_,
+ kMaxMaskWidth, w_, h_, subx_, suby_);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 1) {
+ printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+ time1, time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
}
};
TEST_P(BlendA64MaskTest8B, RandomValues) {
for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
for (int i = 0; i < kBufSize; ++i) {
dst_ref_[i] = rng_.Rand8();
dst_tst_[i] = rng_.Rand8();
@@ -172,12 +204,13 @@ TEST_P(BlendA64MaskTest8B, RandomValues) {
for (int i = 0; i < kMaxMaskSize; ++i)
mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
- RunTest();
+ RunTest(bsize, 1);
}
}
TEST_P(BlendA64MaskTest8B, ExtremeValues) {
for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
for (int i = 0; i < kBufSize; ++i) {
dst_ref_[i] = rng_(2) + 254;
dst_tst_[i] = rng_(2) + 254;
@@ -188,14 +221,39 @@ TEST_P(BlendA64MaskTest8B, ExtremeValues) {
for (int i = 0; i < kMaxMaskSize; ++i)
mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
- RunTest();
+ RunTest(bsize, 1);
}
}
+TEST_P(BlendA64MaskTest8B, DISABLED_Speed) {
+ const int kRunTimes = 10000000;
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_.Rand8();
+ dst_tst_[i] = rng_.Rand8();
+
+ src0_[i] = rng_.Rand8();
+ src1_[i] = rng_.Rand8();
+ }
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+ RunOneTest(bsize, 1, 1, kRunTimes);
+ RunOneTest(bsize, 1, 0, kRunTimes);
+ RunOneTest(bsize, 0, 1, kRunTimes);
+ RunOneTest(bsize, 0, 0, kRunTimes);
+ }
+}
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(SSE4_1, BlendA64MaskTest8B,
::testing::Values(TestFuncs(
aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1)));
+#endif // HAVE_AVX2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, BlendA64MaskTest8B,
+ ::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1,
+ aom_blend_a64_mask_avx2)));
#endif // HAVE_SSE4_1
//////////////////////////////////////////////////////////////////////////////
@@ -215,22 +273,40 @@ class BlendA64MaskTest8B_d16
// max number of bits used by the source
static const int kSrcMaxBitsMask = 0x3fff;
- void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
ConvolveParams conv_params;
conv_params.round_0 = ROUND0_BITS;
conv_params.round_1 = COMPOUND_ROUND1_BITS;
- params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
- src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
- kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
- ASM_REGISTER_STATE_CHECK(params_.tst_func(
- dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
- src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, kMaxMaskWidth,
- w_, h_, subx_, suby_, &conv_params));
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_, mask_,
+ kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_, mask_,
+ kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 1) {
+ printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+ time1, time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
}
};
TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
for (int i = 0; i < kBufSize; ++i) {
dst_ref_[i] = rng_.Rand8();
dst_tst_[i] = rng_.Rand8();
@@ -242,12 +318,13 @@ TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
for (int i = 0; i < kMaxMaskSize; ++i)
mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
- RunTest();
+ RunTest(bsize, 1);
}
}
TEST_P(BlendA64MaskTest8B_d16, ExtremeValues) {
for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
for (int i = 0; i < kBufSize; ++i) {
dst_ref_[i] = 255;
dst_tst_[i] = 255;
@@ -259,7 +336,7 @@ TEST_P(BlendA64MaskTest8B_d16, ExtremeValues) {
for (int i = 0; i < kMaxMaskSize; ++i)
mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1;
- RunTest();
+ RunTest(bsize, 1);
}
}
@@ -270,6 +347,13 @@ INSTANTIATE_TEST_CASE_P(
aom_lowbd_blend_a64_d16_mask_sse4_1)));
#endif // HAVE_SSE4_1
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+ AVX2, BlendA64MaskTest8B_d16,
+ ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+ aom_lowbd_blend_a64_d16_mask_avx2)));
+#endif // HAVE_AVX2
+
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(
NEON, BlendA64MaskTest8B_d16,
@@ -290,16 +374,31 @@ typedef libaom_test::FuncParam<FHBD> TestFuncsHBD;
class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t, uint16_t> {
protected:
- void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
- params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
- CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
- CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
- mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
- ASM_REGISTER_STATE_CHECK(params_.tst_func(
- CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
- CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
- CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, mask_,
- kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_));
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+ mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+ mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 1) {
+ printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+ time1, time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
}
int bit_depth_;
@@ -307,6 +406,7 @@ class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t, uint16_t> {
TEST_P(BlendA64MaskTestHBD, RandomValues) {
for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
switch (rng_(3)) {
case 0: bit_depth_ = 8; break;
case 1: bit_depth_ = 10; break;
@@ -325,12 +425,13 @@ TEST_P(BlendA64MaskTestHBD, RandomValues) {
for (int i = 0; i < kMaxMaskSize; ++i)
mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
- RunTest();
+ RunTest(bsize, 1);
}
}
TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) {
+ int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
switch (rng_(3)) {
case 0: bit_depth_ = 8; break;
case 1: bit_depth_ = 10; break;
@@ -350,7 +451,7 @@ TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
for (int i = 0; i < kMaxMaskSize; ++i)
mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
- RunTest();
+ RunTest(bsize, 1);
}
}
@@ -380,21 +481,37 @@ class BlendA64MaskTestHBD_d16
static const int kSrcMaxBitsMask = (1 << 14) - 1;
static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1;
- void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
ConvolveParams conv_params;
conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
conv_params.round_1 = COMPOUND_ROUND1_BITS;
-
- params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
- p_src0 + src0_offset_, src0_stride_, p_src1 + src1_offset_,
- src1_stride_, mask_, kMaxMaskWidth, w_, h_, subx_, suby_,
- &conv_params, bit_depth_);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_, mask_,
+ kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params,
+ bit_depth_);
+ }
if (params_.tst_func) {
- ASM_REGISTER_STATE_CHECK(params_.tst_func(
- CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
- p_src0 + src0_offset_, src0_stride_, p_src1 + src1_offset_,
- src1_stride_, mask_, kMaxMaskWidth, w_, h_, subx_, suby_,
- &conv_params, bit_depth_));
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_),
+ dst_stride_, p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_, mask_,
+ kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params,
+ bit_depth_);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 1) {
+ printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+ time1, time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
}
}
@@ -405,6 +522,7 @@ class BlendA64MaskTestHBD_d16
TEST_P(BlendA64MaskTestHBD_d16, RandomValues) {
if (params_.tst_func == NULL) return;
for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
switch (rng_(3)) {
case 0: bit_depth_ = 8; break;
case 1: bit_depth_ = 10; break;
@@ -424,26 +542,28 @@ TEST_P(BlendA64MaskTestHBD_d16, RandomValues) {
for (int i = 0; i < kMaxMaskSize; ++i)
mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
- RunTest();
+ RunTest(bsize, 1);
}
}
+// TODO (Scott LaVarnway), fix this test
+TEST_P(BlendA64MaskTestHBD_d16, DISABLED_SaturatedValues) {
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+ src_max_bits_mask_ =
+ (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
+
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = 0;
+ dst_tst_[i] = (1 << bit_depth_) - 1;
+
+ src0_[i] = src_max_bits_mask_;
+ src1_[i] = src_max_bits_mask_;
+ }
-TEST_P(BlendA64MaskTestHBD_d16, SaturatedValues) {
- for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
- src_max_bits_mask_ =
- (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
-
- for (int i = 0; i < kBufSize; ++i) {
- dst_ref_[i] = 0;
- dst_tst_[i] = (1 << bit_depth_) - 1;
+ for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA;
- src0_[i] = src_max_bits_mask_;
- src1_[i] = src_max_bits_mask_;
+ RunTest(bsize, 1);
}
-
- for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA;
-
- RunTest();
}
}
diff --git a/third_party/aom/test/blockd_test.cc b/third_party/aom/test/blockd_test.cc
new file mode 100644
index 000000000..ab624007c
--- /dev/null
+++ b/third_party/aom/test/blockd_test.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/blockd.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+// Verify the optimized implementation of get_partition_subsize() produces the
+// same results as the Partition_Subsize lookup table in the spec.
+TEST(BlockdTest, GetPartitionSubsize) {
+ // The Partition_Subsize table in the spec (Section 9.3. Conversion tables).
+ /* clang-format off */
+ static const BLOCK_SIZE kPartitionSubsize[10][BLOCK_SIZES_ALL] = {
+ {
+ BLOCK_4X4,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X128,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }
+ };
+ /* clang-format on */
+
+ for (int partition = 0; partition < 10; partition++) {
+ for (int bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) {
+ EXPECT_EQ(kPartitionSubsize[partition][bsize],
+ get_partition_subsize(static_cast<BLOCK_SIZE>(bsize),
+ static_cast<PARTITION_TYPE>(partition)));
+ }
+ }
+}
diff --git a/third_party/aom/test/boolcoder_test.cc b/third_party/aom/test/boolcoder_test.cc
index 72182de10..680ec1877 100644
--- a/third_party/aom/test/boolcoder_test.cc
+++ b/third_party/aom/test/boolcoder_test.cc
@@ -108,6 +108,7 @@ TEST(AV1, TestTell) {
double frac_diff_total = 0;
GTEST_ASSERT_GE(aom_reader_tell(&br), 0u);
GTEST_ASSERT_LE(aom_reader_tell(&br), 1u);
+ ASSERT_FALSE(aom_reader_has_overflowed(&br));
for (int i = 0; i < kSymbols; i++) {
aom_read(&br, p, NULL);
uint32_t tell = aom_reader_tell(&br);
@@ -130,5 +131,43 @@ TEST(AV1, TestTell) {
// The average frac_diff error should be pretty small.
GTEST_ASSERT_LE(frac_diff_total / kSymbols, FRAC_DIFF_TOTAL_ERROR)
<< " frac_diff_total: " << frac_diff_total;
+ ASSERT_FALSE(aom_reader_has_overflowed(&br));
+ }
+}
+
+TEST(AV1, TestHasOverflowed) {
+ const int kBufferSize = 10000;
+ aom_writer bw;
+ uint8_t bw_buffer[kBufferSize];
+ const int kSymbols = 1024;
+ // Coders are noisier at low probabilities, so we start at p = 4.
+ for (int p = 4; p < 256; p++) {
+ aom_start_encode(&bw, bw_buffer);
+ for (int i = 0; i < kSymbols; i++) {
+ aom_write(&bw, 1, p);
+ }
+ aom_stop_encode(&bw);
+ aom_reader br;
+ aom_reader_init(&br, bw_buffer, bw.pos);
+ ASSERT_FALSE(aom_reader_has_overflowed(&br));
+ for (int i = 0; i < kSymbols; i++) {
+ GTEST_ASSERT_EQ(aom_read(&br, p, NULL), 1);
+ ASSERT_FALSE(aom_reader_has_overflowed(&br));
+ }
+ // In the worst case, the encoder uses just a tiny fraction of the last
+ // byte in the buffer. So to guarantee that aom_reader_has_overflowed()
+ // returns true, we have to consume very nearly 8 additional bits of data.
+ // In the worse case, one of the bits in that byte will be 1, and the rest
+ // will be zero. Once we are past that 1 bit, when the probability of
+ // reading zero symbol from aom_read() is high, each additional symbol read
+ // will consume very little additional data (in the case that p == 255,
+ // approximately -log_2(255/256) ~= 0.0056 bits). In that case it would
+ // take around 178 calls to consume more than 8 bits. That is only an upper
+ // bound. In practice we are not guaranteed to hit the worse case and can
+ // get away with 174 calls.
+ for (int i = 0; i < 174; i++) {
+ aom_read(&br, p, NULL);
+ }
+ ASSERT_TRUE(aom_reader_has_overflowed(&br));
}
}
diff --git a/third_party/aom/test/clear_system_state.h b/third_party/aom/test/clear_system_state.h
index 7aa78243b..d38ff5dd5 100644
--- a/third_party/aom/test/clear_system_state.h
+++ b/third_party/aom/test/clear_system_state.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_CLEAR_SYSTEM_STATE_H_
-#define TEST_CLEAR_SYSTEM_STATE_H_
+#ifndef AOM_TEST_CLEAR_SYSTEM_STATE_H_
+#define AOM_TEST_CLEAR_SYSTEM_STATE_H_
#include "config/aom_config.h"
@@ -28,4 +28,4 @@ inline void ClearSystemState() {
}
} // namespace libaom_test
-#endif // TEST_CLEAR_SYSTEM_STATE_H_
+#endif // AOM_TEST_CLEAR_SYSTEM_STATE_H_
diff --git a/third_party/aom/test/codec_factory.h b/third_party/aom/test/codec_factory.h
index e6ae7f8c3..dd99110ee 100644
--- a/third_party/aom/test/codec_factory.h
+++ b/third_party/aom/test/codec_factory.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_CODEC_FACTORY_H_
-#define TEST_CODEC_FACTORY_H_
+#ifndef AOM_TEST_CODEC_FACTORY_H_
+#define AOM_TEST_CODEC_FACTORY_H_
#include "config/aom_config.h"
@@ -89,7 +89,7 @@ class AV1Decoder : public Decoder {
protected:
virtual aom_codec_iface_t *CodecInterface() const {
#if CONFIG_AV1_DECODER
- return &aom_codec_av1_dx_algo;
+ return aom_codec_av1_dx();
#else
return NULL;
#endif
@@ -105,7 +105,7 @@ class AV1Encoder : public Encoder {
protected:
virtual aom_codec_iface_t *CodecInterface() const {
#if CONFIG_AV1_ENCODER
- return &aom_codec_av1_cx_algo;
+ return aom_codec_av1_cx();
#else
return NULL;
#endif
@@ -147,7 +147,7 @@ class AV1CodecFactory : public CodecFactory {
virtual aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
int usage) const {
#if CONFIG_AV1_ENCODER
- return aom_codec_enc_config_default(&aom_codec_av1_cx_algo, cfg, usage);
+ return aom_codec_enc_config_default(aom_codec_av1_cx(), cfg, usage);
#else
(void)cfg;
(void)usage;
@@ -167,4 +167,4 @@ const libaom_test::AV1CodecFactory kAV1;
__VA_ARGS__))
} // namespace libaom_test
-#endif // TEST_CODEC_FACTORY_H_
+#endif // AOM_TEST_CODEC_FACTORY_H_
diff --git a/third_party/aom/test/coding_path_sync.cc b/third_party/aom/test/coding_path_sync.cc
index 51a506004..6735236cc 100644
--- a/third_party/aom/test/coding_path_sync.cc
+++ b/third_party/aom/test/coding_path_sync.cc
@@ -27,7 +27,7 @@ namespace {
class CompressedSource {
public:
explicit CompressedSource(int seed) : rnd_(seed), frame_count_(0) {
- aom_codec_iface_t *algo = &aom_codec_av1_cx_algo;
+ aom_codec_iface_t *algo = aom_codec_av1_cx();
aom_codec_enc_cfg_t cfg;
aom_codec_enc_config_default(algo, &cfg, 0);
@@ -140,7 +140,7 @@ std::vector<int16_t> Serialize(const aom_image_t *img) {
class Decoder {
public:
explicit Decoder(int allowLowbitdepth) {
- aom_codec_iface_t *algo = &aom_codec_av1_dx_algo;
+ aom_codec_iface_t *algo = aom_codec_av1_dx();
aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
cfg.allow_lowbitdepth = allowLowbitdepth;
diff --git a/third_party/aom/test/comp_avg_pred_test.cc b/third_party/aom/test/comp_avg_pred_test.cc
index 8bd826eb4..9ad8973f0 100644
--- a/third_party/aom/test/comp_avg_pred_test.cc
+++ b/third_party/aom/test/comp_avg_pred_test.cc
@@ -50,9 +50,9 @@ TEST_P(AV1HighBDJNTCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(1)); }
TEST_P(AV1HighBDJNTCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
- SSE2, AV1HighBDJNTCOMPAVGTest,
- libaom_test::AV1JNTCOMPAVG::BuildParams(aom_highbd_jnt_comp_avg_pred_sse2));
+INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDJNTCOMPAVGTest,
+ libaom_test::AV1JNTCOMPAVG::BuildParams(
+ aom_highbd_jnt_comp_avg_pred_sse2, 1));
#endif
TEST_P(AV1HighBDJNTCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
diff --git a/third_party/aom/test/comp_avg_pred_test.h b/third_party/aom/test/comp_avg_pred_test.h
index ab2004c05..9661dd9f5 100644
--- a/third_party/aom/test/comp_avg_pred_test.h
+++ b/third_party/aom/test/comp_avg_pred_test.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_COMP_AVG_PRED_TEST_H_
-#define TEST_COMP_AVG_PRED_TEST_H_
+#ifndef AOM_TEST_COMP_AVG_PRED_TEST_H_
+#define AOM_TEST_COMP_AVG_PRED_TEST_H_
#include "config/aom_dsp_rtcd.h"
@@ -36,25 +36,21 @@ typedef void (*jntcompavgupsampled_func)(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const JNT_COMP_PARAMS *jcp_param);
-
-typedef void (*highbdjntcompavg_func)(uint16_t *comp_pred, const uint8_t *pred8,
- int width, int height,
- const uint8_t *ref8, int ref_stride,
- const JNT_COMP_PARAMS *jcp_param);
+ int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search);
typedef void (*highbdjntcompavgupsampled_func)(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param);
+ int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+ int subpel_search);
typedef ::testing::tuple<jntcompavg_func, BLOCK_SIZE> JNTCOMPAVGParam;
typedef ::testing::tuple<jntcompavgupsampled_func, BLOCK_SIZE>
JNTCOMPAVGUPSAMPLEDParam;
-typedef ::testing::tuple<int, highbdjntcompavg_func, BLOCK_SIZE>
+typedef ::testing::tuple<int, jntcompavg_func, BLOCK_SIZE>
HighbdJNTCOMPAVGParam;
typedef ::testing::tuple<int, highbdjntcompavgupsampled_func, BLOCK_SIZE>
@@ -73,7 +69,8 @@ typedef ::testing::tuple<int, highbdjntcompavgupsampled_func, BLOCK_SIZE>
}
::testing::internal::ParamGenerator<HighbdJNTCOMPAVGParam> BuildParams(
- highbdjntcompavg_func filter) {
+ jntcompavg_func filter, int is_hbd) {
+ (void)is_hbd;
return ::testing::Combine(::testing::Range(8, 13, 2),
::testing::Values(filter),
::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
@@ -217,33 +214,39 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
JNT_COMP_PARAMS jnt_comp_params;
jnt_comp_params.use_jnt_comp_avg = 1;
int sub_x_q3, sub_y_q3;
- for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
- for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
- for (int ii = 0; ii < 2; ii++) {
- for (int jj = 0; jj < 4; jj++) {
- jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
- jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
-
- const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
- const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
-
- aom_jnt_comp_avg_upsampled_pred_c(
- NULL, NULL, 0, 0, NULL, output, pred8 + offset_r * w + offset_c,
- in_w, in_h, sub_x_q3, sub_y_q3, ref8 + offset_r * w + offset_c,
- in_w, &jnt_comp_params);
- test_impl(NULL, NULL, 0, 0, NULL, output2,
- pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
- sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
- &jnt_comp_params);
-
- for (int i = 0; i < in_h; ++i) {
- for (int j = 0; j < in_w; ++j) {
- int idx = i * in_w + j;
- ASSERT_EQ(output[idx], output2[idx])
- << "Mismatch at unit tests for AV1JNTCOMPAVGUPSAMPLEDTest\n"
- << in_w << "x" << in_h << " Pixel mismatch at index " << idx
- << " = (" << i << ", " << j << "), sub pixel offset = ("
- << sub_y_q3 << ", " << sub_x_q3 << ")";
+ int subpel_search;
+ for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+ for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+ for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+ for (int ii = 0; ii < 2; ii++) {
+ for (int jj = 0; jj < 4; jj++) {
+ jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+ jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+
+ const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+ const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+
+ aom_jnt_comp_avg_upsampled_pred_c(
+ NULL, NULL, 0, 0, NULL, output,
+ pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+ sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+ &jnt_comp_params, subpel_search);
+ test_impl(NULL, NULL, 0, 0, NULL, output2,
+ pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+ sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+ &jnt_comp_params, subpel_search);
+
+ for (int i = 0; i < in_h; ++i) {
+ for (int j = 0; j < in_w; ++j) {
+ int idx = i * in_w + j;
+ ASSERT_EQ(output[idx], output2[idx])
+ << "Mismatch at unit tests for "
+ "AV1JNTCOMPAVGUPSAMPLEDTest\n"
+ << in_w << "x" << in_h << " Pixel mismatch at index "
+ << idx << " = (" << i << ", " << j
+ << "), sub pixel offset = (" << sub_y_q3 << ", "
+ << sub_x_q3 << ")";
+ }
}
}
}
@@ -280,11 +283,12 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
const int num_loops = 1000000000 / (in_w + in_h);
aom_usec_timer timer;
aom_usec_timer_start(&timer);
+ int subpel_search = 2; // set to 1 to test 4-tap filter.
for (int i = 0; i < num_loops; ++i)
aom_jnt_comp_avg_upsampled_pred_c(NULL, NULL, 0, 0, NULL, output, pred8,
in_w, in_h, sub_x_q3, sub_y_q3, ref8,
- in_w, &jnt_comp_params);
+ in_w, &jnt_comp_params, subpel_search);
aom_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
@@ -296,7 +300,7 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
for (int i = 0; i < num_loops; ++i)
test_impl(NULL, NULL, 0, 0, NULL, output2, pred8, in_w, in_h, sub_x_q3,
- sub_y_q3, ref8, in_w, &jnt_comp_params);
+ sub_y_q3, ref8, in_w, &jnt_comp_params, subpel_search);
aom_usec_timer_mark(&timer1);
const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
@@ -316,7 +320,7 @@ class AV1HighBDJNTCOMPAVGTest
void TearDown() { libaom_test::ClearSystemState(); }
protected:
- void RunCheckOutput(highbdjntcompavg_func test_impl) {
+ void RunCheckOutput(jntcompavg_func test_impl) {
const int w = kMaxSize, h = kMaxSize;
const int block_idx = GET_PARAM(2);
const int bd = GET_PARAM(0);
@@ -344,13 +348,14 @@ class AV1HighBDJNTCOMPAVGTest
const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
aom_highbd_jnt_comp_avg_pred_c(
- output, CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
- in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
+ CONVERT_TO_BYTEPTR(output),
+ CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h,
+ CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
&jnt_comp_params);
- test_impl(output2, CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c,
- in_w, in_h,
- CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
- &jnt_comp_params);
+ test_impl(CONVERT_TO_BYTEPTR(output2),
+ CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+ in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
+ in_w, &jnt_comp_params);
for (int i = 0; i < in_h; ++i) {
for (int j = 0; j < in_w; ++j) {
@@ -364,7 +369,7 @@ class AV1HighBDJNTCOMPAVGTest
}
}
}
- void RunSpeedTest(highbdjntcompavg_func test_impl) {
+ void RunSpeedTest(jntcompavg_func test_impl) {
const int w = kMaxSize, h = kMaxSize;
const int block_idx = GET_PARAM(2);
const int bd = GET_PARAM(0);
@@ -392,9 +397,9 @@ class AV1HighBDJNTCOMPAVGTest
aom_usec_timer_start(&timer);
for (int i = 0; i < num_loops; ++i)
- aom_highbd_jnt_comp_avg_pred_c(output, CONVERT_TO_BYTEPTR(pred8), in_w,
- in_h, CONVERT_TO_BYTEPTR(ref8), in_w,
- &jnt_comp_params);
+ aom_highbd_jnt_comp_avg_pred_c(
+ CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
+ CONVERT_TO_BYTEPTR(ref8), in_w, &jnt_comp_params);
aom_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
@@ -405,8 +410,8 @@ class AV1HighBDJNTCOMPAVGTest
aom_usec_timer_start(&timer1);
for (int i = 0; i < num_loops; ++i)
- test_impl(output2, CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
- CONVERT_TO_BYTEPTR(ref8), in_w, &jnt_comp_params);
+ test_impl(CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8), in_w,
+ in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &jnt_comp_params);
aom_usec_timer_mark(&timer1);
const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
@@ -445,38 +450,41 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
JNT_COMP_PARAMS jnt_comp_params;
jnt_comp_params.use_jnt_comp_avg = 1;
int sub_x_q3, sub_y_q3;
-
- for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
- for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
- for (int ii = 0; ii < 2; ii++) {
- for (int jj = 0; jj < 4; jj++) {
- jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
- jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
-
- const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
- const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
-
- aom_highbd_jnt_comp_avg_upsampled_pred_c(
- NULL, NULL, 0, 0, NULL, output,
- CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h,
- sub_x_q3, sub_y_q3,
- CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd,
- &jnt_comp_params);
- test_impl(NULL, NULL, 0, 0, NULL, output2,
- CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
- in_h, sub_x_q3, sub_y_q3,
- CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
- bd, &jnt_comp_params);
-
- for (int i = 0; i < in_h; ++i) {
- for (int j = 0; j < in_w; ++j) {
- int idx = i * in_w + j;
- ASSERT_EQ(output[idx], output2[idx])
- << "Mismatch at unit tests for "
- "AV1HighBDJNTCOMPAVGUPSAMPLEDTest\n"
- << in_w << "x" << in_h << " Pixel mismatch at index " << idx
- << " = (" << i << ", " << j << "), sub pixel offset = ("
- << sub_y_q3 << ", " << sub_x_q3 << ")";
+ int subpel_search;
+ for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+ for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+ for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+ for (int ii = 0; ii < 2; ii++) {
+ for (int jj = 0; jj < 4; jj++) {
+ jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+ jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+
+ const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+ const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+
+ aom_highbd_jnt_comp_avg_upsampled_pred_c(
+ NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output),
+ CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+ in_h, sub_x_q3, sub_y_q3,
+ CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd,
+ &jnt_comp_params, subpel_search);
+ test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2),
+ CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c,
+ in_w, in_h, sub_x_q3, sub_y_q3,
+ CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
+ in_w, bd, &jnt_comp_params, subpel_search);
+
+ for (int i = 0; i < in_h; ++i) {
+ for (int j = 0; j < in_w; ++j) {
+ int idx = i * in_w + j;
+ ASSERT_EQ(output[idx], output2[idx])
+ << "Mismatch at unit tests for "
+ "AV1HighBDJNTCOMPAVGUPSAMPLEDTest\n"
+ << in_w << "x" << in_h << " Pixel mismatch at index "
+ << idx << " = (" << i << ", " << j
+ << "), sub pixel offset = (" << sub_y_q3 << ", "
+ << sub_x_q3 << ")";
+ }
}
}
}
@@ -511,12 +519,12 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
const int num_loops = 1000000000 / (in_w + in_h);
aom_usec_timer timer;
aom_usec_timer_start(&timer);
-
+ int subpel_search = 2; // set to 1 to test 4-tap filter.
for (int i = 0; i < num_loops; ++i)
aom_highbd_jnt_comp_avg_upsampled_pred_c(
- NULL, NULL, 0, 0, NULL, output, CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
- sub_x_q3, sub_y_q3, CONVERT_TO_BYTEPTR(ref8), in_w, bd,
- &jnt_comp_params);
+ NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output),
+ CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
+ CONVERT_TO_BYTEPTR(ref8), in_w, bd, &jnt_comp_params, subpel_search);
aom_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
@@ -527,9 +535,10 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
aom_usec_timer_start(&timer1);
for (int i = 0; i < num_loops; ++i)
- test_impl(NULL, NULL, 0, 0, NULL, output2, CONVERT_TO_BYTEPTR(pred8),
- in_w, in_h, sub_x_q3, sub_y_q3, CONVERT_TO_BYTEPTR(ref8), in_w,
- bd, &jnt_comp_params);
+ test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2),
+ CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
+ CONVERT_TO_BYTEPTR(ref8), in_w, bd, &jnt_comp_params,
+ subpel_search);
aom_usec_timer_mark(&timer1);
const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
@@ -543,4 +552,4 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
} // namespace AV1JNTCOMPAVG
} // namespace libaom_test
-#endif // TEST_COMP_AVG_PRED_TEST_H_
+#endif // AOM_TEST_COMP_AVG_PRED_TEST_H_
diff --git a/third_party/aom/test/comp_mask_variance_test.cc b/third_party/aom/test/comp_mask_variance_test.cc
index 0016ddd59..34be2aa6d 100644
--- a/third_party/aom/test/comp_mask_variance_test.cc
+++ b/third_party/aom/test/comp_mask_variance_test.cc
@@ -190,26 +190,29 @@ void AV1CompMaskUpVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
const int w = block_size_wide[bsize];
const int h = block_size_high[bsize];
int wedge_types = (1 << get_wedge_bits_lookup(bsize));
-
- // loop through subx and suby
- for (int sub = 0; sub < 8 * 8; ++sub) {
- int subx = sub & 0x7;
- int suby = (sub >> 3);
- for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
- const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
-
- aom_comp_mask_pred = aom_comp_mask_pred_c; // ref
- aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_,
- w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w,
- inv);
-
- aom_comp_mask_pred = test_impl; // test
- aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred2_, pred_,
- w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w,
- inv);
- ASSERT_EQ(CheckResult(w, h), true)
- << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
- << "," << suby << ")";
+ int subpel_search;
+ for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+ // loop through subx and suby
+ for (int sub = 0; sub < 8 * 8; ++sub) {
+ int subx = sub & 0x7;
+ int suby = (sub >> 3);
+ for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+ const uint8_t *mask =
+ av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+ // ref
+ aom_comp_mask_upsampled_pred_c(
+ NULL, NULL, 0, 0, NULL, comp_pred1_, pred_, w, h, subx, suby, ref_,
+ MAX_SB_SIZE, mask, w, inv, subpel_search);
+
+ aom_comp_mask_pred = test_impl; // test
+ aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred2_, pred_,
+ w, h, subx, suby, ref_, MAX_SB_SIZE, mask,
+ w, inv, subpel_search);
+ ASSERT_EQ(CheckResult(w, h), true)
+ << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
+ << "," << suby << ")";
+ }
}
}
}
@@ -228,6 +231,7 @@ void AV1CompMaskUpVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl,
const int num_loops = 1000000000 / (w + h);
comp_mask_pred_func funcs[2] = { &aom_comp_mask_pred_c, test_impl };
double elapsed_time[2] = { 0 };
+ int subpel_search = 2; // set to 1 to test 4-tap filter.
for (int i = 0; i < 2; ++i) {
aom_usec_timer timer;
aom_usec_timer_start(&timer);
@@ -235,7 +239,7 @@ void AV1CompMaskUpVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl,
for (int j = 0; j < num_loops; ++j) {
aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_,
w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w,
- 0);
+ 0, subpel_search);
}
aom_usec_timer_mark(&timer);
double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
@@ -272,7 +276,7 @@ INSTANTIATE_TEST_CASE_P(
#endif // ifndef aom_comp_mask_pred
-typedef void (*highbd_comp_mask_pred_func)(uint16_t *comp_pred,
+typedef void (*highbd_comp_mask_pred_func)(uint8_t *comp_pred8,
const uint8_t *pred8, int width,
int height, const uint8_t *ref8,
int ref_stride, const uint8_t *mask,
@@ -358,11 +362,11 @@ void AV1HighbdCompMaskVarianceTest::RunCheckOutput(
for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
- aom_highbd_comp_mask_pred_c(comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h,
- CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w,
- inv);
+ aom_highbd_comp_mask_pred_c(
+ CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv);
- test_impl(comp_pred2_, CONVERT_TO_BYTEPTR(pred_), w, h,
+ test_impl(CONVERT_TO_BYTEPTR(comp_pred2_), CONVERT_TO_BYTEPTR(pred_), w, h,
CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv);
ASSERT_EQ(CheckResult(w, h), true)
@@ -398,7 +402,7 @@ void AV1HighbdCompMaskVarianceTest::RunSpeedTest(
aom_usec_timer_start(&timer);
highbd_comp_mask_pred_func func = funcs[i];
for (int j = 0; j < num_loops; ++j) {
- func(comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h,
+ func(CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, 0);
}
aom_usec_timer_mark(&timer);
@@ -428,6 +432,14 @@ INSTANTIATE_TEST_CASE_P(
::testing::Range(8, 13, 2)));
#endif
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, AV1HighbdCompMaskVarianceTest,
+ ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_sse2),
+ ::testing::ValuesIn(kValidBlockSize),
+ ::testing::Range(8, 13, 2)));
+#endif
+
#ifndef aom_highbd_comp_mask_pred
// can't run this test if aom_highbd_comp_mask_pred is defined to
// aom_highbd_comp_mask_pred_c
@@ -458,25 +470,33 @@ void AV1HighbdCompMaskUpVarianceTest::RunCheckOutput(
ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
}
- // loop through subx and suby
- for (int sub = 0; sub < 8 * 8; ++sub) {
- int subx = sub & 0x7;
- int suby = (sub >> 3);
- for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
- const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
-
- aom_highbd_comp_mask_pred = aom_highbd_comp_mask_pred_c; // ref
- aom_highbd_comp_mask_upsampled_pred(
- NULL, NULL, 0, 0, NULL, comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h,
- subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_);
-
- aom_highbd_comp_mask_pred = test_impl; // test
- aom_highbd_comp_mask_upsampled_pred(
- NULL, NULL, 0, 0, NULL, comp_pred2_, CONVERT_TO_BYTEPTR(pred_), w, h,
- subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_);
- ASSERT_EQ(CheckResult(w, h), true)
- << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
- << "," << suby << ")";
+ int subpel_search;
+ for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+ // loop through subx and suby
+ for (int sub = 0; sub < 8 * 8; ++sub) {
+ int subx = sub & 0x7;
+ int suby = (sub >> 3);
+ for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+ const uint8_t *mask =
+ av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+ aom_highbd_comp_mask_pred = aom_highbd_comp_mask_pred_c; // ref
+ aom_highbd_comp_mask_upsampled_pred(
+ NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred1_),
+ CONVERT_TO_BYTEPTR(pred_), w, h, subx, suby,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_,
+ subpel_search);
+
+ aom_highbd_comp_mask_pred = test_impl; // test
+ aom_highbd_comp_mask_upsampled_pred(
+ NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred2_),
+ CONVERT_TO_BYTEPTR(pred_), w, h, subx, suby,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_,
+ subpel_search);
+ ASSERT_EQ(CheckResult(w, h), true)
+ << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
+ << "," << suby << ")";
+ }
}
}
}
@@ -508,10 +528,12 @@ void AV1HighbdCompMaskUpVarianceTest::RunSpeedTest(
aom_usec_timer timer;
aom_usec_timer_start(&timer);
aom_highbd_comp_mask_pred = funcs[i];
+ int subpel_search = 2; // set to 1 to test 4-tap filter.
for (int j = 0; j < num_loops; ++j) {
aom_highbd_comp_mask_upsampled_pred(
- NULL, NULL, 0, 0, NULL, comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h,
- subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, 0, bd_);
+ NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred1_),
+ CONVERT_TO_BYTEPTR(pred_), w, h, subx, suby, CONVERT_TO_BYTEPTR(ref_),
+ MAX_SB_SIZE, mask, w, 0, bd_, subpel_search);
}
aom_usec_timer_mark(&timer);
double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
@@ -540,5 +562,13 @@ INSTANTIATE_TEST_CASE_P(
::testing::Range(8, 13, 2)));
#endif
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, AV1HighbdCompMaskUpVarianceTest,
+ ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_sse2),
+ ::testing::ValuesIn(kValidBlockSize),
+ ::testing::Range(8, 13, 2)));
+#endif
+
#endif // ifndef aom_highbd_comp_mask_pred
} // namespace AV1CompMaskVariance
diff --git a/third_party/aom/test/decode_api_test.cc b/third_party/aom/test/decode_api_test.cc
index 97cbd0655..c1beacee1 100644
--- a/third_party/aom/test/decode_api_test.cc
+++ b/third_party/aom/test/decode_api_test.cc
@@ -22,7 +22,7 @@ namespace {
TEST(DecodeAPI, InvalidParams) {
static const aom_codec_iface_t *kCodecs[] = {
#if CONFIG_AV1_DECODER
- &aom_codec_av1_dx_algo,
+ aom_codec_av1_dx(),
#endif
};
uint8_t buf[1] = { 0 };
diff --git a/third_party/aom/test/decode_test_driver.h b/third_party/aom/test/decode_test_driver.h
index 916efdad0..d13e13ea1 100644
--- a/third_party/aom/test/decode_test_driver.h
+++ b/third_party/aom/test/decode_test_driver.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_DECODE_TEST_DRIVER_H_
-#define TEST_DECODE_TEST_DRIVER_H_
+#ifndef AOM_TEST_DECODE_TEST_DRIVER_H_
+#define AOM_TEST_DECODE_TEST_DRIVER_H_
#include <cstring>
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -162,4 +162,4 @@ class DecoderTest {
} // namespace libaom_test
-#endif // TEST_DECODE_TEST_DRIVER_H_
+#endif // AOM_TEST_DECODE_TEST_DRIVER_H_
diff --git a/third_party/aom/test/encode_api_test.cc b/third_party/aom/test/encode_api_test.cc
index c469d0871..c26f5720f 100644
--- a/third_party/aom/test/encode_api_test.cc
+++ b/third_party/aom/test/encode_api_test.cc
@@ -22,7 +22,7 @@ namespace {
TEST(EncodeAPI, InvalidParams) {
static const aom_codec_iface_t *kCodecs[] = {
#if CONFIG_AV1_ENCODER
- &aom_codec_av1_cx_algo,
+ aom_codec_av1_cx(),
#endif
};
uint8_t buf[1] = { 0 };
@@ -54,6 +54,16 @@ TEST(EncodeAPI, InvalidParams) {
EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(kCodecs[i], &cfg, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
+
+ EXPECT_EQ(NULL, aom_codec_get_global_headers(NULL));
+
+ aom_fixed_buf_t *glob_headers = aom_codec_get_global_headers(&enc);
+ EXPECT_TRUE(glob_headers->buf != NULL);
+ if (glob_headers) {
+ free(glob_headers->buf);
+ free(glob_headers);
+ }
+
EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
diff --git a/third_party/aom/test/encode_test_driver.cc b/third_party/aom/test/encode_test_driver.cc
index 35908430d..f3d61dc36 100644
--- a/third_party/aom/test/encode_test_driver.cc
+++ b/third_party/aom/test/encode_test_driver.cc
@@ -205,13 +205,8 @@ void EncoderTest::RunLoop(VideoSource *video) {
ASSERT_FALSE(::testing::Test::HasFatalFailure());
- unsigned long dec_init_flags = 0; // NOLINT
- // Use fragment decoder if encoder outputs partitions.
- // NOTE: fragment decoder and partition encoder are only supported by VP8.
- if (init_flags_ & AOM_CODEC_USE_OUTPUT_PARTITION)
- dec_init_flags |= AOM_CODEC_USE_INPUT_FRAGMENTS;
testing::internal::scoped_ptr<Decoder> decoder(
- codec_->CreateDecoder(dec_cfg, dec_init_flags));
+ codec_->CreateDecoder(dec_cfg, 0 /* flags */));
#if CONFIG_AV1_DECODER
if (decoder->IsAV1()) {
// Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole
@@ -268,12 +263,6 @@ void EncoderTest::RunLoop(VideoSource *video) {
}
}
- // Flush the decoder when there are no more fragments.
- if ((init_flags_ & AOM_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) {
- const aom_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
- if (!HandleDecodeResult(res_dec, decoder.get())) break;
- }
-
if (has_dxdata && has_cxdata) {
const aom_image_t *img_enc = encoder->GetPreviewFrame();
DxDataIterator dec_iter = decoder->GetDxData();
diff --git a/third_party/aom/test/encode_test_driver.h b/third_party/aom/test/encode_test_driver.h
index 138cd6a67..4f3f855cf 100644
--- a/third_party/aom/test/encode_test_driver.h
+++ b/third_party/aom/test/encode_test_driver.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_ENCODE_TEST_DRIVER_H_
-#define TEST_ENCODE_TEST_DRIVER_H_
+#ifndef AOM_TEST_ENCODE_TEST_DRIVER_H_
+#define AOM_TEST_ENCODE_TEST_DRIVER_H_
#include <string>
#include <vector>
@@ -246,4 +246,4 @@ class EncoderTest {
} // namespace libaom_test
-#endif // TEST_ENCODE_TEST_DRIVER_H_
+#endif // AOM_TEST_ENCODE_TEST_DRIVER_H_
diff --git a/third_party/aom/test/encodetxb_test.cc b/third_party/aom/test/encodetxb_test.cc
index ab6ec72c6..11cc07368 100644
--- a/third_party/aom/test/encodetxb_test.cc
+++ b/third_party/aom/test/encodetxb_test.cc
@@ -181,15 +181,22 @@ INSTANTIATE_TEST_CASE_P(SSE2, EncodeTxbTest,
::testing::Values(av1_get_nz_map_contexts_sse2));
#endif
-#if HAVE_SSE4_1
-class EncodeTxbInitLevelTest : public ::testing::TestWithParam<int> {
+typedef void (*av1_txb_init_levels_func)(const tran_low_t *const coeff,
+ const int width, const int height,
+ uint8_t *const levels);
+
+typedef ::testing::tuple<av1_txb_init_levels_func, int> TxbInitLevelParam;
+
+class EncodeTxbInitLevelTest
+ : public ::testing::TestWithParam<TxbInitLevelParam> {
public:
virtual ~EncodeTxbInitLevelTest() {}
virtual void TearDown() { libaom_test::ClearSystemState(); }
- void RunTest(int tx_size, int is_speed);
+ void RunTest(av1_txb_init_levels_func test_func, int tx_size, int is_speed);
};
-void EncodeTxbInitLevelTest::RunTest(int tx_size, int is_speed) {
+void EncodeTxbInitLevelTest::RunTest(av1_txb_init_levels_func test_func,
+ int tx_size, int is_speed) {
const int width = get_txb_wide((TX_SIZE)tx_size);
const int height = get_txb_high((TX_SIZE)tx_size);
tran_low_t coeff[MAX_TX_SQUARE];
@@ -215,7 +222,7 @@ void EncodeTxbInitLevelTest::RunTest(int tx_size, int is_speed) {
const double t1 = get_time_mark(&timer);
aom_usec_timer_start(&timer);
for (int i = 0; i < run_times; ++i) {
- av1_txb_init_levels_sse4_1(coeff, width, height, levels1);
+ test_func(coeff, width, height, levels1);
}
const double t2 = get_time_mark(&timer);
if (is_speed) {
@@ -232,11 +239,24 @@ void EncodeTxbInitLevelTest::RunTest(int tx_size, int is_speed) {
}
}
-TEST_P(EncodeTxbInitLevelTest, match) { RunTest(GetParam(), 0); }
-TEST_P(EncodeTxbInitLevelTest, DISABLED_Speed) { RunTest(GetParam(), 1); }
+TEST_P(EncodeTxbInitLevelTest, match) {
+ RunTest(GET_PARAM(0), GET_PARAM(1), 0);
+}
-INSTANTIATE_TEST_CASE_P(SSE4_1, EncodeTxbInitLevelTest,
- ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1));
-#endif
+TEST_P(EncodeTxbInitLevelTest, DISABLED_Speed) {
+ RunTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, EncodeTxbInitLevelTest,
+ ::testing::Combine(::testing::Values(&av1_txb_init_levels_sse4_1),
+ ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+ AVX2, EncodeTxbInitLevelTest,
+ ::testing::Combine(::testing::Values(&av1_txb_init_levels_avx2),
+ ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
+#endif
} // namespace
diff --git a/third_party/aom/test/ethread_test.cc b/third_party/aom/test/ethread_test.cc
index dd9fc2f8d..d9ac78282 100644
--- a/third_party/aom/test/ethread_test.cc
+++ b/third_party/aom/test/ethread_test.cc
@@ -50,7 +50,7 @@ class AVxEncoderThreadTest
SetMode(encoding_mode_);
if (encoding_mode_ != ::libaom_test::kRealTime) {
- cfg_.g_lag_in_frames = 3;
+ cfg_.g_lag_in_frames = 5;
cfg_.rc_end_usage = AOM_VBR;
cfg_.rc_2pass_vbr_minsection_pct = 5;
cfg_.rc_2pass_vbr_maxsection_pct = 2000;
@@ -72,6 +72,7 @@ class AVxEncoderThreadTest
if (!encoder_initialized_) {
SetTileSize(encoder);
encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AV1E_SET_ROW_MT, row_mt_);
if (encoding_mode_ != ::libaom_test::kRealTime) {
encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
@@ -115,10 +116,11 @@ class AVxEncoderThreadTest
void DoTest() {
::libaom_test::YUVVideoSource video(
- "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 15, 18);
+ "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 15, 21);
cfg_.rc_target_bitrate = 1000;
// Encode using single thread.
+ row_mt_ = 0;
cfg_.g_threads = 1;
init_flags_ = AOM_CODEC_USE_PSNR;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
@@ -149,6 +151,55 @@ class AVxEncoderThreadTest
ASSERT_EQ(single_thr_size_enc, multi_thr_size_enc);
ASSERT_EQ(single_thr_md5_enc, multi_thr_md5_enc);
ASSERT_EQ(single_thr_md5_dec, multi_thr_md5_dec);
+
+ // Encode using multiple threads row-mt enabled.
+ row_mt_ = 1;
+ cfg_.g_threads = 2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ std::vector<size_t> multi_thr2_row_mt_size_enc;
+ std::vector<std::string> multi_thr2_row_mt_md5_enc;
+ std::vector<std::string> multi_thr2_row_mt_md5_dec;
+ multi_thr2_row_mt_size_enc = size_enc_;
+ multi_thr2_row_mt_md5_enc = md5_enc_;
+ multi_thr2_row_mt_md5_dec = md5_dec_;
+ size_enc_.clear();
+ md5_enc_.clear();
+ md5_dec_.clear();
+
+ // Disable threads=3 test for now to reduce the time so that the nightly
+ // test would not time out.
+ // cfg_.g_threads = 3;
+ // ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // std::vector<size_t> multi_thr3_row_mt_size_enc;
+ // std::vector<std::string> multi_thr3_row_mt_md5_enc;
+ // std::vector<std::string> multi_thr3_row_mt_md5_dec;
+ // multi_thr3_row_mt_size_enc = size_enc_;
+ // multi_thr3_row_mt_md5_enc = md5_enc_;
+ // multi_thr3_row_mt_md5_dec = md5_dec_;
+ // size_enc_.clear();
+ // md5_enc_.clear();
+ // md5_dec_.clear();
+ // Check that the vectors are equal.
+ // ASSERT_EQ(multi_thr3_row_mt_size_enc, multi_thr2_row_mt_size_enc);
+ // ASSERT_EQ(multi_thr3_row_mt_md5_enc, multi_thr2_row_mt_md5_enc);
+ // ASSERT_EQ(multi_thr3_row_mt_md5_dec, multi_thr2_row_mt_md5_dec);
+
+ cfg_.g_threads = 4;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ std::vector<size_t> multi_thr4_row_mt_size_enc;
+ std::vector<std::string> multi_thr4_row_mt_md5_enc;
+ std::vector<std::string> multi_thr4_row_mt_md5_dec;
+ multi_thr4_row_mt_size_enc = size_enc_;
+ multi_thr4_row_mt_md5_enc = md5_enc_;
+ multi_thr4_row_mt_md5_dec = md5_dec_;
+ size_enc_.clear();
+ md5_enc_.clear();
+ md5_dec_.clear();
+
+ // Check that the vectors are equal.
+ ASSERT_EQ(multi_thr4_row_mt_size_enc, multi_thr2_row_mt_size_enc);
+ ASSERT_EQ(multi_thr4_row_mt_md5_enc, multi_thr2_row_mt_md5_enc);
+ ASSERT_EQ(multi_thr4_row_mt_md5_dec, multi_thr2_row_mt_md5_dec);
}
bool encoder_initialized_;
@@ -156,6 +207,7 @@ class AVxEncoderThreadTest
int set_cpu_used_;
int tile_cols_;
int tile_rows_;
+ int row_mt_;
::libaom_test::Decoder *decoder_;
std::vector<size_t> size_enc_;
std::vector<std::string> md5_enc_;
@@ -177,12 +229,13 @@ TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) {
}
// For AV1, only test speed 0 to 3.
+// Here test cpu_used 2 and 3
AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTest,
- ::testing::Values(::libaom_test::kTwoPassGood,
- ::libaom_test::kOnePassGood),
- ::testing::Range(2, 4), ::testing::Values(1, 2),
+ ::testing::Values(::libaom_test::kTwoPassGood),
+ ::testing::Range(2, 4), ::testing::Values(0, 2),
::testing::Values(0, 1));
+// Test cpu_used 0 and 1.
AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTestLarge,
::testing::Values(::libaom_test::kTwoPassGood,
::libaom_test::kOnePassGood),
@@ -212,14 +265,9 @@ TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) {
DoTest();
}
-AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadLSTest,
- ::testing::Values(::libaom_test::kTwoPassGood,
- ::libaom_test::kOnePassGood),
- ::testing::Range(2, 4), ::testing::Values(6),
- ::testing::Values(0, 6));
AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadLSTestLarge,
::testing::Values(::libaom_test::kTwoPassGood,
::libaom_test::kOnePassGood),
- ::testing::Range(0, 2), ::testing::Values(6),
+ ::testing::Range(0, 4), ::testing::Values(0, 6),
::testing::Values(0, 6));
} // namespace
diff --git a/third_party/aom/test/external_frame_buffer_test.cc b/third_party/aom/test/external_frame_buffer_test.cc
new file mode 100644
index 000000000..c2af059a4
--- /dev/null
+++ b/third_party/aom/test/external_frame_buffer_test.cc
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "config/aom_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+
+namespace {
+
+const int kVideoNameParam = 1;
+
+struct ExternalFrameBuffer {
+ uint8_t *data;
+ size_t size;
+ int in_use;
+};
+
+// Class to manipulate a list of external frame buffers.
+class ExternalFrameBufferList {
+ public:
+ ExternalFrameBufferList()
+ : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(NULL) {}
+
+ virtual ~ExternalFrameBufferList() {
+ for (int i = 0; i < num_buffers_; ++i) {
+ delete[] ext_fb_list_[i].data;
+ }
+ delete[] ext_fb_list_;
+ }
+
+ // Creates the list to hold the external buffers. Returns true on success.
+ bool CreateBufferList(int num_buffers) {
+ if (num_buffers < 0) return false;
+
+ num_buffers_ = num_buffers;
+ ext_fb_list_ = new ExternalFrameBuffer[num_buffers_];
+ EXPECT_TRUE(ext_fb_list_ != NULL);
+ memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_);
+ return true;
+ }
+
+ // Searches the frame buffer list for a free frame buffer. Makes sure
+ // that the frame buffer is at least |min_size| in bytes. Marks that the
+ // frame buffer is in use by libvpx. Finally sets |fb| to point to the
+ // external frame buffer. Returns < 0 on an error.
+ int GetFreeFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) {
+ EXPECT_TRUE(fb != NULL);
+ const int idx = FindFreeBufferIndex();
+ if (idx == num_buffers_) return -1;
+
+ if (ext_fb_list_[idx].size < min_size) {
+ delete[] ext_fb_list_[idx].data;
+ ext_fb_list_[idx].data = new uint8_t[min_size];
+ memset(ext_fb_list_[idx].data, 0, min_size);
+ ext_fb_list_[idx].size = min_size;
+ }
+
+ SetFrameBuffer(idx, fb);
+
+ num_used_buffers_++;
+ return 0;
+ }
+
+ // Test function that will not allocate any data for the frame buffer.
+ // Returns < 0 on an error.
+ int GetZeroFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) {
+ EXPECT_TRUE(fb != NULL);
+ const int idx = FindFreeBufferIndex();
+ if (idx == num_buffers_) return -1;
+
+ if (ext_fb_list_[idx].size < min_size) {
+ delete[] ext_fb_list_[idx].data;
+ ext_fb_list_[idx].data = NULL;
+ ext_fb_list_[idx].size = min_size;
+ }
+
+ SetFrameBuffer(idx, fb);
+ return 0;
+ }
+
+ // Marks the external frame buffer that |fb| is pointing to as free.
+ // Returns < 0 on an error.
+ int ReturnFrameBuffer(aom_codec_frame_buffer_t *fb) {
+ if (fb == NULL) {
+ EXPECT_TRUE(fb != NULL);
+ return -1;
+ }
+ ExternalFrameBuffer *const ext_fb =
+ reinterpret_cast<ExternalFrameBuffer *>(fb->priv);
+ if (ext_fb == NULL) {
+ EXPECT_TRUE(ext_fb != NULL);
+ return -1;
+ }
+ EXPECT_EQ(1, ext_fb->in_use);
+ ext_fb->in_use = 0;
+ num_used_buffers_--;
+ return 0;
+ }
+
+ // Checks that the ximage data is contained within the external frame buffer
+ // private data passed back in the ximage.
+ void CheckXImageFrameBuffer(const aom_image_t *img) {
+ if (img->fb_priv != NULL) {
+ const struct ExternalFrameBuffer *const ext_fb =
+ reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv);
+
+ ASSERT_TRUE(img->planes[0] >= ext_fb->data &&
+ img->planes[0] < (ext_fb->data + ext_fb->size));
+ }
+ }
+
+ int num_used_buffers() const { return num_used_buffers_; }
+
+ private:
+ // Returns the index of the first free frame buffer. Returns |num_buffers_|
+ // if there are no free frame buffers.
+ int FindFreeBufferIndex() {
+ int i;
+ // Find a free frame buffer.
+ for (i = 0; i < num_buffers_; ++i) {
+ if (!ext_fb_list_[i].in_use) break;
+ }
+ return i;
+ }
+
+ // Sets |fb| to an external frame buffer. idx is the index into the frame
+ // buffer list.
+ void SetFrameBuffer(int idx, aom_codec_frame_buffer_t *fb) {
+ ASSERT_TRUE(fb != NULL);
+ fb->data = ext_fb_list_[idx].data;
+ fb->size = ext_fb_list_[idx].size;
+ ASSERT_EQ(0, ext_fb_list_[idx].in_use);
+ ext_fb_list_[idx].in_use = 1;
+ fb->priv = &ext_fb_list_[idx];
+ }
+
+ int num_buffers_;
+ int num_used_buffers_;
+ ExternalFrameBuffer *ext_fb_list_;
+};
+
+#if CONFIG_WEBM_IO
+
+// Callback used by libvpx to request the application to return a frame
+// buffer of at least |min_size| in bytes.
+int get_aom_frame_buffer(void *user_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb) {
+ ExternalFrameBufferList *const fb_list =
+ reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+ return fb_list->GetFreeFrameBuffer(min_size, fb);
+}
+
+// Callback used by libvpx to tell the application that |fb| is not needed
+// anymore.
+int release_aom_frame_buffer(void *user_priv, aom_codec_frame_buffer_t *fb) {
+ ExternalFrameBufferList *const fb_list =
+ reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+ return fb_list->ReturnFrameBuffer(fb);
+}
+
+// Callback will not allocate data for frame buffer.
+int get_aom_zero_frame_buffer(void *user_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb) {
+ ExternalFrameBufferList *const fb_list =
+ reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+ return fb_list->GetZeroFrameBuffer(min_size, fb);
+}
+
+// Callback will allocate one less byte than |min_size|.
+int get_aom_one_less_byte_frame_buffer(void *user_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb) {
+ ExternalFrameBufferList *const fb_list =
+ reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+ return fb_list->GetFreeFrameBuffer(min_size - 1, fb);
+}
+
+// Callback will not release the external frame buffer.
+int do_not_release_aom_frame_buffer(void *user_priv,
+ aom_codec_frame_buffer_t *fb) {
+ (void)user_priv;
+ (void)fb;
+ return 0;
+}
+
+#endif // CONFIG_WEBM_IO
+
+// Class for testing passing in external frame buffers to libaom.
+class ExternalFrameBufferMD5Test
+ : public ::libaom_test::DecoderTest,
+ public ::libaom_test::CodecTestWithParam<const char *> {
+ protected:
+ ExternalFrameBufferMD5Test()
+ : DecoderTest(GET_PARAM(::libaom_test::kCodecFactoryParam)),
+ md5_file_(NULL), num_buffers_(0) {}
+
+ virtual ~ExternalFrameBufferMD5Test() {
+ if (md5_file_ != NULL) fclose(md5_file_);
+ }
+
+ virtual void PreDecodeFrameHook(
+ const libaom_test::CompressedVideoSource &video,
+ libaom_test::Decoder *decoder) {
+ if (num_buffers_ > 0 && video.frame_number() == 0) {
+ // Have libvpx use frame buffers we create.
+ ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+ ASSERT_EQ(AOM_CODEC_OK,
+ decoder->SetFrameBufferFunctions(GetAV1FrameBuffer,
+ ReleaseAV1FrameBuffer, this));
+ }
+ }
+
+ void OpenMD5File(const std::string &md5_file_name_) {
+ md5_file_ = libaom_test::OpenTestDataFile(md5_file_name_);
+ ASSERT_TRUE(md5_file_ != NULL)
+ << "Md5 file open failed. Filename: " << md5_file_name_;
+ }
+
+ virtual void DecompressedFrameHook(const aom_image_t &img,
+ const unsigned int frame_number) {
+ ASSERT_TRUE(md5_file_ != NULL);
+ char expected_md5[33];
+ char junk[128];
+
+ // Read correct md5 checksums.
+ const int res = fscanf(md5_file_, "%s %s", expected_md5, junk);
+ ASSERT_NE(EOF, res) << "Read md5 data failed";
+ expected_md5[32] = '\0';
+
+ ::libaom_test::MD5 md5_res;
+ md5_res.Add(&img);
+ const char *const actual_md5 = md5_res.Get();
+
+ // Check md5 match.
+ ASSERT_STREQ(expected_md5, actual_md5)
+ << "Md5 checksums don't match: frame number = " << frame_number;
+ }
+
+ // Callback to get a free external frame buffer. Return value < 0 is an
+ // error.
+ static int GetAV1FrameBuffer(void *user_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb) {
+ ExternalFrameBufferMD5Test *const md5Test =
+ reinterpret_cast<ExternalFrameBufferMD5Test *>(user_priv);
+ return md5Test->fb_list_.GetFreeFrameBuffer(min_size, fb);
+ }
+
+ // Callback to release an external frame buffer. Return value < 0 is an
+ // error.
+ static int ReleaseAV1FrameBuffer(void *user_priv,
+ aom_codec_frame_buffer_t *fb) {
+ ExternalFrameBufferMD5Test *const md5Test =
+ reinterpret_cast<ExternalFrameBufferMD5Test *>(user_priv);
+ return md5Test->fb_list_.ReturnFrameBuffer(fb);
+ }
+
+ void set_num_buffers(int num_buffers) { num_buffers_ = num_buffers; }
+ int num_buffers() const { return num_buffers_; }
+
+ private:
+ FILE *md5_file_;
+ int num_buffers_;
+ ExternalFrameBufferList fb_list_;
+};
+
+#if CONFIG_WEBM_IO
+const char kAV1TestFile[] = "av1-1-b8-01-size-226x226.ivf";
+const char kAV1NonRefTestFile[] = "av1-1-b8-01-size-226x226.ivf";
+
+// Class for testing passing in external frame buffers to libvpx.
+class ExternalFrameBufferTest : public ::testing::Test {
+ protected:
+ ExternalFrameBufferTest() : video_(NULL), decoder_(NULL), num_buffers_(0) {}
+
+ virtual void SetUp() {
+ video_ = new libaom_test::IVFVideoSource(kAV1TestFile);
+ ASSERT_TRUE(video_ != NULL);
+ video_->Init();
+ video_->Begin();
+
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ decoder_ = new libaom_test::AV1Decoder(cfg, 0);
+ ASSERT_TRUE(decoder_ != NULL);
+ }
+
+ virtual void TearDown() {
+ delete decoder_;
+ decoder_ = NULL;
+ delete video_;
+ video_ = NULL;
+ }
+
+ // Passes the external frame buffer information to libvpx.
+ aom_codec_err_t SetFrameBufferFunctions(
+ int num_buffers, aom_get_frame_buffer_cb_fn_t cb_get,
+ aom_release_frame_buffer_cb_fn_t cb_release) {
+ if (num_buffers > 0) {
+ num_buffers_ = num_buffers;
+ EXPECT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+ }
+
+ return decoder_->SetFrameBufferFunctions(cb_get, cb_release, &fb_list_);
+ }
+
+ aom_codec_err_t DecodeOneFrame() {
+ const aom_codec_err_t res =
+ decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+ CheckDecodedFrames();
+ if (res == AOM_CODEC_OK) video_->Next();
+ return res;
+ }
+
+ aom_codec_err_t DecodeRemainingFrames() {
+ for (; video_->cxdata() != NULL; video_->Next()) {
+ const aom_codec_err_t res =
+ decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+ if (res != AOM_CODEC_OK) return res;
+ CheckDecodedFrames();
+ }
+ return AOM_CODEC_OK;
+ }
+
+ protected:
+ void CheckDecodedFrames() {
+ libaom_test::DxDataIterator dec_iter = decoder_->GetDxData();
+ const aom_image_t *img = NULL;
+
+ // Get decompressed data
+ while ((img = dec_iter.Next()) != NULL) {
+ fb_list_.CheckXImageFrameBuffer(img);
+ }
+ }
+
+ libaom_test::IVFVideoSource *video_;
+ libaom_test::AV1Decoder *decoder_;
+ int num_buffers_;
+ ExternalFrameBufferList fb_list_;
+};
+
+class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
+ protected:
+ virtual void SetUp() {
+ video_ = new libaom_test::IVFVideoSource(kAV1NonRefTestFile);
+ ASSERT_TRUE(video_ != NULL);
+ video_->Init();
+ video_->Begin();
+
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ decoder_ = new libaom_test::AV1Decoder(cfg, 0);
+ ASSERT_TRUE(decoder_ != NULL);
+ }
+
+ virtual void CheckFrameBufferRelease() {
+ TearDown();
+ ASSERT_EQ(0, fb_list_.num_used_buffers());
+ }
+};
+#endif // CONFIG_WEBM_IO
+
+// This test runs through the set of test vectors, and decodes them.
+// Libvpx will call into the application to allocate a frame buffer when
+// needed. The md5 checksums are computed for each frame in the video file.
+// If md5 checksums match the correct md5 data, then the test is passed.
+// Otherwise, the test failed.
+TEST_P(ExternalFrameBufferMD5Test, DISABLED_ExtFBMD5Match) {
+ const std::string filename = GET_PARAM(kVideoNameParam);
+
+ // Number of buffers equals #AOM_MAXIMUM_REF_BUFFERS +
+ // #AOM_MAXIMUM_WORK_BUFFERS + four jitter buffers.
+ const int jitter_buffers = 4;
+ const int num_buffers =
+ AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+ set_num_buffers(num_buffers);
+
+ // Open compressed video file.
+ testing::internal::scoped_ptr<libaom_test::CompressedVideoSource> video;
+ if (filename.substr(filename.length() - 3, 3) == "ivf") {
+ video.reset(new libaom_test::IVFVideoSource(filename));
+ } else {
+#if CONFIG_WEBM_IO
+ video.reset(new libaom_test::WebMVideoSource(filename));
+#else
+ fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+ filename.c_str());
+ return;
+#endif
+ }
+ ASSERT_TRUE(video.get() != NULL);
+ video->Init();
+
+ // Construct md5 file name.
+ const std::string md5_filename = filename + ".md5";
+ OpenMD5File(md5_filename);
+
+ // Decode frame, and check the md5 matching.
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+#if CONFIG_WEBM_IO
+TEST_F(ExternalFrameBufferTest, MinFrameBuffers) {
+ // Minimum number of external frame buffers for AV1 is
+ // #AOM_MAXIMUM_REF_BUFFERS + #AOM_MAXIMUM_WORK_BUFFERS.
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+ release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, EightJitterBuffers) {
+ // Number of buffers equals #AOM_MAXIMUM_REF_BUFFERS +
+ // #AOM_MAXIMUM_WORK_BUFFERS + eight jitter buffers.
+ const int jitter_buffers = 8;
+ const int num_buffers =
+ AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+ ASSERT_EQ(AOM_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+ release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, DISABLED_NotEnoughBuffers) {
+ // Minimum number of external frame buffers for AV1 is
+ // #AOM_MAXIMUM_REF_BUFFERS + #AOM_MAXIMUM_WORK_BUFFERS. Most files will
+ // only use 5 frame buffers at one time.
+ const int num_buffers = 2;
+ ASSERT_EQ(AOM_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+ release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
+ // Only run this on long clips. Decoding a very short clip will return
+ // AOM_CODEC_OK even with only 2 buffers.
+ ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, DISABLED_NoRelease) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+ do_not_release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
+ ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NullRealloc) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_aom_zero_frame_buffer,
+ release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, ReallocOneLessByte) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_OK, SetFrameBufferFunctions(
+ num_buffers, get_aom_one_less_byte_frame_buffer,
+ release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, NullGetFunction) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(
+ AOM_CODEC_INVALID_PARAM,
+ SetFrameBufferFunctions(num_buffers, NULL, release_aom_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferTest, NullReleaseFunction) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_INVALID_PARAM,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, NULL));
+}
+
+TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
+ ASSERT_EQ(AOM_CODEC_ERROR,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+ release_aom_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+ release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+ CheckFrameBufferRelease();
+}
+#endif // CONFIG_WEBM_IO
+
+AV1_INSTANTIATE_TEST_CASE(
+ ExternalFrameBufferMD5Test,
+ ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+ libaom_test::kAV1TestVectors +
+ libaom_test::kNumAV1TestVectors));
+} // namespace
diff --git a/third_party/aom/test/fft_test.cc b/third_party/aom/test/fft_test.cc
index 5c8ec069c..e24e451a3 100644
--- a/third_party/aom/test/fft_test.cc
+++ b/third_party/aom/test/fft_test.cc
@@ -17,9 +17,6 @@
#include "aom_dsp/fft_common.h"
#include "aom_mem/aom_mem.h"
-#if ARCH_X86 || ARCH_X86_64
-#include "aom_ports/x86.h"
-#endif
#include "av1/common/common.h"
#include "config/aom_dsp_rtcd.h"
#include "test/acm_random.h"
@@ -80,14 +77,11 @@ std::vector<std::complex<float> > fft2d(const InputType *input, int n) {
struct FFTTestArg {
int n;
void (*fft)(const float *input, float *temp, float *output);
- int flag;
- FFTTestArg(int n_in, tform_fun_t fft_in, int flag_in)
- : n(n_in), fft(fft_in), flag(flag_in) {}
+ FFTTestArg(int n_in, tform_fun_t fft_in) : n(n_in), fft(fft_in) {}
};
std::ostream &operator<<(std::ostream &os, const FFTTestArg &test_arg) {
- return os << "fft_arg { n:" << test_arg.n << " fft:" << test_arg.fft
- << " flag:" << test_arg.flag << "}";
+ return os << "fft_arg { n:" << test_arg.n << " fft:" << test_arg.fft << " }";
}
class FFT2DTest : public ::testing::TestWithParam<FFTTestArg> {
@@ -100,26 +94,18 @@ class FFT2DTest : public ::testing::TestWithParam<FFTTestArg> {
memset(input_, 0, sizeof(*input_) * n * n);
memset(temp_, 0, sizeof(*temp_) * n * n);
memset(output_, 0, sizeof(*output_) * n * n * 2);
-#if ARCH_X86 || ARCH_X86_64
- disabled_ = GetParam().flag != 0 && !(x86_simd_caps() & GetParam().flag);
-#else
- disabled_ = GetParam().flag != 0;
-#endif
}
void TearDown() {
aom_free(input_);
aom_free(temp_);
aom_free(output_);
}
- int disabled_;
float *input_;
float *temp_;
float *output_;
};
TEST_P(FFT2DTest, Correct) {
- if (disabled_) return;
-
int n = GetParam().n;
for (int i = 0; i < n * n; ++i) {
input_[i] = 1;
@@ -137,8 +123,6 @@ TEST_P(FFT2DTest, Correct) {
}
TEST_P(FFT2DTest, Benchmark) {
- if (disabled_) return;
-
int n = GetParam().n;
float sum = 0;
for (int i = 0; i < 1000 * (64 - n); ++i) {
@@ -149,39 +133,40 @@ TEST_P(FFT2DTest, Benchmark) {
}
}
-INSTANTIATE_TEST_CASE_P(
- FFT2DTestC, FFT2DTest,
- ::testing::Values(FFTTestArg(2, aom_fft2x2_float_c, 0),
- FFTTestArg(4, aom_fft4x4_float_c, 0),
- FFTTestArg(8, aom_fft8x8_float_c, 0),
- FFTTestArg(16, aom_fft16x16_float_c, 0),
- FFTTestArg(32, aom_fft32x32_float_c, 0)));
+INSTANTIATE_TEST_CASE_P(C, FFT2DTest,
+ ::testing::Values(FFTTestArg(2, aom_fft2x2_float_c),
+ FFTTestArg(4, aom_fft4x4_float_c),
+ FFTTestArg(8, aom_fft8x8_float_c),
+ FFTTestArg(16, aom_fft16x16_float_c),
+ FFTTestArg(32,
+ aom_fft32x32_float_c)));
#if ARCH_X86 || ARCH_X86_64
+#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
- FFT2DTestSSE2, FFT2DTest,
- ::testing::Values(FFTTestArg(4, aom_fft4x4_float_sse2, HAS_SSE2),
- FFTTestArg(8, aom_fft8x8_float_sse2, HAS_SSE2),
- FFTTestArg(16, aom_fft16x16_float_sse2, HAS_SSE2),
- FFTTestArg(32, aom_fft32x32_float_sse2, HAS_SSE2)));
-
+ SSE2, FFT2DTest,
+ ::testing::Values(FFTTestArg(4, aom_fft4x4_float_sse2),
+ FFTTestArg(8, aom_fft8x8_float_sse2),
+ FFTTestArg(16, aom_fft16x16_float_sse2),
+ FFTTestArg(32, aom_fft32x32_float_sse2)));
+#endif // HAVE_SSE2
+#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(
- FFT2DTestAVX2, FFT2DTest,
- ::testing::Values(FFTTestArg(8, aom_fft8x8_float_avx2, HAS_AVX2),
- FFTTestArg(16, aom_fft16x16_float_avx2, HAS_AVX2),
- FFTTestArg(32, aom_fft32x32_float_avx2, HAS_AVX2)));
-#endif
+ AVX2, FFT2DTest,
+ ::testing::Values(FFTTestArg(8, aom_fft8x8_float_avx2),
+ FFTTestArg(16, aom_fft16x16_float_avx2),
+ FFTTestArg(32, aom_fft32x32_float_avx2)));
+#endif // HAVE_AVX2
+#endif // ARCH_X86 || ARCH_X86_64
struct IFFTTestArg {
int n;
tform_fun_t ifft;
- int flag;
- IFFTTestArg(int n_in, tform_fun_t ifft_in, int flag_in)
- : n(n_in), ifft(ifft_in), flag(flag_in) {}
+ IFFTTestArg(int n_in, tform_fun_t ifft_in) : n(n_in), ifft(ifft_in) {}
};
std::ostream &operator<<(std::ostream &os, const IFFTTestArg &test_arg) {
return os << "ifft_arg { n:" << test_arg.n << " fft:" << test_arg.ifft
- << " flag:" << test_arg.flag << "}";
+ << " }";
}
class IFFT2DTest : public ::testing::TestWithParam<IFFTTestArg> {
@@ -194,25 +179,18 @@ class IFFT2DTest : public ::testing::TestWithParam<IFFTTestArg> {
memset(input_, 0, sizeof(*input_) * n * n * 2);
memset(temp_, 0, sizeof(*temp_) * n * n * 2);
memset(output_, 0, sizeof(*output_) * n * n);
-#if ARCH_X86 || ARCH_X86_64
- disabled_ = GetParam().flag != 0 && !(x86_simd_caps() & GetParam().flag);
-#else
- disabled_ = GetParam().flag != 0;
-#endif
}
void TearDown() {
aom_free(input_);
aom_free(temp_);
aom_free(output_);
}
- int disabled_;
float *input_;
float *temp_;
float *output_;
};
TEST_P(IFFT2DTest, Correctness) {
- if (disabled_) return;
int n = GetParam().n;
ASSERT_GE(n, 2);
std::vector<float> expected(n * n);
@@ -240,7 +218,6 @@ TEST_P(IFFT2DTest, Correctness) {
};
TEST_P(IFFT2DTest, Benchmark) {
- if (disabled_) return;
int n = GetParam().n;
float sum = 0;
for (int i = 0; i < 1000 * (64 - n); ++i) {
@@ -251,24 +228,29 @@ TEST_P(IFFT2DTest, Benchmark) {
}
}
INSTANTIATE_TEST_CASE_P(
- IFFT2DTestC, IFFT2DTest,
- ::testing::Values(IFFTTestArg(2, aom_ifft2x2_float_c, 0),
- IFFTTestArg(4, aom_ifft4x4_float_c, 0),
- IFFTTestArg(8, aom_ifft8x8_float_c, 0),
- IFFTTestArg(16, aom_ifft16x16_float_c, 0),
- IFFTTestArg(32, aom_ifft32x32_float_c, 0)));
+ C, IFFT2DTest,
+ ::testing::Values(IFFTTestArg(2, aom_ifft2x2_float_c),
+ IFFTTestArg(4, aom_ifft4x4_float_c),
+ IFFTTestArg(8, aom_ifft8x8_float_c),
+ IFFTTestArg(16, aom_ifft16x16_float_c),
+ IFFTTestArg(32, aom_ifft32x32_float_c)));
#if ARCH_X86 || ARCH_X86_64
+#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
- IFFT2DTestSSE2, IFFT2DTest,
- ::testing::Values(IFFTTestArg(4, aom_ifft4x4_float_sse2, HAS_SSE2),
- IFFTTestArg(8, aom_ifft8x8_float_sse2, HAS_SSE2),
- IFFTTestArg(16, aom_ifft16x16_float_sse2, HAS_SSE2),
- IFFTTestArg(32, aom_ifft32x32_float_sse2, HAS_SSE2)));
+ SSE2, IFFT2DTest,
+ ::testing::Values(IFFTTestArg(4, aom_ifft4x4_float_sse2),
+ IFFTTestArg(8, aom_ifft8x8_float_sse2),
+ IFFTTestArg(16, aom_ifft16x16_float_sse2),
+ IFFTTestArg(32, aom_ifft32x32_float_sse2)));
+#endif // HAVE_SSE2
+#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(
- IFFT2DTestAVX2, IFFT2DTest,
- ::testing::Values(IFFTTestArg(8, aom_ifft8x8_float_avx2, HAS_AVX2),
- IFFTTestArg(16, aom_ifft16x16_float_avx2, HAS_AVX2),
- IFFTTestArg(32, aom_ifft32x32_float_avx2, HAS_AVX2)));
-#endif
+ AVX2, IFFT2DTest,
+ ::testing::Values(IFFTTestArg(8, aom_ifft8x8_float_avx2),
+ IFFTTestArg(16, aom_ifft16x16_float_avx2),
+ IFFTTestArg(32, aom_ifft32x32_float_avx2)));
+#endif // HAVE_AVX2
+#endif // ARCH_X86 || ARCH_X86_64
+
} // namespace
diff --git a/third_party/aom/test/function_equivalence_test.h b/third_party/aom/test/function_equivalence_test.h
index 4b22c74a2..f27068902 100644
--- a/third_party/aom/test/function_equivalence_test.h
+++ b/third_party/aom/test/function_equivalence_test.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_FUNCTION_EQUIVALENCE_TEST_H_
-#define TEST_FUNCTION_EQUIVALENCE_TEST_H_
+#ifndef AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
+#define AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
#include "test/acm_random.h"
@@ -66,4 +66,4 @@ class FunctionEquivalenceTest : public ::testing::TestWithParam<FuncParam<T> > {
};
} // namespace libaom_test
-#endif // TEST_FUNCTION_EQUIVALENCE_TEST_H_
+#endif // AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
diff --git a/third_party/aom/test/hiprec_convolve_test_util.h b/third_party/aom/test/hiprec_convolve_test_util.h
index 81471c8b9..2abe24b57 100644
--- a/third_party/aom/test/hiprec_convolve_test_util.h
+++ b/third_party/aom/test/hiprec_convolve_test_util.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
-#define TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
+#ifndef AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
+#define AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
#include "config/av1_rtcd.h"
@@ -90,4 +90,4 @@ class AV1HighbdHiprecConvolveTest
} // namespace libaom_test
-#endif // TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
+#endif // AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
diff --git a/third_party/aom/test/i420_video_source.h b/third_party/aom/test/i420_video_source.h
index 0825296d7..233e7152b 100644
--- a/third_party/aom/test/i420_video_source.h
+++ b/third_party/aom/test/i420_video_source.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_I420_VIDEO_SOURCE_H_
-#define TEST_I420_VIDEO_SOURCE_H_
+#ifndef AOM_TEST_I420_VIDEO_SOURCE_H_
+#define AOM_TEST_I420_VIDEO_SOURCE_H_
#include <cstdio>
#include <cstdlib>
#include <string>
@@ -31,4 +31,4 @@ class I420VideoSource : public YUVVideoSource {
} // namespace libaom_test
-#endif // TEST_I420_VIDEO_SOURCE_H_
+#endif // AOM_TEST_I420_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/invalid_file_test.cc b/third_party/aom/test/invalid_file_test.cc
index 869f3da66..5b4f5a6c3 100644
--- a/third_party/aom/test/invalid_file_test.cc
+++ b/third_party/aom/test/invalid_file_test.cc
@@ -49,29 +49,31 @@ class InvalidFileTest : public ::libaom_test::DecoderTest,
const libaom_test::CompressedVideoSource &video,
libaom_test::Decoder *decoder) {
EXPECT_TRUE(res_file_ != NULL);
- int expected_res_dec;
+ int expected_res_dec = -1;
// Read integer result.
const int res = fscanf(res_file_, "%d", &expected_res_dec);
EXPECT_NE(res, EOF) << "Read result data failed";
- // Check results match.
- const DecodeParam input = GET_PARAM(1);
- if (input.threads > 1) {
- // The serial decode check is too strict for tile-threaded decoding as
- // there is no guarantee on the decode order nor which specific error
- // will take precedence. Currently a tile-level error is not forwarded so
- // the frame will simply be marked corrupt.
- EXPECT_TRUE(res_dec == expected_res_dec ||
- res_dec == AOM_CODEC_CORRUPT_FRAME)
- << "Results don't match: frame number = " << video.frame_number()
- << ". (" << decoder->DecodeError()
- << "). Expected: " << expected_res_dec << " or "
- << AOM_CODEC_CORRUPT_FRAME;
- } else {
- EXPECT_EQ(expected_res_dec, res_dec)
- << "Results don't match: frame number = " << video.frame_number()
- << ". (" << decoder->DecodeError() << ")";
+ if (expected_res_dec != -1) {
+ // Check results match.
+ const DecodeParam input = GET_PARAM(1);
+ if (input.threads > 1) {
+ // The serial decode check is too strict for tile-threaded decoding as
+ // there is no guarantee on the decode order nor which specific error
+ // will take precedence. Currently a tile-level error is not forwarded
+ // so the frame will simply be marked corrupt.
+ EXPECT_TRUE(res_dec == expected_res_dec ||
+ res_dec == AOM_CODEC_CORRUPT_FRAME)
+ << "Results don't match: frame number = " << video.frame_number()
+ << ". (" << decoder->DecodeError()
+ << "). Expected: " << expected_res_dec << " or "
+ << AOM_CODEC_CORRUPT_FRAME;
+ } else {
+ EXPECT_EQ(expected_res_dec, res_dec)
+ << "Results don't match: frame number = " << video.frame_number()
+ << ". (" << decoder->DecodeError() << ")";
+ }
}
return !HasFailure();
@@ -106,6 +108,12 @@ TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
const DecodeParam kAV1InvalidFileTests[] = {
{ 1, "invalid-bug-1814.ivf" },
+ { 4, "invalid-oss-fuzz-9463.ivf" },
+ { 1, "invalid-oss-fuzz-9482.ivf" },
+ { 1, "invalid-oss-fuzz-9720.ivf" },
+ { 1, "invalid-oss-fuzz-10061.ivf" },
+ { 1, "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf" },
+ { 1, "invalid-oss-fuzz-10227.ivf" },
};
AV1_INSTANTIATE_TEST_CASE(InvalidFileTest,
diff --git a/third_party/aom/test/ivf_video_source.h b/third_party/aom/test/ivf_video_source.h
index 4b2713537..ff2841445 100644
--- a/third_party/aom/test/ivf_video_source.h
+++ b/third_party/aom/test/ivf_video_source.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_IVF_VIDEO_SOURCE_H_
-#define TEST_IVF_VIDEO_SOURCE_H_
+#ifndef AOM_TEST_IVF_VIDEO_SOURCE_H_
+#define AOM_TEST_IVF_VIDEO_SOURCE_H_
#include <cstdio>
#include <cstdlib>
@@ -111,4 +111,4 @@ class IVFVideoSource : public CompressedVideoSource {
} // namespace libaom_test
-#endif // TEST_IVF_VIDEO_SOURCE_H_
+#endif // AOM_TEST_IVF_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/log2_test.cc b/third_party/aom/test/log2_test.cc
new file mode 100644
index 000000000..d7840c68b
--- /dev/null
+++ b/third_party/aom/test/log2_test.cc
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/bitops.h"
+#include "av1/common/entropymode.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+TEST(Log2Test, GetMsb) {
+ // Test small numbers exhaustively.
+ for (unsigned int n = 1; n < 10000; n++) {
+ EXPECT_EQ(get_msb(n), static_cast<int>(floor(log2(n))));
+ }
+
+ // Test every power of 2 and the two adjacent numbers.
+ for (int exponent = 2; exponent < 32; exponent++) {
+ const unsigned int power_of_2 = 1U << exponent;
+ EXPECT_EQ(get_msb(power_of_2 - 1), exponent - 1);
+ EXPECT_EQ(get_msb(power_of_2), exponent);
+ EXPECT_EQ(get_msb(power_of_2 + 1), exponent);
+ }
+}
+
+TEST(Log2Test, Av1CeilLog2) {
+ // Test small numbers exhaustively.
+ EXPECT_EQ(av1_ceil_log2(0), 0);
+ for (int n = 1; n < 10000; n++) {
+ EXPECT_EQ(av1_ceil_log2(n), static_cast<int>(ceil(log2(n))));
+ }
+
+ // Test every power of 2 and the two adjacent numbers.
+ for (int exponent = 2; exponent < 31; exponent++) {
+ const int power_of_2 = 1 << exponent;
+ EXPECT_EQ(av1_ceil_log2(power_of_2 - 1), exponent);
+ EXPECT_EQ(av1_ceil_log2(power_of_2), exponent);
+ // The current implementation of av1_ceil_log2 only works up to 2^30.
+ if (exponent < 30) {
+ EXPECT_EQ(av1_ceil_log2(power_of_2 + 1), exponent + 1);
+ }
+ }
+}
diff --git a/third_party/aom/test/md5_helper.h b/third_party/aom/test/md5_helper.h
index b2b14cf21..9443cb262 100644
--- a/third_party/aom/test/md5_helper.h
+++ b/third_party/aom/test/md5_helper.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_MD5_HELPER_H_
-#define TEST_MD5_HELPER_H_
+#ifndef AOM_TEST_MD5_HELPER_H_
+#define AOM_TEST_MD5_HELPER_H_
#include "aom/aom_decoder.h"
#include "common/md5_utils.h"
@@ -73,4 +73,4 @@ class MD5 {
} // namespace libaom_test
-#endif // TEST_MD5_HELPER_H_
+#endif // AOM_TEST_MD5_HELPER_H_
diff --git a/third_party/aom/test/obmc_variance_test.cc b/third_party/aom/test/obmc_variance_test.cc
index 04fee8285..4563b964a 100644
--- a/third_party/aom/test/obmc_variance_test.cc
+++ b/third_party/aom/test/obmc_variance_test.cc
@@ -93,6 +93,43 @@ TEST_P(ObmcVarianceTest, ExtremeValues) {
}
}
+TEST_P(ObmcVarianceTest, DISABLED_Speed) {
+ DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+ const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pre[i] = this->rng_.Rand8();
+ wsrc[i] = this->rng_.Rand8() * this->rng_(kMaskMax * kMaskMax + 1);
+ mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
+ }
+
+ const int num_loops = 1000000;
+ unsigned int ref_sse, tst_sse;
+ aom_usec_timer ref_timer, test_timer;
+
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < num_loops; ++i) {
+ params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < num_loops; ++i) {
+ params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf("c_time=%d \t simd_time=%d \t gain=%d \n", elapsed_time_c,
+ elapsed_time_simd, (elapsed_time_c / elapsed_time_simd));
+}
+
#if HAVE_SSE4_1
const ObmcVarianceTest::ParamType sse4_functions[] = {
TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_sse4_1),
@@ -117,6 +154,30 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcVarianceTest,
::testing::ValuesIn(sse4_functions));
#endif // HAVE_SSE4_1
+#if HAVE_AVX2
+const ObmcVarianceTest::ParamType avx2_functions[] = {
+ TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_avx2),
+ TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_avx2),
+ TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_avx2),
+ TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_avx2),
+ TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_avx2),
+ TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_avx2),
+ TestFuncs(aom_obmc_variance32x32_c, aom_obmc_variance32x32_avx2),
+ TestFuncs(aom_obmc_variance32x16_c, aom_obmc_variance32x16_avx2),
+ TestFuncs(aom_obmc_variance16x32_c, aom_obmc_variance16x32_avx2),
+ TestFuncs(aom_obmc_variance16x16_c, aom_obmc_variance16x16_avx2),
+ TestFuncs(aom_obmc_variance16x8_c, aom_obmc_variance16x8_avx2),
+ TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_avx2),
+ TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_avx2),
+ TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_avx2),
+ TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_sse4_1),
+ TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_sse4_1)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, ObmcVarianceTest,
+ ::testing::ValuesIn(avx2_functions));
+#endif // HAVE_AVX2
+
////////////////////////////////////////////////////////////////////////////////
// High bit-depth
////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/aom/test/pickrst_test.cc b/third_party/aom/test/pickrst_test.cc
new file mode 100644
index 000000000..040e8e8b7
--- /dev/null
+++ b/third_party/aom/test/pickrst_test.cc
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/pickrst.h"
+using libaom_test::FunctionEquivalenceTest;
+
+#define MAX_DATA_BLOCK 384
+
+namespace {
+static const int kIterations = 100;
+
+typedef int64_t (*lowbd_pixel_proj_error_func)(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+
+typedef libaom_test::FuncParam<lowbd_pixel_proj_error_func> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+typedef ::testing::tuple<const lowbd_pixel_proj_error_func>
+ PixelProjErrorTestParam;
+
+class PixelProjErrorTest
+ : public ::testing::TestWithParam<PixelProjErrorTestParam> {
+ public:
+ virtual void SetUp() {
+ target_func_ = GET_PARAM(0);
+ src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(uint8_t)));
+ dgd_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(uint8_t)));
+ flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(int32_t)));
+ flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(int32_t)));
+ }
+ virtual void TearDown() {
+ aom_free(src_);
+ aom_free(dgd_);
+ aom_free(flt0_);
+ aom_free(flt1_);
+ }
+ void runPixelProjErrorTest(int32_t run_times);
+ void runPixelProjErrorTest_ExtremeValues();
+
+ private:
+ lowbd_pixel_proj_error_func target_func_;
+ ACMRandom rng_;
+ uint8_t *src_;
+ uint8_t *dgd_;
+ int32_t *flt0_;
+ int32_t *flt1_;
+};
+
+void PixelProjErrorTest::runPixelProjErrorTest(int32_t run_times) {
+ int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
+ int v_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
+ const int dgd_stride = MAX_DATA_BLOCK;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int flt0_stride = MAX_DATA_BLOCK;
+ const int flt1_stride = MAX_DATA_BLOCK;
+ sgr_params_type params;
+ int xq[2];
+ const int iters = run_times == 1 ? kIterations : 4;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ int64_t err_ref = 0, err_test = 1;
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_[i] = rng_.Rand8();
+ src_[i] = rng_.Rand8();
+ flt0_[i] = rng_.Rand15Signed();
+ flt1_[i] = rng_.Rand15Signed();
+ }
+ xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+ xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+ params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+ params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+ params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+ params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+ uint8_t *dgd = dgd_;
+ uint8_t *src = src_;
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ err_ref = av1_lowbd_pixel_proj_error_c(src, h_end, v_end, src_stride, dgd,
+ dgd_stride, flt0_, flt0_stride,
+ flt1_, flt1_stride, xq, &params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ err_test =
+ target_func_(src, h_end, v_end, src_stride, dgd, dgd_stride, flt0_,
+ flt0_stride, flt1_, flt1_stride, xq, &params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 10) {
+ printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+ params.r[1], h_end, v_end, time1, time2, time1 / time2);
+ }
+ ASSERT_EQ(err_ref, err_test);
+ }
+}
+
+void PixelProjErrorTest::runPixelProjErrorTest_ExtremeValues() {
+ const int h_start = 0;
+ int h_end = 192;
+ const int v_start = 0;
+ int v_end = 192;
+ const int dgd_stride = MAX_DATA_BLOCK;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int flt0_stride = MAX_DATA_BLOCK;
+ const int flt1_stride = MAX_DATA_BLOCK;
+ sgr_params_type params;
+ int xq[2];
+ const int iters = kIterations;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ int64_t err_ref = 0, err_test = 1;
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_[i] = 0;
+ src_[i] = 255;
+ flt0_[i] = rng_.Rand15Signed();
+ flt1_[i] = rng_.Rand15Signed();
+ }
+ xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+ xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+ params.r[0] = rng_.Rand8() % MAX_RADIUS;
+ params.r[1] = rng_.Rand8() % MAX_RADIUS;
+ params.s[0] = rng_.Rand8() % MAX_RADIUS;
+ params.s[1] = rng_.Rand8() % MAX_RADIUS;
+ uint8_t *dgd = dgd_;
+ uint8_t *src = src_;
+
+ err_ref = av1_lowbd_pixel_proj_error_c(
+ src, h_end - h_start, v_end - v_start, src_stride, dgd, dgd_stride,
+ flt0_, flt0_stride, flt1_, flt1_stride, xq, &params);
+
+ err_test = target_func_(src, h_end - h_start, v_end - v_start, src_stride,
+ dgd, dgd_stride, flt0_, flt0_stride, flt1_,
+ flt1_stride, xq, &params);
+
+ ASSERT_EQ(err_ref, err_test);
+ }
+}
+
+TEST_P(PixelProjErrorTest, RandomValues) { runPixelProjErrorTest(1); }
+
+TEST_P(PixelProjErrorTest, ExtremeValues) {
+ runPixelProjErrorTest_ExtremeValues();
+}
+
+TEST_P(PixelProjErrorTest, DISABLED_Speed) { runPixelProjErrorTest(200000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, PixelProjErrorTest,
+ ::testing::Values(av1_lowbd_pixel_proj_error_sse4_1));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_CASE_P(AVX2, PixelProjErrorTest,
+ ::testing::Values(av1_lowbd_pixel_proj_error_avx2));
+#endif // HAVE_AVX2
+
+} // namespace
diff --git a/third_party/aom/test/quantize_func_test.cc b/third_party/aom/test/quantize_func_test.cc
index 97e73bff0..554d0c721 100644
--- a/third_party/aom/test/quantize_func_test.cc
+++ b/third_party/aom/test/quantize_func_test.cc
@@ -27,15 +27,7 @@
namespace {
using libaom_test::ACMRandom;
-#define QUAN_PARAM_LIST \
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, \
- const int16_t *zbin_ptr, const int16_t *round_ptr, \
- const int16_t *quant_ptr, const int16_t *quant_shift_ptr, \
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, \
- const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, \
- const int16_t *iscan
-
-#define QUAN_PARAM_LIST_NO_SKIP \
+#define QUAN_PARAM_LIST \
const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, \
const int16_t *round_ptr, const int16_t *quant_ptr, \
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, \
@@ -43,44 +35,34 @@ using libaom_test::ACMRandom;
const int16_t *scan, const int16_t *iscan
typedef void (*QuantizeFunc)(QUAN_PARAM_LIST);
-typedef void (*QuantizeFuncHbd)(QUAN_PARAM_LIST_NO_SKIP, int log_scale);
-typedef void (*QuantizeFuncNoSkip)(QUAN_PARAM_LIST_NO_SKIP);
+typedef void (*QuantizeFuncHbd)(QUAN_PARAM_LIST, int log_scale);
#define HBD_QUAN_FUNC \
fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \
qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, log_scale)
-#define LBD_QUAN_FUNC_NO_SKIP \
+#define LBD_QUAN_FUNC \
fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \
qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan)
template <QuantizeFuncHbd fn>
void highbd_quan16x16_wrapper(QUAN_PARAM_LIST) {
- (void)skip_block;
const int log_scale = 0;
HBD_QUAN_FUNC;
}
template <QuantizeFuncHbd fn>
void highbd_quan32x32_wrapper(QUAN_PARAM_LIST) {
- (void)skip_block;
const int log_scale = 1;
HBD_QUAN_FUNC;
}
template <QuantizeFuncHbd fn>
void highbd_quan64x64_wrapper(QUAN_PARAM_LIST) {
- (void)skip_block;
const int log_scale = 2;
HBD_QUAN_FUNC;
}
-template <QuantizeFuncNoSkip fn>
-void lowbd_quan_wrapper(QUAN_PARAM_LIST) {
- (void)skip_block;
- LBD_QUAN_FUNC_NO_SKIP;
-}
-
typedef enum { TYPE_B, TYPE_DC, TYPE_FP } QuantType;
using ::testing::tuple;
@@ -125,7 +107,6 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
void QuantizeRun(bool is_loop, int q = 0, int test_num = 1) {
tran_low_t *coeff_ptr = coeff_;
const intptr_t n_coeffs = coeff_num();
- const int skip_block = 0;
tran_low_t *qcoeff_ref = coeff_ptr + n_coeffs;
tran_low_t *dqcoeff_ref = qcoeff_ref + n_coeffs;
@@ -158,13 +139,13 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
memset(qcoeff_ref, 0, 5 * n_coeffs * sizeof(*qcoeff_ref));
- quant_ref_(coeff_ptr, n_coeffs, skip_block, zbin, round, quant,
- quant_shift, qcoeff_ref, dqcoeff_ref, dequant, &eob[0],
- sc->scan, sc->iscan);
+ quant_ref_(coeff_ptr, n_coeffs, zbin, round, quant, quant_shift,
+ qcoeff_ref, dqcoeff_ref, dequant, &eob[0], sc->scan,
+ sc->iscan);
- ASM_REGISTER_STATE_CHECK(quant_(
- coeff_ptr, n_coeffs, skip_block, zbin, round, quant, quant_shift,
- qcoeff, dqcoeff, dequant, &eob[1], sc->scan, sc->iscan));
+ ASM_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, zbin, round, quant,
+ quant_shift, qcoeff, dqcoeff, dequant,
+ &eob[1], sc->scan, sc->iscan));
for (int j = 0; j < n_coeffs; ++j) {
ASSERT_EQ(qcoeff_ref[j], qcoeff[j])
@@ -286,7 +267,6 @@ TEST_P(QuantizeTest, CoeffHalfDequant) {
TEST_P(QuantizeTest, DISABLED_Speed) {
tran_low_t *coeff_ptr = coeff_;
const intptr_t n_coeffs = coeff_num();
- const int skip_block = 0;
tran_low_t *qcoeff_ref = coeff_ptr + n_coeffs;
tran_low_t *dqcoeff_ref = qcoeff_ref + n_coeffs;
@@ -312,8 +292,8 @@ TEST_P(QuantizeTest, DISABLED_Speed) {
aom_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
- quant_(coeff_ptr, n_coeffs, skip_block, zbin, round_fp, quant_fp,
- quant_shift, qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan);
+ quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
+ dqcoeff, dequant, eob, sc->scan, sc->iscan);
}
aom_usec_timer_mark(&timer);
@@ -325,33 +305,24 @@ using ::testing::make_tuple;
#if HAVE_AVX2
const QuantizeParam kQParamArrayAvx2[] = {
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_avx2>, TX_16X16, TYPE_FP,
- AOM_BITS_8),
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_avx2>, TX_4X16, TYPE_FP,
- AOM_BITS_8),
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_avx2>, TX_16X4, TYPE_FP,
- AOM_BITS_8),
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_avx2>, TX_32X8, TYPE_FP,
- AOM_BITS_8),
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_avx2>, TX_8X32, TYPE_FP,
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_16X16, TYPE_FP,
AOM_BITS_8),
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_32x32_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_32x32_avx2>, TX_32X32, TYPE_FP,
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_4X16, TYPE_FP,
AOM_BITS_8),
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_32x32_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_32x32_avx2>, TX_16X64, TYPE_FP,
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_16X4, TYPE_FP,
AOM_BITS_8),
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_32x32_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_32x32_avx2>, TX_64X16, TYPE_FP,
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_32X8, TYPE_FP,
AOM_BITS_8),
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_64x64_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_64x64_avx2>, TX_64X64, TYPE_FP,
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_8X32, TYPE_FP,
AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_32X32,
+ TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_16X64,
+ TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_64X16,
+ TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_avx2, TX_64X64,
+ TYPE_FP, AOM_BITS_8),
make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, TX_16X16,
TYPE_FP, AOM_BITS_8),
@@ -393,20 +364,17 @@ INSTANTIATE_TEST_CASE_P(AVX2, QuantizeTest,
#if HAVE_SSE2
const QuantizeParam kQParamArraySSE2[] = {
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_sse2>, TX_16X16, TYPE_FP,
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_16X16, TYPE_FP,
+ AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_4X16, TYPE_FP,
AOM_BITS_8),
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_sse2>, TX_4X16, TYPE_FP,
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_16X4, TYPE_FP,
AOM_BITS_8),
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_sse2>, TX_16X4, TYPE_FP,
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_8X32, TYPE_FP,
AOM_BITS_8),
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_sse2>, TX_8X32, TYPE_FP,
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_32X8, TYPE_FP,
AOM_BITS_8),
- make_tuple(&lowbd_quan_wrapper<av1_quantize_fp_c>,
- &lowbd_quan_wrapper<av1_quantize_fp_sse2>, TX_32X8, TYPE_FP,
+ make_tuple(&aom_quantize_b_c, &aom_quantize_b_sse2, TX_16X16, TYPE_B,
AOM_BITS_8),
make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, TX_16X16,
TYPE_B, AOM_BITS_8),
@@ -426,4 +394,32 @@ INSTANTIATE_TEST_CASE_P(SSE2, QuantizeTest,
::testing::ValuesIn(kQParamArraySSE2));
#endif
+#if HAVE_SSSE3 && ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, QuantizeTest,
+ ::testing::Values(make_tuple(&aom_quantize_b_c, &aom_quantize_b_ssse3,
+ TX_16X16, TYPE_B, AOM_BITS_8)));
+
+// Like libvpx, the ssse3 and avx quantize tests do not pass.
+// https://bugs.chromium.org/p/webm/issues/detail?id=1448
+INSTANTIATE_TEST_CASE_P(
+ DISABLED_SSSE3_32x32, QuantizeTest,
+ ::testing::Values(make_tuple(&aom_quantize_b_32x32_c,
+ &aom_quantize_b_32x32_ssse3, TX_16X16, TYPE_B,
+ AOM_BITS_8)));
+
+#endif // HAVE_SSSE3 && ARCH_X86_64
+
+#if HAVE_AVX && ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+ AVX, QuantizeTest,
+ ::testing::Values(
+ make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx, TX_16X16, TYPE_B,
+ AOM_BITS_8),
+ // Although these tests will not pass against _c, test them against each
+ // other so there is some minor checking.
+ make_tuple(&aom_quantize_b_32x32_ssse3, &aom_quantize_b_32x32_avx,
+ TX_32X32, TYPE_B, AOM_BITS_8)));
+
+#endif // HAVE_AVX && ARCH_X86_64
} // namespace
diff --git a/third_party/aom/test/reconinter_test.cc b/third_party/aom/test/reconinter_test.cc
index 9b849404c..a8536e517 100644
--- a/third_party/aom/test/reconinter_test.cc
+++ b/third_party/aom/test/reconinter_test.cc
@@ -83,7 +83,8 @@ class BuildCompDiffwtdMaskD16Test
protected:
void RunCheckOutput(buildcompdiffwtdmaskd16_func test_impl);
- void RunSpeedTest(buildcompdiffwtdmaskd16_func test_impl);
+ void RunSpeedTest(buildcompdiffwtdmaskd16_func test_impl,
+ DIFFWTD_MASK_TYPE mask_type);
libaom_test::ACMRandom rnd_;
}; // class BuildCompDiffwtdMaskD16Test
@@ -98,8 +99,7 @@ void BuildCompDiffwtdMaskD16Test::RunCheckOutput(
DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
- ConvolveParams conv_params =
- get_conv_params_no_round(0, 0, 0, NULL, 0, 1, bd);
+ ConvolveParams conv_params = get_conv_params_no_round(0, 0, NULL, 0, 1, bd);
int in_precision =
bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
@@ -130,7 +130,7 @@ void BuildCompDiffwtdMaskD16Test::RunCheckOutput(
}
void BuildCompDiffwtdMaskD16Test::RunSpeedTest(
- buildcompdiffwtdmaskd16_func test_impl) {
+ buildcompdiffwtdmaskd16_func test_impl, DIFFWTD_MASK_TYPE mask_type) {
const int block_idx = GET_PARAM(2);
const int bd = GET_PARAM(0);
const int width = block_size_wide[block_idx];
@@ -139,8 +139,7 @@ void BuildCompDiffwtdMaskD16Test::RunSpeedTest(
DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
- ConvolveParams conv_params =
- get_conv_params_no_round(0, 0, 0, NULL, 0, 1, bd);
+ ConvolveParams conv_params = get_conv_params_no_round(0, 0, NULL, 0, 1, bd);
int in_precision =
bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
@@ -150,31 +149,29 @@ void BuildCompDiffwtdMaskD16Test::RunSpeedTest(
src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
}
- const int num_loops = 1000000000 / (width + height);
+ const int num_loops = 10000000 / (width + height);
aom_usec_timer timer;
aom_usec_timer_start(&timer);
for (int i = 0; i < num_loops; ++i)
- av1_build_compound_diffwtd_mask_d16_c(mask, DIFFWTD_38, src0, width, src1,
+ av1_build_compound_diffwtd_mask_d16_c(mask, mask_type, src0, width, src1,
width, height, width, &conv_params,
bd);
aom_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- printf("av1_build_compound_diffwtd_mask_d16 c_code %3dx%-3d: %7.2f us\n",
- width, height, 1000.0 * elapsed_time / num_loops);
aom_usec_timer timer1;
aom_usec_timer_start(&timer1);
for (int i = 0; i < num_loops; ++i)
- test_impl(mask, DIFFWTD_38, src0, width, src1, width, height, width,
+ test_impl(mask, mask_type, src0, width, src1, width, height, width,
&conv_params, bd);
aom_usec_timer_mark(&timer1);
const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
- printf("av1_build_compound_diffwtd_mask_d16 test_code %3dx%-3d: %7.2f us\n",
- width, height, 1000.0 * elapsed_time1 / num_loops);
+ printf("av1_build_compound_diffwtd_mask_d16 %3dx%-3d: %7.2f \n", width,
+ height, elapsed_time / double(elapsed_time1));
}
#if HAVE_SSE4_1
void BuildCompDiffwtdMaskTest::RunTest(buildcompdiffwtdmaskd_func test_impl,
@@ -232,7 +229,8 @@ TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
}
TEST_P(BuildCompDiffwtdMaskD16Test, DISABLED_Speed) {
- RunSpeedTest(GET_PARAM(1));
+ RunSpeedTest(GET_PARAM(1), DIFFWTD_38);
+ RunSpeedTest(GET_PARAM(1), DIFFWTD_38_INV);
}
#if HAVE_SSE4_1
@@ -244,6 +242,14 @@ INSTANTIATE_TEST_CASE_P(
BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, BuildCompDiffwtdMaskTest,
+ BuildParams(av1_build_compound_diffwtd_mask_avx2));
+
+INSTANTIATE_TEST_CASE_P(AVX2, BuildCompDiffwtdMaskD16Test,
+ BuildParams(av1_build_compound_diffwtd_mask_d16_avx2));
+#endif
+
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(NEON, BuildCompDiffwtdMaskD16Test,
BuildParams(av1_build_compound_diffwtd_mask_d16_neon));
diff --git a/third_party/aom/test/register_state_check.h b/third_party/aom/test/register_state_check.h
index ef1f775e5..d404621dd 100644
--- a/third_party/aom/test/register_state_check.h
+++ b/third_party/aom/test/register_state_check.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_REGISTER_STATE_CHECK_H_
-#define TEST_REGISTER_STATE_CHECK_H_
+#ifndef AOM_TEST_REGISTER_STATE_CHECK_H_
+#define AOM_TEST_REGISTER_STATE_CHECK_H_
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -145,4 +145,4 @@ class RegisterStateCheckMMX {
#define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK
#endif
-#endif // TEST_REGISTER_STATE_CHECK_H_
+#endif // AOM_TEST_REGISTER_STATE_CHECK_H_
diff --git a/third_party/aom/test/sum_squares_test.cc b/third_party/aom/test/sum_squares_test.cc
index c03ebad4a..f10998498 100644
--- a/third_party/aom/test/sum_squares_test.cc
+++ b/third_party/aom/test/sum_squares_test.cc
@@ -52,6 +52,7 @@ class SumSquaresTest : public ::testing::TestWithParam<TestFuncs> {
aom_free(src_);
}
void RunTest(int isRandom);
+ void RunSpeedTest();
void GenRandomData(int width, int height, int stride) {
const int msb = 11; // Up to 12 bit input
@@ -108,6 +109,38 @@ void SumSquaresTest::RunTest(int isRandom) {
}
}
+void SumSquaresTest::RunSpeedTest() {
+ for (int block = BLOCK_4X4; block < BLOCK_SIZES_ALL; block++) {
+ const int width = block_size_wide[block]; // Up to 128x128
+ const int height = block_size_high[block]; // Up to 128x128
+ int stride = 4 << rnd_(7); // Up to 256 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(7);
+ }
+ GenExtremeData(width, height, stride);
+ const int num_loops = 1000000000 / (width + height);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+
+ for (int i = 0; i < num_loops; ++i)
+ params_.ref_func(src_, stride, width, height);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("SumSquaresTest C %3dx%-3d: %7.2f ns\n", width, height,
+ 1000.0 * elapsed_time / num_loops);
+
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
+ for (int i = 0; i < num_loops; ++i)
+ params_.tst_func(src_, stride, width, height);
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("SumSquaresTest Test %3dx%-3d: %7.2f ns\n", width, height,
+ 1000.0 * elapsed_time1 / num_loops);
+ }
+}
+
TEST_P(SumSquaresTest, OperationCheck) {
RunTest(1); // GenRandomData
}
@@ -116,6 +149,8 @@ TEST_P(SumSquaresTest, ExtremeValues) {
RunTest(0); // GenExtremeData
}
+TEST_P(SumSquaresTest, DISABLED_Speed) { RunSpeedTest(); }
+
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
@@ -125,6 +160,13 @@ INSTANTIATE_TEST_CASE_P(
#endif // HAVE_SSE2
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+ AVX2, SumSquaresTest,
+ ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
+ &aom_sum_squares_2d_i16_avx2)));
+#endif // HAVE_AVX2
+
//////////////////////////////////////////////////////////////////////////////
// 1D version
//////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/aom/test/test-data.sha1 b/third_party/aom/test/test-data.sha1
index 723e06104..b6ee34701 100644
--- a/third_party/aom/test/test-data.sha1
+++ b/third_party/aom/test/test-data.sha1
@@ -2,6 +2,18 @@ d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv
b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
26b7f64399b84db4b4c9c915d743ec5c2619d4b9 *invalid-bug-1814.ivf
d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-1814.ivf.res
+fa06784f23751d8c37be94160fb821e855199af4 *invalid-oss-fuzz-10061.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10061.ivf.res
+c9e06c4c7fb7d69fd635a1f606a5e478d60e99cf *invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf
+88e18e61bd2b7457b4c71ebefbdff0029c41cc04 *invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf.res
+91a5bedeb4832c1c2900736cc0f644bb63971bbc *invalid-oss-fuzz-10227.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10227.ivf.res
+c0960f032484579f967881cc025b71cfd7a79ee1 *invalid-oss-fuzz-9463.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-9463.ivf.res
+f448caf378e250b7eea4fa2d1c3cd7ef4a3211ce *invalid-oss-fuzz-9482.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-9482.ivf.res
+a686989de79af89136f631fd630df639c7861851 *invalid-oss-fuzz-9720.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-9720.ivf.res
a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m
0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m
ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m
@@ -486,4 +498,10 @@ bf241a449a28773b93e6e529a06dfc28109577e4 *av1-1-b10-00-quantizer-62.ivf
8b6eb3fff2e0db7eac775b08c745250ca591e2d9 *av1-1-b10-00-quantizer-63.ivf
63ea689d025593e5d91760785b8e446d04d4671e *av1-1-b10-00-quantizer-63.ivf.md5
a9f7ea6312a533cc6426a6145edd190d45813c37 *av1-1-b8-02-allintra.ivf
-8fd8f789cfee1069d20f3e2c241f5cad7292239e *av1-1-b8-02-allintra.ivf.md5 \ No newline at end of file
+8fd8f789cfee1069d20f3e2c241f5cad7292239e *av1-1-b8-02-allintra.ivf.md5
+e69e41fee40b408b6eebcc79f266a95f2ee24f9e *av1-1-b8-03-sizedown.mkv
+8c528fb3ccda959a29721566e132f730935ca32b *av1-1-b8-03-sizedown.mkv.md5
+1889da5ee1708007e47bb887470ac477e1d7ba01 *av1-1-b8-03-sizeup.mkv
+8de81b170635d456602dc8923a8b39c534d01fa8 *av1-1-b8-03-sizeup.mkv.md5
+d3ed7de0aa8c155fe35e0f5f4203240710d31383 *park_joy_90p_8_420_monochrome.y4m
+5b3f0907407b809aa66b62cb080feda8c92454ca *park_joy_90p_8_420_vertical_csp.y4m
diff --git a/third_party/aom/test/test.cmake b/third_party/aom/test/test.cmake
index 7b584880f..b16ae14c3 100644
--- a/third_party/aom/test/test.cmake
+++ b/third_party/aom/test/test.cmake
@@ -20,17 +20,20 @@ include("${AOM_ROOT}/test/test_data_util.cmake")
set(AOM_UNIT_TEST_DATA_LIST_FILE "${AOM_ROOT}/test/test-data.sha1")
-list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_CONFIG_DIR}/usage_exit.c"
+list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
"${AOM_ROOT}/test/test_libaom.cc")
list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
"${AOM_ROOT}/test/acm_random.h"
"${AOM_ROOT}/test/aom_integer_test.cc"
+ "${AOM_ROOT}/test/av1_config_test.cc"
+ "${AOM_ROOT}/test/blockd_test.cc"
"${AOM_ROOT}/test/clear_system_state.h"
"${AOM_ROOT}/test/codec_factory.h"
"${AOM_ROOT}/test/decode_test_driver.cc"
"${AOM_ROOT}/test/decode_test_driver.h"
"${AOM_ROOT}/test/function_equivalence_test.h"
+ "${AOM_ROOT}/test/log2_test.cc"
"${AOM_ROOT}/test/md5_helper.h"
"${AOM_ROOT}/test/register_state_check.h"
"${AOM_ROOT}/test/test_vectors.cc"
@@ -45,6 +48,7 @@ if(CONFIG_INTERNAL_STATS)
endif()
list(APPEND AOM_UNIT_TEST_DECODER_SOURCES "${AOM_ROOT}/test/decode_api_test.cc"
+ "${AOM_ROOT}/test/external_frame_buffer_test.cc"
"${AOM_ROOT}/test/invalid_file_test.cc"
"${AOM_ROOT}/test/test_vector_test.cc"
"${AOM_ROOT}/test/ivf_video_source.h")
@@ -53,8 +57,6 @@ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
"${AOM_ROOT}/test/active_map_test.cc"
"${AOM_ROOT}/test/altref_test.cc"
"${AOM_ROOT}/test/aq_segment_test.cc"
- "${AOM_ROOT}/test/av1_txfm_test.cc"
- "${AOM_ROOT}/test/av1_txfm_test.h"
"${AOM_ROOT}/test/borders_test.cc"
"${AOM_ROOT}/test/cpu_speed_test.cc"
"${AOM_ROOT}/test/datarate_test.cc"
@@ -78,7 +80,7 @@ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
list(APPEND AOM_DECODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/decode_perf_test.cc")
list(APPEND AOM_ENCODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/encode_perf_test.cc")
list(APPEND AOM_UNIT_TEST_WEBM_SOURCES "${AOM_ROOT}/test/webm_video_source.h")
-list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES "${AOM_CONFIG_DIR}/usage_exit.c"
+list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
"${AOM_ROOT}/test/test_intra_pred_speed.cc")
if(NOT BUILD_SHARED_LIBS)
@@ -105,6 +107,7 @@ if(NOT BUILD_SHARED_LIBS)
if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc"
"${AOM_ROOT}/test/av1_ext_tile_test.cc"
"${AOM_ROOT}/test/binary_codes_test.cc"
"${AOM_ROOT}/test/boolcoder_test.cc"
@@ -168,6 +171,8 @@ if(NOT BUILD_SHARED_LIBS)
"${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
"${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
"${AOM_ROOT}/test/av1_round_shift_array_test.cc"
+ "${AOM_ROOT}/test/av1_txfm_test.cc"
+ "${AOM_ROOT}/test/av1_txfm_test.h"
"${AOM_ROOT}/test/av1_wedge_utils_test.cc"
"${AOM_ROOT}/test/blend_a64_mask_1d_test.cc"
"${AOM_ROOT}/test/blend_a64_mask_test.cc"
@@ -184,11 +189,16 @@ if(NOT BUILD_SHARED_LIBS)
"${AOM_ROOT}/test/noise_model_test.cc"
"${AOM_ROOT}/test/obmc_sad_test.cc"
"${AOM_ROOT}/test/obmc_variance_test.cc"
+ "${AOM_ROOT}/test/pickrst_test.cc"
"${AOM_ROOT}/test/sad_test.cc"
"${AOM_ROOT}/test/subtract_test.cc"
"${AOM_ROOT}/test/reconinter_test.cc"
"${AOM_ROOT}/test/sum_squares_test.cc"
- "${AOM_ROOT}/test/variance_test.cc")
+ "${AOM_ROOT}/test/variance_test.cc"
+ "${AOM_ROOT}/test/wiener_test.cc"
+ "${AOM_ROOT}/test/warp_filter_test.cc"
+ "${AOM_ROOT}/test/warp_filter_test_util.cc"
+ "${AOM_ROOT}/test/warp_filter_test_util.h")
list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
"${AOM_ROOT}/test/av1_highbd_iht_test.cc"
@@ -201,10 +211,8 @@ if(NOT BUILD_SHARED_LIBS)
list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
"${AOM_ROOT}/test/av1_convolve_scale_test.cc"
"${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc"
- "${AOM_ROOT}/test/intra_edge_test.cc"
- "${AOM_ROOT}/test/warp_filter_test.cc"
- "${AOM_ROOT}/test/warp_filter_test_util.cc"
- "${AOM_ROOT}/test/warp_filter_test_util.h")
+ "${AOM_ROOT}/test/intra_edge_test.cc")
+
endif()
if(HAVE_SSE4_2)
@@ -224,6 +232,9 @@ if(ENABLE_TESTS)
if(MSVC) # Force static run time to avoid collisions with googletest.
include("${AOM_ROOT}/build/cmake/msvc_runtime.cmake")
+ if(BUILD_SHARED_LIBS)
+ set(AOM_DISABLE_GTEST_CMAKE 1)
+ endif()
endif()
if(BUILD_SHARED_LIBS AND APPLE) # Silence an RPATH warning.
diff --git a/third_party/aom/test/test_data_util.cmake b/third_party/aom/test/test_data_util.cmake
index 9fe00a07d..45c951478 100644
--- a/third_party/aom/test/test_data_util.cmake
+++ b/third_party/aom/test/test_data_util.cmake
@@ -14,6 +14,18 @@ list(APPEND AOM_TEST_DATA_FILE_NAMES
"hantro_odd.yuv"
"invalid-bug-1814.ivf"
"invalid-bug-1814.ivf.res"
+ "invalid-oss-fuzz-10061.ivf"
+ "invalid-oss-fuzz-10061.ivf.res"
+ "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf"
+ "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf.res"
+ "invalid-oss-fuzz-10227.ivf"
+ "invalid-oss-fuzz-10227.ivf.res"
+ "invalid-oss-fuzz-9463.ivf"
+ "invalid-oss-fuzz-9463.ivf.res"
+ "invalid-oss-fuzz-9482.ivf"
+ "invalid-oss-fuzz-9482.ivf.res"
+ "invalid-oss-fuzz-9720.ivf"
+ "invalid-oss-fuzz-9720.ivf.res"
"park_joy_90p_10_420.y4m"
"park_joy_90p_10_422.y4m"
"park_joy_90p_10_444.y4m"
@@ -22,6 +34,8 @@ list(APPEND AOM_TEST_DATA_FILE_NAMES
"park_joy_90p_12_444.y4m"
"park_joy_90p_8_420_a10-1.y4m"
"park_joy_90p_8_420.y4m"
+ "park_joy_90p_8_420_monochrome.y4m"
+ "park_joy_90p_8_420_vertical_csp.y4m"
"park_joy_90p_8_422.y4m"
"park_joy_90p_8_444.y4m"
"desktop_credits.y4m"
@@ -494,7 +508,11 @@ if(CONFIG_AV1_DECODER)
"av1-1-b8-01-size-66x66.ivf"
"av1-1-b8-01-size-66x66.ivf.md5"
"av1-1-b8-02-allintra.ivf"
- "av1-1-b8-02-allintra.ivf.md5")
+ "av1-1-b8-02-allintra.ivf.md5"
+ "av1-1-b8-03-sizeup.mkv"
+ "av1-1-b8-03-sizeup.mkv.md5"
+ "av1-1-b8-03-sizedown.mkv"
+ "av1-1-b8-03-sizedown.mkv.md5")
endif()
if(ENABLE_ENCODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
diff --git a/third_party/aom/test/test_vector_test.cc b/third_party/aom/test/test_vector_test.cc
index 1bfeacba1..286988b17 100644
--- a/third_party/aom/test/test_vector_test.cc
+++ b/third_party/aom/test/test_vector_test.cc
@@ -119,7 +119,8 @@ TEST_P(TestVectorTest, MD5Match) {
testing::internal::scoped_ptr<libaom_test::CompressedVideoSource> video;
if (filename.substr(filename.length() - 3, 3) == "ivf") {
video.reset(new libaom_test::IVFVideoSource(filename));
- } else if (filename.substr(filename.length() - 4, 4) == "webm") {
+ } else if (filename.substr(filename.length() - 4, 4) == "webm" ||
+ filename.substr(filename.length() - 3, 3) == "mkv") {
#if CONFIG_WEBM_IO
video.reset(new libaom_test::WebMVideoSource(filename));
#else
diff --git a/third_party/aom/test/test_vectors.cc b/third_party/aom/test/test_vectors.cc
index f478c0183..71e431e18 100644
--- a/third_party/aom/test/test_vectors.cc
+++ b/third_party/aom/test/test_vectors.cc
@@ -131,7 +131,8 @@ const char *const kAV1TestVectors[] = {
"av1-1-b8-01-size-66x16.ivf", "av1-1-b8-01-size-66x18.ivf",
"av1-1-b8-01-size-66x32.ivf", "av1-1-b8-01-size-66x34.ivf",
"av1-1-b8-01-size-66x64.ivf", "av1-1-b8-01-size-66x66.ivf",
- "av1-1-b8-02-allintra.ivf",
+ "av1-1-b8-02-allintra.ivf", "av1-1-b8-03-sizedown.mkv",
+ "av1-1-b8-03-sizeup.mkv"
};
const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors);
#endif // CONFIG_AV1_DECODER
diff --git a/third_party/aom/test/test_vectors.h b/third_party/aom/test/test_vectors.h
index 229f063a6..be37f6e37 100644
--- a/third_party/aom/test/test_vectors.h
+++ b/third_party/aom/test/test_vectors.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_TEST_VECTORS_H_
-#define TEST_TEST_VECTORS_H_
+#ifndef AOM_TEST_TEST_VECTORS_H_
+#define AOM_TEST_TEST_VECTORS_H_
#include "config/aom_config.h"
@@ -23,4 +23,4 @@ extern const char *const kAV1TestVectors[];
} // namespace libaom_test
-#endif // TEST_TEST_VECTORS_H_
+#endif // AOM_TEST_TEST_VECTORS_H_
diff --git a/third_party/aom/test/transform_test_base.h b/third_party/aom/test/transform_test_base.h
index 67e8faf33..8ebcf5ff7 100644
--- a/third_party/aom/test/transform_test_base.h
+++ b/third_party/aom/test/transform_test_base.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_TRANSFORM_TEST_BASE_H_
-#define TEST_TRANSFORM_TEST_BASE_H_
+#ifndef AOM_TEST_TRANSFORM_TEST_BASE_H_
+#define AOM_TEST_TRANSFORM_TEST_BASE_H_
#include "config/aom_config.h"
@@ -339,4 +339,4 @@ class TransformTestBase {
} // namespace libaom_test
-#endif // TEST_TRANSFORM_TEST_BASE_H_
+#endif // AOM_TEST_TRANSFORM_TEST_BASE_H_
diff --git a/third_party/aom/test/util.h b/third_party/aom/test/util.h
index db00875ef..c3f4e4442 100644
--- a/third_party/aom/test/util.h
+++ b/third_party/aom/test/util.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_UTIL_H_
-#define TEST_UTIL_H_
+#ifndef AOM_TEST_UTIL_H_
+#define AOM_TEST_UTIL_H_
#include <stdio.h>
#include <math.h>
@@ -50,4 +50,4 @@ static INLINE double get_time_mark(aom_usec_timer *t) {
return static_cast<double>(aom_usec_timer_elapsed(t));
}
-#endif // TEST_UTIL_H_
+#endif // AOM_TEST_UTIL_H_
diff --git a/third_party/aom/test/variance_test.cc b/third_party/aom/test/variance_test.cc
index 2f1b1fc5a..0df314b0f 100644
--- a/third_party/aom/test/variance_test.cc
+++ b/third_party/aom/test/variance_test.cc
@@ -470,6 +470,7 @@ class MainTestClass
int byte_shift() const { return params_.bit_depth - 8; }
int block_size() const { return params_.block_size; }
int width() const { return params_.width; }
+ int height() const { return params_.height; }
uint32_t mask() const { return params_.mask; }
};
@@ -583,18 +584,19 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() {
CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
}
}
- unsigned int sse1, sse2, var1, var2;
+ unsigned int sse;
const int stride = width();
int run_time = 1000000000 / block_size();
-
- ASM_REGISTER_STATE_CHECK(var1 =
- params_.func(src_, stride, ref_, stride, &sse1));
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
for (int i = 0; i < run_time; ++i) {
- ASM_REGISTER_STATE_CHECK(
- var2 = params_.func(src_, stride, ref_, stride, &sse2));
+ params_.func(src_, stride, ref_, stride, &sse);
}
- EXPECT_EQ(var1, var2);
- EXPECT_EQ(sse1, sse2);
+
+ aom_usec_timer_mark(&timer);
+ const double elapsed_time =
+ static_cast<double>(aom_usec_timer_elapsed(&timer));
+ printf("Variance %dx%d : %7.2fns\n", width(), height(), elapsed_time);
}
////////////////////////////////////////////////////////////////////////////////
@@ -1183,6 +1185,7 @@ TEST_P(AvxHBDVarianceTest, Zero) { ZeroTest(); }
TEST_P(AvxHBDVarianceTest, Ref) { RefTest(); }
TEST_P(AvxHBDVarianceTest, RefStride) { RefStrideTest(); }
TEST_P(AvxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(AvxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
TEST_P(AvxHBDSubpelVarianceTest, Ref) { RefTest(); }
TEST_P(AvxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
@@ -1607,39 +1610,71 @@ INSTANTIATE_TEST_CASE_P(
MseParams(3, 3, &aom_highbd_8_mse8x8_sse2)));
*/
-INSTANTIATE_TEST_CASE_P(
- SSE2, AvxHBDVarianceTest,
- ::testing::Values(
- VarianceParams(6, 6, &aom_highbd_12_variance64x64_sse2, 12),
- VarianceParams(6, 5, &aom_highbd_12_variance64x32_sse2, 12),
- VarianceParams(5, 6, &aom_highbd_12_variance32x64_sse2, 12),
- VarianceParams(5, 5, &aom_highbd_12_variance32x32_sse2, 12),
- VarianceParams(5, 4, &aom_highbd_12_variance32x16_sse2, 12),
- VarianceParams(4, 5, &aom_highbd_12_variance16x32_sse2, 12),
- VarianceParams(4, 4, &aom_highbd_12_variance16x16_sse2, 12),
- VarianceParams(4, 3, &aom_highbd_12_variance16x8_sse2, 12),
- VarianceParams(3, 4, &aom_highbd_12_variance8x16_sse2, 12),
- VarianceParams(3, 3, &aom_highbd_12_variance8x8_sse2, 12),
- VarianceParams(6, 6, &aom_highbd_10_variance64x64_sse2, 10),
- VarianceParams(6, 5, &aom_highbd_10_variance64x32_sse2, 10),
- VarianceParams(5, 6, &aom_highbd_10_variance32x64_sse2, 10),
- VarianceParams(5, 5, &aom_highbd_10_variance32x32_sse2, 10),
- VarianceParams(5, 4, &aom_highbd_10_variance32x16_sse2, 10),
- VarianceParams(4, 5, &aom_highbd_10_variance16x32_sse2, 10),
- VarianceParams(4, 4, &aom_highbd_10_variance16x16_sse2, 10),
- VarianceParams(4, 3, &aom_highbd_10_variance16x8_sse2, 10),
- VarianceParams(3, 4, &aom_highbd_10_variance8x16_sse2, 10),
- VarianceParams(3, 3, &aom_highbd_10_variance8x8_sse2, 10),
- VarianceParams(6, 6, &aom_highbd_8_variance64x64_sse2, 8),
- VarianceParams(6, 5, &aom_highbd_8_variance64x32_sse2, 8),
- VarianceParams(5, 6, &aom_highbd_8_variance32x64_sse2, 8),
- VarianceParams(5, 5, &aom_highbd_8_variance32x32_sse2, 8),
- VarianceParams(5, 4, &aom_highbd_8_variance32x16_sse2, 8),
- VarianceParams(4, 5, &aom_highbd_8_variance16x32_sse2, 8),
- VarianceParams(4, 4, &aom_highbd_8_variance16x16_sse2, 8),
- VarianceParams(4, 3, &aom_highbd_8_variance16x8_sse2, 8),
- VarianceParams(3, 4, &aom_highbd_8_variance8x16_sse2, 8),
- VarianceParams(3, 3, &aom_highbd_8_variance8x8_sse2, 8)));
+const VarianceParams kArrayHBDVariance_sse2[] = {
+ VarianceParams(7, 7, &aom_highbd_12_variance128x128_sse2, 12),
+ VarianceParams(7, 6, &aom_highbd_12_variance128x64_sse2, 12),
+ VarianceParams(6, 7, &aom_highbd_12_variance64x128_sse2, 12),
+ VarianceParams(6, 6, &aom_highbd_12_variance64x64_sse2, 12),
+ VarianceParams(6, 5, &aom_highbd_12_variance64x32_sse2, 12),
+ VarianceParams(5, 6, &aom_highbd_12_variance32x64_sse2, 12),
+ VarianceParams(5, 5, &aom_highbd_12_variance32x32_sse2, 12),
+ VarianceParams(5, 4, &aom_highbd_12_variance32x16_sse2, 12),
+ VarianceParams(4, 5, &aom_highbd_12_variance16x32_sse2, 12),
+ VarianceParams(4, 4, &aom_highbd_12_variance16x16_sse2, 12),
+ VarianceParams(4, 3, &aom_highbd_12_variance16x8_sse2, 12),
+ VarianceParams(3, 4, &aom_highbd_12_variance8x16_sse2, 12),
+ VarianceParams(3, 3, &aom_highbd_12_variance8x8_sse2, 12),
+ VarianceParams(7, 7, &aom_highbd_10_variance128x128_sse2, 10),
+ VarianceParams(7, 6, &aom_highbd_10_variance128x64_sse2, 10),
+ VarianceParams(6, 7, &aom_highbd_10_variance64x128_sse2, 10),
+ VarianceParams(6, 6, &aom_highbd_10_variance64x64_sse2, 10),
+ VarianceParams(6, 5, &aom_highbd_10_variance64x32_sse2, 10),
+ VarianceParams(5, 6, &aom_highbd_10_variance32x64_sse2, 10),
+ VarianceParams(5, 5, &aom_highbd_10_variance32x32_sse2, 10),
+ VarianceParams(5, 4, &aom_highbd_10_variance32x16_sse2, 10),
+ VarianceParams(4, 5, &aom_highbd_10_variance16x32_sse2, 10),
+ VarianceParams(4, 4, &aom_highbd_10_variance16x16_sse2, 10),
+ VarianceParams(4, 3, &aom_highbd_10_variance16x8_sse2, 10),
+ VarianceParams(3, 4, &aom_highbd_10_variance8x16_sse2, 10),
+ VarianceParams(3, 3, &aom_highbd_10_variance8x8_sse2, 10),
+ VarianceParams(7, 7, &aom_highbd_8_variance128x128_sse2, 8),
+ VarianceParams(7, 6, &aom_highbd_8_variance128x64_sse2, 8),
+ VarianceParams(6, 7, &aom_highbd_8_variance64x128_sse2, 8),
+ VarianceParams(6, 6, &aom_highbd_8_variance64x64_sse2, 8),
+ VarianceParams(6, 5, &aom_highbd_8_variance64x32_sse2, 8),
+ VarianceParams(5, 6, &aom_highbd_8_variance32x64_sse2, 8),
+ VarianceParams(5, 5, &aom_highbd_8_variance32x32_sse2, 8),
+ VarianceParams(5, 4, &aom_highbd_8_variance32x16_sse2, 8),
+ VarianceParams(4, 5, &aom_highbd_8_variance16x32_sse2, 8),
+ VarianceParams(4, 4, &aom_highbd_8_variance16x16_sse2, 8),
+ VarianceParams(4, 3, &aom_highbd_8_variance16x8_sse2, 8),
+ VarianceParams(3, 4, &aom_highbd_8_variance8x16_sse2, 8),
+ VarianceParams(3, 3, &aom_highbd_8_variance8x8_sse2, 8)
+};
+INSTANTIATE_TEST_CASE_P(SSE2, AvxHBDVarianceTest,
+ ::testing::ValuesIn(kArrayHBDVariance_sse2));
+
+#if HAVE_AVX2
+
+const VarianceParams kArrayHBDVariance_avx2[] = {
+ VarianceParams(7, 7, &aom_highbd_10_variance128x128_avx2, 10),
+ VarianceParams(7, 6, &aom_highbd_10_variance128x64_avx2, 10),
+ VarianceParams(6, 7, &aom_highbd_10_variance64x128_avx2, 10),
+ VarianceParams(6, 6, &aom_highbd_10_variance64x64_avx2, 10),
+ VarianceParams(6, 5, &aom_highbd_10_variance64x32_avx2, 10),
+ VarianceParams(5, 6, &aom_highbd_10_variance32x64_avx2, 10),
+ VarianceParams(5, 5, &aom_highbd_10_variance32x32_avx2, 10),
+ VarianceParams(5, 4, &aom_highbd_10_variance32x16_avx2, 10),
+ VarianceParams(4, 5, &aom_highbd_10_variance16x32_avx2, 10),
+ VarianceParams(4, 4, &aom_highbd_10_variance16x16_avx2, 10),
+ VarianceParams(4, 3, &aom_highbd_10_variance16x8_avx2, 10),
+ VarianceParams(3, 4, &aom_highbd_10_variance8x16_avx2, 10),
+ VarianceParams(3, 3, &aom_highbd_10_variance8x8_avx2, 10)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, AvxHBDVarianceTest,
+ ::testing::ValuesIn(kArrayHBDVariance_avx2));
+#endif // HAVE_AVX2
const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_sse2, 12),
diff --git a/third_party/aom/test/video_source.h b/third_party/aom/test/video_source.h
index dc39b5a80..3c1c5e559 100644
--- a/third_party/aom/test/video_source.h
+++ b/third_party/aom/test/video_source.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_VIDEO_SOURCE_H_
-#define TEST_VIDEO_SOURCE_H_
+#ifndef AOM_TEST_VIDEO_SOURCE_H_
+#define AOM_TEST_VIDEO_SOURCE_H_
#if defined(_WIN32)
#undef NOMINMAX
@@ -256,4 +256,4 @@ class CompressedVideoSource {
} // namespace libaom_test
-#endif // TEST_VIDEO_SOURCE_H_
+#endif // AOM_TEST_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/warp_filter_test.cc b/third_party/aom/test/warp_filter_test.cc
index 15f8a285c..19a4e8b6a 100644
--- a/third_party/aom/test/warp_filter_test.cc
+++ b/third_party/aom/test/warp_filter_test.cc
@@ -17,20 +17,40 @@ using libaom_test::AV1HighbdWarpFilter::AV1HighbdWarpFilterTest;
using libaom_test::AV1WarpFilter::AV1WarpFilterTest;
namespace {
-#if HAVE_SSE4_1
-TEST_P(AV1WarpFilterTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
-TEST_P(AV1WarpFilterTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(3)); }
+TEST_P(AV1WarpFilterTest, CheckOutput) {
+ RunCheckOutput(::testing::get<3>(GET_PARAM(0)));
+}
+TEST_P(AV1WarpFilterTest, DISABLED_Speed) {
+ RunSpeedTest(::testing::get<3>(GET_PARAM(0)));
+}
+
+INSTANTIATE_TEST_CASE_P(
+ C, AV1WarpFilterTest,
+ libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_c));
+
+#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(
SSE4_1, AV1WarpFilterTest,
libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse4_1));
-TEST_P(AV1HighbdWarpFilterTest, CheckOutput) { RunCheckOutput(GET_PARAM(4)); }
-TEST_P(AV1HighbdWarpFilterTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(4)); }
+TEST_P(AV1HighbdWarpFilterTest, CheckOutput) {
+ RunCheckOutput(::testing::get<4>(GET_PARAM(0)));
+}
+TEST_P(AV1HighbdWarpFilterTest, DISABLED_Speed) {
+ RunSpeedTest(::testing::get<4>(GET_PARAM(0)));
+}
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdWarpFilterTest,
libaom_test::AV1HighbdWarpFilter::BuildParams(
av1_highbd_warp_affine_sse4_1));
#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+ NEON, AV1WarpFilterTest,
+ libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon));
+#endif // HAVE_NEON
+
} // namespace
diff --git a/third_party/aom/test/warp_filter_test_util.cc b/third_party/aom/test/warp_filter_test_util.cc
index b341cd0c2..69b2ed4af 100644
--- a/third_party/aom/test/warp_filter_test_util.cc
+++ b/third_party/aom/test/warp_filter_test_util.cc
@@ -28,22 +28,35 @@ int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits) {
void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
int16_t *alpha, int16_t *beta, int16_t *gamma,
- int16_t *delta) {
+ int16_t *delta, const int is_alpha_zero,
+ const int is_beta_zero, const int is_gamma_zero,
+ const int is_delta_zero) {
while (1) {
+ int rnd8 = rnd->Rand8() & 3;
mat[0] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6);
mat[1] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6);
mat[2] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
(1 << WARPEDMODEL_PREC_BITS);
mat[3] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
- // 50/50 chance of generating ROTZOOM vs. AFFINE models
- if (rnd->Rand8() & 1) {
+
+ if (rnd8 <= 1) {
// AFFINE
mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
(1 << WARPEDMODEL_PREC_BITS);
- } else {
+ } else if (rnd8 == 2) {
mat[4] = -mat[3];
mat[5] = mat[2];
+ } else {
+ mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
+ mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
+ (1 << WARPEDMODEL_PREC_BITS);
+ if (is_alpha_zero == 1) mat[2] = 1 << WARPEDMODEL_PREC_BITS;
+ if (is_beta_zero == 1) mat[3] = 0;
+ if (is_gamma_zero == 1) mat[4] = 0;
+ if (is_delta_zero == 1)
+ mat[5] = (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) +
+ (1 << WARPEDMODEL_PREC_BITS);
}
// Calculate the derived parameters and check that they are suitable
@@ -78,14 +91,16 @@ void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
}
namespace AV1WarpFilter {
-::testing::internal::ParamGenerator<WarpTestParam> BuildParams(
+::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
warp_affine_func filter) {
- const WarpTestParam params[] = {
+ WarpTestParam params[] = {
make_tuple(4, 4, 50000, filter), make_tuple(8, 8, 50000, filter),
make_tuple(64, 64, 1000, filter), make_tuple(4, 16, 20000, filter),
make_tuple(32, 8, 10000, filter),
};
- return ::testing::ValuesIn(params);
+ return ::testing::Combine(::testing::ValuesIn(params),
+ ::testing::Values(0, 1), ::testing::Values(0, 1),
+ ::testing::Values(0, 1), ::testing::Values(0, 1));
}
AV1WarpFilterTest::~AV1WarpFilterTest() {}
@@ -97,7 +112,13 @@ void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
const int w = 128, h = 128;
const int border = 16;
const int stride = w + 2 * border;
- const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+ WarpTestParam params = GET_PARAM(0);
+ const int out_w = ::testing::get<0>(params),
+ out_h = ::testing::get<1>(params);
+ const int is_alpha_zero = GET_PARAM(1);
+ const int is_beta_zero = GET_PARAM(2);
+ const int is_gamma_zero = GET_PARAM(3);
+ const int is_delta_zero = GET_PARAM(4);
int sub_x, sub_y;
const int bd = 8;
@@ -110,10 +131,11 @@ void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
uint8_t *output = new uint8_t[output_n];
int32_t mat[8];
int16_t alpha, beta, gamma, delta;
- ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
-
- generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta);
+ generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+ is_alpha_zero, is_beta_zero, is_gamma_zero,
+ is_delta_zero);
for (int r = 0; r < h; ++r)
for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
@@ -126,7 +148,7 @@ void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
sub_y = 0;
int do_average = 0;
- conv_params = get_conv_params_no_round(0, do_average, 0, dsta, out_w, 1, bd);
+ conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
conv_params.use_jnt_comp_avg = 0;
const int num_loops = 1000000000 / (out_w + out_h);
@@ -150,8 +172,14 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
const int w = 128, h = 128;
const int border = 16;
const int stride = w + 2 * border;
- const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
- const int num_iters = GET_PARAM(2);
+ WarpTestParam params = GET_PARAM(0);
+ const int is_alpha_zero = GET_PARAM(1);
+ const int is_beta_zero = GET_PARAM(2);
+ const int is_gamma_zero = GET_PARAM(3);
+ const int is_delta_zero = GET_PARAM(4);
+ const int out_w = ::testing::get<0>(params),
+ out_h = ::testing::get<1>(params);
+ const int num_iters = ::testing::get<2>(params);
int i, j, sub_x, sub_y;
const int bd = 8;
@@ -164,7 +192,7 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
uint8_t *output2 = new uint8_t[output_n];
int32_t mat[8];
int16_t alpha, beta, gamma, delta;
- ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n];
for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand8();
@@ -180,15 +208,18 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
const int use_no_round = rnd_.Rand8() & 1;
for (sub_x = 0; sub_x < 2; ++sub_x)
for (sub_y = 0; sub_y < 2; ++sub_y) {
- generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta);
+ generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+ is_alpha_zero, is_beta_zero, is_gamma_zero,
+ is_delta_zero);
+
for (int ii = 0; ii < 2; ++ii) {
for (int jj = 0; jj < 5; ++jj) {
for (int do_average = 0; do_average <= 1; ++do_average) {
if (use_no_round) {
- conv_params = get_conv_params_no_round(0, do_average, 0, dsta,
- out_w, 1, bd);
+ conv_params =
+ get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
} else {
- conv_params = get_conv_params(0, 0, 0, bd);
+ conv_params = get_conv_params(0, 0, bd);
}
if (jj >= 4) {
conv_params.use_jnt_comp_avg = 0;
@@ -201,8 +232,8 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
out_h, out_w, sub_x, sub_y, &conv_params, alpha,
beta, gamma, delta);
if (use_no_round) {
- conv_params = get_conv_params_no_round(0, do_average, 0, dstb,
- out_w, 1, bd);
+ conv_params =
+ get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
}
if (jj >= 4) {
conv_params.use_jnt_comp_avg = 0;
@@ -246,7 +277,7 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
} // namespace AV1WarpFilter
namespace AV1HighbdWarpFilter {
-::testing::internal::ParamGenerator<HighbdWarpTestParam> BuildParams(
+::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams(
highbd_warp_affine_func filter) {
const HighbdWarpTestParam params[] = {
make_tuple(4, 4, 100, 8, filter), make_tuple(8, 8, 100, 8, filter),
@@ -258,7 +289,9 @@ namespace AV1HighbdWarpFilter {
make_tuple(64, 64, 100, 12, filter), make_tuple(4, 16, 100, 12, filter),
make_tuple(32, 8, 100, 12, filter),
};
- return ::testing::ValuesIn(params);
+ return ::testing::Combine(::testing::ValuesIn(params),
+ ::testing::Values(0, 1), ::testing::Values(0, 1),
+ ::testing::Values(0, 1), ::testing::Values(0, 1));
}
AV1HighbdWarpFilterTest::~AV1HighbdWarpFilterTest() {}
@@ -272,8 +305,13 @@ void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
const int w = 128, h = 128;
const int border = 16;
const int stride = w + 2 * border;
- const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
- const int bd = GET_PARAM(3);
+ HighbdWarpTestParam param = GET_PARAM(0);
+ const int is_alpha_zero = GET_PARAM(1);
+ const int is_beta_zero = GET_PARAM(2);
+ const int is_gamma_zero = GET_PARAM(3);
+ const int is_delta_zero = GET_PARAM(4);
+ const int out_w = ::testing::get<0>(param), out_h = ::testing::get<1>(param);
+ const int bd = ::testing::get<3>(param);
const int mask = (1 << bd) - 1;
int sub_x, sub_y;
@@ -285,10 +323,12 @@ void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
uint16_t *output = new uint16_t[output_n];
int32_t mat[8];
int16_t alpha, beta, gamma, delta;
- ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
- generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta);
+ generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+ is_alpha_zero, is_beta_zero, is_gamma_zero,
+ is_delta_zero);
// Generate an input block and extend its borders horizontally
for (int r = 0; r < h; ++r)
for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask;
@@ -303,7 +343,7 @@ void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
sub_y = 0;
int do_average = 0;
conv_params.use_jnt_comp_avg = 0;
- conv_params = get_conv_params_no_round(0, do_average, 0, dsta, out_w, 1, bd);
+ conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
const int num_loops = 1000000000 / (out_w + out_h);
aom_usec_timer timer;
@@ -328,9 +368,14 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
const int w = 128, h = 128;
const int border = 16;
const int stride = w + 2 * border;
- const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
- const int num_iters = GET_PARAM(2);
- const int bd = GET_PARAM(3);
+ HighbdWarpTestParam param = GET_PARAM(0);
+ const int is_alpha_zero = GET_PARAM(1);
+ const int is_beta_zero = GET_PARAM(2);
+ const int is_gamma_zero = GET_PARAM(3);
+ const int is_delta_zero = GET_PARAM(4);
+ const int out_w = ::testing::get<0>(param), out_h = ::testing::get<1>(param);
+ const int bd = ::testing::get<3>(param);
+ const int num_iters = ::testing::get<2>(param);
const int mask = (1 << bd) - 1;
int i, j, sub_x, sub_y;
@@ -343,7 +388,7 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
uint16_t *output2 = new uint16_t[output_n];
int32_t mat[8];
int16_t alpha, beta, gamma, delta;
- ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n];
for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand16();
@@ -361,15 +406,17 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
const int use_no_round = rnd_.Rand8() & 1;
for (sub_x = 0; sub_x < 2; ++sub_x)
for (sub_y = 0; sub_y < 2; ++sub_y) {
- generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta);
+ generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+ is_alpha_zero, is_beta_zero, is_gamma_zero,
+ is_delta_zero);
for (int ii = 0; ii < 2; ++ii) {
for (int jj = 0; jj < 5; ++jj) {
for (int do_average = 0; do_average <= 1; ++do_average) {
if (use_no_round) {
- conv_params = get_conv_params_no_round(0, do_average, 0, dsta,
- out_w, 1, bd);
+ conv_params =
+ get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
} else {
- conv_params = get_conv_params(0, 0, 0, bd);
+ conv_params = get_conv_params(0, 0, bd);
}
if (jj >= 4) {
conv_params.use_jnt_comp_avg = 0;
@@ -385,8 +432,8 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
if (use_no_round) {
// TODO(angiebird): Change this to test_impl once we have SIMD
// implementation
- conv_params = get_conv_params_no_round(0, do_average, 0, dstb,
- out_w, 1, bd);
+ conv_params =
+ get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
}
if (jj >= 4) {
conv_params.use_jnt_comp_avg = 0;
diff --git a/third_party/aom/test/warp_filter_test_util.h b/third_party/aom/test/warp_filter_test_util.h
index cf72d9db6..b8998e5c8 100644
--- a/third_party/aom/test/warp_filter_test_util.h
+++ b/third_party/aom/test/warp_filter_test_util.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_WARP_FILTER_TEST_UTIL_H_
-#define TEST_WARP_FILTER_TEST_UTIL_H_
+#ifndef AOM_TEST_WARP_FILTER_TEST_UTIL_H_
+#define AOM_TEST_WARP_FILTER_TEST_UTIL_H_
#include "config/av1_rtcd.h"
#include "config/aom_dsp_rtcd.h"
@@ -28,7 +28,8 @@ namespace libaom_test {
void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
int16_t *alpha, int16_t *beta, int16_t *gamma,
- int16_t *delta);
+ int16_t *delta, int is_alpha_zero, int is_beta_zero,
+ int is_gamma_zero, int is_delta_zero);
namespace AV1WarpFilter {
@@ -41,11 +42,12 @@ typedef void (*warp_affine_func)(const int32_t *mat, const uint8_t *ref,
int16_t beta, int16_t gamma, int16_t delta);
typedef ::testing::tuple<int, int, int, warp_affine_func> WarpTestParam;
+typedef ::testing::tuple<WarpTestParam, int, int, int, int> WarpTestParams;
-::testing::internal::ParamGenerator<WarpTestParam> BuildParams(
+::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
warp_affine_func filter);
-class AV1WarpFilterTest : public ::testing::TestWithParam<WarpTestParam> {
+class AV1WarpFilterTest : public ::testing::TestWithParam<WarpTestParams> {
public:
virtual ~AV1WarpFilterTest();
virtual void SetUp();
@@ -73,12 +75,14 @@ typedef void (*highbd_warp_affine_func)(const int32_t *mat, const uint16_t *ref,
typedef ::testing::tuple<int, int, int, int, highbd_warp_affine_func>
HighbdWarpTestParam;
+typedef ::testing::tuple<HighbdWarpTestParam, int, int, int, int>
+ HighbdWarpTestParams;
-::testing::internal::ParamGenerator<HighbdWarpTestParam> BuildParams(
+::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams(
highbd_warp_affine_func filter);
class AV1HighbdWarpFilterTest
- : public ::testing::TestWithParam<HighbdWarpTestParam> {
+ : public ::testing::TestWithParam<HighbdWarpTestParams> {
public:
virtual ~AV1HighbdWarpFilterTest();
virtual void SetUp();
@@ -96,4 +100,4 @@ class AV1HighbdWarpFilterTest
} // namespace libaom_test
-#endif // TEST_WARP_FILTER_TEST_UTIL_H_
+#endif // AOM_TEST_WARP_FILTER_TEST_UTIL_H_
diff --git a/third_party/aom/test/webm_video_source.h b/third_party/aom/test/webm_video_source.h
index 482f5dea2..bb3d11735 100644
--- a/third_party/aom/test/webm_video_source.h
+++ b/third_party/aom/test/webm_video_source.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_WEBM_VIDEO_SOURCE_H_
-#define TEST_WEBM_VIDEO_SOURCE_H_
+#ifndef AOM_TEST_WEBM_VIDEO_SOURCE_H_
+#define AOM_TEST_WEBM_VIDEO_SOURCE_H_
#include <cstdarg>
#include <cstdio>
#include <cstdlib>
@@ -93,4 +93,4 @@ class WebMVideoSource : public CompressedVideoSource {
} // namespace libaom_test
-#endif // TEST_WEBM_VIDEO_SOURCE_H_
+#endif // AOM_TEST_WEBM_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/wiener_test.cc b/third_party/aom/test/wiener_test.cc
new file mode 100644
index 000000000..dfec09119
--- /dev/null
+++ b/third_party/aom/test/wiener_test.cc
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/pickrst.h"
+
+#define MAX_WIENER_BLOCK 384
+#define MAX_DATA_BLOCK (MAX_WIENER_BLOCK + WIENER_WIN)
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static void compute_stats_win_opt_c(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, double *M, double *H) {
+ ASSERT_TRUE(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+ int i, j, k, l, m, n;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ std::vector<std::vector<int64_t> > M_int(wiener_win,
+ std::vector<int64_t>(wiener_win, 0));
+ std::vector<std::vector<int64_t> > H_int(
+ wiener_win * wiener_win, std::vector<int64_t>(wiener_win * 8, 0));
+ std::vector<std::vector<int32_t> > sumY(wiener_win,
+ std::vector<int32_t>(wiener_win, 0));
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ for (i = v_start; i < v_end; i++) {
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t X1 = src[i * src_stride + j];
+ const uint8_t X2 = src[i * src_stride + j + 1];
+ sumX += X1 + X2;
+
+ const uint8_t *dgd_ij = dgd_win + i * dgd_stride + j;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const uint8_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
+ int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijkl[0];
+ const uint8_t D2 = dgd_ijkl[1];
+ sumY[k][l] += D1 + D2;
+ M_int[l][k] += D1 * X1 + D2 * X2;
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m] +
+ D2 * dgd_ij[n + dgd_stride * m + 1];
+ }
+ }
+ }
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ M[l * wiener_win + k] =
+ M_int[l][k] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H[(l * wiener_win + k) * wiener_win2 + m * wiener_win + n] =
+ H_int[(l * wiener_win + k)][n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+void compute_stats_opt_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+ int h_start, int h_end, int v_start, int v_end,
+ int dgd_stride, int src_stride, double *M, double *H) {
+ if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win_opt_c(wiener_win, dgd, src, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H);
+ } else {
+ av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ }
+}
+
+static const int kIterations = 100;
+static const double min_error = (double)(0.01);
+typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, double *M, double *H);
+
+typedef libaom_test::FuncParam<compute_stats_Func> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+typedef ::testing::tuple<const compute_stats_Func> WienerTestParam;
+
+class WienerTest : public ::testing::TestWithParam<WienerTestParam> {
+ public:
+ virtual void SetUp() { target_func_ = GET_PARAM(0); }
+ void runWienerTest(const int32_t wiener_win, int32_t run_times);
+ void runWienerTest_ExtremeValues(const int32_t wiener_win);
+
+ private:
+ compute_stats_Func target_func_;
+ ACMRandom rng_;
+};
+
+void WienerTest::runWienerTest(const int32_t wiener_win, int32_t run_times) {
+ const int32_t wiener_halfwin = wiener_win >> 1;
+ const int32_t wiener_win2 = wiener_win * wiener_win;
+ DECLARE_ALIGNED(32, uint8_t, dgd_buf[MAX_DATA_BLOCK * MAX_DATA_BLOCK]);
+ DECLARE_ALIGNED(32, uint8_t, src_buf[MAX_DATA_BLOCK * MAX_DATA_BLOCK]);
+ DECLARE_ALIGNED(32, double, M_ref[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, double, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+ DECLARE_ALIGNED(32, double, M_test[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, double, H_test[WIENER_WIN2 * WIENER_WIN2]);
+ const int h_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
+ int h_end =
+ run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
+ const int v_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
+ int v_end =
+ run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
+ const int dgd_stride = h_end;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int iters = run_times == 1 ? kIterations : 2;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_buf[i] = rng_.Rand8();
+ src_buf[i] = rng_.Rand8();
+ }
+ uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
+ uint8_t *src = src_buf;
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M_ref, H_ref);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M_test, H_test);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 10) {
+ printf("win %d %3dx%-3d:%7.2f/%7.2fns", wiener_win, h_end, v_end, time1,
+ time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
+ int failed = 0;
+ for (int i = 0; i < wiener_win2; ++i) {
+ if (fabs(M_ref[i] - M_test[i]) > min_error) {
+ failed = 1;
+ printf("win %d M iter %d [%4d] ref %6.0f test %6.0f \n", wiener_win,
+ iter, i, M_ref[i], M_test[i]);
+ break;
+ }
+ }
+ // ASSERT_EQ(failed, 0);
+ for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+ if (fabs(H_ref[i] - H_test[i]) > min_error) {
+ failed = 1;
+ printf("win %d H iter %d [%4d] ref %6.0f test %6.0f \n", wiener_win,
+ iter, i, H_ref[i], H_test[i]);
+ break;
+ }
+ }
+ ASSERT_EQ(failed, 0);
+ }
+}
+
+void WienerTest::runWienerTest_ExtremeValues(const int32_t wiener_win) {
+ const int32_t wiener_halfwin = wiener_win >> 1;
+ const int32_t wiener_win2 = wiener_win * wiener_win;
+ DECLARE_ALIGNED(32, uint8_t, dgd_buf[MAX_DATA_BLOCK * MAX_DATA_BLOCK]);
+ DECLARE_ALIGNED(32, uint8_t, src_buf[MAX_DATA_BLOCK * MAX_DATA_BLOCK]);
+ DECLARE_ALIGNED(32, double, M_ref[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, double, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+ DECLARE_ALIGNED(32, double, M_test[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, double, H_test[WIENER_WIN2 * WIENER_WIN2]);
+ const int h_start = 16;
+ const int h_end = MAX_WIENER_BLOCK;
+ const int v_start = 16;
+ const int v_end = MAX_WIENER_BLOCK;
+ const int dgd_stride = h_end;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int iters = 1;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_buf[i] = 255;
+ src_buf[i] = 255;
+ }
+ uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
+ uint8_t *src = src_buf;
+
+ av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M_ref, H_ref);
+
+ target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M_test, H_test);
+
+ int failed = 0;
+ for (int i = 0; i < wiener_win2; ++i) {
+ if (fabs(M_ref[i] - M_test[i]) > min_error) {
+ failed = 1;
+ printf("win %d M iter %d [%4d] ref %6.0f test %6.0f \n", wiener_win,
+ iter, i, M_ref[i], M_test[i]);
+ break;
+ }
+ }
+ // ASSERT_EQ(failed, 0);
+ for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+ if (fabs(H_ref[i] - H_test[i]) > min_error) {
+ failed = 1;
+ printf("win %d H iter %d [%4d] ref %6.0f test %6.0f \n", wiener_win,
+ iter, i, H_ref[i], H_test[i]);
+ break;
+ }
+ }
+ ASSERT_EQ(failed, 0);
+ }
+}
+
+TEST_P(WienerTest, RandomValues) {
+ runWienerTest(WIENER_WIN, 1);
+ runWienerTest(WIENER_WIN_CHROMA, 1);
+}
+
+TEST_P(WienerTest, ExtremeValues) {
+ runWienerTest_ExtremeValues(WIENER_WIN);
+ runWienerTest_ExtremeValues(WIENER_WIN_CHROMA);
+}
+
+TEST_P(WienerTest, DISABLED_Speed) {
+ runWienerTest(WIENER_WIN, 200);
+ runWienerTest(WIENER_WIN_CHROMA, 200);
+}
+
+INSTANTIATE_TEST_CASE_P(C, WienerTest, ::testing::Values(compute_stats_opt_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, WienerTest,
+ ::testing::Values(av1_compute_stats_sse4_1));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_CASE_P(AVX2, WienerTest,
+ ::testing::Values(av1_compute_stats_avx2));
+#endif // HAVE_AVX2
+
+} // namespace
diff --git a/third_party/aom/test/y4m_test.cc b/third_party/aom/test/y4m_test.cc
index b8011935d..6cc75ef5b 100644
--- a/third_party/aom/test/y4m_test.cc
+++ b/third_party/aom/test/y4m_test.cc
@@ -37,6 +37,10 @@ struct Y4mTestParam {
const Y4mTestParam kY4mTestVectors[] = {
{ "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420,
"e5406275b9fc6bb3436c31d4a05c1cab" },
+ { "park_joy_90p_8_420_monochrome.y4m", 8, AOM_IMG_FMT_I420,
+ "95ef5bf6218580588be24a5271bb6a7f" },
+ { "park_joy_90p_8_420_vertical_csp.y4m", 8, AOM_IMG_FMT_I420,
+ "f53a40fec15254ac312527339d9c686b" },
{ "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422,
"284a47a47133b12884ec3a14e959a0b6" },
{ "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444,
@@ -55,24 +59,7 @@ const Y4mTestParam kY4mTestVectors[] = {
"5a6481a550821dab6d0192f5c63845e9" },
};
-static void write_image_file(const aom_image_t *img, FILE *file) {
- int plane, y;
- for (plane = 0; plane < 3; ++plane) {
- const unsigned char *buf = img->planes[plane];
- const int stride = img->stride[plane];
- const int bytes_per_sample = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
- const int h =
- (plane ? (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift
- : img->d_h);
- const int w =
- (plane ? (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift
- : img->d_w);
- for (y = 0; y < h; ++y) {
- fwrite(buf, bytes_per_sample, w, file);
- buf += stride;
- }
- }
-}
+static const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V };
class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
public ::libaom_test::Y4mVideoSource {
@@ -162,12 +149,13 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest {
tmpfile_ = new libaom_test::TempOutFile;
ASSERT_TRUE(tmpfile_->file() != NULL);
y4m_write_file_header(buf, sizeof(buf), kWidth, kHeight, &framerate,
- y4m_.aom_fmt, y4m_.bit_depth);
+ img()->monochrome, img()->csp, y4m_.aom_fmt,
+ y4m_.bit_depth);
fputs(buf, tmpfile_->file());
for (unsigned int i = start_; i < limit_; i++) {
y4m_write_frame_header(buf, sizeof(buf));
fputs(buf, tmpfile_->file());
- write_image_file(img(), tmpfile_->file());
+ y4m_write_image_file(img(), PLANES_YUV, tmpfile_->file());
Next();
}
ReplaceInputFile(tmpfile_->file());
diff --git a/third_party/aom/test/y4m_video_source.h b/third_party/aom/test/y4m_video_source.h
index 277ded9eb..3dea901e6 100644
--- a/third_party/aom/test/y4m_video_source.h
+++ b/third_party/aom/test/y4m_video_source.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_Y4M_VIDEO_SOURCE_H_
-#define TEST_Y4M_VIDEO_SOURCE_H_
+#ifndef AOM_TEST_Y4M_VIDEO_SOURCE_H_
+#define AOM_TEST_Y4M_VIDEO_SOURCE_H_
#include <algorithm>
#include <string>
@@ -120,4 +120,4 @@ class Y4mVideoSource : public VideoSource {
} // namespace libaom_test
-#endif // TEST_Y4M_VIDEO_SOURCE_H_
+#endif // AOM_TEST_Y4M_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/yuv_video_source.h b/third_party/aom/test/yuv_video_source.h
index 51554af6f..774ecc008 100644
--- a/third_party/aom/test/yuv_video_source.h
+++ b/third_party/aom/test/yuv_video_source.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TEST_YUV_VIDEO_SOURCE_H_
-#define TEST_YUV_VIDEO_SOURCE_H_
+#ifndef AOM_TEST_YUV_VIDEO_SOURCE_H_
+#define AOM_TEST_YUV_VIDEO_SOURCE_H_
#include <cstdio>
#include <cstdlib>
@@ -120,4 +120,4 @@ class YUVVideoSource : public VideoSource {
} // namespace libaom_test
-#endif // TEST_YUV_VIDEO_SOURCE_H_
+#endif // AOM_TEST_YUV_VIDEO_SOURCE_H_
diff --git a/third_party/aom/third_party/vector/LICENSE b/third_party/aom/third_party/vector/LICENSE
new file mode 100644
index 000000000..afcb9f00a
--- /dev/null
+++ b/third_party/aom/third_party/vector/LICENSE
@@ -0,0 +1,19 @@
+The MIT License (MIT)
+Copyright (c) 2016 Peter Goldsborough
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/third_party/aom/third_party/vector/README.libaom b/third_party/aom/third_party/vector/README.libaom
new file mode 100644
index 000000000..2bb8b2d5d
--- /dev/null
+++ b/third_party/aom/third_party/vector/README.libaom
@@ -0,0 +1,14 @@
+Name: vector
+URL: https://github.com/goldsborough/vector
+Version: commit-id: 40efe82
+License: MIT
+License File: LICENSE
+
+Description:
+A feature-complete, generic and customizable resizable
+array implementation in pure C that supports almost
+the entire C++ std::vector API, including iterators.
+
+Local Modifications:
+Renamed some functions to fit in with the AOMedia
+naming convention.
diff --git a/third_party/aom/tools/obu_parser.cc b/third_party/aom/tools/obu_parser.cc
index 2d0f5b27c..7d71386ce 100644
--- a/third_party/aom/tools/obu_parser.cc
+++ b/third_party/aom/tools/obu_parser.cc
@@ -8,6 +8,7 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include <string.h>
#include <cstdio>
#include <string>
@@ -15,7 +16,7 @@
#include "aom/aom_codec.h"
#include "aom/aom_integer.h"
#include "aom_ports/mem_ops.h"
-#include "av1/decoder/obu.h"
+#include "av1/common/obu_util.h"
#include "tools/obu_parser.h"
namespace aom_tools {
diff --git a/third_party/aom/tools/obu_parser.h b/third_party/aom/tools/obu_parser.h
index 86e7c4581..1d7d2d794 100644
--- a/third_party/aom/tools/obu_parser.h
+++ b/third_party/aom/tools/obu_parser.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TOOLS_OBU_PARSER_H_
-#define TOOLS_OBU_PARSER_H_
+#ifndef AOM_TOOLS_OBU_PARSER_H_
+#define AOM_TOOLS_OBU_PARSER_H_
#include <cstdint>
@@ -24,4 +24,4 @@ bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes);
} // namespace aom_tools
-#endif // TOOLS_OBU_PARSER_H_
+#endif // AOM_TOOLS_OBU_PARSER_H_
diff --git a/third_party/aom/tools/txfm_analyzer/txfm_graph.h b/third_party/aom/tools/txfm_analyzer/txfm_graph.h
index 76a9bc732..2e3c9551e 100644
--- a/third_party/aom/tools/txfm_analyzer/txfm_graph.h
+++ b/third_party/aom/tools/txfm_analyzer/txfm_graph.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef TOOLS_TXFM_ANALYZER_H_
-#define TOOLS_TXFM_ANALYZER_H_
+#ifndef AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_
+#define AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_
struct Node {
Node *inNode[2];
@@ -158,4 +158,4 @@ void amplify_value(Node *node, int stage_num, int node_num, int stage_idx,
void propagate_estimate_amlify(Node *node, int stage_num, int node_num,
int stage_idx, int amplify_bit,
int estimate_bit);
-#endif // TOOLS_TXFM_ANALYZER_H_
+#endif // AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_