From d2499ead93dc4298c0882fe98902acb1b5209f99 Mon Sep 17 00:00:00 2001 From: trav90 Date: Fri, 19 Oct 2018 23:05:00 -0500 Subject: Update libaom to commit ID 1e227d41f0616de9548a673a83a21ef990b62591 --- third_party/aom/av1/av1.cmake | 14 +- third_party/aom/av1/av1_cx_iface.c | 160 +- third_party/aom/av1/av1_dx_iface.c | 62 +- third_party/aom/av1/av1_iface_common.h | 7 +- third_party/aom/av1/common/alloccommon.h | 6 +- third_party/aom/av1/common/arm/av1_inv_txfm_neon.c | 2447 ++++++++- third_party/aom/av1/common/arm/av1_inv_txfm_neon.h | 8 +- .../aom/av1/common/arm/blend_a64_hmask_neon.c | 4 +- .../aom/av1/common/arm/blend_a64_vmask_neon.c | 4 +- third_party/aom/av1/common/arm/cfl_neon.c | 4 +- third_party/aom/av1/common/arm/convolve_neon.c | 363 +- third_party/aom/av1/common/arm/convolve_neon.h | 6 +- third_party/aom/av1/common/arm/jnt_convolve_neon.c | 512 +- third_party/aom/av1/common/arm/mem_neon.h | 15 +- third_party/aom/av1/common/arm/selfguided_neon.c | 18 +- third_party/aom/av1/common/arm/transpose_neon.h | 83 +- third_party/aom/av1/common/arm/warp_plane_neon.c | 714 +++ .../aom/av1/common/arm/wiener_convolve_neon.c | 145 +- third_party/aom/av1/common/av1_inv_txfm1d.c | 140 +- third_party/aom/av1/common/av1_inv_txfm1d.h | 6 +- third_party/aom/av1/common/av1_inv_txfm1d_cfg.h | 6 +- third_party/aom/av1/common/av1_loopfilter.c | 945 +++- third_party/aom/av1/common/av1_loopfilter.h | 120 +- third_party/aom/av1/common/av1_rtcd_defs.pl | 46 +- third_party/aom/av1/common/av1_txfm.c | 50 + third_party/aom/av1/common/av1_txfm.h | 32 +- third_party/aom/av1/common/blockd.c | 64 +- third_party/aom/av1/common/blockd.h | 53 +- third_party/aom/av1/common/cdef.h | 6 +- third_party/aom/av1/common/cdef_block.h | 6 +- third_party/aom/av1/common/cdef_block_simd.h | 5 + third_party/aom/av1/common/cfl.h | 6 +- third_party/aom/av1/common/common.h | 6 +- third_party/aom/av1/common/common_data.h | 75 +- third_party/aom/av1/common/convolve.c | 116 +- third_party/aom/av1/common/convolve.h | 21 +- third_party/aom/av1/common/entropy.h | 6 +- third_party/aom/av1/common/entropymode.h | 8 +- third_party/aom/av1/common/entropymv.c | 55 - third_party/aom/av1/common/entropymv.h | 16 +- third_party/aom/av1/common/enums.h | 12 +- third_party/aom/av1/common/filter.h | 22 +- third_party/aom/av1/common/frame_buffers.c | 11 + third_party/aom/av1/common/frame_buffers.h | 12 +- third_party/aom/av1/common/idct.c | 274 +- third_party/aom/av1/common/idct.h | 31 +- third_party/aom/av1/common/mv.h | 8 +- third_party/aom/av1/common/mvref_common.c | 381 +- third_party/aom/av1/common/mvref_common.h | 43 +- third_party/aom/av1/common/obmc.h | 14 +- third_party/aom/av1/common/obu_util.c | 147 + third_party/aom/av1/common/obu_util.h | 47 + third_party/aom/av1/common/odintrin.h | 12 +- third_party/aom/av1/common/onyxc_int.h | 29 +- third_party/aom/av1/common/ppc/cfl_ppc.c | 85 +- third_party/aom/av1/common/pred_common.c | 4 +- third_party/aom/av1/common/pred_common.h | 6 +- third_party/aom/av1/common/quant_common.h | 6 +- third_party/aom/av1/common/reconinter.c | 652 +-- third_party/aom/av1/common/reconinter.h | 148 +- third_party/aom/av1/common/reconintra.h | 6 +- third_party/aom/av1/common/resize.c | 39 +- third_party/aom/av1/common/resize.h | 6 +- third_party/aom/av1/common/restoration.c | 131 +- third_party/aom/av1/common/restoration.h | 7 +- third_party/aom/av1/common/scale.h | 7 +- third_party/aom/av1/common/scan.h | 6 +- third_party/aom/av1/common/seg_common.h | 6 +- third_party/aom/av1/common/thread_common.c | 18 +- third_party/aom/av1/common/thread_common.h | 6 +- third_party/aom/av1/common/tile_common.c | 16 + third_party/aom/av1/common/tile_common.h | 9 +- third_party/aom/av1/common/timing.h | 6 +- third_party/aom/av1/common/token_cdfs.h | 5 + third_party/aom/av1/common/txb_common.h | 243 +- third_party/aom/av1/common/warped_motion.c | 4 +- third_party/aom/av1/common/warped_motion.h | 6 +- .../aom/av1/common/x86/av1_convolve_scale_sse4.c | 1 - third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c | 6 + third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h | 6 +- .../aom/av1/common/x86/av1_inv_txfm_ssse3.c | 6 + .../aom/av1/common/x86/av1_inv_txfm_ssse3.h | 10 +- third_party/aom/av1/common/x86/av1_txfm_sse2.h | 6 +- third_party/aom/av1/common/x86/av1_txfm_sse4.h | 11 +- third_party/aom/av1/common/x86/cfl_simd.h | 5 + third_party/aom/av1/common/x86/convolve_2d_avx2.c | 2 - third_party/aom/av1/common/x86/convolve_2d_sse2.c | 3 +- third_party/aom/av1/common/x86/convolve_sse2.c | 11 +- .../aom/av1/common/x86/highbd_convolve_2d_avx2.c | 1 - .../aom/av1/common/x86/highbd_convolve_2d_sse4.c | 1 - .../aom/av1/common/x86/highbd_convolve_2d_ssse3.c | 1 - .../aom/av1/common/x86/highbd_inv_txfm_avx2.c | 1117 +++- .../aom/av1/common/x86/highbd_inv_txfm_sse4.c | 5335 +++++++++++++++----- .../aom/av1/common/x86/highbd_jnt_convolve_avx2.c | 1 - .../aom/av1/common/x86/highbd_txfm_utility_sse4.h | 28 +- .../aom/av1/common/x86/highbd_warp_plane_sse4.c | 268 +- third_party/aom/av1/common/x86/jnt_convolve_avx2.c | 211 +- third_party/aom/av1/common/x86/reconinter_avx2.c | 496 ++ third_party/aom/av1/common/x86/selfguided_avx2.c | 23 +- third_party/aom/av1/common/x86/selfguided_sse4.c | 24 +- third_party/aom/av1/common/x86/warp_plane_sse4.c | 809 ++- .../aom/av1/common/x86/wiener_convolve_avx2.c | 3 +- .../aom/av1/common/x86/wiener_convolve_sse2.c | 3 +- third_party/aom/av1/decoder/accounting.h | 6 +- third_party/aom/av1/decoder/decodeframe.c | 555 +- third_party/aom/av1/decoder/decodeframe.h | 8 +- third_party/aom/av1/decoder/decodemv.c | 179 +- third_party/aom/av1/decoder/decodemv.h | 6 +- third_party/aom/av1/decoder/decoder.c | 21 +- third_party/aom/av1/decoder/decoder.h | 12 +- third_party/aom/av1/decoder/decodetxb.h | 6 +- third_party/aom/av1/decoder/detokenize.h | 6 +- third_party/aom/av1/decoder/dthread.h | 6 +- third_party/aom/av1/decoder/inspection.h | 6 +- third_party/aom/av1/decoder/obu.c | 176 +- third_party/aom/av1/decoder/obu.h | 29 +- third_party/aom/av1/encoder/aq_complexity.c | 7 +- third_party/aom/av1/encoder/aq_complexity.h | 6 +- third_party/aom/av1/encoder/aq_cyclicrefresh.c | 8 +- third_party/aom/av1/encoder/aq_cyclicrefresh.h | 6 +- third_party/aom/av1/encoder/aq_variance.c | 179 +- third_party/aom/av1/encoder/aq_variance.h | 10 +- third_party/aom/av1/encoder/av1_fwd_txfm1d.c | 147 +- third_party/aom/av1/encoder/av1_fwd_txfm1d.h | 6 +- third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h | 6 +- third_party/aom/av1/encoder/av1_quantize.c | 68 +- third_party/aom/av1/encoder/av1_quantize.h | 6 +- third_party/aom/av1/encoder/bitstream.c | 312 +- third_party/aom/av1/encoder/bitstream.h | 16 +- third_party/aom/av1/encoder/block.h | 53 +- third_party/aom/av1/encoder/blockiness.c | 1 - third_party/aom/av1/encoder/context_tree.c | 17 +- third_party/aom/av1/encoder/context_tree.h | 8 +- third_party/aom/av1/encoder/corner_detect.h | 6 +- third_party/aom/av1/encoder/corner_match.h | 6 +- third_party/aom/av1/encoder/cost.h | 6 +- third_party/aom/av1/encoder/dwt.h | 5 + third_party/aom/av1/encoder/encodeframe.c | 1075 ++-- third_party/aom/av1/encoder/encodeframe.h | 11 +- third_party/aom/av1/encoder/encodemb.c | 71 +- third_party/aom/av1/encoder/encodemb.h | 17 +- third_party/aom/av1/encoder/encodemv.c | 26 +- third_party/aom/av1/encoder/encodemv.h | 14 +- third_party/aom/av1/encoder/encoder.c | 490 +- third_party/aom/av1/encoder/encoder.h | 105 +- third_party/aom/av1/encoder/encodetxb.c | 48 +- third_party/aom/av1/encoder/encodetxb.h | 6 +- third_party/aom/av1/encoder/ethread.c | 252 +- third_party/aom/av1/encoder/ethread.h | 6 +- third_party/aom/av1/encoder/extend.h | 6 +- third_party/aom/av1/encoder/firstpass.c | 992 +--- third_party/aom/av1/encoder/firstpass.h | 19 +- third_party/aom/av1/encoder/global_motion.c | 4 +- third_party/aom/av1/encoder/global_motion.h | 6 +- third_party/aom/av1/encoder/grain_test_vectors.h | 6 +- third_party/aom/av1/encoder/hash.h | 8 +- third_party/aom/av1/encoder/hash_motion.c | 94 +- third_party/aom/av1/encoder/hash_motion.h | 16 +- third_party/aom/av1/encoder/hybrid_fwd_txfm.c | 38 +- third_party/aom/av1/encoder/hybrid_fwd_txfm.h | 6 +- third_party/aom/av1/encoder/lookahead.h | 6 +- third_party/aom/av1/encoder/mathutils.h | 7 +- third_party/aom/av1/encoder/mbgraph.c | 7 +- third_party/aom/av1/encoder/mbgraph.h | 6 +- third_party/aom/av1/encoder/mcomp.c | 231 +- third_party/aom/av1/encoder/mcomp.h | 33 +- third_party/aom/av1/encoder/ml.c | 16 + third_party/aom/av1/encoder/ml.h | 11 +- third_party/aom/av1/encoder/palette.h | 6 +- .../aom/av1/encoder/partition_model_weights.h | 1457 ++++-- third_party/aom/av1/encoder/picklpf.c | 3 +- third_party/aom/av1/encoder/picklpf.h | 6 +- third_party/aom/av1/encoder/pickrst.c | 128 +- third_party/aom/av1/encoder/pickrst.h | 23 +- third_party/aom/av1/encoder/pustats.h | 183 +- third_party/aom/av1/encoder/random.h | 6 +- third_party/aom/av1/encoder/ransac.h | 6 +- .../aom/av1/encoder/rate_distortion_model_params.h | 6 +- third_party/aom/av1/encoder/ratectrl.c | 96 +- third_party/aom/av1/encoder/ratectrl.h | 29 +- third_party/aom/av1/encoder/ratectrl_xiph.c | 0 third_party/aom/av1/encoder/ratectrl_xiph.h | 0 third_party/aom/av1/encoder/rd.c | 494 +- third_party/aom/av1/encoder/rd.h | 48 +- third_party/aom/av1/encoder/rdopt.c | 4751 ++++++++++------- third_party/aom/av1/encoder/rdopt.h | 18 +- third_party/aom/av1/encoder/reconinter_enc.c | 627 +++ third_party/aom/av1/encoder/reconinter_enc.h | 127 + third_party/aom/av1/encoder/segmentation.h | 6 +- third_party/aom/av1/encoder/speed_features.c | 145 +- third_party/aom/av1/encoder/speed_features.h | 144 +- third_party/aom/av1/encoder/temporal_filter.c | 81 +- third_party/aom/av1/encoder/temporal_filter.h | 6 +- third_party/aom/av1/encoder/tokenize.h | 6 +- .../aom/av1/encoder/tx_prune_model_weights.h | 1482 +++--- .../aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c | 259 +- .../aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c | 75 +- .../aom/av1/encoder/x86/av1_fwd_txfm_avx2.h | 6 +- .../aom/av1/encoder/x86/av1_fwd_txfm_sse2.h | 6 +- third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h | 9 +- third_party/aom/av1/encoder/x86/encodetxb_avx2.c | 130 + third_party/aom/av1/encoder/x86/encodetxb_sse4.c | 46 +- .../aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c | 344 +- third_party/aom/av1/encoder/x86/pickrst_avx2.c | 403 ++ third_party/aom/av1/encoder/x86/pickrst_sse4.c | 389 ++ third_party/aom/av1/encoder/x86/wedge_utils_avx2.c | 4 +- third_party/aom/av1/exports_com | 2 + third_party/aom/av1/exports_dec | 1 + third_party/aom/av1/exports_test | 2 + 209 files changed, 23499 insertions(+), 10802 deletions(-) create mode 100644 third_party/aom/av1/common/arm/warp_plane_neon.c create mode 100644 third_party/aom/av1/common/obu_util.c create mode 100644 third_party/aom/av1/common/obu_util.h delete mode 100644 third_party/aom/av1/encoder/ratectrl_xiph.c delete mode 100644 third_party/aom/av1/encoder/ratectrl_xiph.h create mode 100644 third_party/aom/av1/encoder/reconinter_enc.c create mode 100644 third_party/aom/av1/encoder/reconinter_enc.h create mode 100644 third_party/aom/av1/encoder/x86/encodetxb_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/pickrst_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/pickrst_sse4.c create mode 100644 third_party/aom/av1/exports_com create mode 100644 third_party/aom/av1/exports_test (limited to 'third_party/aom/av1') diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake index 4c4f542fe..3a7cd7ee1 100644 --- a/third_party/aom/av1/av1.cmake +++ b/third_party/aom/av1/av1.cmake @@ -53,6 +53,8 @@ list(APPEND AOM_AV1_COMMON_SOURCES "${AOM_ROOT}/av1/common/mv.h" "${AOM_ROOT}/av1/common/mvref_common.c" "${AOM_ROOT}/av1/common/mvref_common.h" + "${AOM_ROOT}/av1/common/obu_util.c" + "${AOM_ROOT}/av1/common/obu_util.h" "${AOM_ROOT}/av1/common/odintrin.c" "${AOM_ROOT}/av1/common/odintrin.h" "${AOM_ROOT}/av1/common/onyxc_int.h" @@ -78,8 +80,8 @@ list(APPEND AOM_AV1_COMMON_SOURCES "${AOM_ROOT}/av1/common/thread_common.h" "${AOM_ROOT}/av1/common/tile_common.c" "${AOM_ROOT}/av1/common/tile_common.h" - "${AOM_ROOT}/av1/common/timing.h" "${AOM_ROOT}/av1/common/timing.c" + "${AOM_ROOT}/av1/common/timing.h" "${AOM_ROOT}/av1/common/token_cdfs.h" "${AOM_ROOT}/av1/common/txb_common.c" "${AOM_ROOT}/av1/common/txb_common.h" @@ -176,6 +178,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/rd.h" "${AOM_ROOT}/av1/encoder/rdopt.c" "${AOM_ROOT}/av1/encoder/rdopt.h" + "${AOM_ROOT}/av1/encoder/reconinter_enc.c" + "${AOM_ROOT}/av1/encoder/reconinter_enc.h" "${AOM_ROOT}/av1/encoder/segmentation.c" "${AOM_ROOT}/av1/encoder/segmentation.h" "${AOM_ROOT}/av1/encoder/speed_features.c" @@ -268,7 +272,8 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c" "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c" "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c" - "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c") + "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c") list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2 "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c" @@ -276,7 +281,9 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2 "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c" "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h" "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c" - "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c") + "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c") list(APPEND AOM_AV1_ENCODER_INTRIN_NEON "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c") @@ -301,6 +308,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON "${AOM_ROOT}/av1/common/arm/selfguided_neon.c" "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c" "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h" + "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c" "${AOM_ROOT}/av1/common/cdef_block_neon.c") list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2 diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c index 3bc4804c9..3295f618a 100644 --- a/third_party/aom/av1/av1_cx_iface.c +++ b/third_party/aom/av1/av1_cx_iface.c @@ -14,28 +14,29 @@ #include "config/aom_config.h" #include "config/aom_version.h" -#include "aom/aom_encoder.h" #include "aom_ports/aom_once.h" +#include "aom_ports/mem_ops.h" #include "aom_ports/system_state.h" + +#include "aom/aom_encoder.h" #include "aom/internal/aom_codec_internal.h" -#include "av1/encoder/encoder.h" -#include "aom/aomcx.h" -#include "av1/encoder/firstpass.h" + #include "av1/av1_iface_common.h" #include "av1/encoder/bitstream.h" -#include "aom_ports/mem_ops.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" #define MAG_SIZE (4) #define MAX_NUM_ENHANCEMENT_LAYERS 3 struct av1_extracfg { int cpu_used; // available cpu percentage in 1/16 - int dev_sf; unsigned int enable_auto_alt_ref; unsigned int enable_auto_bwd_ref; unsigned int noise_sensitivity; unsigned int sharpness; unsigned int static_thresh; + unsigned int row_mt; unsigned int tile_columns; // log2 number of tile columns unsigned int tile_rows; // log2 number of tile rows unsigned int arnr_max_frames; @@ -98,37 +99,40 @@ struct av1_extracfg { float noise_level; int noise_block_size; #endif + + unsigned int chroma_subsampling_x; + unsigned int chroma_subsampling_y; }; static struct av1_extracfg default_extra_cfg = { - 0, // cpu_used - 0, // dev_sf - 1, // enable_auto_alt_ref - 0, // enable_auto_bwd_ref - 0, // noise_sensitivity - 0, // sharpness - 0, // static_thresh - 0, // tile_columns - 0, // tile_rows - 7, // arnr_max_frames - 5, // arnr_strength - 0, // min_gf_interval; 0 -> default decision - 0, // max_gf_interval; 0 -> default decision - AOM_TUNE_PSNR, // tuning - 10, // cq_level - 0, // rc_max_intra_bitrate_pct - 0, // rc_max_inter_bitrate_pct - 0, // gf_cbr_boost_pct - 0, // lossless - 1, // enable_cdef - 1, // enable_restoration - 0, // disable_trellis_quant - 0, // enable_qm - DEFAULT_QM_Y, // qm_y - DEFAULT_QM_U, // qm_u - DEFAULT_QM_V, // qm_v - DEFAULT_QM_FIRST, // qm_min - DEFAULT_QM_LAST, // qm_max + 0, // cpu_used + 1, // enable_auto_alt_ref + 0, // enable_auto_bwd_ref + 0, // noise_sensitivity + CONFIG_SHARP_SETTINGS, // sharpness + 0, // static_thresh + 0, // row_mt + 0, // tile_columns + 0, // tile_rows + 7, // arnr_max_frames + 5, // arnr_strength + 0, // min_gf_interval; 0 -> default decision + 0, // max_gf_interval; 0 -> default decision + AOM_TUNE_PSNR, // tuning + 10, // cq_level + 0, // rc_max_intra_bitrate_pct + 0, // rc_max_inter_bitrate_pct + 0, // gf_cbr_boost_pct + 0, // lossless + !CONFIG_SHARP_SETTINGS, // enable_cdef + 1, // enable_restoration + 0, // disable_trellis_quant + 0, // enable_qm + DEFAULT_QM_Y, // qm_y + DEFAULT_QM_U, // qm_u + DEFAULT_QM_V, // qm_v + DEFAULT_QM_FIRST, // qm_min + DEFAULT_QM_LAST, // qm_max #if CONFIG_DIST_8X8 0, #endif @@ -150,7 +154,7 @@ static struct av1_extracfg default_extra_cfg = { 0, // render width 0, // render height AOM_SUPERBLOCK_SIZE_DYNAMIC, // superblock_size - 0, // Single tile decoding is off by default. + 1, // this depends on large_scale_tile. 0, // error_resilient_mode off by default. 0, // s_frame_mode off by default. 0, // film_grain_test_vector @@ -168,6 +172,8 @@ static struct av1_extracfg default_extra_cfg = { 0, // noise_level 32, // noise_block_size #endif + 0, // chroma_subsampling_x + 0, // chroma_subsampling_y }; struct aom_codec_alg_priv { @@ -251,10 +257,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx, RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1); RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1); if (extra_cfg->max_gf_interval > 0) { - RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1)); - } - if (extra_cfg->min_gf_interval > 0 && extra_cfg->max_gf_interval > 0) { - RANGE_CHECK(extra_cfg, max_gf_interval, extra_cfg->min_gf_interval, + RANGE_CHECK(extra_cfg, max_gf_interval, MAX(2, extra_cfg->min_gf_interval), (MAX_LAG_BUFFERS - 1)); } @@ -284,13 +287,14 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx, RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 2); RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2); RANGE_CHECK(extra_cfg, cpu_used, 0, 8); - RANGE_CHECK(extra_cfg, dev_sf, 0, UINT8_MAX); RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64, AOM_SUPERBLOCK_SIZE_DYNAMIC); RANGE_CHECK_HI(cfg, large_scale_tile, 1); RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1); + RANGE_CHECK_HI(extra_cfg, row_mt, 1); + RANGE_CHECK_HI(extra_cfg, tile_columns, 6); RANGE_CHECK_HI(extra_cfg, tile_rows, 6); @@ -372,6 +376,9 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx, #endif } + RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1); + RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1); + return AOM_CODEC_OK; } @@ -581,7 +588,6 @@ static aom_codec_err_t set_encoder_config( oxcf->sframe_mode = cfg->sframe_mode; oxcf->sframe_enabled = cfg->sframe_dist != 0; oxcf->speed = extra_cfg->cpu_used; - oxcf->dev_sf = extra_cfg->dev_sf; oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref; oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref; oxcf->noise_sensitivity = extra_cfg->noise_sensitivity; @@ -637,6 +643,8 @@ static aom_codec_err_t set_encoder_config( oxcf->superblock_size = AOM_SUPERBLOCK_SIZE_64X64; } + oxcf->row_mt = extra_cfg->row_mt; + oxcf->tile_columns = extra_cfg->tile_columns; oxcf->tile_rows = extra_cfg->tile_rows; @@ -692,6 +700,24 @@ static aom_codec_err_t set_encoder_config( oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost; oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test; + +#if CONFIG_REDUCED_ENCODER_BORDER + if (oxcf->superres_mode != SUPERRES_NONE || + oxcf->resize_mode != RESIZE_NONE) { + warn( + "Superres / resize cannot be used with CONFIG_REDUCED_ENCODER_BORDER. " + "Disabling superres/resize.\n"); + // return AOM_CODEC_INVALID_PARAM; + disable_superres(oxcf); + oxcf->resize_mode = RESIZE_NONE; + oxcf->resize_scale_denominator = SCALE_NUMERATOR; + oxcf->resize_kf_scale_denominator = SCALE_NUMERATOR; + } +#endif // CONFIG_REDUCED_ENCODER_BORDER + + oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x; + oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y; + return AOM_CODEC_OK; } @@ -731,6 +757,10 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx, return res; } +static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) { + return av1_get_global_headers(ctx->cpi); +} + static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); @@ -765,12 +795,6 @@ static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } -static aom_codec_err_t ctrl_set_devsf(aom_codec_alg_priv_t *ctx, va_list args) { - struct av1_extracfg extra_cfg = ctx->extra_cfg; - extra_cfg.dev_sf = CAST(AOME_SET_DEVSF, args); - return update_extra_cfg(ctx, &extra_cfg); -} - static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; @@ -806,6 +830,13 @@ static aom_codec_err_t ctrl_set_static_thresh(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.row_mt = CAST(AV1E_SET_ROW_MT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; @@ -1669,6 +1700,20 @@ static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_set_chroma_subsampling_x(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.chroma_subsampling_x = CAST(AV1E_SET_CHROMA_SUBSAMPLING_X, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.chroma_subsampling_y = CAST(AV1E_SET_CHROMA_SUBSAMPLING_Y, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV1_COPY_REFERENCE, ctrl_copy_reference }, { AOME_USE_REFERENCE, ctrl_use_reference }, @@ -1681,11 +1726,11 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AOME_SET_SCALEMODE, ctrl_set_scale_mode }, { AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id }, { AOME_SET_CPUUSED, ctrl_set_cpuused }, - { AOME_SET_DEVSF, ctrl_set_devsf }, { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref }, { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref }, { AOME_SET_SHARPNESS, ctrl_set_sharpness }, { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh }, + { AV1E_SET_ROW_MT, ctrl_set_row_mt }, { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns }, { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows }, { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames }, @@ -1754,7 +1799,8 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV1E_GET_ACTIVEMAP, ctrl_get_active_map }, { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image }, { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image }, - + { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x }, + { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y }, { -1, NULL }, }; @@ -1850,13 +1896,13 @@ CODEC_INTERFACE(aom_codec_av1_cx) = { }, { // NOLINT - 1, // 1 cfg map - encoder_usage_cfg_map, // aom_codec_enc_cfg_map_t - encoder_encode, // aom_codec_encode_fn_t - encoder_get_cxdata, // aom_codec_get_cx_data_fn_t - encoder_set_config, // aom_codec_enc_config_set_fn_t - NULL, // aom_codec_get_global_headers_fn_t - encoder_get_preview, // aom_codec_get_preview_frame_fn_t - NULL // aom_codec_enc_mr_get_mem_loc_fn_t + 1, // 1 cfg map + encoder_usage_cfg_map, // aom_codec_enc_cfg_map_t + encoder_encode, // aom_codec_encode_fn_t + encoder_get_cxdata, // aom_codec_get_cx_data_fn_t + encoder_set_config, // aom_codec_enc_config_set_fn_t + encoder_get_global_headers, // aom_codec_get_global_headers_fn_t + encoder_get_preview, // aom_codec_get_preview_frame_fn_t + NULL // aom_codec_enc_mr_get_mem_loc_fn_t } }; diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c index f42572019..4a6631047 100644 --- a/third_party/aom/av1/av1_dx_iface.c +++ b/third_party/aom/av1/av1_dx_iface.c @@ -26,6 +26,7 @@ #include "av1/common/alloccommon.h" #include "av1/common/frame_buffers.h" #include "av1/common/enums.h" +#include "av1/common/obu_util.h" #include "av1/decoder/decoder.h" #include "av1/decoder/decodeframe.h" @@ -46,6 +47,7 @@ struct aom_codec_alg_priv { int last_show_frame; // Index of last output frame. int byte_alignment; int skip_loop_filter; + int skip_film_grain; int decode_tile_row; int decode_tile_col; unsigned int tile_mode; @@ -103,6 +105,15 @@ static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx, priv->cfg.cfg.ext_partition = 1; } av1_zero(priv->image_with_grain); + // Turn row_mt on by default. + priv->row_mt = 1; + + // Turn on normal tile coding mode by default. + // 0 is for normal tile coding mode, and 1 is for large scale tile coding + // mode(refer to lightfield example). + priv->tile_mode = 0; + priv->decode_tile_row = -1; + priv->decode_tile_col = -1; } return AOM_CODEC_OK; @@ -216,7 +227,7 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data, while (1) { data += bytes_read; data_sz -= bytes_read; - const uint8_t *payload_start = data; + if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME; // Check that the selected OBU is a sequence header if (obu_header.type == OBU_SEQUENCE_HEADER) { // Sanity check on sequence header size @@ -264,9 +275,9 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data, } } // skip past any unread OBU header data - data = payload_start + payload_size; + data += payload_size; data_sz -= payload_size; - if (data_sz <= 0) break; // exit if we're out of OBUs + if (data_sz == 0) break; // exit if we're out of OBUs status = aom_read_obu_header_and_size( data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read); if (status != AOM_CODEC_OK) return status; @@ -313,6 +324,7 @@ static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) { cm->new_fb_idx = INVALID_IDX; cm->byte_alignment = ctx->byte_alignment; cm->skip_loop_filter = ctx->skip_loop_filter; + cm->skip_film_grain = ctx->skip_film_grain; if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { pool->get_fb_cb = ctx->get_ext_fb_cb; @@ -434,7 +446,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) { frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug; frame_worker_data->pbi->row_mt = ctx->row_mt; - worker->hook = (AVxWorkerHook)frame_worker_hook; + worker->hook = frame_worker_hook; if (!winterface->reset(worker)) { set_error_detail(ctx, "Frame Worker thread creation failed"); return AOM_CODEC_MEM_ERROR; @@ -515,12 +527,11 @@ static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx, static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx, const uint8_t *data, size_t data_sz, void *user_priv) { - const uint8_t *data_start = data; - const uint8_t *data_end = data + data_sz; aom_codec_err_t res = AOM_CODEC_OK; - // Release any pending output frames from the previous decoder call. - // We need to do this even if the decoder is being flushed + // Release any pending output frames from the previous decoder_decode call. + // We need to do this even if the decoder is being flushed or the input + // arguments are invalid. if (ctx->frame_workers) { BufferPool *const pool = ctx->buffer_pool; RefCntBuffer *const frame_bufs = pool->frame_bufs; @@ -538,10 +549,13 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx, unlock_buffer_pool(ctx->buffer_pool); } + /* Sanity checks */ + /* NULL data ptr allowed if data_sz is 0 too */ if (data == NULL && data_sz == 0) { ctx->flushed = 1; return AOM_CODEC_OK; } + if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM; // Reset flushed when receiving a valid frame. ctx->flushed = 0; @@ -552,6 +566,9 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx, if (res != AOM_CODEC_OK) return res; } + const uint8_t *data_start = data; + const uint8_t *data_end = data + data_sz; + if (ctx->is_annexb) { // read the size of this temporal unit size_t length_of_size; @@ -617,6 +634,7 @@ static aom_image_t *add_grain_if_needed(aom_image_t *img, img->fmt != grain_img_buf->fmt) { aom_img_free(grain_img_buf); grain_img_buf = NULL; + *grain_img_ptr = NULL; } } if (!grain_img_buf) { @@ -624,7 +642,14 @@ static aom_image_t *add_grain_if_needed(aom_image_t *img, *grain_img_ptr = grain_img_buf; } - av1_add_film_grain(grain_params, img, grain_img_buf); + if (grain_img_buf) { + grain_img_buf->user_priv = img->user_priv; + if (av1_add_film_grain(grain_params, img, grain_img_buf)) { + aom_img_free(grain_img_buf); + grain_img_buf = NULL; + *grain_img_ptr = NULL; + } + } return grain_img_buf; } @@ -720,8 +745,13 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, img = &ctx->img; img->temporal_id = cm->temporal_layer_id; img->spatial_id = cm->spatial_layer_id; + if (cm->skip_film_grain) grain_params->apply_grain = 0; aom_image_t *res = add_grain_if_needed( img, &ctx->image_with_grain[*index], grain_params); + if (!res) { + aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME, + "Grain systhesis failed\n"); + } *index += 1; // Advance the iterator to point to the next image return res; } @@ -1128,6 +1158,19 @@ static aom_codec_err_t ctrl_set_skip_loop_filter(aom_codec_alg_priv_t *ctx, return AOM_CODEC_OK; } +static aom_codec_err_t ctrl_set_skip_film_grain(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->skip_film_grain = va_arg(args, int); + + if (ctx->frame_workers) { + AVxWorker *const worker = ctx->frame_workers; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi->common.skip_film_grain = ctx->skip_film_grain; + } + + return AOM_CODEC_OK; +} + static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx, va_list args) { #if !CONFIG_ACCOUNTING @@ -1231,6 +1274,7 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug }, { AV1D_SET_ROW_MT, ctrl_set_row_mt }, { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr }, + { AV1D_SET_SKIP_FILM_GRAIN, ctrl_set_skip_film_grain }, // Getters { AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted }, diff --git a/third_party/aom/av1/av1_iface_common.h b/third_party/aom/av1/av1_iface_common.h index c03892b73..4a7af580b 100644 --- a/third_party/aom/av1/av1_iface_common.h +++ b/third_party/aom/av1/av1_iface_common.h @@ -8,10 +8,11 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_AV1_IFACE_COMMON_H_ -#define AV1_AV1_IFACE_COMMON_H_ +#ifndef AOM_AV1_AV1_IFACE_COMMON_H_ +#define AOM_AV1_AV1_IFACE_COMMON_H_ #include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12, void *user_priv) { @@ -132,4 +133,4 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img, return AOM_CODEC_OK; } -#endif // AV1_AV1_IFACE_COMMON_H_ +#endif // AOM_AV1_AV1_IFACE_COMMON_H_ diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h index dbcb5b947..8e5896981 100644 --- a/third_party/aom/av1/common/alloccommon.h +++ b/third_party/aom/av1/common/alloccommon.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ALLOCCOMMON_H_ -#define AV1_COMMON_ALLOCCOMMON_H_ +#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_ +#define AOM_AV1_COMMON_ALLOCCOMMON_H_ #define INVALID_IDX -1 // Invalid buffer index. @@ -45,4 +45,4 @@ int av1_get_MBs(int width, int height); } // extern "C" #endif -#endif // AV1_COMMON_ALLOCCOMMON_H_ +#endif // AOM_AV1_COMMON_ALLOCCOMMON_H_ diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c index 51c991498..bad411743 100644 --- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c +++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c @@ -9,6 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include + #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" @@ -19,19 +21,7 @@ #include "av1/common/enums.h" #include "av1/common/idct.h" #include "av1/common/arm/av1_inv_txfm_neon.h" - -static INLINE TxSetType find_TxSetType(TX_SIZE tx_size) { - const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size]; - TxSetType tx_set_type; - if (tx_size_sqr_up > TX_32X32) { - tx_set_type = EXT_TX_SET_DCTONLY; - } else if (tx_size_sqr_up == TX_32X32) { - tx_set_type = EXT_TX_SET_DCT_IDTX; - } else { - tx_set_type = EXT_TX_SET_ALL16; - } - return tx_set_type; -} +#include "av1/common/arm/transpose_neon.h" // 1D itx types typedef enum ATTRIBUTE_PACKED { @@ -65,6 +55,2038 @@ static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = { { av1_idct64_new, NULL, NULL }, }; +static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in, + uint8_t *output, int stride, + int flipud, + const int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + int16x8_t temp_output; + for (int i = 0; i < height; ++i, j += step) { + temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output))); + temp_output = vaddq_s16(temp_output, in[j]); + vst1_u8(output, vqmovun_s16(temp_output)); + output += stride; + } +} + +static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred, + int16x8_t res0, + int16x8_t res1) { + int16x8_t temp_output[2]; + uint8x16_t temp_output_8q; + temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred))); + temp_output[0] = vaddq_s16(temp_output[0], res0); + temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred))); + temp_output[1] = vaddq_s16(temp_output[1], res1); + temp_output_8q = + vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1])); + return temp_output_8q; +} + +static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in, + uint8_t *output, int stride, + int flipud, int height) { + uint8x16_t temp_output_8q; + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + temp_output_8q = vld1q_u8(output + i * stride); + temp_output_8q = + lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]); + vst1q_u8((output + i * stride), temp_output_8q); + } +} + +static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size, + int value) { + for (int i = 0; i < size; i++) { + a[i] = vdupq_n_s16((int16_t)value); + } +} + +static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1, + int16_t coef2, int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0_l, s0_h, s1_l, s1_h; + int16x4_t v0[2], v1[2]; + + s0_l = vmull_n_s16(vget_low_s16(in0), coef1); + s0_h = vmull_n_s16(vget_high_s16(in0), coef1); + s1_l = vmull_n_s16(vget_low_s16(in0), coef2); + s1_h = vmull_n_s16(vget_high_s16(in0), coef2); + + v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) { + int32x4_t t0[2], t1[2]; + int16x4_t v0[2], v1[2]; + + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0); + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + + v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT); + + x[0] = vcombine_s16(v0[0], v0[1]); + x[1] = vcombine_s16(v1[0], v1[1]); +} + +static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1, + int16_t *const c2, + int16_t *const c3) { + int16x4_t val = vdup_n_s16((int16_t)0); + val = vld1_lane_s16(c0, val, 0); + val = vld1_lane_s16(c1, val, 1); + val = vld1_lane_s16(c2, val, 2); + val = vld1_lane_s16(c3, val, 3); + return val; +} + +static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60), + (int16_t *)(cospi + 20), (int16_t *)(cospi + 44)); + const int16x4_t c1 = + create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28), + (int16_t *)(cospi + 52), (int16_t *)(cospi + 12)); + const int16x4_t c2 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + int16x8_t x[8]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + + // Stage 1 + x[0] = in[7]; + x[1] = in[0]; + x[2] = in[5]; + x[3] = in[2]; + x[4] = in[3]; + x[5] = in[4]; + x[6] = in[1]; + x[7] = in[6]; + + // Stage 2 + btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); + btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); + btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); + + // Stage 3 + x[0] = vqaddq_s16(s0, s4); + x[1] = vqaddq_s16(s1, s5); + x[2] = vqaddq_s16(s2, s6); + x[3] = vqaddq_s16(s3, s7); + x[4] = vqsubq_s16(s0, s4); + x[5] = vqsubq_s16(s1, s5); + x[6] = vqsubq_s16(s2, s6); + x[7] = vqsubq_s16(s3, s7); + + // Stage 4 + s0 = x[0]; + s1 = x[1]; + s2 = x[2]; + s3 = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6); + + // Stage 5 + x[0] = vqaddq_s16(s0, s2); + x[1] = vqaddq_s16(s1, s3); + x[2] = vqsubq_s16(s0, s2); + x[3] = vqsubq_s16(s1, s3); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + + // stage 6 + btf_16_half_neon(x + 2, c2); + btf_16_half_neon(x + 6, c2); + + // Stage 7 + out[0] = x[0]; + out[1] = vnegq_s16(x[4]); + out[2] = x[6]; + out[3] = vnegq_s16(x[2]); + out[4] = x[3]; + out[5] = vnegq_s16(x[7]); + out[6] = x[5]; + out[7] = vnegq_s16(x[1]); +} + +static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c2 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + int16x8_t x[8]; + int16x8_t s0, s1, s4, s5; + + // Stage 1 + x[1] = in[0]; + + // Stage 2 + + btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1); + + // Stage 3 + x[0] = s0; + x[1] = s1; + x[4] = s0; + x[5] = s1; + + // Stage 4 + s0 = x[0]; + s1 = x[1]; + btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); + + // Stage 5 + x[0] = s0; + x[1] = s1; + x[2] = s0; + x[3] = s1; + x[4] = s4; + x[5] = s5; + x[6] = s4; + x[7] = s5; + + // stage 6 + btf_16_half_neon(x + 2, c2); + btf_16_half_neon(x + 6, c2); + + // Stage 7 + out[0] = x[0]; + out[1] = vnegq_s16(x[4]); + out[2] = x[6]; + out[3] = vnegq_s16(x[2]); + out[4] = x[3]; + out[5] = vnegq_s16(x[7]); + out[6] = x[5]; + out[7] = vnegq_s16(x[1]); +} + +static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit, + int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[8], step2[8]; + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c2 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + // stage 2 + btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]); + btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]); + + // stage 3 + btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]); + btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]); + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + + // stage 4 + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]); + + // stage 5 + out[0] = vqaddq_s16(step1[0], step2[7]); + out[1] = vqaddq_s16(step1[1], step1[6]); + out[2] = vqaddq_s16(step1[2], step1[5]); + out[3] = vqaddq_s16(step1[3], step2[4]); + out[4] = vqsubq_s16(step1[3], step2[4]); + out[5] = vqsubq_s16(step1[2], step1[5]); + out[6] = vqsubq_s16(step1[1], step1[6]); + out[7] = vqsubq_s16(step1[0], step2[7]); +} + +static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]); + + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 4 + // stage 5 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; +} + +void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) { + assert(!(size % 4)); + if (!bit) return; + const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit)); + for (int i = 0; i < size; i++) { + arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8); + } +} + +static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) { + int16x8_t temp[8]; + for (int i = 0; i < size; ++i) { + temp[i] = input[size - 1 - i]; + } + for (int i = 0; i < size; ++i) { + input[i] = temp[i]; + } +} + +static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input, + int16x8_t *const a, + int out_size) { + for (int i = 0; i < 8; ++i) { + a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)), + vmovn_s32(vld1q_s32(input + 4))); + input += out_size; + } +} + +static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output, + int8_t cos_bit, int bit) { + (void)bit; + (void)cos_bit; + + output[0] = vmulq_n_s16(input[0], (int16_t)2); + output[1] = vmulq_n_s16(input[1], (int16_t)2); + output[2] = vmulq_n_s16(input[2], (int16_t)2); + output[3] = vmulq_n_s16(input[3], (int16_t)2); + output[4] = vmulq_n_s16(input[4], (int16_t)2); + output[5] = vmulq_n_s16(input[5], (int16_t)2); + output[6] = vmulq_n_s16(input[6], (int16_t)2); + output[7] = vmulq_n_s16(input[7], (int16_t)2); +} + +static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output, + int size) { + int32x4_t out_low, out_high; + int16x4_t low, high; + + for (int z = 0; z < size; ++z) { + out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2); + out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2); + + low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits); + high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits); + + output[z] = vcombine_s16(low, high); + } +} + +static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output, + int8_t cos_bit, int bit) { + (void)bit; + (void)cos_bit; + + int32x4_t out_low, out_high; + int16x4_t low, high; + int16_t scale = (int16_t)(2 * NewSqrt2); + + for (int z = 0; z < 16; ++z) { + out_low = vmull_n_s16(vget_low_s16(input[z]), scale); + out_high = vmull_n_s16(vget_high_s16(input[z]), scale); + + low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits); + high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits); + + output[z] = vcombine_s16(low, high); + } +} + +static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output, + int8_t cos_bit, int bit) { + (void)bit; + (void)cos_bit; + + for (int z = 0; z < 32; ++z) { + output[z] = vmulq_n_s16(input[z], (int16_t)4); + } +} + +static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 4 + + t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 6 + // stage 7 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; +} + +static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[16], step2[16]; + + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60), + (int16_t *)(cospi + 36), (int16_t *)(cospi + 28)); + const int16x4_t c1 = + create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44), + (int16_t *)(cospi + 52), (int16_t *)(cospi + 12)); + const int16x4_t c2 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c3 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + // stage 2 + + btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]); + btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]); + btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]); + btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]); + + step2[0] = in[0]; + step2[1] = in[8]; + step2[2] = in[4]; + step2[3] = in[12]; + step2[4] = in[2]; + step2[5] = in[10]; + step2[6] = in[6]; + step2[7] = in[14]; + + // stage 3 + + btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]); + btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + + // stage 4 + + btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3, + &step2[10], &step2[13]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +} + +static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[16], step2[16]; + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[2] = in[4]; + step2[4] = in[2]; + step2[6] = in[6]; + + btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]); + btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]); + btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]); + + // stage 3 + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + + // stage 4 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); + btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0, + &step2[10], &step2[13]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]); + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + + // stage 6 + btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +} + +static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62), + (int16_t *)(cospi + 10), (int16_t *)(cospi + 54)); + const int16x4_t c1 = + create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46), + (int16_t *)(cospi + 26), (int16_t *)(cospi + 38)); + const int16x4_t c2 = + create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30), + (int16_t *)(cospi + 42), (int16_t *)(cospi + 22)); + const int16x4_t c3 = + create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14), + (int16_t *)(cospi + 58), (int16_t *)(cospi + 6)); + const int16x4_t c4 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + + const int16x4_t c = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + int16x8_t x[16]; + int16x8_t t[14]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; + + // Stage 1 + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // Stage 2 + btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); + btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); + btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); + btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11); + btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13); + btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15); + + // Stage 3 + x[0] = vqaddq_s16(s0, s8); + x[1] = vqaddq_s16(s1, s9); + x[2] = vqaddq_s16(s2, s10); + x[3] = vqaddq_s16(s3, s11); + x[4] = vqaddq_s16(s4, s12); + x[5] = vqaddq_s16(s5, s13); + x[6] = vqaddq_s16(s6, s14); + x[7] = vqaddq_s16(s7, s15); + x[8] = vqsubq_s16(s0, s8); + x[9] = vqsubq_s16(s1, s9); + x[10] = vqsubq_s16(s2, s10); + x[11] = vqsubq_s16(s3, s11); + x[12] = vqsubq_s16(s4, s12); + x[13] = vqsubq_s16(s5, s13); + x[14] = vqsubq_s16(s6, s14); + x[15] = vqsubq_s16(s7, s15); + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11); + btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12); + btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14); + + // Stage 5 + x[0] = vqaddq_s16(t[0], t[4]); + x[1] = vqaddq_s16(t[1], t[5]); + x[2] = vqaddq_s16(t[2], t[6]); + x[3] = vqaddq_s16(t[3], t[7]); + x[4] = vqsubq_s16(t[0], t[4]); + x[5] = vqsubq_s16(t[1], t[5]); + x[6] = vqsubq_s16(t[2], t[6]); + x[7] = vqsubq_s16(t[3], t[7]); + x[8] = vqaddq_s16(s8, s12); + x[9] = vqaddq_s16(s9, s13); + x[10] = vqaddq_s16(s10, s14); + x[11] = vqaddq_s16(s11, s15); + x[12] = vqsubq_s16(s8, s12); + x[13] = vqsubq_s16(s9, s13); + x[14] = vqsubq_s16(s10, s14); + x[15] = vqsubq_s16(s11, s15); + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13); + btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14); + + // Stage 7 + x[0] = vqaddq_s16(t[0], t[2]); + x[1] = vqaddq_s16(t[1], t[3]); + x[2] = vqsubq_s16(t[0], t[2]); + x[3] = vqsubq_s16(t[1], t[3]); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + x[8] = vqaddq_s16(t[8], t[10]); + x[9] = vqaddq_s16(t[9], t[11]); + x[10] = vqsubq_s16(t[8], t[10]); + x[11] = vqsubq_s16(t[9], t[11]); + x[12] = vqaddq_s16(s12, s14); + x[13] = vqaddq_s16(s13, s15); + x[14] = vqsubq_s16(s12, s14); + x[15] = vqsubq_s16(s13, s15); + + // Stage 8 + btf_16_half_neon(x + 2, c); + btf_16_half_neon(x + 6, c); + btf_16_half_neon(x + 10, c); + btf_16_half_neon(x + 14, c); + + // Stage 9 + out[0] = x[0]; + out[1] = vnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vnegq_s16(x[1]); +} + +static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c4 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + int16x8_t x[16]; + int16x8_t t[10]; + int16x8_t s0, s1, s4, s5; + int16x8_t s8, s9, s12, s13; + + // Stage 1 + x[1] = in[0]; + + // Stage 2 + btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); + + // Stage 3 + x[0] = s0; + x[1] = s1; + x[8] = s0; + x[9] = s1; + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); + + // Stage 5 + x[0] = t[0]; + x[1] = t[1]; + x[4] = t[0]; + x[5] = t[1]; + x[8] = s8; + x[9] = s9; + x[12] = s8; + x[13] = s9; + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5); + t[8] = x[8]; + t[9] = x[9]; + btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13); + + // Stage 7 + x[0] = t[0]; + x[1] = t[1]; + x[2] = t[0]; + x[3] = t[1]; + x[4] = s4; + x[5] = s5; + x[6] = s4; + x[7] = s5; + x[8] = t[8]; + x[9] = t[9]; + x[10] = t[8]; + x[11] = t[9]; + x[12] = s12; + x[13] = s13; + x[14] = s12; + x[15] = s13; + + // Stage 8 + btf_16_half_neon(x + 2, c); + btf_16_half_neon(x + 6, c); + btf_16_half_neon(x + 10, c); + btf_16_half_neon(x + 14, c); + + // Stage 9 + out[0] = x[0]; + out[1] = vnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vnegq_s16(x[1]); +} + +static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c4 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + int16x8_t x[16]; + int16x8_t t[14]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; + + // Stage 1 + x[1] = in[0]; + x[3] = in[2]; + x[5] = in[4]; + x[7] = in[6]; + x[8] = in[7]; + x[10] = in[5]; + x[12] = in[3]; + x[14] = in[1]; + + // Stage 2 + btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); + btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3); + btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5); + btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7); + + btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9); + btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11); + btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13); + btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15); + + // Stage 3 + x[0] = vqaddq_s16(s0, s8); + x[1] = vqaddq_s16(s1, s9); + x[2] = vqaddq_s16(s2, s10); + x[3] = vqaddq_s16(s3, s11); + x[4] = vqaddq_s16(s4, s12); + x[5] = vqaddq_s16(s5, s13); + x[6] = vqaddq_s16(s6, s14); + x[7] = vqaddq_s16(s7, s15); + x[8] = vqsubq_s16(s0, s8); + x[9] = vqsubq_s16(s1, s9); + x[10] = vqsubq_s16(s2, s10); + x[11] = vqsubq_s16(s3, s11); + x[12] = vqsubq_s16(s4, s12); + x[13] = vqsubq_s16(s5, s13); + x[14] = vqsubq_s16(s6, s14); + x[15] = vqsubq_s16(s7, s15); + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11); + btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12); + btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14); + + // Stage 5 + x[0] = vqaddq_s16(t[0], t[4]); + x[1] = vqaddq_s16(t[1], t[5]); + x[2] = vqaddq_s16(t[2], t[6]); + x[3] = vqaddq_s16(t[3], t[7]); + x[4] = vqsubq_s16(t[0], t[4]); + x[5] = vqsubq_s16(t[1], t[5]); + x[6] = vqsubq_s16(t[2], t[6]); + x[7] = vqsubq_s16(t[3], t[7]); + x[8] = vqaddq_s16(s8, s12); + x[9] = vqaddq_s16(s9, s13); + x[10] = vqaddq_s16(s10, s14); + x[11] = vqaddq_s16(s11, s15); + x[12] = vqsubq_s16(s8, s12); + x[13] = vqsubq_s16(s9, s13); + x[14] = vqsubq_s16(s10, s14); + x[15] = vqsubq_s16(s11, s15); + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13); + btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14); + + // Stage 7 + x[0] = vqaddq_s16(t[0], t[2]); + x[1] = vqaddq_s16(t[1], t[3]); + x[2] = vqsubq_s16(t[0], t[2]); + x[3] = vqsubq_s16(t[1], t[3]); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + x[8] = vqaddq_s16(t[8], t[10]); + x[9] = vqaddq_s16(t[9], t[11]); + x[10] = vqsubq_s16(t[8], t[10]); + x[11] = vqsubq_s16(t[9], t[11]); + x[12] = vqaddq_s16(s12, s14); + x[13] = vqaddq_s16(s13, s15); + x[14] = vqsubq_s16(s12, s14); + x[15] = vqsubq_s16(s13, s15); + + // Stage 8 + btf_16_half_neon(x + 2, c); + btf_16_half_neon(x + 6, c); + btf_16_half_neon(x + 10, c); + btf_16_half_neon(x + 14, c); + + // Stage 9 + out[0] = x[0]; + out[1] = vnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vnegq_s16(x[1]); +} + +static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62), + (int16_t *)(cospi + 34), (int16_t *)(cospi + 30)); + const int16x4_t c1 = + create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46), + (int16_t *)(cospi + 50), (int16_t *)(cospi + 14)); + const int16x4_t c2 = + create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54), + (int16_t *)(cospi + 42), (int16_t *)(cospi + 22)); + const int16x4_t c3 = + create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38), + (int16_t *)(cospi + 58), (int16_t *)(cospi + 6)); + const int16x4_t c4 = + create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60), + (int16_t *)(cospi + 36), (int16_t *)(cospi + 28)); + const int16x4_t c5 = + create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44), + (int16_t *)(cospi + 52), (int16_t *)(cospi + 12)); + const int16x4_t c6 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c7 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + // stage 2 + + btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]); + btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]); + btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]); + btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]); + btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]); + btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]); + + step2[0] = in[0]; + step2[1] = in[16]; + step2[2] = in[8]; + step2[3] = in[24]; + step2[4] = in[4]; + step2[5] = in[20]; + step2[6] = in[12]; + step2[7] = in[28]; + step2[8] = in[2]; + step2[9] = in[18]; + step2[10] = in[10]; + step2[11] = in[26]; + step2[12] = in[6]; + step2[13] = in[22]; + step2[14] = in[14]; + step2[15] = in[30]; + + // stage 3 + + btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]); + btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]); + btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]); + btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + step1[4] = step2[4]; + step1[5] = step2[5]; + step1[6] = step2[6]; + step1[7] = step2[7]; + + step1[16] = vqaddq_s16(step2[16], step2[17]); + step1[17] = vqsubq_s16(step2[16], step2[17]); + step1[18] = vqsubq_s16(step2[19], step2[18]); + step1[19] = vqaddq_s16(step2[19], step2[18]); + step1[20] = vqaddq_s16(step2[20], step2[21]); + step1[21] = vqsubq_s16(step2[20], step2[21]); + step1[22] = vqsubq_s16(step2[23], step2[22]); + step1[23] = vqaddq_s16(step2[23], step2[22]); + step1[24] = vqaddq_s16(step2[24], step2[25]); + step1[25] = vqsubq_s16(step2[24], step2[25]); + step1[26] = vqsubq_s16(step2[27], step2[26]); + step1[27] = vqaddq_s16(step2[27], step2[26]); + step1[28] = vqaddq_s16(step2[28], step2[29]); + step1[29] = vqsubq_s16(step2[28], step2[29]); + step1[30] = vqsubq_s16(step2[31], step2[30]); + step1[31] = vqaddq_s16(step2[31], step2[30]); + + // stage 4 + + btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]); + btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]); + btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6, + &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6, + &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[8] = vqaddq_s16(step1[8], step1[9]); + step2[9] = vqsubq_s16(step1[8], step1[9]); + step2[10] = vqsubq_s16(step1[11], step1[10]); + step2[11] = vqaddq_s16(step1[11], step1[10]); + step2[12] = vqaddq_s16(step1[12], step1[13]); + step2[13] = vqsubq_s16(step1[12], step1[13]); + step2[14] = vqsubq_s16(step1[15], step1[14]); + step2[15] = vqaddq_s16(step1[15], step1[14]); + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]); + btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]); + btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7, + &step1[10], &step1[13]); + + step1[4] = vqaddq_s16(step2[4], step2[5]); + step1[5] = vqsubq_s16(step2[4], step2[5]); + step1[6] = vqsubq_s16(step2[7], step2[6]); + step1[7] = vqaddq_s16(step2[7], step2[6]); + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7, + &step2[20], &step2[27]); + btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7, + &step2[21], &step2[26]); + + step2[0] = vqaddq_s16(step1[0], step1[3]); + step2[1] = vqaddq_s16(step1[1], step1[2]); + step2[2] = vqsubq_s16(step1[1], step1[2]); + step2[3] = vqsubq_s16(step1[0], step1[3]); + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} + +static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; + out[16] = step1; + out[17] = step1; + out[18] = step1; + out[19] = step1; + out[20] = step1; + out[21] = step1; + out[22] = step1; + out[23] = step1; + out[24] = step1; + out[25] = step1; + out[26] = step1; + out[27] = step1; + out[28] = step1; + out[29] = step1; + out[30] = step1; + out[31] = step1; +} + +static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + int32x4_t t32[16]; + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c1 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[4] = in[4]; + step2[8] = in[2]; + step2[12] = in[6]; + + btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); + btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); + btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); + btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); + + // stage 3 + step1[0] = step2[0]; + step1[4] = step2[4]; + + btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); + btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); + + step1[16] = step2[16]; + step1[17] = step2[16]; + step1[18] = step2[19]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[21] = step2[20]; + step1[22] = step2[23]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[24]; + step1[26] = step2[27]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[29] = step2[28]; + step1[30] = step2[31]; + step1[31] = step2[31]; + + // stage 4 + + btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); + btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0, + &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0, + &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[8] = step1[8]; + step2[9] = step1[8]; + step2[10] = step1[11]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[13] = step1[12]; + step2[14] = step1[15]; + step2[15] = step1[15]; + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); + step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1, + &step1[10], &step1[13]); + + step1[4] = step2[4]; + step1[5] = step2[4]; + step1[6] = step2[7]; + step1[7] = step2[7]; + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1, + &step2[20], &step2[27]); + btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1, + &step2[21], &step2[26]); + + step2[0] = step1[0]; + step2[1] = step1[0]; + step2[2] = step1[0]; + step2[3] = step1[0]; + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} + +static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + int32x4_t t32[16]; + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c1 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + // stage 1 + // stage 2 + + btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); + btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]); + btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]); + btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); + btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); + btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]); + btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]); + btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); + + step2[0] = in[0]; + step2[2] = in[8]; + step2[4] = in[4]; + step2[6] = in[12]; + step2[8] = in[2]; + step2[10] = in[10]; + step2[12] = in[6]; + step2[14] = in[14]; + + // stage 3 + + btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); + btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]); + btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]); + btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = step2[4]; + step1[6] = step2[6]; + step1[16] = vqaddq_s16(step2[16], step2[17]); + step1[17] = vqsubq_s16(step2[16], step2[17]); + step1[18] = vqsubq_s16(step2[19], step2[18]); + step1[19] = vqaddq_s16(step2[19], step2[18]); + step1[20] = vqaddq_s16(step2[20], step2[21]); + step1[21] = vqsubq_s16(step2[20], step2[21]); + step1[22] = vqsubq_s16(step2[23], step2[22]); + step1[23] = vqaddq_s16(step2[23], step2[22]); + step1[24] = vqaddq_s16(step2[24], step2[25]); + step1[25] = vqsubq_s16(step2[24], step2[25]); + step1[26] = vqsubq_s16(step2[27], step2[26]); + step1[27] = vqaddq_s16(step2[27], step2[26]); + step1[28] = vqaddq_s16(step2[28], step2[29]); + step1[29] = vqsubq_s16(step2[28], step2[29]); + step1[30] = vqsubq_s16(step2[31], step2[30]); + step1[31] = vqaddq_s16(step2[31], step2[30]); + + // stage 4 + + btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); + btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]); + btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0, + &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0, + &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[8] = vqaddq_s16(step1[8], step1[9]); + step2[9] = vqsubq_s16(step1[8], step1[9]); + step2[10] = vqsubq_s16(step1[11], step1[10]); + step2[11] = vqaddq_s16(step1[11], step1[10]); + step2[12] = vqaddq_s16(step1[12], step1[13]); + step2[13] = vqsubq_s16(step1[12], step1[13]); + step2[14] = vqsubq_s16(step1[15], step1[14]); + step2[15] = vqaddq_s16(step1[15], step1[14]); + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); + + step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]); + btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1, + &step1[10], &step1[13]); + + step1[4] = vqaddq_s16(step2[4], step2[5]); + step1[5] = vqsubq_s16(step2[4], step2[5]); + step1[6] = vqsubq_s16(step2[7], step2[6]); + step1[7] = vqaddq_s16(step2[7], step2[6]); + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1, + &step2[20], &step2[27]); + btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1, + &step2[21], &step2[26]); + + step2[0] = vqaddq_s16(step1[0], step1[3]); + step2[1] = vqaddq_s16(step1[0], step1[2]); + step2[2] = vqsubq_s16(step1[0], step1[2]); + step2[3] = vqsubq_s16(step1[0], step1[3]); + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} + // Functions for blocks with eob at DC and within // topleft 8x8, 16x16, 32x32 corner static const transform_1d_neon @@ -90,10 +2112,37 @@ static const transform_1d_neon { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; -static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, - uint8_t *output, int stride, - TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { + +static const transform_neon + lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL }, + { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL }, + { identity8_new_neon, identity8_new_neon, NULL, NULL } }, + { + { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL }, + { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon, + NULL }, + { identity16_new_neon, identity16_new_neon, identity16_new_neon, + NULL }, + }, + { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon, + idct32_new_neon }, + { NULL, NULL, NULL, NULL }, + { identity32_new_neon, identity32_new_neon, identity32_new_neon, + identity32_new_neon } }, + { { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +static INLINE void lowbd_inv_txfm2d_add_wxh_idtx_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); int32_t *temp_in = txfm_buf; @@ -160,7 +2209,79 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, } } -static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( +static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[32 * 4]; + int16x8_t b[32 * 4]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const int32_t *input_1; + int temp_b = 0; + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + input_1 = input; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col); + transpose_s16_8x8q(&a[k], &a[k]); + input_1 += 8; + } + input += (txfm_size_col * 8); + if (abs(rect_type) == 1) { + int y = i * txfm_size_col; + round_shift_for_rect(&a[y], &a[y], txfm_size_col); + } + row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); + av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, + -shift[0]); + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon( + &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_v_wxh_identity_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); @@ -244,7 +2365,88 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( } } -static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( +static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[16 * 2]; + int16x8_t b[16 * 2]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const int32_t *input_1; + int temp_b = 0; + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + input_1 = input; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col); + transpose_s16_8x8q(&a[k], &a[k]); + input_1 += 8; + } + input += (txfm_size_col * 8); + if (abs(rect_type) == 1) { + int y = i * txfm_size_col; + round_shift_for_rect(&a[y], &a[y], txfm_size_col); + } + row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); + av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, + -shift[0]); + if (lr_flip == 1) { + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + flip_buf_ud_neon(&a[k], 8); + transpose_s16_8x8q( + &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); + } + temp_b += 8; + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon( + &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_h_wxh_identity_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); @@ -328,6 +2530,78 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( } } +static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[16 * 2]; + int16x8_t b[16 * 2]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const int32_t *input_1; + int temp_b = 0; + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + input_1 = input; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col); + transpose_s16_8x8q(&a[k], &a[k]); + input_1 += 8; + } + input += (txfm_size_col * 8); + if (abs(rect_type) == 1) { + int y = i * txfm_size_col; + round_shift_for_rect(&a[y], &a[y], txfm_size_col); + } + row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); + av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, + -shift[0]); + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); + } +} + static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, @@ -644,7 +2918,7 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output, } } -static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( +static INLINE void lowbd_inv_txfm2d_add_wxh_no_identity_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]); @@ -727,6 +3001,118 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( } } +static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[64 * 8]; + int16x8_t b[64 * 8]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const int32_t *input_1; + int temp_b = 0; + + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + input_1 = input; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col); + transpose_s16_8x8q(&a[k], &a[k]); + input_1 += 8; + } + input += (txfm_size_col * 8); + if (abs(rect_type) == 1) { + int y = i * txfm_size_col; + round_shift_for_rect(&a[y], &a[y], txfm_size_col); + } + row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); + av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, + -shift[0]); + if (lr_flip == 1) { + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + flip_buf_ud_neon(&a[k], 8); + transpose_s16_8x8q( + &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); + } + temp_b += 8; + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_wxh_universe_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_type) { + case IDTX: + lowbd_inv_txfm2d_add_wxh_idtx_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + case H_DCT: + case H_ADST: + case H_FLIPADST: + lowbd_inv_txfm2d_add_v_wxh_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + case V_DCT: + case V_ADST: + case V_FLIPADST: + lowbd_inv_txfm2d_add_h_wxh_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + default: + lowbd_inv_txfm2d_add_wxh_no_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + static INLINE void lowbd_inv_txfm2d_add_universe_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { @@ -756,6 +3142,7 @@ static INLINE void lowbd_inv_txfm2d_add_universe_neon( break; } } + void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { @@ -787,8 +3174,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, break; case TX_16X64: { - lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, - tx_size, eob); + lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type, + tx_size, eob); } break; case TX_64X16: { @@ -797,13 +3184,13 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); } - lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type, - tx_size, eob); + lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type, + tx_size, eob); } break; case TX_32X64: { - lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, - tx_size, eob); + lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type, + tx_size, eob); } break; case TX_64X32: { @@ -812,8 +3199,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); } - lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type, - tx_size, eob); + lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type, + tx_size, eob); } break; case TX_64X64: { @@ -822,8 +3209,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); } - lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type, - tx_size, eob); + lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type, + tx_size, eob); } break; default: diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h index 6af2d61e7..9ec658291 100644 --- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h +++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ -#define AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ +#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ +#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ #include "config/aom_config.h" #include "config/av1_rtcd.h" @@ -23,6 +23,8 @@ typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output, const int8_t cos_bit, const int8_t *stage_ptr); +typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output, + int8_t cos_bit, int bit); DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, @@ -149,4 +151,4 @@ static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, *eoby = eob_fill[temp_eoby]; } -#endif // AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ +#endif // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c index 0d8233744..7134f183e 100644 --- a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c +++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c @@ -34,8 +34,8 @@ void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, uint8x8_t tmp0, tmp1; uint8x16_t res_q; uint16x8_t res, res_low, res_high; - uint32x2_t tmp0_32, tmp1_32; - uint16x4_t tmp0_16, tmp1_16; + uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0); + uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0); const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64); if (w >= 16) { diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c index 33b06b767..194e94c8c 100644 --- a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c +++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c @@ -27,8 +27,8 @@ void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride, uint8x8_t tmp0, tmp1; uint8x16_t tmp0_q, tmp1_q, res_q; uint16x8_t res, res_low, res_high; - uint32x2_t tmp0_32, tmp1_32; - uint16x4_t tmp0_16, tmp1_16; + uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0); + uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0); assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c index d731b6a66..39025b5e5 100644 --- a/third_party/aom/av1/common/arm/cfl_neon.c +++ b/third_party/aom/av1/common/arm/cfl_neon.c @@ -131,7 +131,7 @@ static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input, } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } -#if __ARM_ARCH <= 7 +#ifndef __aarch64__ uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) { return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)), vpadd_u16(vget_low_u16(b), vget_high_u16(b))); @@ -311,7 +311,7 @@ static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst, // Permute and add in such a way that each lane contains the block sum. // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A] -#if __ARM_ARCH >= 8 +#ifdef __aarch64__ sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4); sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4); #else diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c index f15744c94..d0c4f8ff6 100644 --- a/third_party/aom/av1/common/arm/convolve_neon.c +++ b/third_party/aom/av1/common/arm/convolve_neon.c @@ -13,6 +13,8 @@ #include #include +#include "config/av1_rtcd.h" + #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" #include "av1/common/convolve.h" @@ -68,6 +70,33 @@ static INLINE uint8x8_t convolve8_horiz_8x8( return vqmovun_s16(sum); } +#if !defined(__aarch64__) +static INLINE uint8x8_t convolve8_horiz_4x1( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16_t *filter, + const int16x4_t shift_round_0, const int16x4_t shift_by_bits) { + int16x4_t sum; + + sum = vmul_n_s16(s0, filter[0]); + sum = vmla_n_s16(sum, s1, filter[1]); + sum = vmla_n_s16(sum, s2, filter[2]); + sum = vmla_n_s16(sum, s5, filter[5]); + sum = vmla_n_s16(sum, s6, filter[6]); + sum = vmla_n_s16(sum, s7, filter[7]); + /* filter[3] can take a max value of 128. So the max value of the result : + * 128*255 + sum > 16 bits + */ + sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3])); + sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4])); + + sum = vqrshl_s16(sum, shift_round_0); + sum = vqrshl_s16(sum, shift_by_bits); + + return vqmovun_s16(vcombine_s16(sum, sum)); +} +#endif // !defined(__arch64__) + static INLINE uint8x8_t convolve8_vert_8x4( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, @@ -175,7 +204,10 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, (void)conv_params; (void)filter_params_y; - uint8x8_t t0, t1, t2, t3; + uint8x8_t t0; +#if defined(__aarch64__) + uint8x8_t t1, t2, t3; +#endif assert(bits >= 0); assert((FILTER_BITS - conv_params->round_1) >= 0 || @@ -188,7 +220,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, const int16x8_t shift_by_bits = vdupq_n_s16(-bits); src -= horiz_offset; - +#if defined(__aarch64__) if (h == 4) { uint8x8_t d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; @@ -275,12 +307,18 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, w -= 4; } while (w > 0); } else { +#endif int width; const uint8_t *s; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + +#if defined(__aarch64__) + int16x8_t s8, s9, s10; uint8x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; +#endif if (w <= 4) { +#if defined(__aarch64__) do { load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); @@ -387,10 +425,49 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, } h -= 8; } while (h > 0); +#else + int16x8_t tt0; + int16x4_t x0, x1, x2, x3, x4, x5, x6, x7; + const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0); + const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits); + do { + t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + x0 = vget_low_s16(tt0); // a0 a1 a2 a3 + x4 = vget_high_s16(tt0); // a4 a5 a6 a7 + + t0 = vld1_u8(src + 8); // a8 a9 a10 a11 a12 a13 a14 a15 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + x7 = vget_low_s16(tt0); // a8 a9 a10 a11 + + x1 = vext_s16(x0, x4, 1); // a1 a2 a3 a4 + x2 = vext_s16(x0, x4, 2); // a2 a3 a4 a5 + x3 = vext_s16(x0, x4, 3); // a3 a4 a5 a6 + x5 = vext_s16(x4, x7, 1); // a5 a6 a7 a8 + x6 = vext_s16(x4, x7, 2); // a6 a7 a8 a9 + x7 = vext_s16(x4, x7, 3); // a7 a8 a9 a10 + + src += src_stride; + + t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter, + shift_round_0_low, shift_by_bits_low); + + if (w == 4) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), + 0); // 00 01 02 03 + dst += dst_stride; + } else if (w == 2) { + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01 + dst += dst_stride; + } + h -= 1; + } while (h > 0); +#endif } else { uint8_t *d; - int16x8_t s11, s12, s13, s14; - + int16x8_t s11; +#if defined(__aarch64__) + int16x8_t s12, s13, s14; do { __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); @@ -479,8 +556,47 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, dst += 8 * dst_stride; h -= 8; } while (h > 0); +#else + do { + t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + width = w; + s = src + 8; + d = dst; + __builtin_prefetch(dst); + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s11 = s0; + s0 = s7; + + s1 = vextq_s16(s11, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(s11, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(s11, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(s11, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(s11, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(s11, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(s11, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter, + shift_round_0, shift_by_bits); + vst1_u8(d, t0); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += src_stride; + dst += dst_stride; + h -= 1; + } while (h > 0); +#endif } +#if defined(__aarch64__) } +#endif } void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, @@ -505,9 +621,12 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, filter_params_y, subpel_y_q4 & SUBPEL_MASK); if (w <= 4) { - uint8x8_t d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t d01; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; +#if defined(__aarch64__) + uint8x8_t d23; + int16x4_t s8, s9, s10, d1, d2, d3; +#endif s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); src += src_stride; s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); @@ -526,6 +645,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, do { s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); src += src_stride; +#if defined(__aarch64__) s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); src += src_stride; s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); @@ -591,14 +711,41 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, s5 = s9; s6 = s10; h -= 4; +#else + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + + d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + + d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS); + + if (w == 4) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); + dst += dst_stride; + } else if (w == 2) { + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); + dst += dst_stride; + } + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + h -= 1; +#endif } while (h > 0); } else { int height; const uint8_t *s; uint8_t *d; - uint8x8_t t0, t1, t2, t3; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - + uint8x8_t t0; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; +#if defined(__aarch64__) + uint8x8_t t1, t2, t3; + int16x8_t s8, s9, s10; +#endif do { __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); @@ -628,6 +775,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, do { s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); s += src_stride; +#if defined(__aarch64__) s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); s += src_stride; s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); @@ -670,6 +818,24 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, s5 = s9; s6 = s10; height -= 4; +#else + __builtin_prefetch(d); + __builtin_prefetch(s); + + t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + + vst1_u8(d, t0); + d += dst_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; +#endif } while (height > 0); src += 8; dst += 8; @@ -686,7 +852,10 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, ConvolveParams *conv_params) { int im_dst_stride; int width, height; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + uint8x8_t t0; +#if defined(__aarch64__) + uint8x8_t t1, t2, t3, t4, t5, t6, t7; +#endif DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]); @@ -724,13 +893,18 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, assert(conv_params->round_0 > 0); if (w <= 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; +#if defined(__aarch64__) + int16x4_t s8, s9, s10, d1, d2, d3; +#endif const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2))); const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1)); do { s = src_ptr; + +#if defined(__aarch64__) __builtin_prefetch(s + 0 * src_stride); __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); @@ -789,16 +963,56 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, src_ptr += 4 * src_stride; dst_ptr += 4 * im_dst_stride; height -= 4; +#else + int16x8_t tt0; + + __builtin_prefetch(s); + + t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s0 = vget_low_s16(tt0); + s4 = vget_high_s16(tt0); + + __builtin_prefetch(dst_ptr); + s += 8; + + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8 + s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9 + s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10 + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + horiz_const, shift_round_0); + + if (w == 4) { + vst1_s16(dst_ptr, d0); + dst_ptr += im_dst_stride; + } else if (w == 2) { + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0); + dst_ptr += im_dst_stride; + } + + src_ptr += src_stride; + height -= 1; +#endif } while (height > 0); } else { int16_t *d_tmp; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0; +#if defined(__aarch64__) + int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7; int16x8_t s11, s12, s13, s14; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int16x8_t res0, res1, res2, res3, res4, res5, res6, res7; +#endif const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2))); const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1)); +#if defined(__aarch64__) do { __builtin_prefetch(src_ptr + 0 * src_stride); __builtin_prefetch(src_ptr + 1 * src_stride); @@ -886,6 +1100,45 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, dst_ptr += 8 * im_dst_stride; height -= 8; } while (height > 0); +#else + do { + t0 = vld1_u8(src_ptr); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + + width = w; + s = src_ptr + 8; + d_tmp = dst_ptr; + + __builtin_prefetch(dst_ptr); + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t sum = s0; + s0 = s7; + + s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + horiz_const, shift_round_0); + + vst1q_s16(d_tmp, res0); + + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += im_dst_stride; + height -= 1; + } while (height > 0); +#endif } // vertical @@ -910,10 +1163,17 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, width = w; if (width <= 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x4_t d0, d1, d2, d3; - uint16x8_t dd0, dd1; - uint8x8_t d01, d23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + uint16x4_t d0; + uint16x8_t dd0; + uint8x8_t d01; + +#if defined(__aarch64__) + int16x4_t s8, s9, s10; + uint16x4_t d1, d2, d3; + uint16x8_t dd1; + uint8x8_t d23; +#endif d_u8 = dst_u8_ptr; v_s = v_src_ptr; @@ -931,6 +1191,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, v_s += (7 * im_stride); do { +#if defined(__aarch64__) load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10); v_s += (im_stride << 2); @@ -1008,11 +1269,48 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, s5 = s9; s6 = s10; height -= 4; +#else + s7 = vld1_s16(v_s); + v_s += im_stride; + + __builtin_prefetch(d_u8 + 0 * dst_stride); + + d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_shift_vec, offset_const, + sub_const_vec); + + dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits); + d01 = vqmovn_u16(dd0); + + if (w == 4) { + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01), + 0); // 00 01 02 03 + d_u8 += dst_stride; + + } else if (w == 2) { + vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01), + 0); // 00 01 + d_u8 += dst_stride; + } + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; +#endif } while (height > 0); } else { // if width is a multiple of 8 & height is a multiple of 4 - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x8_t res0, res1, res2, res3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + uint8x8_t res0; +#if defined(__aarch64__) + int16x8_t s8, s9, s10; + uint8x8_t res1, res2, res3; +#endif do { __builtin_prefetch(v_src_ptr + 0 * im_stride); @@ -1032,6 +1330,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, height = h; do { +#if defined(__aarch64__) load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10); v_s += (im_stride << 2); @@ -1076,6 +1375,28 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, s5 = s9; s6 = s10; height -= 4; +#else + s7 = vld1q_s16(v_s); + v_s += im_stride; + + __builtin_prefetch(d_u8 + 0 * dst_stride); + + res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, round_shift_vec, offset_const, + sub_const_vec, vec_round_bits); + + vst1_u8(d_u8, res0); + d_u8 += dst_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; +#endif } while (height > 0); v_src_ptr += 8; dst_u8_ptr += 8; diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h index 47c93d645..f382984f2 100644 --- a/third_party/aom/av1/common/arm/convolve_neon.h +++ b/third_party/aom/av1/common/arm/convolve_neon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef AV1_COMMON_ARM_CONVOLVE_NEON_H_ -#define AV1_COMMON_ARM_CONVOLVE_NEON_H_ +#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ +#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ #include @@ -225,4 +225,4 @@ static INLINE uint16x4_t convolve8_4x4_s32( return res; } -#endif // AV1_COMMON_ARM_CONVOLVE_NEON_H_ +#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c index 4015082b4..e5674ef7c 100644 --- a/third_party/aom/av1/common/arm/jnt_convolve_neon.c +++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c @@ -22,12 +22,108 @@ #include "av1/common/arm/mem_neon.h" #include "av1/common/arm/transpose_neon.h" +#if !defined(__aarch64__) +static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0, + const uint16_t fwd_offset, + const uint16_t bck_offset, + const int16x4_t sub_const_vec, + const int16_t round_bits, + const int use_jnt_comp_avg, uint8x8_t *t0) { + int16x4_t tmp0; + uint16x4_t tmp_u0; + uint32x4_t sum0; + int32x4_t dst0; + int16x8_t tmp4; + + if (use_jnt_comp_avg) { + const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits)); + + sum0 = vmull_n_u16(res0, fwd_offset); + sum0 = vmlal_n_u16(sum0, d0, bck_offset); + + sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS); + + dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), vmovl_s16(sub_const_vec)); + + dst0 = vqrshlq_s32(dst0, round_bits_vec); + + tmp0 = vqmovn_s32(dst0); + tmp4 = vcombine_s16(tmp0, tmp0); + + *t0 = vqmovun_s16(tmp4); + } else { + const int16x4_t round_bits_vec = vdup_n_s16(-round_bits); + tmp_u0 = vhadd_u16(res0, d0); + + tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec); + + tmp0 = vqrshl_s16(tmp0, round_bits_vec); + + tmp4 = vcombine_s16(tmp0, tmp0); + + *t0 = vqmovun_s16(tmp4); + } +} + +static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0, + const uint16_t fwd_offset, + const uint16_t bck_offset, + const int16x4_t sub_const, + const int16_t round_bits, + const int use_jnt_comp_avg, uint8x8_t *t0) { + int16x4_t tmp0, tmp2; + int16x8_t f0; + uint32x4_t sum0, sum2; + int32x4_t dst0, dst2; + + uint16x8_t tmp_u0; + + if (use_jnt_comp_avg) { + const int32x4_t sub_const_vec = vmovl_s16(sub_const); + const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits); + + sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset); + sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset); + sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS); + + sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset); + sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset); + sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS); + + dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec); + dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec); + + dst0 = vqrshlq_s32(dst0, round_bits_vec); + dst2 = vqrshlq_s32(dst2, round_bits_vec); + + tmp0 = vqmovn_s32(dst0); + tmp2 = vqmovn_s32(dst2); + + f0 = vcombine_s16(tmp0, tmp2); + + *t0 = vqmovun_s16(f0); + + } else { + const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const); + const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits); + + tmp_u0 = vhaddq_u16(res0, d0); + + f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec); + + f0 = vqrshlq_s16(f0, round_bits_vec); + + *t0 = vqmovun_s16(f0); + } +} +#endif // !defined(__arch64__) + static INLINE void compute_avg_4x4( uint16x4_t res0, uint16x4_t res1, uint16x4_t res2, uint16x4_t res3, uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3, const uint16_t fwd_offset, const uint16_t bck_offset, const int16x4_t sub_const_vec, const int16_t round_bits, - const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) { + const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) { int16x4_t tmp0, tmp1, tmp2, tmp3; uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3; uint32x4_t sum0, sum1, sum2, sum3; @@ -107,7 +203,7 @@ static INLINE void compute_avg_8x4( uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3, const uint16_t fwd_offset, const uint16_t bck_offset, const int16x4_t sub_const, const int16_t round_bits, - const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2, + const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2, uint8x8_t *t3) { int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int16x8_t f0, f1, f2, f3; @@ -231,7 +327,6 @@ static INLINE void jnt_convolve_2d_horiz_neon( int16_t *dst_ptr; int dst_stride; int width, height; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; dst_ptr = im_block; dst_stride = im_stride; @@ -239,15 +334,22 @@ static INLINE void jnt_convolve_2d_horiz_neon( width = w; if (w == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - int16x8_t tt0, tt1, tt2, tt3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; + int16x8_t tt0; + uint8x8_t t0; const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2))); const int16x4_t shift_round_0 = vdup_n_s16(-(round_0)); +#if defined(__aarch64__) + int16x4_t s8, s9, s10, d1, d2, d3; + int16x8_t tt1, tt2, tt3; + uint8x8_t t1, t2, t3; +#endif do { s = src; __builtin_prefetch(s + 0 * src_stride); +#if defined(__aarch64__) __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); @@ -301,17 +403,48 @@ static INLINE void jnt_convolve_2d_horiz_neon( src += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; +#else + t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + s0 = vget_low_s16(tt0); // a0 a1 a2 a3 + s4 = vget_high_s16(tt0); // a4 a5 a6 a7 + __builtin_prefetch(dst_ptr); + s += 8; + t0 = vld1_u8(s); // a8 a9 a10 a11 + + // a8 a9 a10 a11 + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8 + s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9 + s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10 + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + horiz_const, shift_round_0); + + vst1_s16(dst_ptr, d0); + + src += src_stride; + dst_ptr += dst_stride; + height -= 1; +#endif } while (height > 0); } else { int16_t *d_tmp; - int16x8_t s11, s12, s13, s14; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int16x8_t res0, res1, res2, res3, res4, res5, res6, res7; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t res0; + uint8x8_t t0; const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2))); const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0)); - do { +#if defined(__aarch64__) + uint8x8_t t1, t2, t3, t4, t5, t6, t7; + int16x8_t s8, s9, s10, s11, s12, s13, s14; + int16x8_t res1, res2, res3, res4, res5, res6, res7; __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); @@ -390,6 +523,42 @@ static INLINE void jnt_convolve_2d_horiz_neon( src += 8 * src_stride; dst_ptr += 8 * dst_stride; height -= 8; +#else + int16x8_t temp_0; + t0 = vld1_u8(src); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + + width = w; + s = src + 8; + d_tmp = dst_ptr; + __builtin_prefetch(dst_ptr); + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + temp_0 = s0; + s0 = s7; + + s1 = vextq_s16(temp_0, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(temp_0, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(temp_0, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(temp_0, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(temp_0, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7, + x_filter_tmp, horiz_const, shift_round_0); + vst1q_s16(d_tmp, res0); + + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + src += src_stride; + dst_ptr += dst_stride; + height -= 1; +#endif } while (height > 0); } } @@ -420,10 +589,15 @@ static INLINE void jnt_convolve_2d_vert_neon( const int do_average = conv_params->do_average; const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x4_t res4, res5, res6, res7; - uint16x4_t d0, d1, d2, d3; - uint8x8_t t0, t1; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + uint16x4_t res4, d0; + uint8x8_t t0; + +#if defined(__aarch64__) + int16x4_t s8, s9, s10; + uint16x4_t res5, res6, res7, d1, d2, d3; + uint8x8_t t1; +#endif dst = conv_params->dst; src_ptr = im_block; @@ -450,6 +624,7 @@ static INLINE void jnt_convolve_2d_vert_neon( s += (7 * im_stride); do { +#if defined(__aarch64__) load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10); s += (im_stride << 2); @@ -480,17 +655,13 @@ static INLINE void jnt_convolve_2d_vert_neon( bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg, &t0, &t1); - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), - 0); // 00 01 02 03 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); d_u8 += dst8_stride; - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), - 1); // 10 11 12 13 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1); d_u8 += dst8_stride; - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), - 0); // 20 21 22 23 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0); d_u8 += dst8_stride; - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), - 1); // 30 31 32 33 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1); d_u8 += dst8_stride; } else { @@ -505,6 +676,39 @@ static INLINE void jnt_convolve_2d_vert_neon( s5 = s9; s6 = s10; height -= 4; +#else + s7 = vld1_s16(s); + s += (im_stride); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d_u8 + 0 * dst8_stride); + + d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_shift_vec, offset_const); + + if (do_average) { + res4 = vld1_u16(d); + d += (dst_stride); + + compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec, + round_bits, use_jnt_comp_avg, &t0); + + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); + d_u8 += dst8_stride; + + } else { + vst1_u16(d, d0); + d += (dst_stride); + } + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height--; +#endif } while (height > 0); src_ptr += 4; dst_ptr += 4; @@ -722,8 +926,10 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, uint8_t *dst_u8_ptr; CONV_BUF_TYPE *d, *dst_ptr; int width, height; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; - + uint8x8_t t0; +#if defined(__aarch64__) + uint8x8_t t1, t2, t3, t4, t5, t6, t7; +#endif s = src_ptr; dst_ptr = dst; dst_u8_ptr = dst8; @@ -731,11 +937,18 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, height = h; if ((w == 4) || (h == 4)) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - int16x8_t tt0, tt1, tt2, tt3; - uint16x4_t res4, res5, res6, res7; - uint32x2_t tu0, tu1; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; + int16x8_t tt0; + uint16x4_t res4; +#if defined(__aarch64__) + int16x4_t s8, s9, s10, d1, d2, d3; + int16x8_t tt1, tt2, tt3; + uint16x4_t res5, res6, res7; + uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0); int16x8_t u0, u1; +#else + int16x4_t temp_0; +#endif const int16x4_t zero = vdup_n_s16(0); const int16x4_t round_offset_vec = vdup_n_s16(round_offset); const int16x4_t shift_round_0 = vdup_n_s16(-conv_params->round_0 + 1); @@ -746,6 +959,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, d_u8 = dst_u8_ptr; width = w; __builtin_prefetch(s + 0 * src_stride); +#if defined(__aarch64__) __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); @@ -854,15 +1068,66 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, dst_ptr += (dst_stride << 2); dst_u8_ptr += (dst8_stride << 2); height -= 4; +#else + t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + s0 = vget_low_s16(tt0); // a0 a1 a2 a3 + s4 = vget_high_s16(tt0); // a4 a5 a6 a7 + __builtin_prefetch(d); + + s += 8; + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 + + // a8 a9 a10 a11 + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + temp_0 = s7; + s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8 + s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9 + s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10 + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + zero, shift_round_0); + d0 = vrshl_s16(d0, horiz_const); + d0 = vadd_s16(d0, round_offset_vec); + s0 = s4; + s4 = temp_0; + if (conv_params->do_average) { + __builtin_prefetch(d); + __builtin_prefetch(d_u8); + + res4 = vld1_u16(d); + + compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset, + bck_offset, round_offset_vec, round_bits, + use_jnt_comp_avg, &t0); + + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), + 0); // 00 01 02 03 + } else { + vst1_u16(d, vreinterpret_u16_s16(d0)); + } + + s += 4; + width -= 4; + d += 4; + d_u8 += 4; + } while (width > 0); + src_ptr += (src_stride); + dst_ptr += (dst_stride); + dst_u8_ptr += (dst8_stride); + height--; +#endif } while (height > 0); } else { CONV_BUF_TYPE *d_tmp; uint8_t *d_u8_tmp; - int16x8_t s11, s12, s13, s14; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int16x8_t res0, res1, res2, res3, res4, res5, res6, res7; - uint16x8_t res8, res9, res10, res11; - + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t res0; + uint16x8_t res8; const int16x8_t round_offset128 = vdupq_n_s16(round_offset); const int16x4_t round_offset64 = vdup_n_s16(round_offset); const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1); @@ -872,6 +1137,11 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, d = dst_ptr = dst; d_u8 = dst_u8_ptr = dst8; do { +#if defined(__aarch64__) + int16x8_t s11, s12, s13, s14; + int16x8_t s8, s9, s10; + int16x8_t res1, res2, res3, res4, res5, res6, res7; + uint16x8_t res9, res10, res11; __builtin_prefetch(src_ptr + 0 * src_stride); __builtin_prefetch(src_ptr + 1 * src_stride); __builtin_prefetch(src_ptr + 2 * src_stride); @@ -1007,6 +1277,67 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, dst_ptr += 8 * dst_stride; dst_u8_ptr += 8 * dst8_stride; height -= 8; +#else + int16x8_t temp_0; + __builtin_prefetch(src_ptr); + t0 = vld1_u8(src_ptr); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + + width = w; + s = src_ptr + 8; + d = dst_ptr; + d_u8_tmp = dst_u8_ptr; + + __builtin_prefetch(dst_ptr); + + do { + d_u8 = d_u8_tmp; + d_tmp = d; + + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + temp_0 = s0; + s0 = s7; + + s1 = vextq_s16(temp_0, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(temp_0, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(temp_0, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(temp_0, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(temp_0, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7, + x_filter_tmp, zero, shift_round_0); + + res0 = vrshlq_s16(res0, horiz_const); + res0 = vaddq_s16(res0, round_offset128); + + if (conv_params->do_average) { + res8 = vld1q_u16(d_tmp); + d_tmp += (dst_stride); + + compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset, + bck_offset, round_offset64, round_bits, + use_jnt_comp_avg, &t0); + + vst1_u8(d_u8, t0); + d_u8 += (dst8_stride); + } else { + vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0)); + d_tmp += (dst_stride); + } + + s += 8; + d += 8; + width -= 8; + d_u8_tmp += 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + dst_u8_ptr += dst8_stride; + height--; +#endif } while (height > 0); } } @@ -1057,7 +1388,6 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, uint8_t *dst_u8_ptr; CONV_BUF_TYPE *d, *dst_ptr; int width, height; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; s = src_ptr; dst_ptr = dst; @@ -1070,11 +1400,18 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, assert((conv_params->round_1 - 2) >= bits); if ((w == 4) || (h == 4)) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - uint16x4_t res4, res5, res6, res7; - uint32x2_t tu0, tu1, tu2, tu3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; + uint16x4_t res4; + uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0), + tu3 = vdup_n_u32(0); int16x8_t u0, u1, u2, u3; + uint8x8_t t0; +#if defined(__aarch64__) + int16x4_t s8, s9, s10, d1, d2, d3; + uint16x4_t res5, res6, res7; + uint8x8_t t1; +#endif const int16x4_t round_offset64 = vdup_n_s16(round_offset); const int16x4_t shift_vec = vdup_n_s16(-shift_value); const int16x4_t zero = vdup_n_s16(0); @@ -1111,6 +1448,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, s += (7 * src_stride); do { +#if defined(__aarch64__) load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1); u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0))); @@ -1154,17 +1492,13 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1); - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), - 0); // 00 01 02 03 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); d_u8 += dst8_stride; - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), - 1); // 10 11 12 13 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1); d_u8 += dst8_stride; - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), - 0); // 20 21 22 23 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0); d_u8 += dst8_stride; - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), - 1); // 30 31 32 33 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1); d_u8 += dst8_stride; } else { store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0), @@ -1183,6 +1517,44 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, s += (src_stride << 2); height -= 4; +#else + load_unaligned_u8_4x1(s, src_stride, &tu0); + u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0))); + s7 = vget_low_s16(u0); + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp, + zero, shift_vec); + + d0 = vadd_s16(d0, round_offset64); + + if (conv_params->do_average) { + __builtin_prefetch(d); + + res4 = vld1_u16(d); + d += (dst_stride); + + compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset, + bck_offset, round_offset64, round_bits, + use_jnt_comp_avg, &t0); + + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); + d_u8 += dst8_stride; + } else { + vst1_u16(d, vreinterpret_u16_s16(d0)); + d += (dst_stride); + } + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + + s += (src_stride); + height--; +#endif } while (height > 0); src_ptr += 4; dst_ptr += 4; @@ -1191,15 +1563,19 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, } while (width > 0); } else { CONV_BUF_TYPE *d_tmp; - int16x8_t s11, s12, s13, s14; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int16x8_t res0, res1, res2, res3, res4, res5, res6, res7; - uint16x8_t res8, res9, res10, res11; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t res0; + uint16x8_t res8; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; const int16x8_t round_offset128 = vdupq_n_s16(round_offset); const int16x8_t shift_vec = vdupq_n_s16(-shift_value); const int16x4_t round_offset64 = vdup_n_s16(round_offset); const int16x8_t zero = vdupq_n_s16(0); - +#if defined(__aarch64__) + int16x8_t s8, s9, s10, s11, s12, s13, s14; + int16x8_t res1, res2, res3, res4, res5, res6, res7; + uint16x8_t res10, res11, res9; +#endif dst_ptr = dst; dst_u8_ptr = dst8; do { @@ -1227,6 +1603,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, d_u8 = dst_u8_ptr; do { +#if defined(__aarch64__) load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); @@ -1316,6 +1693,43 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, s6 = s14; s += (8 * src_stride); height -= 8; +#else + s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + __builtin_prefetch(dst_ptr); + + res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp, + zero, shift_vec); + res0 = vaddq_s16(res0, round_offset128); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + + if (conv_params->do_average) { + __builtin_prefetch(d_tmp); + + res8 = vld1q_u16(d_tmp); + d_tmp += (dst_stride); + + compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset, + bck_offset, round_offset64, round_bits, + use_jnt_comp_avg, &t0); + + vst1_u8(d_u8, t0); + d_u8 += (dst8_stride); + } else { + vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0)); + d_tmp += dst_stride; + } + + s += (src_stride); + height--; +#endif } while (height > 0); src_ptr += 8; dst_ptr += 8; diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h index 4bf45a52c..c4ae2e784 100644 --- a/third_party/aom/av1/common/arm/mem_neon.h +++ b/third_party/aom/av1/common/arm/mem_neon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef AV1_COMMON_ARM_MEM_NEON_H_ -#define AV1_COMMON_ARM_MEM_NEON_H_ +#ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_ +#define AOM_AV1_COMMON_ARM_MEM_NEON_H_ #include #include @@ -362,6 +362,15 @@ static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride, *tu1 = vset_lane_u32(a, *tu1, 1); } +static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride, + uint32x2_t *tu0) { + uint32_t a; + + memcpy(&a, buf, 4); + buf += stride; + *tu0 = vset_lane_u32(a, *tu0, 0); +} + static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride, uint32x2_t *tu0) { uint32_t a; @@ -482,4 +491,4 @@ static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1, vst1q_u32(s, s4); } -#endif // AV1_COMMON_ARM_MEM_NEON_H_ +#endif // AOM_AV1_COMMON_ARM_MEM_NEON_H_ diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c index b4808a972..b3a37c4cb 100644 --- a/third_party/aom/av1/common/arm/selfguided_neon.c +++ b/third_party/aom/av1/common/arm/selfguided_neon.c @@ -1007,10 +1007,11 @@ static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0, vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x)))); } -void final_filter_fast_internal(uint16_t *A, int32_t *B, const int buf_stride, - int16_t *src, const int src_stride, - int32_t *dst, const int dst_stride, - const int width, const int height) { +static void final_filter_fast_internal(uint16_t *A, int32_t *B, + const int buf_stride, int16_t *src, + const int src_stride, int32_t *dst, + const int dst_stride, const int width, + const int height) { int16x8_t s0; int32_t *B_tmp, *dst_ptr; uint16_t *A_tmp; @@ -1340,10 +1341,10 @@ static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride, } } -void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, - int stride, int32_t *flt0, int32_t *flt1, - int flt_stride, int sgr_params_idx, - int bit_depth, int highbd) { +int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, + int stride, int32_t *flt0, int32_t *flt1, + int flt_stride, int sgr_params_idx, + int bit_depth, int highbd) { const sgr_params_type *const params = &sgr_params[sgr_params_idx]; assert(!(params->r[0] == 0 && params->r[1] == 0)); @@ -1376,6 +1377,7 @@ void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, if (params->r[1] > 0) restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride, bit_depth, sgr_params_idx, 1); + return 0; } void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h index fe134087b..8a3d9f07f 100644 --- a/third_party/aom/av1/common/arm/transpose_neon.h +++ b/third_party/aom/av1/common/arm/transpose_neon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef AV1_COMMON_ARM_TRANSPOSE_NEON_H_ -#define AV1_COMMON_ARM_TRANSPOSE_NEON_H_ +#ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_ +#define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_ #include @@ -386,6 +386,83 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, vget_high_s16(vreinterpretq_s16_s32(c3.val[1]))); } +static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { + int16x8x2_t b0; + b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), + vreinterpret_s16_s32(vget_low_s32(a1))); + b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), + vreinterpret_s16_s32(vget_high_s32(a1))); + return b0; +} + +static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1)); + const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3)); + const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5)); + const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7)); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + + *out = d0.val[0]; + *(out + 1) = d1.val[0]; + *(out + 2) = d2.val[0]; + *(out + 3) = d3.val[0]; + *(out + 4) = d0.val[1]; + *(out + 5) = d1.val[1]; + *(out + 6) = d2.val[1]; + *(out + 7) = d3.val[1]; +} + static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1, int16x4_t *a2, int16x4_t *a3) { // Swap 16 bit elements. Goes from: @@ -457,4 +534,4 @@ static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, *a3 = c1.val[1]; } -#endif // AV1_COMMON_ARM_TRANSPOSE_NEON_H_ +#endif // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c new file mode 100644 index 000000000..7f02d42a7 --- /dev/null +++ b/third_party/aom/av1/common/arm/warp_plane_neon.c @@ -0,0 +1,714 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" +#include "config/av1_rtcd.h" +#include "av1/common/warped_motion.h" +#include "av1/common/scale.h" + +/* This is a modified version of 'warped_filter' from warped_motion.c: + * Each coefficient is stored in 8 bits instead of 16 bits + * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7 + + This is done in order to avoid overflow: Since the tap with the largest + coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation + order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular + convolve functions. + + Instead, we use the summation order + ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)). + The rearrangement of coefficients in this table is so that we can get the + coefficients into the correct order more quickly. +*/ +/* clang-format off */ +DECLARE_ALIGNED(8, static const int8_t, + filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = { +#if WARPEDPIXEL_PREC_BITS == 6 + // [-1, 0) + { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0}, + { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0}, + { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0}, + { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0}, + { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0}, + { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0}, + { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0}, + { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0}, + { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0}, + { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0}, + { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0}, + { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0}, + { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0}, + { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0}, + { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0}, + { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0}, + { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0}, + { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0}, + { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0}, + { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0}, + { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0}, + { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0}, + { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0}, + { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0}, + { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0}, + { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0}, + { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0}, + { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0}, + { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0}, + { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0}, + { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0}, + { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0}, + // [0, 1) + { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0}, + { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0}, + { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1}, + {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1}, + {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1}, + {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1}, + {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1}, + {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1}, + {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2}, + {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2}, + {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2}, + {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2}, + {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2}, + {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2}, + {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2}, + {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2}, + {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2}, + {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2}, + {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2}, + {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2}, + {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2}, + {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2}, + {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2}, + {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2}, + {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2}, + {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1}, + {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2}, + {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1}, + {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1}, + {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1}, + { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0}, + { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0}, + // [1, 2) + { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0}, + { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1}, + { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1}, + { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1}, + { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1}, + { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2}, + { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2}, + { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2}, + { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3}, + { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3}, + { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3}, + { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4}, + { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4}, + { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4}, + { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4}, + { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4}, + { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4}, + { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4}, + { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4}, + { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4}, + { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4}, + { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4}, + { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4}, + { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3}, + { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3}, + { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3}, + { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2}, + { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2}, + { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2}, + { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1}, + { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1}, + { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0}, + // dummy (replicate row index 191) + { 0, 0, 2, -1, 0, 0, 127, 0}, + +#else + // [-1, 0) + { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0}, + { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0}, + { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0}, + { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0}, + { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0}, + { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0}, + { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0}, + { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0}, + { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0}, + { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0}, + { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0}, + { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0}, + { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0}, + { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0}, + { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0}, + { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0}, + // [0, 1) + { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0}, + { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1}, + {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1}, + {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1}, + {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2}, + {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2}, + {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2}, + {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2}, + {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2}, + {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2}, + {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2}, + {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2}, + {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2}, + {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1}, + {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1}, + { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0}, + // [1, 2) + { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0}, + { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1}, + { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2}, + { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3}, + { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3}, + { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3}, + { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4}, + { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4}, + { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4}, + { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4}, + { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4}, + { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3}, + { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3}, + { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2}, + { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1}, + { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1}, + // dummy (replicate row index 95) + { 0, 0, 4, -3, 0, -1, 127, 1}, +#endif // WARPEDPIXEL_PREC_BITS == 6 +}; +/* clang-format on */ + +static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0, + uint8x8_t src_1, int16x4_t *res) { + int16x8_t coeff_0, coeff_1; + int16x8_t pix_0, pix_1; + + coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]), + vreinterpret_s16_s32(x1.val[0])); + coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]), + vreinterpret_s16_s32(x1.val[1])); + + pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0)); + pix_0 = vmulq_s16(coeff_0, pix_0); + + pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1)); + pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1); + + *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0)); +} + +static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2, + uint8x16_t src_3, uint8x16_t src_4, + int16x8_t *tmp_dst, int sx, int alpha, + int k, const int offset_bits_horiz, + const int reduce_bits_horiz) { + const uint8x16_t mask = { 255, 0, 255, 0, 255, 0, 255, 0, + 255, 0, 255, 0, 255, 0, 255, 0 }; + const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz)); + const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz); + + int16x8_t f0, f1, f2, f3, f4, f5, f6, f7; + int32x2x2_t b0, b1; + uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low; + int32x4_t tmp_res_low, tmp_res_high; + uint16x8_t res; + int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd; + + uint8x16_t tmp_0 = vandq_u8(src_1, mask); + uint8x16_t tmp_1 = vandq_u8(src_2, mask); + uint8x16_t tmp_2 = vandq_u8(src_3, mask); + uint8x16_t tmp_3 = vandq_u8(src_4, mask); + + tmp_2 = vextq_u8(tmp_0, tmp_0, 1); + tmp_3 = vextq_u8(tmp_1, tmp_1, 1); + + src_1 = vaddq_u8(tmp_0, tmp_2); + src_2 = vaddq_u8(tmp_1, tmp_3); + + src_1_low = vget_low_u8(src_1); + src_2_low = vget_low_u8(src_2); + src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4)); + src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4)); + src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2)); + src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6)); + + // Loading the 8 filter taps + f0 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS])); + f1 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS])); + f2 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS])); + f3 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS])); + f4 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS])); + f5 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS])); + f6 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS])); + f7 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS])); + + b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)), + vreinterpret_s32_s16(vget_low_s16(f2))); + b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)), + vreinterpret_s32_s16(vget_low_s16(f6))); + convolve(b0, b1, src_1_low, src_3_low, &res_0246_even); + + b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)), + vreinterpret_s32_s16(vget_low_s16(f3))); + b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)), + vreinterpret_s32_s16(vget_low_s16(f7))); + convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd); + + b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)), + vreinterpret_s32_s16(vget_high_s16(f2))); + b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)), + vreinterpret_s32_s16(vget_high_s16(f6))); + convolve(b0, b1, src_2_low, src_4_low, &res_1357_even); + + b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)), + vreinterpret_s32_s16(vget_high_s16(f3))); + b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)), + vreinterpret_s32_s16(vget_high_s16(f7))); + convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd); + + tmp_res_low = vaddl_s16(res_0246_even, res_1357_even); + tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high)); + res = vqrshlq_u16(res, shift); + + tmp_dst[k + 7] = vreinterpretq_s16_u16(res); +} + +static INLINE void vertical_filter_neon(const int16x8_t *src, + int32x4_t *res_low, int32x4_t *res_high, + int sy, int gamma) { + int16x4_t src_0, src_1, fltr_0, fltr_1; + int32x4_t res_0, res_1; + int32x2_t res_0_im, res_1_im; + int32x4_t res_even, res_odd, im_res_0, im_res_1; + + int16x8_t f0, f1, f2, f3, f4, f5, f6, f7; + int16x8x2_t b0, b1, b2, b3; + int32x4x2_t c0, c1, c2, c3; + int32x4x2_t d0, d1, d2, d3; + + b0 = vtrnq_s16(src[0], src[1]); + b1 = vtrnq_s16(src[2], src[3]); + b2 = vtrnq_s16(src[4], src[5]); + b3 = vtrnq_s16(src[6], src[7]); + + c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b0.val[1])); + c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]), + vreinterpretq_s32_s16(b1.val[1])); + c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b2.val[1])); + c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]), + vreinterpretq_s32_s16(b3.val[1])); + + f0 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + f1 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + f2 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + f3 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + f4 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + f5 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + f6 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + f7 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2)); + d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6)); + d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3)); + d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7)); + + // row:0,1 even_col:0,2 + src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0])); + fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0])); + res_0 = vmull_s16(src_0, fltr_0); + + // row:0,1,2,3 even_col:0,2 + src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0])); + fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1])); + res_0 = vmlal_s16(res_0, src_0, fltr_0); + res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0)); + + // row:0,1 even_col:4,6 + src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1])); + fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0])); + res_1 = vmull_s16(src_1, fltr_1); + + // row:0,1,2,3 even_col:4,6 + src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1])); + fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1])); + res_1 = vmlal_s16(res_1, src_1, fltr_1); + res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1)); + + // row:0,1,2,3 even_col:0,2,4,6 + im_res_0 = vcombine_s32(res_0_im, res_1_im); + + // row:4,5 even_col:0,2 + src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0])); + fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0])); + res_0 = vmull_s16(src_0, fltr_0); + + // row:4,5,6,7 even_col:0,2 + src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0])); + fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1])); + res_0 = vmlal_s16(res_0, src_0, fltr_0); + res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0)); + + // row:4,5 even_col:4,6 + src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1])); + fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0])); + res_1 = vmull_s16(src_1, fltr_1); + + // row:4,5,6,7 even_col:4,6 + src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1])); + fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1])); + res_1 = vmlal_s16(res_1, src_1, fltr_1); + res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1)); + + // row:4,5,6,7 even_col:0,2,4,6 + im_res_1 = vcombine_s32(res_0_im, res_1_im); + + // row:0-7 even_col:0,2,4,6 + res_even = vaddq_s32(im_res_0, im_res_1); + + // row:0,1 odd_col:1,3 + src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0])); + fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0])); + res_0 = vmull_s16(src_0, fltr_0); + + // row:0,1,2,3 odd_col:1,3 + src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0])); + fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1])); + res_0 = vmlal_s16(res_0, src_0, fltr_0); + res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0)); + + // row:0,1 odd_col:5,7 + src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1])); + fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0])); + res_1 = vmull_s16(src_1, fltr_1); + + // row:0,1,2,3 odd_col:5,7 + src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1])); + fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1])); + res_1 = vmlal_s16(res_1, src_1, fltr_1); + res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1)); + + // row:0,1,2,3 odd_col:1,3,5,7 + im_res_0 = vcombine_s32(res_0_im, res_1_im); + + // row:4,5 odd_col:1,3 + src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0])); + fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0])); + res_0 = vmull_s16(src_0, fltr_0); + + // row:4,5,6,7 odd_col:1,3 + src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0])); + fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1])); + res_0 = vmlal_s16(res_0, src_0, fltr_0); + res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0)); + + // row:4,5 odd_col:5,7 + src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1])); + fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0])); + res_1 = vmull_s16(src_1, fltr_1); + + // row:4,5,6,7 odd_col:5,7 + src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1])); + fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1])); + res_1 = vmlal_s16(res_1, src_1, fltr_1); + res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1)); + + // row:4,5,6,7 odd_col:1,3,5,7 + im_res_1 = vcombine_s32(res_0_im, res_1_im); + + // row:0-7 odd_col:1,3,5,7 + res_odd = vaddq_s32(im_res_0, im_res_1); + + // reordering as 0 1 2 3 | 4 5 6 7 + c0 = vtrnq_s32(res_even, res_odd); + + // Final store + *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1])); + *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1])); +} + +void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + int16x8_t tmp[15]; + const int bd = 8; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const int32x4_t fwd = vdupq_n_s32((int32_t)w0); + const int32x4_t bwd = vdupq_n_s32((int32_t)w1); + const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd)); + + int limit = 0; + uint8x16_t vec_dup, mask_val; + int32x4_t res_lo, res_hi; + int16x8_t result_final; + uint8x16_t src_1, src_2, src_3, src_4; + uint8x16_t indx_vec = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + }; + uint8x16_t cmp_vec; + + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert); + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int16x4_t res_sub_const = + vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)))); + int k; + + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + for (int i = 0; i < p_height; i += 8) { + for (int j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0]; + const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1]; + const int32_t x4 = dst_x >> subsampling_x; + const int32_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS; + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS; + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + // horizontal + if (ix4 <= -7) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int16_t dup_val = + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)); + + tmp[k + 7] = vdupq_n_s16(dup_val); + } + } else if (ix4 >= width + 6) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz)); + tmp[k + 7] = vdupq_n_s16(dup_val); + } + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + const uint8_t *src = ref + iy * stride + ix4 - 7; + src_1 = vld1q_u8(src); + + if (out_of_boundary_left >= 0) { + limit = out_of_boundary_left + 1; + cmp_vec = vdupq_n_u8(out_of_boundary_left); + vec_dup = vdupq_n_u8(*(src + limit)); + mask_val = vcleq_u8(indx_vec, cmp_vec); + src_1 = vbslq_u8(mask_val, vec_dup, src_1); + } + if (out_of_boundary_right >= 0) { + limit = 15 - (out_of_boundary_right + 1); + cmp_vec = vdupq_n_u8(15 - out_of_boundary_right); + vec_dup = vdupq_n_u8(*(src + limit)); + mask_val = vcgeq_u8(indx_vec, cmp_vec); + src_1 = vbslq_u8(mask_val, vec_dup, src_1); + } + src_2 = vextq_u8(src_1, src_1, 1); + src_3 = vextq_u8(src_2, src_2, 1); + src_4 = vextq_u8(src_3, src_3, 1); + + horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k, + offset_bits_horiz, reduce_bits_horiz); + } + } else { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + const uint8_t *src = ref + iy * stride + ix4 - 7; + src_1 = vld1q_u8(src); + src_2 = vextq_u8(src_1, src_1, 1); + src_3 = vextq_u8(src_2, src_2, 1); + src_4 = vextq_u8(src_3, src_3, 1); + + horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k, + offset_bits_horiz, reduce_bits_horiz); + } + } + + // vertical + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + const int16x8_t *v_src = tmp + (k + 4); + + vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma); + + res_lo = vaddq_s32(res_lo, add_const_vert); + res_hi = vaddq_s32(res_hi, add_const_vert); + + if (conv_params->is_compound) { + uint16_t *const p = + (uint16_t *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j]; + + res_lo = vrshlq_s32(res_lo, shift_vert); + if (conv_params->do_average) { + uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j]; + uint16x4_t tmp16_lo = vld1_u16(p); + int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo)); + int16x4_t tmp16_low; + if (conv_params->use_jnt_comp_avg) { + res_lo = vmulq_s32(res_lo, bwd); + tmp32_lo = vmulq_s32(tmp32_lo, fwd); + tmp32_lo = vaddq_s32(tmp32_lo, res_lo); + tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS); + } else { + tmp32_lo = vaddq_s32(tmp32_lo, res_lo); + tmp16_low = vshrn_n_s32(tmp32_lo, 1); + } + int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const); + res_low = vqrshl_s16(res_low, round_bits_vec); + int16x8_t final_res_low = vcombine_s16(res_low, res_low); + uint8x8_t res_8_low = vqmovun_s16(final_res_low); + + vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0); + } else { + uint16x4_t res_u16_low = vqmovun_s32(res_lo); + vst1_u16(p, res_u16_low); + } + if (p_width > 4) { + uint16_t *const p4 = + (uint16_t *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + + res_hi = vrshlq_s32(res_hi, shift_vert); + if (conv_params->do_average) { + uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4]; + + uint16x4_t tmp16_hi = vld1_u16(p4); + int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi)); + int16x4_t tmp16_high; + if (conv_params->use_jnt_comp_avg) { + res_hi = vmulq_s32(res_hi, bwd); + tmp32_hi = vmulq_s32(tmp32_hi, fwd); + tmp32_hi = vaddq_s32(tmp32_hi, res_hi); + tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS); + } else { + tmp32_hi = vaddq_s32(tmp32_hi, res_hi); + tmp16_high = vshrn_n_s32(tmp32_hi, 1); + } + int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const); + res_high = vqrshl_s16(res_high, round_bits_vec); + int16x8_t final_res_high = vcombine_s16(res_high, res_high); + uint8x8_t res_8_high = vqmovun_s16(final_res_high); + + vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high), + 0); + } else { + uint16x4_t res_u16_high = vqmovun_s32(res_hi); + vst1_u16(p4, res_u16_high); + } + } + } else { + res_lo = vrshlq_s32(res_lo, shift_vert); + res_hi = vrshlq_s32(res_hi, shift_vert); + + result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi)); + result_final = vsubq_s16(result_final, sub_constant); + + uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j]; + uint8x8_t val = vqmovun_s16(result_final); + + if (p_width == 4) { + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0); + } else { + vst1_u8(p, val); + } + } + } + } + } +} diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c index 72fbed4d4..a9bb5bcf0 100644 --- a/third_party/aom/av1/common/arm/wiener_convolve_neon.c +++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c @@ -26,7 +26,6 @@ Apply horizontal filter and store in a temporary buffer. When applying vertical filter, overwrite the original pixel values. */ - void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -78,8 +77,10 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, /* if height is a multiple of 8 */ if (!(h & 7)) { int16x8_t res0, res1, res2, res3; - uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11; + uint16x8_t res4; uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; +#if defined(__aarch64__) + uint16x8_t res5, res6, res7, res8, res9, res10, res11; uint8x8_t t8, t9, t10, t11, t12, t13, t14; do { @@ -190,16 +191,64 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, dst_ptr += 8 * MAX_SB_SIZE; height -= 8; } while (height > 0); +#else + uint8x8_t temp_0; + + do { + const uint8_t *s; + + __builtin_prefetch(src_ptr); + + t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + s = src_ptr + 8; + d_tmp = dst_ptr; + width = w; + + __builtin_prefetch(dst_ptr); + + do { + t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + temp_0 = t0; + t0 = t7; + + t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6)); + res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5)); + res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4)); + res3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, + bd, conv_params->round_0); + + vst1q_u16(d_tmp, res4); + + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += MAX_SB_SIZE; + height--; + } while (height > 0); +#endif } else { /*if height is a multiple of 4*/ - int16x8_t tt0, tt1, tt2, tt3; const uint8_t *s; + int16x8_t tt0, tt1, tt2, tt3; + uint16x8_t d0; + uint8x8_t t0, t1, t2, t3; + +#if defined(__aarch64__) uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7; - uint16x8_t d0, d1, d2, d3; + uint16x8_t d1, d2, d3; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; int16x4_t s11, s12, s13, s14; - uint8x8_t t0, t1, t2, t3; - do { __builtin_prefetch(src_ptr + 0 * src_stride); __builtin_prefetch(src_ptr + 1 * src_stride); @@ -292,11 +341,61 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, dst_ptr += 4 * MAX_SB_SIZE; height -= 4; } while (height > 0); +#else + uint8x8_t temp_0, t4, t5, t6, t7; + + do { + __builtin_prefetch(src_ptr); + + t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + + __builtin_prefetch(dst_ptr); + + s = src_ptr + 8; + d_tmp = dst_ptr; + width = w; + + do { + t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + temp_0 = t0; + t0 = t7; + + t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6)); + tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5)); + tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4)); + tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd, + conv_params->round_0); + + vst1q_u16(d_tmp, d0); + + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + + src_ptr += src_stride; + dst_ptr += MAX_SB_SIZE; + height -= 1; + } while (height > 0); +#endif } { - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x8_t t0, t1, t2, t3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + uint8x8_t t0; +#if defined(__aarch64__) + int16x8_t s8, s9, s10; + uint8x8_t t1, t2, t3; +#endif int16_t *src_tmp_ptr, *s; uint8_t *dst_tmp_ptr; height = h; @@ -324,6 +423,7 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, d = dst_tmp_ptr; height = h; +#if defined(__aarch64__) do { __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride); __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride); @@ -397,5 +497,34 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, w -= 8; } while (w > 0); +#else + do { + __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride); + + s7 = vld1q_s16(s); + s += src_stride; + + t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp, + bd, conv_params->round_1); + + vst1_u8(d, t0); + d += dst_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; + } while (height > 0); + + src_tmp_ptr += 8; + dst_tmp_ptr += 8; + + w -= 8; + } while (w > 0); +#endif } } diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c index 8514dc64c..7ef2d6d7f 100644 --- a/third_party/aom/av1/common/av1_inv_txfm1d.c +++ b/third_party/aom/av1/common/av1_inv_txfm1d.c @@ -11,56 +11,7 @@ #include #include "av1/common/av1_inv_txfm1d.h" - -static void range_check_buf(int32_t stage, const int32_t *input, - const int32_t *buf, int32_t size, int8_t bit) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - const int64_t max_value = (1LL << (bit - 1)) - 1; - const int64_t min_value = -(1LL << (bit - 1)); - - int in_range = 1; - - for (int i = 0; i < size; ++i) { - if (buf[i] < min_value || buf[i] > max_value) { - in_range = 0; - } - } - - if (!in_range) { - fprintf(stderr, "Error: coeffs contain out-of-range values\n"); - fprintf(stderr, "size: %d\n", size); - fprintf(stderr, "stage: %d\n", stage); - fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value, - max_value); - - fprintf(stderr, "coeffs: "); - - fprintf(stderr, "["); - for (int j = 0; j < size; j++) { - if (j > 0) fprintf(stderr, ", "); - fprintf(stderr, "%d", input[j]); - } - fprintf(stderr, "]\n"); - - fprintf(stderr, " buf: "); - - fprintf(stderr, "["); - for (int j = 0; j < size; j++) { - if (j > 0) fprintf(stderr, ", "); - fprintf(stderr, "%d", buf[j]); - } - fprintf(stderr, "]\n\n"); - } - - assert(in_range); -#else - (void)stage; - (void)input; - (void)buf; - (void)size; - (void)bit; -#endif -} +#include "av1/common/av1_txfm.h" // TODO(angiebird): Make 1-d txfm functions static // @@ -84,7 +35,7 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[1] = input[2]; bf1[2] = input[1]; bf1[3] = input[3]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -94,7 +45,7 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -129,7 +80,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = input[5]; bf1[6] = input[3]; bf1[7] = input[7]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -143,7 +94,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -157,7 +108,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -171,7 +122,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[7] = bf0[7]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -218,7 +169,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = input[11]; bf1[14] = input[7]; bf1[15] = input[15]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -240,7 +191,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -262,7 +213,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -284,7 +235,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); bf1[15] = bf0[15]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -306,7 +257,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -328,7 +279,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -399,7 +350,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = input[23]; bf1[30] = input[15]; bf1[31] = input[31]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -437,7 +388,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit); bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit); bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -475,7 +426,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]); bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]); bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -513,7 +464,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit); bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit); bf1[31] = bf0[31]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -551,7 +502,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]); bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]); bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -589,7 +540,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit); bf1[30] = bf0[30]; bf1[31] = bf0[31]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -627,7 +578,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]); bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]); bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; @@ -665,7 +616,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = bf0[29]; bf1[30] = bf0[30]; bf1[31] = bf0[31]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; @@ -760,7 +711,6 @@ void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, output[1] = round_shift(x1, bit); output[2] = round_shift(x2, bit); output[3] = round_shift(x3, bit); - range_check_buf(6, input, output, 4, stage_range[6]); } void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -786,7 +736,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = input[4]; bf1[6] = input[1]; bf1[7] = input[6]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -800,7 +750,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -814,7 +764,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]); bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -828,7 +778,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -842,7 +792,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]); bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]); bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -856,7 +806,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = bf0[5]; bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -903,7 +853,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = input[12]; bf1[14] = input[1]; bf1[15] = input[14]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -925,7 +875,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -947,7 +897,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -969,7 +919,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -991,7 +941,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -1013,7 +963,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -1035,7 +985,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]); bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; @@ -1057,7 +1007,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[13]; bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; @@ -1193,7 +1143,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = input[47]; bf1[62] = input[31]; bf1[63] = input[63]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -1263,7 +1213,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit); bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit); bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -1333,7 +1283,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]); bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]); bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -1403,7 +1353,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit); bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit); bf1[63] = bf0[63]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -1473,7 +1423,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]); bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]); bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -1543,7 +1493,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit); bf1[62] = bf0[62]; bf1[63] = bf0[63]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -1613,7 +1563,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]); bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]); bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; @@ -1683,7 +1633,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[61]; bf1[62] = bf0[62]; bf1[63] = bf0[63]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; @@ -1753,7 +1703,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]); bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]); bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 10 stage++; @@ -1823,7 +1773,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[61]; bf1[62] = bf0[62]; bf1[63] = bf0[63]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 11 stage++; diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h index 64a1a921c..c31c019aa 100644 --- a/third_party/aom/av1/common/av1_inv_txfm1d.h +++ b/third_party/aom/av1/common/av1_inv_txfm1d.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_INV_TXFM1D_H_ -#define AV1_INV_TXFM1D_H_ +#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ +#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ #include "av1/common/av1_txfm.h" @@ -58,4 +58,4 @@ void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, } #endif -#endif // AV1_INV_TXFM1D_H_ +#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h index 4c600f756..7d80a0099 100644 --- a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h +++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_INV_TXFM2D_CFG_H_ -#define AV1_INV_TXFM2D_CFG_H_ +#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ +#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ #include "av1/common/av1_inv_txfm1d.h" // sum of fwd_shift_## @@ -44,4 +44,4 @@ extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL]; extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/]; extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/]; -#endif // AV1_INV_TXFM2D_CFG_H_ +#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c index 9d68b8760..537d8dfe9 100644 --- a/third_party/aom/av1/common/av1_loopfilter.c +++ b/third_party/aom/av1/common/av1_loopfilter.c @@ -68,23 +68,6 @@ static const int mode_lf_lut[] = { // 10101010|10101010 // // A loopfilter should be applied to every other 4x4 horizontally. -// TODO(chengchen): make these tables static -const FilterMask left_txform_mask[TX_SIZES] = { - { { 0xffffffffffffffffULL, // TX_4X4, - 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } }, - - { { 0x5555555555555555ULL, // TX_8X8, - 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL } }, - - { { 0x1111111111111111ULL, // TX_16X16, - 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL } }, - - { { 0x0101010101010101ULL, // TX_32X32, - 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL } }, - - { { 0x0001000100010001ULL, // TX_64X64, - 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } }, -}; // 256 bit masks (64x64 / 4x4) for above transform size for Y plane. // We use 4 uint64_t to represent the 256 bit. @@ -113,98 +96,314 @@ const FilterMask left_txform_mask[TX_SIZES] = { // 00000000|00000000 // // A loopfilter should be applied to every other 4x4 horizontally. -const FilterMask above_txform_mask[TX_SIZES] = { - { { 0xffffffffffffffffULL, // TX_4X4 - 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } }, - { { 0x0000ffff0000ffffULL, // TX_8X8 - 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL } }, - - { { 0x000000000000ffffULL, // TX_16X16 - 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL } }, - - { { 0x000000000000ffffULL, // TX_32X32 - 0x0000000000000000ULL, 0x000000000000ffffULL, 0x0000000000000000ULL } }, - - { { 0x000000000000ffffULL, // TX_64X64 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, +const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18 }; -// 64 bit mask to shift and set for each prediction size. A bit is set for -// each 4x4 block that would be in the top left most block of the given block -// size in the 64x64 block. -const FilterMask size_mask_y[BLOCK_SIZES_ALL] = { - { { 0x0000000000000001ULL, // BLOCK_4X4 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x0000000000010001ULL, // BLOCK_4X8 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x0000000000000003ULL, // BLOCK_8X4 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x0000000000030003ULL, // BLOCK_8X8 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x0003000300030003ULL, // BLOCK_8X16 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x00000000000f000fULL, // BLOCK_16X8 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x000f000f000f000fULL, // BLOCK_16X16 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x000f000f000f000fULL, // BLOCK_16X32 - 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x00ff00ff00ff00ffULL, // BLOCK_32X16 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x00ff00ff00ff00ffULL, // BLOCK_32X32 - 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x00ff00ff00ff00ffULL, // BLOCK_32X64 - 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL } }, - - { { 0xffffffffffffffffULL, // BLOCK_64X32 - 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0xffffffffffffffffULL, // BLOCK_64X64 - 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } }, - // Y plane max coding block size is 128x128, but the codec divides it - // into 4 64x64 blocks. - // BLOCK_64X128 - { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } }, - // BLOCK_128X64 - { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } }, - // BLOCK_128X128 - { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } }, - - { { 0x0001000100010001ULL, // BLOCK_4X16 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x000000000000000fULL, // BLOCK_16X4 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x0003000300030003ULL, // BLOCK_8X32 - 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, +const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = { + -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13 +}; - { { 0x0000000000ff00ffULL, // BLOCK_32X8 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, +const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = { + -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8 +}; - { { 0x000f000f000f000fULL, // BLOCK_16X64 - 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL } }, +const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1, + -1, -1, -1, 0, 1, 2, + 3, -1, -1, -1, -1, -1, + -1, -1, -1, -1 }; + +const FilterMask left_mask_univariant_reordered[67] = { + // TX_4X4 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X4, TX_4X4 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_4X4 + { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_4X4 + { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_4X4 + { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, + 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, + 0xffffffffffffffffULL } }, // block size 64X64, TX_4X4 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X4 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_4X4 + { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_4X4 + { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, + 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4 + { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_4X4 + // TX_8X8 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_8X8 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X8 + { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_8X8 + { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_8X8 + { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_8X8 + { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_8X8 + { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_8X8 + { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL, + 0x0055005500550055ULL } }, // block size 32X64, TX_8X8 + { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_8X8 + { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL, + 0x5555555555555555ULL } }, // block size 64X64, TX_8X8 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X8 + { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_8X8 + { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL, + 0x0005000500050005ULL } }, // block size 16X64, TX_8X8 + { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_8X8 + // TX_16X16 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_16X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X16 + { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_16X16 + { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_16X16 + { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL, + 0x0011001100110011ULL } }, // block size 32X64, TX_16X16 + { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_16X16 + { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL, + 0x1111111111111111ULL } }, // block size 64X64, TX_16X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 16X64, TX_16X16 + { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_16X16 + // TX_32X32 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_32X32 + { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL } }, // block size 32X64, TX_32X32 + { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_32X32 + { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL } }, // block size 64X64, TX_32X32 + // TX_64X64 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 64X64, TX_64X64 + // 2:1, 1:2 transform sizes. + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X8 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X8 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_8X4 + { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_8X4 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X16 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_16X8 + { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_16X8 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X32 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 16X64, TX_16X32 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_32X16 + { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_32X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 32X64, TX_32X64 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_64X32 + // 4:1, 1:4 transform sizes. + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X16 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_16X4 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X32 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_32X8 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 16X64, TX_16X64 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_64X16 +}; - { { 0xffffffffffffffffULL, // BLOCK_64X16 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } } +const FilterMask above_mask_univariant_reordered[67] = { + // TX_4X4 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X4, TX_4X4 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_4X4 + { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_4X4 + { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_4X4 + { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, + 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, + 0xffffffffffffffffULL } }, // block size 64X64, TX_4x4 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X4 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_4X4 + { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_4X4 + { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, + 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4 + { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_4X4 + // TX_8X8 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_8X8 + { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X8 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_8X8 + { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_8X8 + { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_8X8 + { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_8X8 + { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_8X8 + { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL, + 0x000000ff000000ffULL } }, // block size 32X64, TX_8X8 + { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_8X8 + { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, + 0x0000ffff0000ffffULL } }, // block size 64X64, TX_8X8 + { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X8 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_8X8 + { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL, + 0x0000000f0000000fULL } }, // block size 16X64, TX_8X8 + { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_8X8 + // TX_16X16 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_16X16 + { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X16 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_16X16 + { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_16X16 + { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL, + 0x00000000000000ffULL } }, // block size 32X64, TX_16X16 + { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_16X16 + { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL, + 0x000000000000ffffULL } }, // block size 64X64, TX_16X16 + { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL, + 0x000000000000000fULL } }, // block size 16X64, TX_16X16 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_16X16 + // TX_32X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_32X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL, + 0x0000000000000000ULL } }, // block size 32X64, TX_32X32 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_32X32 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL, + 0x0000000000000000ULL } }, // block size 64X64, TX_32X32 + // TX_64X64 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X64, TX_64X64 + // 2:1, 1:2 transform sizes. + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X8 + { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X8 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_8X4 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_8X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X16 + { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X16 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_16X8 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_16X8 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X32 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL, + 0x0000000000000000ULL } }, // block size 16X64, TX_16X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_32X16 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_32X16 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X64, TX_32X64 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_64X32 + // 4:1, 1:4 transform sizes. + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X16 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_16X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_32X8 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X64, TX_16X64 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_64X16 }; LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row, int mi_col) { - if ((mi_row << MI_SIZE_LOG2) >= cm->height || - (mi_col << MI_SIZE_LOG2) >= cm->width) - return NULL; assert(cm->lf.lfm != NULL); const int row = mi_row >> MIN_MIB_SIZE_LOG2; // 64x64 const int col = mi_col >> MIN_MIB_SIZE_LOG2; @@ -248,10 +447,10 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { SIMD_WIDTH); } } -static uint8_t get_filter_level(const AV1_COMMON *cm, - const loop_filter_info_n *lfi_n, - const int dir_idx, int plane, - const MB_MODE_INFO *mbmi) { + +uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n, + const int dir_idx, int plane, + const MB_MODE_INFO *mbmi) { const int segment_id = mbmi->segment_id; if (cm->delta_lf_present_flag) { int delta_lf; @@ -374,30 +573,6 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start, } } } - -#if LOOP_FILTER_BITMASK - memset(lf->neighbor_sb_lpf_info.tx_size_y_above, TX_64X64, - sizeof(TX_SIZE) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.tx_size_y_left, TX_64X64, - sizeof(TX_SIZE) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.tx_size_uv_above, TX_64X64, - sizeof(TX_SIZE) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.tx_size_uv_left, TX_64X64, - sizeof(TX_SIZE) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.y_level_above, 0, - sizeof(uint8_t) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.y_level_left, 0, - sizeof(uint8_t) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.u_level_above, 0, - sizeof(uint8_t) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.u_level_left, 0, - sizeof(uint8_t) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.v_level_above, 0, - sizeof(uint8_t) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.v_level_left, 0, - sizeof(uint8_t) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.skip, 0, sizeof(uint8_t) * MI_SIZE_64X64); -#endif // LOOP_FILTER_BITMASK } #if LOOP_FILTER_BITMASK @@ -413,7 +588,7 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start, // After locating which uint64_t, mi_row % 4 is the // row offset, and each row has 16 = 1 << stride_log2 4x4 units. // Therefore, shift = (row << stride_log2) + mi_col; -static int get_index_shift(int mi_col, int mi_row, int *index) { +int get_index_shift(int mi_col, int mi_row, int *index) { // *index = mi_row >> 2; // rows = mi_row % 4; // stride_log2 = 4; @@ -588,15 +763,9 @@ static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane, else lfm->lfl_y_hor[row][col] = level; } else if (plane == 1) { - if (dir == VERT_EDGE) - lfm->lfl_u_ver[row][col] = level; - else - lfm->lfl_u_hor[row][col] = level; + lfm->lfl_u[row][col] = level; } else { - if (dir == VERT_EDGE) - lfm->lfl_v_ver[row][col] = level; - else - lfm->lfl_v_hor[row][col] = level; + lfm->lfl_v[row][col] = level; } } } @@ -623,11 +792,12 @@ static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane, const TX_SIZE prev_tx_size = plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy) : mbmi_prev->tx_size; - const TX_SIZE min_tx_size = - (dir == VERT_EDGE) ? AOMMIN(txsize_horz_map[tx_size], - txsize_horz_map[prev_tx_size]) - : AOMMIN(txsize_vert_map[tx_size], - txsize_vert_map[prev_tx_size]); + TX_SIZE min_tx_size = (dir == VERT_EDGE) + ? AOMMIN(txsize_horz_map[tx_size], + txsize_horz_map[prev_tx_size]) + : AOMMIN(txsize_vert_map[tx_size], + txsize_vert_map[prev_tx_size]); + min_tx_size = AOMMIN(min_tx_size, TX_16X16); assert(min_tx_size < TX_SIZES); const int row = r % MI_SIZE_64X64; const int col = c % MI_SIZE_64X64; @@ -883,13 +1053,11 @@ void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane, } else if (plane == 1) { av1_zero(lfm->left_u); av1_zero(lfm->above_u); - av1_zero(lfm->lfl_u_ver); - av1_zero(lfm->lfl_u_hor); + av1_zero(lfm->lfl_u); } else { av1_zero(lfm->left_v); av1_zero(lfm->above_v); - av1_zero(lfm->lfl_v_ver); - av1_zero(lfm->lfl_v_hor); + av1_zero(lfm->lfl_v); } } } @@ -979,13 +1147,10 @@ static void filter_selectively_vert_row2( if ((mask_16x16_0 & mask_16x16_1) & 1) { if (plane) { - // TODO(any): add aom_lpf_vertical_6_dual for chroma plane. - aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); - aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); + aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); } else { - // TODO(any): add dual function simd function. Current sse2 code - // just called aom_lpf_vertical_14_sse2 twice. aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, lfi1->mblim, lfi1->lim, lfi1->hev_thr); @@ -1005,9 +1170,9 @@ static void filter_selectively_vert_row2( if ((mask_8x8_0 & mask_8x8_1) & 1) { if (plane) { - aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); - aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); + aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); } else { aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, lfi1->mblim, lfi1->lim, @@ -1070,10 +1235,9 @@ static void highbd_filter_selectively_vert_row2( if ((mask_16x16_0 & mask_16x16_1) & 1) { if (plane) { - aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, bd); - aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); + aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); } else { aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, lfi1->mblim, @@ -1094,10 +1258,9 @@ static void highbd_filter_selectively_vert_row2( if ((mask_8x8_0 & mask_8x8_1) & 1) { if (plane) { - aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, bd); - aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); + aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); } else { aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, lfi1->mblim, @@ -1163,13 +1326,15 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int plane, plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14; if ((mask_16x16 & two_block_mask) == two_block_mask) { - /* - aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr); - */ - - lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); - lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr); + if (plane) { + aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } else { + aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } count = 2; } else { lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); @@ -1181,28 +1346,24 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int plane, plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8; if ((mask_8x8 & two_block_mask) == two_block_mask) { - /* - aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, lfin->lim, - lfin->hev_thr); - */ - - lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); - lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr); + if (plane) { + aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } else { + aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } count = 2; } else { lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } } else if (mask_4x4 & 1) { if ((mask_4x4 & two_block_mask) == two_block_mask) { - /* aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, lfin->mblim, lfin->lim, lfin->hev_thr); - */ - aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); - aom_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim, - lfin->hev_thr); count = 2; } else { aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); @@ -1239,15 +1400,15 @@ static void highbd_filter_selectively_horiz( plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14; if ((mask_16x16 & two_block_mask) == two_block_mask) { - /* - aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, bd); - */ - - highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, - bd); - highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, - lfin->hev_thr, bd); + if (plane) { + aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } else { + aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } count = 2; } else { highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, @@ -1258,15 +1419,15 @@ static void highbd_filter_selectively_horiz( plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8; if ((mask_8x8 & two_block_mask) == two_block_mask) { - /* - aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, lfin->lim, - lfin->hev_thr, bd); - */ - highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, - bd); - highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, - lfin->hev_thr, bd); + if (plane) { + aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } else { + aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } count = 2; } else { highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, @@ -1274,15 +1435,9 @@ static void highbd_filter_selectively_horiz( } } else if (mask_4x4 & 1) { if ((mask_4x4 & two_block_mask) == two_block_mask) { - /* aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, lfin->mblim, lfin->lim, lfin->hev_thr, bd); - */ - aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, bd); - aom_highbd_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim, - lfin->hev_thr, bd); count = 2; } else { aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, @@ -1299,43 +1454,289 @@ static void highbd_filter_selectively_horiz( } } -static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf, - uint8_t *dst_buf, int ref_stride, int dst_stride, - int start, int end) { - return 0; - - start <<= MI_SIZE_LOG2; - end <<= MI_SIZE_LOG2; - uint8_t *ref0 = ref_buf; - uint8_t *dst0 = dst_buf; - if (cm->seq_params.use_highbitdepth) { - const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf); - const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf); - for (int j = 0; j < 4; ++j) { - for (int i = start; i < end; ++i) - if (ref16[i] != dst16[i]) { - ref_buf = ref0; - dst_buf = dst0; - return i + 1; +void av1_build_bitmask_vert_info( + AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr, + int plane) { + const int subsampling_x = plane_ptr->subsampling_x; + const int subsampling_y = plane_ptr->subsampling_y; + const int row_step = (MI_SIZE >> MI_SIZE_LOG2); + const int is_uv = plane > 0; + TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16; + uint8_t level, prev_level = 1; + int skip, prev_skip = 0; + int is_coding_block_border; + + for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) { + const int mi_row = r << subsampling_y; + const int row = mi_row % MI_SIZE_64X64; + int index = 0; + const int shift = get_index_shift(0, row, &index); + + for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; + c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) { + const int mi_col = c << subsampling_x; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + + for (int col_in_unit = 0; + col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) { + const int x = (c + col_in_unit) << MI_SIZE_LOG2; + if (x >= plane_ptr->dst.width) break; + const int col = col_in_unit << subsampling_x; + const uint64_t mask = ((uint64_t)1 << (shift | col)); + skip = lfm->skip.bits[index] & mask; + is_coding_block_border = lfm->is_vert_border.bits[index] & mask; + switch (plane) { + case 0: level = lfm->lfl_y_ver[row][col]; break; + case 1: level = lfm->lfl_u[row][col]; break; + case 2: level = lfm->lfl_v[row][col]; break; + default: assert(plane >= 0 && plane <= 2); return; } - ref16 += ref_stride; - dst16 += dst_stride; + for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) { + if (is_uv && ts == TX_64X64) continue; + if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) { + tx_size = ts; + break; + } + } + if ((c + col_in_unit > 0) && (level || prev_level) && + (!prev_skip || !skip || is_coding_block_border)) { + const TX_SIZE min_tx_size = + AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size)); + const int tmp_row = (mi_row | subsampling_y) % MI_SIZE_64X64; + const int tmp_col = (col | subsampling_x) % MI_SIZE_64X64; + const int shift_1 = get_index_shift(tmp_col, tmp_row, &index); + const uint64_t mask_1 = ((uint64_t)1 << shift_1); + switch (plane) { + case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break; + case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break; + case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break; + default: assert(plane >= 0 && plane <= 2); return; + } + } + + // update prev info + prev_level = level; + prev_skip = skip; + prev_tx_size = tx_size; + // advance + col_in_unit += tx_size_wide_unit[tx_size]; + } } - } else { - for (int j = 0; j < 4; ++j) { - for (int i = start; i < end; ++i) - if (ref_buf[i] != dst_buf[i]) { - ref_buf = ref0; - dst_buf = dst0; - return i + 1; + } +} + +void av1_build_bitmask_horz_info( + AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr, + int plane) { + const int subsampling_x = plane_ptr->subsampling_x; + const int subsampling_y = plane_ptr->subsampling_y; + const int col_step = (MI_SIZE >> MI_SIZE_LOG2); + const int is_uv = plane > 0; + TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16; + uint8_t level, prev_level = 1; + int skip, prev_skip = 0; + int is_coding_block_border; + + for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) { + const int mi_col = c << subsampling_x; + const int col = mi_col % MI_SIZE_64X64; + + for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; + r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) { + const int mi_row = r << subsampling_y; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + + for (int r_in_unit = 0; + r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) { + const int y = (r + r_in_unit) << MI_SIZE_LOG2; + if (y >= plane_ptr->dst.height) break; + const int row = r_in_unit << subsampling_y; + int index = 0; + const int shift = get_index_shift(col, row, &index); + const uint64_t mask = ((uint64_t)1 << shift); + skip = lfm->skip.bits[index] & mask; + is_coding_block_border = lfm->is_horz_border.bits[index] & mask; + switch (plane) { + case 0: level = lfm->lfl_y_hor[row][col]; break; + case 1: level = lfm->lfl_u[row][col]; break; + case 2: level = lfm->lfl_v[row][col]; break; + default: assert(plane >= 0 && plane <= 2); return; } - ref_buf += ref_stride; - dst_buf += dst_stride; + for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) { + if (is_uv && ts == TX_64X64) continue; + if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) { + tx_size = ts; + break; + } + } + if ((r + r_in_unit > 0) && (level || prev_level) && + (!prev_skip || !skip || is_coding_block_border)) { + const TX_SIZE min_tx_size = + AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size)); + const int tmp_row = (row | subsampling_y) % MI_SIZE_64X64; + const int tmp_col = (mi_col | subsampling_x) % MI_SIZE_64X64; + const int shift_1 = get_index_shift(tmp_col, tmp_row, &index); + const uint64_t mask_1 = ((uint64_t)1 << shift_1); + + switch (plane) { + case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break; + case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break; + case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break; + default: assert(plane >= 0 && plane <= 2); return; + } + } + + // update prev info + prev_level = level; + prev_skip = skip; + prev_tx_size = tx_size; + // advance + r_in_unit += tx_size_high_unit[tx_size]; + } + } + } +} + +void av1_filter_block_plane_bitmask_vert( + AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl, + int mi_row, int mi_col) { + struct buf_2d *const dst = &plane_ptr->dst; + uint8_t *const buf0 = dst->buf; + const int ssx = plane_ptr->subsampling_x; + const int ssy = plane_ptr->subsampling_y; + const int mask_cutoff = 0xffff; + const int row_step = 1 << ssy; + const int two_row_step = 2 << ssy; + const int row_stride = dst->stride << MI_SIZE_LOG2; + const int two_row_stride = row_stride << 1; + uint64_t mask_16x16 = 0; + uint64_t mask_8x8 = 0; + uint64_t mask_4x4 = 0; + uint8_t *lfl; + uint8_t *lfl2; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + assert(lfm); + + // 1. vertical filtering. filter two rows at a time + for (int r = 0; + ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64; + r += two_row_step) { + const int row = r | ssy; + const int row_next = row + row_step; + const int col = ssx; + int index = 0; + const int shift = get_index_shift(col, row, &index); + int index_next = 0; + const int shift_next = get_index_shift(col, row_next, &index_next); + switch (pl) { + case 0: + mask_16x16 = lfm->left_y[TX_16X16].bits[index]; + mask_8x8 = lfm->left_y[TX_8X8].bits[index]; + mask_4x4 = lfm->left_y[TX_4X4].bits[index]; + lfl = &lfm->lfl_y_ver[row][col]; + lfl2 = &lfm->lfl_y_ver[row_next][col]; + break; + case 1: + mask_16x16 = lfm->left_u[TX_16X16].bits[index]; + mask_8x8 = lfm->left_u[TX_8X8].bits[index]; + mask_4x4 = lfm->left_u[TX_4X4].bits[index]; + lfl = &lfm->lfl_u[row][col]; + lfl2 = &lfm->lfl_u[row_next][col]; + break; + case 2: + mask_16x16 = lfm->left_v[TX_16X16].bits[index]; + mask_8x8 = lfm->left_v[TX_8X8].bits[index]; + mask_4x4 = lfm->left_v[TX_4X4].bits[index]; + lfl = &lfm->lfl_v[row][col]; + lfl2 = &lfm->lfl_v[row_next][col]; + break; + default: assert(pl >= 0 && pl <= 2); return; + } + uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff; + uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff; + uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff; + uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff; + uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff; + uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff; + + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_vert_row2( + ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0, + mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1, + &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth); + else + filter_selectively_vert_row2( + ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, + mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2); + dst->buf += two_row_stride; + } + // reset buf pointer for horizontal filtering + dst->buf = buf0; +} + +void av1_filter_block_plane_bitmask_horz( + AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl, + int mi_row, int mi_col) { + struct buf_2d *const dst = &plane_ptr->dst; + uint8_t *const buf0 = dst->buf; + const int ssx = plane_ptr->subsampling_x; + const int ssy = plane_ptr->subsampling_y; + const int mask_cutoff = 0xffff; + const int row_step = 1 << ssy; + const int row_stride = dst->stride << MI_SIZE_LOG2; + uint64_t mask_16x16 = 0; + uint64_t mask_8x8 = 0; + uint64_t mask_4x4 = 0; + uint8_t *lfl; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + assert(lfm); + for (int r = 0; + ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64; + r += row_step) { + if (mi_row + r == 0) { + dst->buf += row_stride; + continue; } + const int row = r | ssy; + const int col = ssx; + int index = 0; + const int shift = get_index_shift(col, row, &index); + switch (pl) { + case 0: + mask_16x16 = lfm->above_y[TX_16X16].bits[index]; + mask_8x8 = lfm->above_y[TX_8X8].bits[index]; + mask_4x4 = lfm->above_y[TX_4X4].bits[index]; + lfl = &lfm->lfl_y_hor[row][col]; + break; + case 1: + mask_16x16 = lfm->above_u[TX_16X16].bits[index]; + mask_8x8 = lfm->above_u[TX_8X8].bits[index]; + mask_4x4 = lfm->above_u[TX_4X4].bits[index]; + lfl = &lfm->lfl_u[row][col]; + break; + case 2: + mask_16x16 = lfm->above_v[TX_16X16].bits[index]; + mask_8x8 = lfm->above_v[TX_8X8].bits[index]; + mask_4x4 = lfm->above_v[TX_4X4].bits[index]; + lfl = &lfm->lfl_v[row][col]; + break; + default: assert(pl >= 0 && pl <= 2); return; + } + mask_16x16 = (mask_16x16 >> shift) & mask_cutoff; + mask_8x8 = (mask_8x8 >> shift) & mask_cutoff; + mask_4x4 = (mask_4x4 >> shift) & mask_cutoff; + + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_horiz( + CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth); + else + filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl); + dst->buf += row_stride; } - ref_buf = ref0; - dst_buf = dst0; - return 0; + // reset buf pointer for next block + dst->buf = buf0; } void av1_filter_block_plane_ver(AV1_COMMON *const cm, @@ -1385,15 +1786,15 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm, mask_16x16 = lfm->left_u[TX_16X16].bits[index]; mask_8x8 = lfm->left_u[TX_8X8].bits[index]; mask_4x4 = lfm->left_u[TX_4X4].bits[index]; - lfl = &lfm->lfl_u_ver[row][col]; - lfl2 = &lfm->lfl_u_ver[row_next][col]; + lfl = &lfm->lfl_u[row][col]; + lfl2 = &lfm->lfl_u[row_next][col]; break; case 2: mask_16x16 = lfm->left_v[TX_16X16].bits[index]; mask_8x8 = lfm->left_v[TX_8X8].bits[index]; mask_4x4 = lfm->left_v[TX_4X4].bits[index]; - lfl = &lfm->lfl_v_ver[row][col]; - lfl2 = &lfm->lfl_v_ver[row_next][col]; + lfl = &lfm->lfl_v[row][col]; + lfl2 = &lfm->lfl_v[row_next][col]; break; default: assert(pl >= 0 && pl <= 2); return; } @@ -1460,13 +1861,13 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm, mask_16x16 = lfm->above_u[TX_16X16].bits[index]; mask_8x8 = lfm->above_u[TX_8X8].bits[index]; mask_4x4 = lfm->above_u[TX_4X4].bits[index]; - lfl = &lfm->lfl_u_hor[row][col]; + lfl = &lfm->lfl_u[row][col]; break; case 2: mask_16x16 = lfm->above_v[TX_16X16].bits[index]; mask_8x8 = lfm->above_v[TX_8X8].bits[index]; mask_4x4 = lfm->above_v[TX_4X4].bits[index]; - lfl = &lfm->lfl_v_hor[row][col]; + lfl = &lfm->lfl_v[row][col]; break; default: assert(pl >= 0 && pl <= 2); return; } @@ -1820,6 +2221,9 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm, static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, MACROBLOCKD *xd, int start, int stop, +#if LOOP_FILTER_BITMASK + int is_decoding, +#endif int plane_start, int plane_end) { struct macroblockd_plane *pd = xd->plane; const int col_start = 0; @@ -1827,6 +2231,45 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, int mi_row, mi_col; int plane; +#if LOOP_FILTER_BITMASK + if (is_decoding) { + for (plane = plane_start; plane < plane_end; plane++) { + if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1])) + break; + else if (plane == 1 && !(cm->lf.filter_level_u)) + continue; + else if (plane == 2 && !(cm->lf.filter_level_v)) + continue; + + av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0, + plane, plane + 1); + av1_build_bitmask_vert_info(cm, &pd[plane], plane); + av1_build_bitmask_horz_info(cm, &pd[plane], plane); + + // apply loop filtering which only goes through buffer once + for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) { + for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) { + av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, mi_col, + plane, plane + 1); + av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row, + mi_col); + if (mi_col - MI_SIZE_64X64 >= 0) { + av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, + mi_col - MI_SIZE_64X64, plane, plane + 1); + av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row, + mi_col - MI_SIZE_64X64); + } + } + av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, + mi_col - MI_SIZE_64X64, plane, plane + 1); + av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row, + mi_col - MI_SIZE_64X64); + } + } + return; + } +#endif + for (plane = plane_start; plane < plane_end; plane++) { if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1])) break; @@ -1910,8 +2353,11 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, } void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, - MACROBLOCKD *xd, int plane_start, int plane_end, - int partial_frame) { + MACROBLOCKD *xd, +#if LOOP_FILTER_BITMASK + int is_decoding, +#endif + int plane_start, int plane_end, int partial_frame) { int start_mi_row, end_mi_row, mi_rows_to_filter; start_mi_row = 0; @@ -1923,6 +2369,9 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, } end_mi_row = start_mi_row + mi_rows_to_filter; av1_loop_filter_frame_init(cm, plane_start, plane_end); - loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, plane_start, - plane_end); + loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, +#if LOOP_FILTER_BITMASK + is_decoding, +#endif + plane_start, plane_end); } diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h index c35c3b2dc..80ac61178 100644 --- a/third_party/aom/av1/common/av1_loopfilter.h +++ b/third_party/aom/av1/common/av1_loopfilter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_LOOPFILTER_H_ -#define AV1_COMMON_LOOPFILTER_H_ +#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_ +#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_ #include "config/aom_config.h" @@ -60,51 +60,20 @@ typedef struct { uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64]; uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64]; - // U plane vertical edge and horizontal edge filter level - uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64]; - uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64]; + // U plane filter level + uint8_t lfl_u[MI_SIZE_64X64][MI_SIZE_64X64]; - // V plane vertical edge and horizontal edge filter level - uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64]; - uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64]; -} LoopFilterMask; + // V plane filter level + uint8_t lfl_v[MI_SIZE_64X64][MI_SIZE_64X64]; -// To determine whether to apply loop filtering at one transform block edge, -// we need information of the neighboring transform block. Specifically, -// in determining a vertical edge, we need the information of the tx block -// to its left. For a horizontal edge, we need info of the tx block above it. -// Thus, we need to record info of right column and bottom row of tx blocks. -// We record the information of the neighboring superblock, when bitmask -// building for a superblock is finished. And it will be used for next -// superblock bitmask building. -// Information includes: -// ------------------------------------------------------------ -// MI_SIZE_64X64 -// Y tx_size above |--------------| -// Y tx_size left |--------------| -// UV tx_size above |--------------| -// UV tx_size left |--------------| -// Y level above |--------------| -// Y level left |--------------| -// U level above |--------------| -// U level left |--------------| -// V level above |--------------| -// V level left |--------------| -// skip |--------------| -// ------------------------------------------------------------ -typedef struct { - TX_SIZE tx_size_y_above[MI_SIZE_64X64]; - TX_SIZE tx_size_y_left[MI_SIZE_64X64]; - TX_SIZE tx_size_uv_above[MI_SIZE_64X64]; - TX_SIZE tx_size_uv_left[MI_SIZE_64X64]; - uint8_t y_level_above[MI_SIZE_64X64]; - uint8_t y_level_left[MI_SIZE_64X64]; - uint8_t u_level_above[MI_SIZE_64X64]; - uint8_t u_level_left[MI_SIZE_64X64]; - uint8_t v_level_above[MI_SIZE_64X64]; - uint8_t v_level_left[MI_SIZE_64X64]; - uint8_t skip[MI_SIZE_64X64]; -} LpfSuperblockInfo; + // other info + FilterMask skip; + FilterMask is_vert_border; + FilterMask is_horz_border; + // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64 + FilterMask tx_size_ver[2][5]; + FilterMask tx_size_hor[2][5]; +} LoopFilterMask; #endif // LOOP_FILTER_BITMASK struct loopfilter { @@ -130,7 +99,6 @@ struct loopfilter { LoopFilterMask *lfm; size_t lfm_num; int lfm_stride; - LpfSuperblockInfo neighbor_sb_lpf_info; #endif // LOOP_FILTER_BITMASK }; @@ -157,9 +125,15 @@ void av1_loop_filter_init(struct AV1Common *cm); void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start, int plane_end); +#if LOOP_FILTER_BITMASK +void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, + struct macroblockd *mbd, int is_decoding, + int plane_start, int plane_end, int partial_frame); +#else void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, struct macroblockd *mbd, int plane_start, int plane_end, int partial_frame); +#endif void av1_filter_block_plane_vert(const struct AV1Common *const cm, const MACROBLOCKD *const xd, const int plane, @@ -180,6 +154,9 @@ typedef struct LoopFilterWorkerData { MACROBLOCKD *xd; } LFWorkerData; +uint8_t get_filter_level(const struct AV1Common *cm, + const loop_filter_info_n *lfi_n, const int dir_idx, + int plane, const MB_MODE_INFO *mbmi); #if LOOP_FILTER_BITMASK void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col, int plane, int subsampling_x, int subsampling_y, @@ -192,10 +169,59 @@ void av1_filter_block_plane_ver(struct AV1Common *const cm, void av1_filter_block_plane_hor(struct AV1Common *const cm, struct macroblockd_plane *const plane, int pl, int mi_row, int mi_col); +LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm, + int mi_row, int mi_col); +int get_index_shift(int mi_col, int mi_row, int *index); + +static const FilterMask left_txform_mask[TX_SIZES] = { + { { 0x0000000000000001ULL, // TX_4X4, + 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, + + { { 0x0000000000010001ULL, // TX_8X8, + 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, + + { { 0x0001000100010001ULL, // TX_16X16, + 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, + + { { 0x0001000100010001ULL, // TX_32X32, + 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, + + { { 0x0001000100010001ULL, // TX_64X64, + 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } }, +}; + +static const uint64_t above_txform_mask[2][TX_SIZES] = { + { + 0x0000000000000001ULL, // TX_4X4 + 0x0000000000000003ULL, // TX_8X8 + 0x000000000000000fULL, // TX_16X16 + 0x00000000000000ffULL, // TX_32X32 + 0x000000000000ffffULL, // TX_64X64 + }, + { + 0x0000000000000001ULL, // TX_4X4 + 0x0000000000000005ULL, // TX_8X8 + 0x0000000000000055ULL, // TX_16X16 + 0x0000000000005555ULL, // TX_32X32 + 0x0000000055555555ULL, // TX_64X64 + }, +}; + +extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL]; + +extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL]; + +extern const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL]; + +extern const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL]; + +extern const FilterMask left_mask_univariant_reordered[67]; + +extern const FilterMask above_mask_univariant_reordered[67]; #endif #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_COMMON_LOOPFILTER_H_ +#endif // AOM_AV1_COMMON_AV1_LOOPFILTER_H_ diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl index fa8b34981..dee1f1c79 100755 --- a/third_party/aom/av1/common/av1_rtcd_defs.pl +++ b/third_party/aom/av1/common/av1_rtcd_defs.pl @@ -76,12 +76,12 @@ specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/; specialize qw/av1_highbd_wiener_convolve_add_src ssse3/; specialize qw/av1_highbd_wiener_convolve_add_src avx2/; + # directional intra predictor functions add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy"; add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy"; add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy"; - # FILTER_INTRA predictor functions add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode"; specialize qw/av1_filter_intra_predictor sse4_1/; @@ -108,6 +108,22 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64"; add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; specialize qw/av1_inv_txfm_add ssse3 avx2 neon/; +add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/; + +add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_8x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_16x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_8x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_16x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_32x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/; + add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; @@ -122,9 +138,7 @@ specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/; add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/; add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; -specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/; add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; -specialize qw/av1_inv_txfm2d_add_32x32 avx2/; add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; @@ -132,8 +146,6 @@ add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *ou add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; -specialize qw/av1_inv_txfm2d_add_64x64 sse4_1/; - add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; @@ -146,13 +158,13 @@ add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride # build compound seg mask functions add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w"; -specialize qw/av1_build_compound_diffwtd_mask sse4_1/; +specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/; add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd"; specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/; add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd"; -specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 neon/; +specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/; # # Encoder functions below this point. @@ -186,7 +198,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_8x16 sse4_1/; add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_16x8 sse4_1/; add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; @@ -203,6 +217,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/av1_fwd_txfm2d_32x32 sse4_1/; add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_64x64 sse4_1/; add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; @@ -218,7 +233,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; specialize qw/av1_temporal_filter_apply sse2 msa/; - add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale"; + add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale"; # ENCODEMB INVOKE @@ -238,7 +253,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts"; specialize qw/av1_get_nz_map_contexts sse2/; add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels"; - specialize qw/av1_txb_init_levels sse4_1/; + specialize qw/av1_txb_init_levels sse4_1 avx2/; add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N"; specialize qw/av1_wedge_sse_from_residuals sse2 avx2/; @@ -251,6 +266,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length"; specialize qw/av1_get_crc32c_value sse4_2/; + add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, double *M, double *H"; + specialize qw/av1_compute_stats sse4_1 avx2/; + + add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; + specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/; } # end encoder functions @@ -275,7 +295,7 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) { # WARPED_MOTION / GLOBAL_MOTION functions add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; -specialize qw/av1_warp_affine sse4_1/; +specialize qw/av1_warp_affine sse4_1 neon/; add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; specialize qw/av1_highbd_warp_affine sse4_1/; @@ -290,9 +310,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd"; specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/; -add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height, - int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, - int sgr_params_idx, int bit_depth, int highbd"; +add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height, + int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, + int sgr_params_idx, int bit_depth, int highbd"; specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/; # CONVOLVE_ROUND/COMPOUND_ROUND functions diff --git a/third_party/aom/av1/common/av1_txfm.c b/third_party/aom/av1/common/av1_txfm.c index 1e6654121..bb70eab70 100644 --- a/third_party/aom/av1/common/av1_txfm.c +++ b/third_party/aom/av1/common/av1_txfm.c @@ -108,3 +108,53 @@ const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = { 1, // TXFM_TYPE_IDENTITY16 1, // TXFM_TYPE_IDENTITY32 }; + +void av1_range_check_buf(int32_t stage, const int32_t *input, + const int32_t *buf, int32_t size, int8_t bit) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + const int64_t max_value = (1LL << (bit - 1)) - 1; + const int64_t min_value = -(1LL << (bit - 1)); + + int in_range = 1; + + for (int i = 0; i < size; ++i) { + if (buf[i] < min_value || buf[i] > max_value) { + in_range = 0; + } + } + + if (!in_range) { + fprintf(stderr, "Error: coeffs contain out-of-range values\n"); + fprintf(stderr, "size: %d\n", size); + fprintf(stderr, "stage: %d\n", stage); + fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value, + max_value); + + fprintf(stderr, "coeffs: "); + + fprintf(stderr, "["); + for (int j = 0; j < size; j++) { + if (j > 0) fprintf(stderr, ", "); + fprintf(stderr, "%d", input[j]); + } + fprintf(stderr, "]\n"); + + fprintf(stderr, " buf: "); + + fprintf(stderr, "["); + for (int j = 0; j < size; j++) { + if (j > 0) fprintf(stderr, ", "); + fprintf(stderr, "%d", buf[j]); + } + fprintf(stderr, "]\n\n"); + } + + assert(in_range); +#else + (void)stage; + (void)input; + (void)buf; + (void)size; + (void)bit; +#endif +} diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h index c9cc79852..59d64ca4a 100644 --- a/third_party/aom/av1/common/av1_txfm.h +++ b/third_party/aom/av1/common/av1_txfm.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_TXFM_H_ -#define AV1_TXFM_H_ +#ifndef AOM_AV1_COMMON_AV1_TXFM_H_ +#define AOM_AV1_COMMON_AV1_TXFM_H_ #include #include @@ -39,7 +39,7 @@ extern const int32_t av1_sinpi_arr_data[7][5]; static const int cos_bit_min = 10; static const int cos_bit_max = 16; -static const int NewSqrt2Bits = 12; +#define NewSqrt2Bits ((int32_t)12) // 2^12 * sqrt(2) static const int32_t NewSqrt2 = 5793; // 2^12 / sqrt(2) @@ -64,7 +64,7 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) { #endif // CONFIG_COEFFICIENT_RANGE_CHECKING #if DO_RANGE_CHECK_CLAMP bit = AOMMIN(bit, 31); - return clamp(value, (1 << (bit - 1)) - 1, -(1 << (bit - 1))); + return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1); #endif // DO_RANGE_CHECK_CLAMP (void)bit; return value; @@ -78,10 +78,25 @@ static INLINE int32_t round_shift(int64_t value, int bit) { static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1, int bit) { int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1); + int64_t intermediate = result_64 + (1LL << (bit - 1)); + // NOTE(david.barker): The value 'result_64' may not necessarily fit + // into 32 bits. However, the result of this function is nominally + // ROUND_POWER_OF_TWO_64(result_64, bit) + // and that is required to fit into stage_range[stage] many bits + // (checked by range_check_buf()). + // + // Here we've unpacked that rounding operation, and it can be shown + // that the value of 'intermediate' here *does* fit into 32 bits + // for any conformant bitstream. + // The upshot is that, if you do all this calculation using + // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic, + // then you'll still get the correct result. + // To provide a check on this logic, we assert that 'intermediate' + // would fit into an int32 if range checking is enabled. #if CONFIG_COEFFICIENT_RANGE_CHECKING - assert(result_64 >= INT32_MIN && result_64 <= INT32_MAX); + assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX); #endif - return round_shift(result_64, bit); + return (int32_t)(intermediate >> bit); } static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, @@ -206,9 +221,12 @@ static INLINE int get_txw_idx(TX_SIZE tx_size) { static INLINE int get_txh_idx(TX_SIZE tx_size) { return tx_size_high_log2[tx_size] - tx_size_high_log2[0]; } + +void av1_range_check_buf(int32_t stage, const int32_t *input, + const int32_t *buf, int32_t size, int8_t bit); #define MAX_TXWH_IDX 5 #ifdef __cplusplus } #endif // __cplusplus -#endif // AV1_TXFM_H_ +#endif // AOM_AV1_COMMON_AV1_TXFM_H_ diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c index 86b4b5d6c..2e796b656 100644 --- a/third_party/aom/av1/common/blockd.c +++ b/third_party/aom/av1/common/blockd.c @@ -28,66 +28,6 @@ PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) { return above_mi->mode; } -void av1_foreach_transformed_block_in_plane( - const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, - foreach_transformed_block_visitor visit, void *arg) { - const struct macroblockd_plane *const pd = &xd->plane[plane]; - // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") - // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 - // transform size varies per plane, look it up in a common way. - const TX_SIZE tx_size = av1_get_tx_size(plane, xd); - const BLOCK_SIZE plane_bsize = - get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); - const uint8_t txw_unit = tx_size_wide_unit[tx_size]; - const uint8_t txh_unit = tx_size_high_unit[tx_size]; - const int step = txw_unit * txh_unit; - int i = 0, r, c; - - // If mb_to_right_edge is < 0 we are in a situation in which - // the current block size extends into the UMV and we won't - // visit the sub blocks that are wholly within the UMV. - const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); - const int max_blocks_high = max_block_high(xd, plane_bsize, plane); - - int blk_row, blk_col; - - const BLOCK_SIZE max_unit_bsize = - get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y); - int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0]; - int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0]; - mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); - mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); - - // Keep track of the row and column of the blocks we use so that we know - // if we are in the unrestricted motion border. - for (r = 0; r < max_blocks_high; r += mu_blocks_high) { - const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high); - // Skip visiting the sub blocks that are wholly within the UMV. - for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) { - const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide); - for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) { - for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) { - visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg); - i += step; - } - } - } - } -} - -void av1_foreach_transformed_block(const MACROBLOCKD *const xd, - BLOCK_SIZE bsize, int mi_row, int mi_col, - foreach_transformed_block_visitor visit, - void *arg, const int num_planes) { - for (int plane = 0; plane < num_planes; ++plane) { - if (!is_chroma_reference(mi_row, mi_col, bsize, - xd->plane[plane].subsampling_x, - xd->plane[plane].subsampling_y)) - continue; - av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); - } -} - void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, int aoff, int loff) { @@ -159,6 +99,10 @@ void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y, xd->plane[i].subsampling_x = i ? ss_x : 0; xd->plane[i].subsampling_y = i ? ss_y : 0; } + for (i = num_planes; i < MAX_MB_PLANE; i++) { + xd->plane[i].subsampling_x = 1; + xd->plane[i].subsampling_y = 1; + } } const int16_t dr_intra_derivative[90] = { diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h index 979f13bd9..a2311c1b0 100644 --- a/third_party/aom/av1/common/blockd.h +++ b/third_party/aom/av1/common/blockd.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_BLOCKD_H_ -#define AV1_COMMON_BLOCKD_H_ +#ifndef AOM_AV1_COMMON_BLOCKD_H_ +#define AOM_AV1_COMMON_BLOCKD_H_ #include "config/aom_config.h" @@ -38,13 +38,13 @@ extern "C" { #define MAX_DIFFWTD_MASK_BITS 1 // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS -typedef enum { +typedef enum ATTRIBUTE_PACKED { DIFFWTD_38 = 0, DIFFWTD_38_INV, DIFFWTD_MASK_TYPES, } DIFFWTD_MASK_TYPE; -typedef enum { +typedef enum ATTRIBUTE_PACKED { KEY_FRAME = 0, INTER_FRAME = 1, INTRA_ONLY_FRAME = 2, // replaces intra-only @@ -57,7 +57,7 @@ static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) { } static INLINE int is_inter_mode(PREDICTION_MODE mode) { - return mode >= NEARESTMV && mode <= NEW_NEWMV; + return mode >= INTER_MODE_START && mode < INTER_MODE_END; } typedef struct { @@ -66,10 +66,10 @@ typedef struct { } BUFFER_SET; static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) { - return mode >= NEARESTMV && mode <= NEWMV; + return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END; } static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) { - return mode >= NEAREST_NEARESTMV && mode <= NEW_NEWMV; + return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END; } static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) { @@ -148,10 +148,6 @@ static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) { mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV); } -static INLINE int use_masked_motion_search(COMPOUND_TYPE type) { - return (type == COMPOUND_WEDGE); -} - static INLINE int is_masked_compound_type(COMPOUND_TYPE type) { return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD); } @@ -267,8 +263,8 @@ typedef struct MB_MODE_INFO { int mi_row; int mi_col; #endif - int num_proj_ref[2]; - WarpedMotionParams wm_params[2]; + int num_proj_ref; + WarpedMotionParams wm_params; // Index of the alpha Cb and alpha Cr combination int cfl_alpha_idx; @@ -376,7 +372,7 @@ static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col, } #endif -enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 }; +enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 }; struct buf_2d { uint8_t *buf; @@ -500,6 +496,8 @@ typedef struct jnt_comp_params { int bck_offset; } JNT_COMP_PARAMS; +// Most/all of the pointers are mere pointers to actual arrays are allocated +// elsewhere. This is mostly for coding convenience. typedef struct macroblockd { struct macroblockd_plane plane[MAX_MB_PLANE]; @@ -544,7 +542,7 @@ typedef struct macroblockd { SgrprojInfo sgrproj_info[MAX_MB_PLANE]; // block dimension in the unit of mode_info. - uint8_t n8_w, n8_h; + uint8_t n4_w, n4_h; uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; @@ -599,6 +597,9 @@ typedef struct macroblockd { uint16_t cb_offset[MAX_MB_PLANE]; uint16_t txb_offset[MAX_MB_PLANE]; uint16_t color_index_map_offset[2]; + + CONV_BUF_TYPE *tmp_conv_dst; + uint8_t *tmp_obmc_bufs[2]; } MACROBLOCKD; static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) { @@ -623,6 +624,11 @@ static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) { } } +// For a square block size 'bsize', returns the size of the sub-blocks used by +// the given partition type. If the partition produces sub-blocks of different +// sizes, then the function returns the largest sub-block size. +// Implements the Partition_Subsize lookup table in the spec (Section 9.3. +// Conversion tables). // Note: the input block size should be square. // Otherwise it's considered invalid. static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize, @@ -781,6 +787,8 @@ static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type, return intra_mode_to_tx_type(mbmi, plane_type); } +// Implements the get_plane_residual_size() function in the spec (Section +// 5.11.38. Get plane residual size function). static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, int subsampling_x, int subsampling_y) { @@ -952,15 +960,6 @@ typedef void (*foreach_transformed_block_visitor)(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); -void av1_foreach_transformed_block_in_plane( - const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, - foreach_transformed_block_visitor visit, void *arg); - -void av1_foreach_transformed_block(const MACROBLOCKD *const xd, - BLOCK_SIZE bsize, int mi_row, int mi_col, - foreach_transformed_block_visitor visit, - void *arg, const int num_planes); - void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, int aoff, int loff); @@ -976,7 +975,7 @@ static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) { } static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) { - return (mode >= NEARESTMV) && (mode <= NEWMV); + return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END); } static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) { @@ -1045,7 +1044,7 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd, is_motion_variation_allowed_compound(mbmi)) { if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION; assert(!has_second_ref(mbmi)); - if (mbmi->num_proj_ref[0] >= 1 && + if (mbmi->num_proj_ref >= 1 && (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) { if (xd->cur_frame_force_integer_mv) { return OBMC_CAUSAL; @@ -1174,4 +1173,4 @@ static INLINE int av1_get_max_eob(TX_SIZE tx_size) { } // extern "C" #endif -#endif // AV1_COMMON_BLOCKD_H_ +#endif // AOM_AV1_COMMON_BLOCKD_H_ diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h index 092230de9..3b2eac8a5 100644 --- a/third_party/aom/av1/common/cdef.h +++ b/third_party/aom/av1/common/cdef.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_CDEF_H_ -#define AV1_COMMON_CDEF_H_ +#ifndef AOM_AV1_COMMON_CDEF_H_ +#define AOM_AV1_COMMON_CDEF_H_ #define CDEF_STRENGTH_BITS 6 @@ -48,4 +48,4 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_COMMON_CDEF_H_ +#endif // AOM_AV1_COMMON_CDEF_H_ diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h index 81c6da077..6b4452cd6 100644 --- a/third_party/aom/av1/common/cdef_block.h +++ b/third_party/aom/av1/common/cdef_block.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#if !defined(_CDEF_BLOCK_H) -#define _CDEF_BLOCK_H (1) +#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_ +#define AOM_AV1_COMMON_CDEF_BLOCK_H_ #include "av1/common/odintrin.h" @@ -56,4 +56,4 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in, cdef_list *dlist, int cdef_count, int level, int sec_strength, int pri_damping, int sec_damping, int coeff_shift); -#endif +#endif // AOM_AV1_COMMON_CDEF_BLOCK_H_ diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h index d24a7c0fa..14587a023 100644 --- a/third_party/aom/av1/common/cdef_block_simd.h +++ b/third_party/aom/av1/common/cdef_block_simd.h @@ -9,6 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ +#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ + #include "config/av1_rtcd.h" #include "av1/common/cdef_block.h" @@ -913,3 +916,5 @@ void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, } } } + +#endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h index bc9fbce1b..d627891bf 100644 --- a/third_party/aom/av1/common/cfl.h +++ b/third_party/aom/av1/common/cfl.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_CFL_H_ -#define AV1_COMMON_CFL_H_ +#ifndef AOM_AV1_COMMON_CFL_H_ +#define AOM_AV1_COMMON_CFL_H_ #include "av1/common/blockd.h" #include "av1/common/onyxc_int.h" @@ -299,4 +299,4 @@ void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst, return pred[tx_size % TX_SIZES_ALL]; \ } -#endif // AV1_COMMON_CFL_H_ +#endif // AOM_AV1_COMMON_CFL_H_ diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h index 72c6d3a1e..bed6083db 100644 --- a/third_party/aom/av1/common/common.h +++ b/third_party/aom/av1/common/common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_COMMON_H_ -#define AV1_COMMON_COMMON_H_ +#ifndef AOM_AV1_COMMON_COMMON_H_ +#define AOM_AV1_COMMON_COMMON_H_ /* Interface header for common constant data structures and lookup tables */ @@ -60,4 +60,4 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { } // extern "C" #endif -#endif // AV1_COMMON_COMMON_H_ +#endif // AOM_AV1_COMMON_COMMON_H_ diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h index f521f10bf..46e455fdb 100644 --- a/third_party/aom/av1/common/common_data.h +++ b/third_party/aom/av1/common/common_data.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_COMMON_DATA_H_ -#define AV1_COMMON_COMMON_DATA_H_ +#ifndef AOM_AV1_COMMON_COMMON_DATA_H_ +#define AOM_AV1_COMMON_COMMON_DATA_H_ #include "av1/common/enums.h" #include "aom/aom_integer.h" @@ -20,34 +20,43 @@ extern "C" { #endif -// Log 2 conversion lookup tables in units of mode info(4x4). +// Log 2 conversion lookup tables in units of mode info (4x4). +// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables). static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4 }; +// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables). static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = { 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2 }; +// Width/height lookup tables in units of mode info (4x4). +// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables). static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = { 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16 }; +// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables). static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = { 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4 }; -// Width/height lookup tables in units of various block sizes +// Width/height lookup tables in units of samples. +// The Block_Width table in the spec (Section 9.3. Conversion tables). static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = { 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64, 64, 128, 128, 4, 16, 8, 32, 16, 64 }; +// The Block_Height table in the spec (Section 9.3. Conversion tables). static const uint8_t block_size_high[BLOCK_SIZES_ALL] = { 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64, 32, 64, 128, 64, 128, 16, 4, 32, 8, 64, 16 }; -// AOMMIN(3, AOMMIN(b_width_log2(bsize), b_height_log2(bsize))) +// Maps a block size to a context. +// The Size_Group table in the spec (Section 9.3. Conversion tables). +// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize))) static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2 }; @@ -56,6 +65,8 @@ static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = { 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10 }; +// A compressed version of the Partition_Subsize table in the spec (9.3. +// Conversion tables), for square block sizes only. /* clang-format off */ static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = { { // PARTITION_NONE @@ -350,34 +361,36 @@ static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { TX_64X64, // TX_MODE_LARGEST TX_64X64, // TX_MODE_SELECT }; -/* clang-format on */ +// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual +// size function). static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = { - // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 - // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 - { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } }, - { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } }, - { { BLOCK_8X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } }, - { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } }, - { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_4X16, BLOCK_4X8 } }, - { { BLOCK_16X8, BLOCK_16X4 }, { BLOCK_8X8, BLOCK_8X4 } }, - { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } }, - { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_8X32, BLOCK_8X16 } }, - { { BLOCK_32X16, BLOCK_32X8 }, { BLOCK_16X16, BLOCK_16X8 } }, - { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } }, - { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_16X64, BLOCK_16X32 } }, - { { BLOCK_64X32, BLOCK_64X16 }, { BLOCK_32X32, BLOCK_32X16 } }, - { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } }, - { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } }, - { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } }, - { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } }, - { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_4X16, BLOCK_4X8 } }, - { { BLOCK_16X4, BLOCK_16X4 }, { BLOCK_8X4, BLOCK_8X4 } }, - { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } }, - { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } }, - { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } }, - { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } } + // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 + // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 + { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } }, + { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_4X4 } }, + { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X4 } }, + { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } }, + { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, + { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } }, + { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } }, + { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } }, + { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } }, + { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } }, + { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } }, + { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } }, + { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } }, + { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } }, + { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } }, + { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } }, + { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, + { { BLOCK_16X4, BLOCK_INVALID }, { BLOCK_8X4, BLOCK_8X4 } }, + { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } }, + { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } }, + { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } }, + { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } } }; +/* clang-format on */ // Generates 5 bit field in which each bit set to 1 represents // a blocksize partition 11111 means we split 128x128, 64x64, 32x32, 16x16 @@ -430,4 +443,4 @@ static const int quant_dist_lookup_table[2][4][2] = { } // extern "C" #endif -#endif // AV1_COMMON_COMMON_DATA_H_ +#endif // AOM_AV1_COMMON_COMMON_DATA_H_ diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c index ed962c722..1f11126fc 100644 --- a/third_party/aom/av1/common/convolve.c +++ b/third_party/aom/av1/common/convolve.c @@ -173,6 +173,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_q4 & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -510,31 +511,73 @@ static void convolve_2d_scale_wrapper( y_step_qn, conv_params); } +// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So +// we may create optimized code to do 2-tap filtering for all bilinear filtering +// usages, not just IntraBC. +static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + int subpel_x_q4, int subpel_y_q4, + ConvolveParams *conv_params) { + const InterpFilterParams *filter_params_x = + subpel_x_q4 ? &av1_intrabc_filter_params : NULL; + const InterpFilterParams *filter_params_y = + subpel_y_q4 ? &av1_intrabc_filter_params : NULL; + if (subpel_x_q4 != 0 && subpel_y_q4 != 0) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, 0, 0, conv_params); + } else if (subpel_x_q4 != 0) { + av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, 0, 0, conv_params); + } else { + av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, 0, 0, conv_params); + } +} + void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilters interp_filters, const int subpel_x_q4, int x_step_q4, const int subpel_y_q4, int y_step_q4, int scaled, ConvolveParams *conv_params, - const struct scale_factors *sf) { + const struct scale_factors *sf, int is_intrabc) { + assert(IMPLIES(is_intrabc, !scaled)); (void)x_step_q4; (void)y_step_q4; (void)dst; (void)dst_stride; - InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1); - InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0); + + if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) { + convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4, + subpel_y_q4, conv_params); + return; + } + + InterpFilter filter_x = 0; + InterpFilter filter_y = 0; + const int need_filter_params_x = (subpel_x_q4 != 0) | scaled; + const int need_filter_params_y = (subpel_y_q4 != 0) | scaled; + if (need_filter_params_x) + filter_x = av1_extract_interp_filter(interp_filters, 1); + if (need_filter_params_y) + filter_y = av1_extract_interp_filter(interp_filters, 0); const InterpFilterParams *filter_params_x = - av1_get_interp_filter_params_with_block_size(filter_x, w); + need_filter_params_x + ? av1_get_interp_filter_params_with_block_size(filter_x, w) + : NULL; const InterpFilterParams *filter_params_y = - av1_get_interp_filter_params_with_block_size(filter_y, h); + need_filter_params_y + ? av1_get_interp_filter_params_with_block_size(filter_y, h) + : NULL; - if (scaled) + if (scaled) { convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params); - else + } else { sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound]( src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_q4, subpel_y_q4, conv_params); + } } void av1_highbd_convolve_2d_copy_sr_c( @@ -964,24 +1007,68 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, } } +static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, + int h, int subpel_x_q4, + int subpel_y_q4, + ConvolveParams *conv_params, + int bd) { + const InterpFilterParams *filter_params_x = + subpel_x_q4 ? &av1_intrabc_filter_params : NULL; + const InterpFilterParams *filter_params_y = + subpel_y_q4 ? &av1_intrabc_filter_params : NULL; + if (subpel_x_q4 != 0 && subpel_y_q4 != 0) { + av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, 0, 0, + conv_params, bd); + } else if (subpel_x_q4 != 0) { + av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, 0, 0, + conv_params, bd); + } else { + av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, 0, 0, + conv_params, bd); + } +} + void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w, int h, InterpFilters interp_filters, const int subpel_x_q4, int x_step_q4, const int subpel_y_q4, int y_step_q4, int scaled, ConvolveParams *conv_params, - const struct scale_factors *sf, int bd) { + const struct scale_factors *sf, + int is_intrabc, int bd) { + assert(IMPLIES(is_intrabc, !scaled)); (void)x_step_q4; (void)y_step_q4; (void)dst_stride; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1); - InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0); + + if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, + subpel_x_q4, subpel_y_q4, conv_params, bd); + return; + } + + InterpFilter filter_x = 0; + InterpFilter filter_y = 0; + const int need_filter_params_x = (subpel_x_q4 != 0) | scaled; + const int need_filter_params_y = (subpel_y_q4 != 0) | scaled; + if (need_filter_params_x) + filter_x = av1_extract_interp_filter(interp_filters, 1); + if (need_filter_params_y) + filter_y = av1_extract_interp_filter(interp_filters, 0); const InterpFilterParams *filter_params_x = - av1_get_interp_filter_params_with_block_size(filter_x, w); + need_filter_params_x + ? av1_get_interp_filter_params_with_block_size(filter_x, w) + : NULL; const InterpFilterParams *filter_params_y = - av1_get_interp_filter_params_with_block_size(filter_y, h); + need_filter_params_y + ? av1_get_interp_filter_params_with_block_size(filter_y, h) + : NULL; if (scaled) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); @@ -1111,7 +1198,8 @@ void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; const int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); assert(w <= MAX_SB_SIZE); assert(h <= MAX_SB_SIZE); diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h index bc2d4bccf..4109dd843 100644 --- a/third_party/aom/av1/common/convolve.h +++ b/third_party/aom/av1/common/convolve.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_AV1_CONVOLVE_H_ -#define AV1_COMMON_AV1_CONVOLVE_H_ +#ifndef AOM_AV1_COMMON_CONVOLVE_H_ +#define AOM_AV1_COMMON_CONVOLVE_H_ #include "av1/common/filter.h" #ifdef __cplusplus @@ -19,7 +19,6 @@ extern "C" { typedef uint16_t CONV_BUF_TYPE; typedef struct ConvolveParams { - int ref; int do_average; CONV_BUF_TYPE *dst; int dst_stride; @@ -59,15 +58,13 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, InterpFilters interp_filters, const int subpel_x_q4, int x_step_q4, const int subpel_y_q4, int y_step_q4, int scaled, ConvolveParams *conv_params, - const struct scale_factors *sf); + const struct scale_factors *sf, int is_intrabc); -static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average, - int plane, +static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane, CONV_BUF_TYPE *dst, int dst_stride, int is_compound, int bd) { ConvolveParams conv_params; - conv_params.ref = ref; conv_params.do_average = do_average; assert(IMPLIES(do_average, is_compound)); conv_params.is_compound = is_compound; @@ -88,15 +85,14 @@ static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average, return conv_params; } -static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane, +static INLINE ConvolveParams get_conv_params(int do_average, int plane, int bd) { - return get_conv_params_no_round(ref, do_average, plane, NULL, 0, 0, bd); + return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd); } static INLINE ConvolveParams get_conv_params_wiener(int bd) { ConvolveParams conv_params; (void)bd; - conv_params.ref = 0; conv_params.do_average = 0; conv_params.is_compound = 0; conv_params.round_0 = WIENER_ROUND0_BITS; @@ -119,10 +115,11 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, const int subpel_x_q4, int x_step_q4, const int subpel_y_q4, int y_step_q4, int scaled, ConvolveParams *conv_params, - const struct scale_factors *sf, int bd); + const struct scale_factors *sf, + int is_intrabc, int bd); #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_COMMON_AV1_CONVOLVE_H_ +#endif // AOM_AV1_COMMON_CONVOLVE_H_ diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h index ef944c5a0..991692c2f 100644 --- a/third_party/aom/av1/common/entropy.h +++ b/third_party/aom/av1/common/entropy.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ENTROPY_H_ -#define AV1_COMMON_ENTROPY_H_ +#ifndef AOM_AV1_COMMON_ENTROPY_H_ +#define AOM_AV1_COMMON_ENTROPY_H_ #include "config/aom_config.h" @@ -178,4 +178,4 @@ static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) { } // extern "C" #endif -#endif // AV1_COMMON_ENTROPY_H_ +#endif // AOM_AV1_COMMON_ENTROPY_H_ diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h index 0bd2e20a1..7047f34d2 100644 --- a/third_party/aom/av1/common/entropymode.h +++ b/third_party/aom/av1/common/entropymode.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ENTROPYMODE_H_ -#define AV1_COMMON_ENTROPYMODE_H_ +#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_ +#define AOM_AV1_COMMON_ENTROPYMODE_H_ #include "av1/common/entropy.h" #include "av1/common/entropymv.h" @@ -186,6 +186,8 @@ void av1_set_default_mode_deltas(int8_t *mode_deltas); void av1_setup_frame_contexts(struct AV1Common *cm); void av1_setup_past_independence(struct AV1Common *cm); +// Returns (int)ceil(log2(n)). +// NOTE: This implementation only works for n <= 2^30. static INLINE int av1_ceil_log2(int n) { if (n < 2) return 0; int i = 1, p = 2; @@ -207,4 +209,4 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride, } // extern "C" #endif -#endif // AV1_COMMON_ENTROPYMODE_H_ +#endif // AOM_AV1_COMMON_ENTROPYMODE_H_ diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c index 446aa433c..491337387 100644 --- a/third_party/aom/av1/common/entropymv.c +++ b/third_party/aom/av1/common/entropymv.c @@ -60,61 +60,6 @@ static const nmv_context default_nmv_context = { } }, }; -static const uint8_t log_in_base_2[] = { - 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10 -}; - -static INLINE int mv_class_base(MV_CLASS_TYPE c) { - return c ? CLASS0_SIZE << (c + 2) : 0; -} - -MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) { - const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096) - ? MV_CLASS_10 - : (MV_CLASS_TYPE)log_in_base_2[z >> 3]; - if (offset) *offset = z - mv_class_base(c); - return c; -} - void av1_init_mv_probs(AV1_COMMON *cm) { // NB: this sets CDFs too cm->fc->nmvc = default_nmv_context; diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h index 02ca7b66b..fa818a2c1 100644 --- a/third_party/aom/av1/common/entropymv.h +++ b/third_party/aom/av1/common/entropymv.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ENTROPYMV_H_ -#define AV1_COMMON_ENTROPYMV_H_ +#ifndef AOM_AV1_COMMON_ENTROPYMV_H_ +#define AOM_AV1_COMMON_ENTROPYMV_H_ #include "config/aom_config.h" @@ -91,16 +91,6 @@ typedef struct { nmv_component comps[2]; } nmv_context; -static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) { - if (mv->row == 0) { - return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ; - } else { - return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ; - } -} - -MV_CLASS_TYPE av1_get_mv_class(int z, int *offset); - typedef enum { MV_SUBPEL_NONE = -1, MV_SUBPEL_LOW_PRECISION = 0, @@ -111,4 +101,4 @@ typedef enum { } // extern "C" #endif -#endif // AV1_COMMON_ENTROPYMV_H_ +#endif // AOM_AV1_COMMON_ENTROPYMV_H_ diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h index 689c25f30..869c06ef2 100644 --- a/third_party/aom/av1/common/enums.h +++ b/third_party/aom/av1/common/enums.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ENUMS_H_ -#define AV1_COMMON_ENUMS_H_ +#ifndef AOM_AV1_COMMON_ENUMS_H_ +#define AOM_AV1_COMMON_ENUMS_H_ #include "config/aom_config.h" @@ -274,7 +274,7 @@ typedef enum ATTRIBUTE_PACKED { TX_TYPES, } TX_TYPE; -typedef enum { +typedef enum ATTRIBUTE_PACKED { REG_REG, REG_SMOOTH, REG_SHARP, @@ -438,6 +438,8 @@ typedef enum ATTRIBUTE_PACKED { COMP_INTER_MODE_START = NEAREST_NEARESTMV, COMP_INTER_MODE_END = MB_MODE_COUNT, COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START, + INTER_MODE_START = NEARESTMV, + INTER_MODE_END = MB_MODE_COUNT, INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode. INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks } PREDICTION_MODE; @@ -478,7 +480,7 @@ typedef enum ATTRIBUTE_PACKED { INTERINTRA_MODES } INTERINTRA_MODE; -typedef enum { +typedef enum ATTRIBUTE_PACKED { COMPOUND_AVERAGE, COMPOUND_WEDGE, COMPOUND_DIFFWTD, @@ -614,4 +616,4 @@ typedef enum ATTRIBUTE_PACKED { } // extern "C" #endif -#endif // AV1_COMMON_ENUMS_H_ +#endif // AOM_AV1_COMMON_ENUMS_H_ diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h index 7f8ad583a..571422d11 100644 --- a/third_party/aom/av1/common/filter.h +++ b/third_party/aom/av1/common/filter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_FILTER_H_ -#define AV1_COMMON_FILTER_H_ +#ifndef AOM_AV1_COMMON_FILTER_H_ +#define AOM_AV1_COMMON_FILTER_H_ #include @@ -139,6 +139,17 @@ static const InterpFilterParams BILINEAR } }; +// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel +// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV. +DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = { + 64, + 64, +}; + +static const InterpFilterParams av1_intrabc_filter_params = { + av1_intrabc_bilinear_filter, 2, 0, BILINEAR +}; + DECLARE_ALIGNED(256, static const InterpKernel, av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 }, @@ -181,6 +192,11 @@ av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter, return &av1_interp_filter_params_list[interp_filter]; } +static INLINE const InterpFilterParams *av1_get_4tap_interp_filter_params( + const InterpFilter interp_filter) { + return &av1_interp_4tap[interp_filter]; +} + static INLINE const int16_t *av1_get_interp_filter_kernel( const InterpFilter interp_filter) { return av1_interp_filter_params_list[interp_filter].filter_ptr; @@ -195,4 +211,4 @@ static INLINE const int16_t *av1_get_interp_filter_subpel_kernel( } // extern "C" #endif -#endif // AV1_COMMON_FILTER_H_ +#endif // AOM_AV1_COMMON_FILTER_H_ diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c index 502ccd27d..fd6c4bc79 100644 --- a/third_party/aom/av1/common/frame_buffers.c +++ b/third_party/aom/av1/common/frame_buffers.c @@ -38,6 +38,17 @@ void av1_free_internal_frame_buffers(InternalFrameBufferList *list) { list->int_fb = NULL; } +void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + if (list->int_fb[i].data && !list->int_fb[i].in_use) + memset(list->int_fb[i].data, 0, list->int_fb[i].size); + } +} + int av1_get_frame_buffer(void *cb_priv, size_t min_size, aom_codec_frame_buffer_t *fb) { int i; diff --git a/third_party/aom/av1/common/frame_buffers.h b/third_party/aom/av1/common/frame_buffers.h index e7341cfdd..16188e51c 100644 --- a/third_party/aom/av1/common/frame_buffers.h +++ b/third_party/aom/av1/common/frame_buffers.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_FRAME_BUFFERS_H_ -#define AV1_COMMON_FRAME_BUFFERS_H_ +#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_ +#define AOM_AV1_COMMON_FRAME_BUFFERS_H_ #include "aom/aom_frame_buffer.h" #include "aom/aom_integer.h" @@ -36,6 +36,12 @@ int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list); // Free any data allocated to the frame buffers. void av1_free_internal_frame_buffers(InternalFrameBufferList *list); +// Zeros all unused internal frame buffers. In particular, this zeros the +// frame borders. Call this function after a sequence header change to +// re-initialize the frame borders for the different width, height, or bit +// depth. +void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list); + // Callback used by libaom to request an external frame buffer. |cb_priv| // Callback private data, which points to an InternalFrameBufferList. // |min_size| is the minimum size in bytes needed to decode the next frame. @@ -51,4 +57,4 @@ int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb); } // extern "C" #endif -#endif // AV1_COMMON_FRAME_BUFFERS_H_ +#endif // AOM_AV1_COMMON_FRAME_BUFFERS_H_ diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c index bc758eb57..2c1cb9827 100644 --- a/third_party/aom/av1/common/idct.c +++ b/third_party/aom/av1/common/idct.c @@ -31,21 +31,16 @@ int av1_get_tx_scale(const TX_SIZE tx_size) { // that input and output could be the same buffer. // idct -static void highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, - int stride, int eob, int bd) { +void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { if (eob > 1) av1_highbd_iwht4x4_16_add(input, dest, stride, bd); else av1_highbd_iwht4x4_1_add(input, dest, stride, bd); } -static const int32_t *cast_to_int32(const tran_low_t *input) { - assert(sizeof(int32_t) == sizeof(tran_low_t)); - return (const int32_t *)input; -} - -void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); int eob = txfm_param->eob; int bd = txfm_param->bd; @@ -54,206 +49,150 @@ void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, const TX_TYPE tx_type = txfm_param->tx_type; if (lossless) { assert(tx_type == DCT_DCT); - highbd_iwht4x4_add(input, dest, stride, eob, bd); + av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); return; } - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - } + + av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); } -static void highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_4x8(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_8x4(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_8x16(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_16x8(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_16x32(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_32x16(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_16x4(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_4x16(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_32x8(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_8x32(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_32x64(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_64x32(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_16x64(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); -} -static void highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { - const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_64x16(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); } -static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + + av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); - break; - } } -static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; +void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - default: - av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - } + av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_inv_txfm2d_add_32x32(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - // Assembly version doesn't support IDTX, so use C version for it. - case IDTX: - av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - default: assert(0); - } + av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); } -static void highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); assert(tx_type == DCT_DCT); - av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); + av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); } static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, @@ -270,70 +209,70 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); } -static void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const TX_SIZE tx_size = txfm_param->tx_size; switch (tx_size) { case TX_32X32: - highbd_inv_txfm_add_32x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param); break; case TX_16X16: - highbd_inv_txfm_add_16x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param); break; case TX_8X8: - highbd_inv_txfm_add_8x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param); break; case TX_4X8: - highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); break; case TX_8X4: - highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); break; case TX_8X16: - highbd_inv_txfm_add_8x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param); break; case TX_16X8: - highbd_inv_txfm_add_16x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param); break; case TX_16X32: - highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); break; case TX_32X16: - highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); break; case TX_64X64: - highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param); break; case TX_32X64: - highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); break; case TX_64X32: - highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); break; case TX_16X64: - highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param); break; case TX_64X16: - highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param); break; case TX_4X4: // this is like av1_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - av1_highbd_inv_txfm_add_4x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param); break; case TX_16X4: - highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); break; case TX_4X16: - highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); break; case TX_8X32: - highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); break; case TX_32X8: - highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); break; default: assert(0 && "Invalid transform size"); break; } @@ -352,7 +291,8 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, } } - highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, txfm_param); + av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, + txfm_param); for (int r = 0; r < h; ++r) { for (int c = 0; c < w; ++c) { @@ -375,7 +315,7 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd, assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]); if (txfm_param.is_hbd) { - highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); + av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); } else { av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); } diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h index 50032a167..d9454e73f 100644 --- a/third_party/aom/av1/common/idct.h +++ b/third_party/aom/av1/common/idct.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_IDCT_H_ -#define AV1_COMMON_IDCT_H_ +#ifndef AOM_AV1_COMMON_IDCT_H_ +#define AOM_AV1_COMMON_IDCT_H_ #include "config/aom_config.h" @@ -36,11 +36,32 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd, const tran_low_t *dqcoeff, int plane, TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst, int stride, int eob, int reduced_tx_set); +void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); + +static INLINE const int32_t *cast_to_int32(const tran_low_t *input) { + assert(sizeof(int32_t) == sizeof(tran_low_t)); + return (const int32_t *)input; +} + +typedef void(highbd_inv_txfm_add)(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *param); + +highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x8; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x4; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x32; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x16; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x64; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x32; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x64; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x16; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x4; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x16; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x32; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x8; -void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *param); #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_COMMON_IDCT_H_ +#endif // AOM_AV1_COMMON_IDCT_H_ diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h index c2495640e..5b0225192 100644 --- a/third_party/aom/av1/common/mv.h +++ b/third_party/aom/av1/common/mv.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_MV_H_ -#define AV1_COMMON_MV_H_ +#ifndef AOM_AV1_COMMON_MV_H_ +#define AOM_AV1_COMMON_MV_H_ #include "av1/common/common.h" #include "av1/common/common_data.h" @@ -56,7 +56,7 @@ typedef struct mv32 { #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS) /* clang-format off */ -typedef enum { +typedef enum ATTRIBUTE_PACKED { IDENTITY = 0, // identity transformation, 0-parameter TRANSLATION = 1, // translational motion 2-parameter ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter @@ -298,4 +298,4 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row, } // extern "C" #endif -#endif // AV1_COMMON_MV_H_ +#endif // AOM_AV1_COMMON_MV_H_ diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c index 6939df335..7f24ab4e6 100644 --- a/third_party/aom/av1/common/mvref_common.c +++ b/third_party/aom/av1/common/mvref_common.c @@ -27,16 +27,19 @@ static void get_mv_projection(MV *output, MV ref, int num, int den) { den = AOMMIN(den, MAX_FRAME_DISTANCE); num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE) : AOMMAX(num, -MAX_FRAME_DISTANCE); - int mv_row = ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14); - int mv_col = ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14); + const int mv_row = + ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14); + const int mv_col = + ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14); const int clamp_max = MV_UPP - 1; const int clamp_min = MV_LOW + 1; output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max); output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max); } -void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi, - int mi_row, int mi_col, int x_mis, int y_mis) { +void av1_copy_frame_mvs(const AV1_COMMON *const cm, + const MB_MODE_INFO *const mi, int mi_row, int mi_col, + int x_mis, int y_mis) { const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1); MV_REF *frame_mvs = cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1); @@ -141,38 +144,37 @@ static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset, int *processed_rows) { - int end_mi = AOMMIN(xd->n8_w, cm->mi_cols - mi_col); + int end_mi = AOMMIN(xd->n4_w, cm->mi_cols - mi_col); end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]); const int n8_w_8 = mi_size_wide[BLOCK_8X8]; const int n8_w_16 = mi_size_wide[BLOCK_16X16]; int i; int col_offset = 0; - const int shift = 0; // TODO(jingning): Revisit this part after cb4x4 is stable. if (abs(row_offset) > 1) { col_offset = 1; - if ((mi_col & 0x01) && xd->n8_w < n8_w_8) --col_offset; + if ((mi_col & 0x01) && xd->n4_w < n8_w_8) --col_offset; } - const int use_step_16 = (xd->n8_w >= 16); + const int use_step_16 = (xd->n4_w >= 16); MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride; (void)mi_row; for (i = 0; i < end_mi;) { const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i]; const int candidate_bsize = candidate->sb_type; - const int n8_w = mi_size_wide[candidate_bsize]; - int len = AOMMIN(xd->n8_w, n8_w); + const int n4_w = mi_size_wide[candidate_bsize]; + int len = AOMMIN(xd->n4_w, n4_w); if (use_step_16) len = AOMMAX(n8_w_16, len); else if (abs(row_offset) > 1) len = AOMMAX(len, n8_w_8); int weight = 2; - if (xd->n8_w >= n8_w_8 && xd->n8_w <= n8_w) { + if (xd->n4_w >= n8_w_8 && xd->n4_w <= n4_w) { int inc = AOMMIN(-max_row_offset + row_offset + 1, mi_size_high[candidate_bsize]); // Obtain range used in weight calculation. - weight = AOMMAX(weight, (inc << shift)); + weight = AOMMAX(weight, inc); // Update processed rows. *processed_rows = inc - row_offset - 1; } @@ -192,37 +194,36 @@ static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset, int *processed_cols) { - int end_mi = AOMMIN(xd->n8_h, cm->mi_rows - mi_row); + int end_mi = AOMMIN(xd->n4_h, cm->mi_rows - mi_row); end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]); const int n8_h_8 = mi_size_high[BLOCK_8X8]; const int n8_h_16 = mi_size_high[BLOCK_16X16]; int i; int row_offset = 0; - const int shift = 0; if (abs(col_offset) > 1) { row_offset = 1; - if ((mi_row & 0x01) && xd->n8_h < n8_h_8) --row_offset; + if ((mi_row & 0x01) && xd->n4_h < n8_h_8) --row_offset; } - const int use_step_16 = (xd->n8_h >= 16); + const int use_step_16 = (xd->n4_h >= 16); (void)mi_col; for (i = 0; i < end_mi;) { const MB_MODE_INFO *const candidate = xd->mi[(row_offset + i) * xd->mi_stride + col_offset]; const int candidate_bsize = candidate->sb_type; - const int n8_h = mi_size_high[candidate_bsize]; - int len = AOMMIN(xd->n8_h, n8_h); + const int n4_h = mi_size_high[candidate_bsize]; + int len = AOMMIN(xd->n4_h, n4_h); if (use_step_16) len = AOMMAX(n8_h_16, len); else if (abs(col_offset) > 1) len = AOMMAX(len, n8_h_8); int weight = 2; - if (xd->n8_h >= n8_h_8 && xd->n8_h <= n8_h) { + if (xd->n4_h >= n8_h_8 && xd->n4_h <= n4_h) { int inc = AOMMIN(-max_col_offset + col_offset + 1, mi_size_wide[candidate_bsize]); // Obtain range used in weight calculation. - weight = AOMMAX(weight, (inc << shift)); + weight = AOMMAX(weight, inc); // Update processed cols. *processed_cols = inc - col_offset - 1; } @@ -248,7 +249,7 @@ static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, mi_pos.row = row_offset; mi_pos.col = col_offset; - if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) { + if (is_inside(tile, mi_col, mi_row, &mi_pos)) { const MB_MODE_INFO *const candidate = xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col]; const int len = mi_size_wide[BLOCK_8X8]; @@ -290,19 +291,19 @@ static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd, // The left hand of two vertical rectangles always has a top right (as the // block above will have been decoded) - if (xd->n8_w < xd->n8_h) + if (xd->n4_w < xd->n4_h) if (!xd->is_sec_rect) has_tr = 1; // The bottom of two horizontal rectangles never has a top right (as the block // to the right won't have been decoded) - if (xd->n8_w > xd->n8_h) + if (xd->n4_w > xd->n4_h) if (xd->is_sec_rect) has_tr = 0; // The bottom left square of a Vertical A (in the old format) does // not have a top right as it is decoded before the right hand // rectangle of the partition if (xd->mi[0]->partition == PARTITION_VERT_A) { - if (xd->n8_w == xd->n8_h) + if (xd->n4_w == xd->n4_h) if (mask_row & bs) has_tr = 0; } @@ -335,7 +336,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1; mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1; - if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) return 0; + if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0; const TPL_MV_REF *prev_frame_mvs = cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) + @@ -430,20 +431,75 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, return 0; } +static void process_compound_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, + const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2], + int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) { + for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { + MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx]; + + for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) { + if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) { + ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx]; + ++ref_id_count[cmp_idx]; + } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) { + int_mv this_mv = candidate->mv[rf_idx]; + if (cm->ref_frame_sign_bias[can_rf] != + cm->ref_frame_sign_bias[rf[cmp_idx]]) { + this_mv.as_mv.row = -this_mv.as_mv.row; + this_mv.as_mv.col = -this_mv.as_mv.col; + } + ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv; + ++ref_diff_count[cmp_idx]; + } + } + } +} + +static void process_single_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, + MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES], + CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE]) { + for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { + if (candidate->ref_frame[rf_idx] > INTRA_FRAME) { + int_mv this_mv = candidate->mv[rf_idx]; + if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] != + cm->ref_frame_sign_bias[ref_frame]) { + this_mv.as_mv.row = -this_mv.as_mv.row; + this_mv.as_mv.col = -this_mv.as_mv.col; + } + int stack_idx; + for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) { + const int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv; + if (this_mv.as_int == stack_mv.as_int) break; + } + + if (stack_idx == refmv_count[ref_frame]) { + ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv; + + // TODO(jingning): Set an arbitrary small number here. The weight + // doesn't matter as long as it is properly initialized. + ref_mv_stack[ref_frame][stack_idx].weight = 2; + ++refmv_count[ref_frame]; + } + } + } +} + static void setup_ref_mv_list( const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES], CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates, int mi_row, int mi_col, int16_t *mode_context) { - const int bs = AOMMAX(xd->n8_w, xd->n8_h); + const int bs = AOMMAX(xd->n4_w, xd->n4_h); const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs); MV_REFERENCE_FRAME rf[2]; const TileInfo *const tile = &xd->tile; int max_row_offset = 0, max_col_offset = 0; - const int row_adj = (xd->n8_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01); - const int col_adj = (xd->n8_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01); + const int row_adj = (xd->n4_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01); + const int col_adj = (xd->n4_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01); int processed_rows = 0; int processed_cols = 0; @@ -455,17 +511,16 @@ static void setup_ref_mv_list( if (xd->up_available) { max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj; - if (xd->n8_h < mi_size_high[BLOCK_8X8]) + if (xd->n4_h < mi_size_high[BLOCK_8X8]) max_row_offset = -(2 << 1) + row_adj; - max_row_offset = - find_valid_row_offset(tile, mi_row, cm->mi_rows, max_row_offset); + max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset); } if (xd->left_available) { max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj; - if (xd->n8_w < mi_size_wide[BLOCK_8X8]) + if (xd->n4_w < mi_size_wide[BLOCK_8X8]) max_col_offset = -(2 << 1) + col_adj; max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset); @@ -487,12 +542,12 @@ static void setup_ref_mv_list( gm_mv_candidates, max_col_offset, &processed_cols); // Check top-right boundary if (has_tr) - scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n8_w, + scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n4_w, ref_mv_stack[ref_frame], &row_match_count, &newmv_count, gm_mv_candidates, &refmv_count[ref_frame]); - uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0); - uint8_t nearest_refmv_count = refmv_count[ref_frame]; + const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0); + const uint8_t nearest_refmv_count = refmv_count[ref_frame]; // TODO(yunqing): for comp_search, do it for all 3 cases. for (int idx = 0; idx < nearest_refmv_count; ++idx) @@ -500,27 +555,27 @@ static void setup_ref_mv_list( if (cm->allow_ref_frame_mvs) { int is_available = 0; - const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n8_h); - const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n8_w); - const int blk_row_end = AOMMIN(xd->n8_h, mi_size_high[BLOCK_64X64]); - const int blk_col_end = AOMMIN(xd->n8_w, mi_size_wide[BLOCK_64X64]); + const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n4_h); + const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n4_w); + const int blk_row_end = AOMMIN(xd->n4_h, mi_size_high[BLOCK_64X64]); + const int blk_col_end = AOMMIN(xd->n4_w, mi_size_wide[BLOCK_64X64]); const int tpl_sample_pos[3][2] = { { voffset, -2 }, { voffset, hoffset }, { voffset - 2, hoffset }, }; - const int allow_extension = (xd->n8_h >= mi_size_high[BLOCK_8X8]) && - (xd->n8_h < mi_size_high[BLOCK_64X64]) && - (xd->n8_w >= mi_size_wide[BLOCK_8X8]) && - (xd->n8_w < mi_size_wide[BLOCK_64X64]); - - int step_h = (xd->n8_h >= mi_size_high[BLOCK_64X64]) - ? mi_size_high[BLOCK_16X16] - : mi_size_high[BLOCK_8X8]; - int step_w = (xd->n8_w >= mi_size_wide[BLOCK_64X64]) - ? mi_size_wide[BLOCK_16X16] - : mi_size_wide[BLOCK_8X8]; + const int allow_extension = (xd->n4_h >= mi_size_high[BLOCK_8X8]) && + (xd->n4_h < mi_size_high[BLOCK_64X64]) && + (xd->n4_w >= mi_size_wide[BLOCK_8X8]) && + (xd->n4_w < mi_size_wide[BLOCK_64X64]); + + const int step_h = (xd->n4_h >= mi_size_high[BLOCK_64X64]) + ? mi_size_high[BLOCK_16X16] + : mi_size_high[BLOCK_8X8]; + const int step_w = (xd->n4_w >= mi_size_wide[BLOCK_64X64]) + ? mi_size_wide[BLOCK_16X16] + : mi_size_wide[BLOCK_8X8]; for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) { for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) { @@ -569,7 +624,7 @@ static void setup_ref_mv_list( max_col_offset, &processed_cols); } - uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0); + const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0); switch (nearest_match) { case 0: @@ -636,62 +691,24 @@ static void setup_ref_mv_list( int_mv ref_id[2][2], ref_diff[2][2]; int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 }; - int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w); + int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w); mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col); - int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h); + int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h); mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row); int mi_size = AOMMIN(mi_width, mi_height); for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) { const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; - const int candidate_bsize = candidate->sb_type; - - for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { - MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx]; - - for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) { - if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) { - ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx]; - ++ref_id_count[cmp_idx]; - } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) { - int_mv this_mv = candidate->mv[rf_idx]; - if (cm->ref_frame_sign_bias[can_rf] != - cm->ref_frame_sign_bias[rf[cmp_idx]]) { - this_mv.as_mv.row = -this_mv.as_mv.row; - this_mv.as_mv.col = -this_mv.as_mv.col; - } - ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv; - ++ref_diff_count[cmp_idx]; - } - } - } - idx += mi_size_wide[candidate_bsize]; + process_compound_ref_mv_candidate( + candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count); + idx += mi_size_wide[candidate->sb_type]; } for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) { const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; - const int candidate_bsize = candidate->sb_type; - - for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { - MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx]; - - for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) { - if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) { - ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx]; - ++ref_id_count[cmp_idx]; - } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) { - int_mv this_mv = candidate->mv[rf_idx]; - if (cm->ref_frame_sign_bias[can_rf] != - cm->ref_frame_sign_bias[rf[cmp_idx]]) { - this_mv.as_mv.row = -this_mv.as_mv.row; - this_mv.as_mv.col = -this_mv.as_mv.col; - } - ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv; - ++ref_diff_count[cmp_idx]; - } - } - } - idx += mi_size_high[candidate_bsize]; + process_compound_ref_mv_candidate( + candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count); + idx += mi_size_high[candidate->sb_type]; } // Build up the compound mv predictor @@ -743,87 +760,37 @@ static void setup_ref_mv_list( for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) { clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv, - xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd); + xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd); clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv, - xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd); + xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd); } } else { // Handle single reference frame extension - int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w); + int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w); mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col); - int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h); + int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h); mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row); int mi_size = AOMMIN(mi_width, mi_height); for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size && refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) { const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; - const int candidate_bsize = candidate->sb_type; - - // TODO(jingning): Refactor the following code. - for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { - if (candidate->ref_frame[rf_idx] > INTRA_FRAME) { - int_mv this_mv = candidate->mv[rf_idx]; - if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] != - cm->ref_frame_sign_bias[ref_frame]) { - this_mv.as_mv.row = -this_mv.as_mv.row; - this_mv.as_mv.col = -this_mv.as_mv.col; - } - int stack_idx; - for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) { - int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv; - if (this_mv.as_int == stack_mv.as_int) break; - } - - if (stack_idx == refmv_count[ref_frame]) { - ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv; - - // TODO(jingning): Set an arbitrary small number here. The weight - // doesn't matter as long as it is properly initialized. - ref_mv_stack[ref_frame][stack_idx].weight = 2; - ++refmv_count[ref_frame]; - } - } - } - idx += mi_size_wide[candidate_bsize]; + process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, + ref_mv_stack); + idx += mi_size_wide[candidate->sb_type]; } for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size && refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) { const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; - const int candidate_bsize = candidate->sb_type; - - // TODO(jingning): Refactor the following code. - for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { - if (candidate->ref_frame[rf_idx] > INTRA_FRAME) { - int_mv this_mv = candidate->mv[rf_idx]; - if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] != - cm->ref_frame_sign_bias[ref_frame]) { - this_mv.as_mv.row = -this_mv.as_mv.row; - this_mv.as_mv.col = -this_mv.as_mv.col; - } - int stack_idx; - for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) { - int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv; - if (this_mv.as_int == stack_mv.as_int) break; - } - - if (stack_idx == refmv_count[ref_frame]) { - ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv; - - // TODO(jingning): Set an arbitrary small number here. The weight - // doesn't matter as long as it is properly initialized. - ref_mv_stack[ref_frame][stack_idx].weight = 2; - ++refmv_count[ref_frame]; - } - } - } - idx += mi_size_high[candidate_bsize]; + process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, + ref_mv_stack); + idx += mi_size_high[candidate->sb_type]; } for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) { clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv, - xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd); + xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd); } if (mv_ref_list != NULL) { @@ -936,8 +903,10 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row, const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2)) : -((-mv.col) >> (4 + MI_SIZE_LOG2)); - int row = (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset; - int col = (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset; + const int row = + (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset; + const int col = + (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset; if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 || col >= (cm->mi_cols >> 1)) @@ -955,37 +924,44 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row, return 1; } -static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame, - int dir) { +// Note: motion_filed_projection finds motion vectors of current frame's +// reference frame, and projects them to current frame. To make it clear, +// let's call current frame's reference frame as start frame. +// Call Start frame's reference frames as reference frames. +// Call ref_offset as frame distances between start frame and its reference +// frames. +static int motion_field_projection(AV1_COMMON *cm, + MV_REFERENCE_FRAME start_frame, int dir) { TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; int ref_offset[REF_FRAMES] = { 0 }; (void)dir; - int ref_frame_idx = cm->frame_refs[FWD_RF_OFFSET(ref_frame)].idx; - if (ref_frame_idx < 0) return 0; + const int start_frame_idx = cm->frame_refs[FWD_RF_OFFSET(start_frame)].idx; + if (start_frame_idx < 0) return 0; - if (cm->buffer_pool->frame_bufs[ref_frame_idx].intra_only) return 0; + if (cm->buffer_pool->frame_bufs[start_frame_idx].intra_only) return 0; - if (cm->buffer_pool->frame_bufs[ref_frame_idx].mi_rows != cm->mi_rows || - cm->buffer_pool->frame_bufs[ref_frame_idx].mi_cols != cm->mi_cols) + if (cm->buffer_pool->frame_bufs[start_frame_idx].mi_rows != cm->mi_rows || + cm->buffer_pool->frame_bufs[start_frame_idx].mi_cols != cm->mi_cols) return 0; - int ref_frame_index = - cm->buffer_pool->frame_bufs[ref_frame_idx].cur_frame_offset; - unsigned int *ref_rf_idx = - &cm->buffer_pool->frame_bufs[ref_frame_idx].ref_frame_offset[0]; - int cur_frame_index = cm->cur_frame->cur_frame_offset; - int ref_to_cur = get_relative_dist(cm, ref_frame_index, cur_frame_index); + const int start_frame_offset = + cm->buffer_pool->frame_bufs[start_frame_idx].cur_frame_offset; + const unsigned int *const ref_frame_offsets = + &cm->buffer_pool->frame_bufs[start_frame_idx].ref_frame_offset[0]; + const int cur_frame_offset = cm->cur_frame->cur_frame_offset; + int start_to_current_frame_offset = + get_relative_dist(cm, start_frame_offset, cur_frame_offset); for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) { - ref_offset[rf] = - get_relative_dist(cm, ref_frame_index, ref_rf_idx[rf - LAST_FRAME]); + ref_offset[rf] = get_relative_dist(cm, start_frame_offset, + ref_frame_offsets[rf - LAST_FRAME]); } - if (dir == 2) ref_to_cur = -ref_to_cur; + if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset; - MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[ref_frame_idx].mvs; + MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[start_frame_idx].mvs; const int mvs_rows = (cm->mi_rows + 1) >> 1; const int mvs_cols = (cm->mi_cols + 1) >> 1; @@ -999,19 +975,20 @@ static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame, int mi_r, mi_c; const int ref_frame_offset = ref_offset[mv_ref->ref_frame]; - int pos_valid = abs(ref_frame_offset) <= MAX_FRAME_DISTANCE && - ref_frame_offset > 0 && - abs(ref_to_cur) <= MAX_FRAME_DISTANCE; + int pos_valid = + abs(ref_frame_offset) <= MAX_FRAME_DISTANCE && + ref_frame_offset > 0 && + abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE; if (pos_valid) { - get_mv_projection(&this_mv.as_mv, fwd_mv, ref_to_cur, - ref_frame_offset); + get_mv_projection(&this_mv.as_mv, fwd_mv, + start_to_current_frame_offset, ref_frame_offset); pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col, this_mv.as_mv, dir >> 1); } if (pos_valid) { - int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c; + const int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c; tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row; tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col; @@ -1167,14 +1144,14 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, if (up_available) { int mi_row_offset = -1; MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride]; - uint8_t n8_w = mi_size_wide[mbmi->sb_type]; + uint8_t n4_w = mi_size_wide[mbmi->sb_type]; - if (xd->n8_w <= n8_w) { + if (xd->n4_w <= n4_w) { // Handle "current block width <= above block width" case. - int col_offset = -mi_col % n8_w; + int col_offset = -mi_col % n4_w; if (col_offset < 0) do_tl = 0; - if (col_offset + n8_w > xd->n8_w) do_tr = 0; + if (col_offset + n4_w > xd->n4_w) do_tr = 0; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1); @@ -1185,11 +1162,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, } } else { // Handle "current block width > above block width" case. - for (i = 0; i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) { + for (i = 0; i < AOMMIN(xd->n4_w, cm->mi_cols - mi_col); i += mi_step) { int mi_col_offset = i; mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]; - n8_w = mi_size_wide[mbmi->sb_type]; - mi_step = AOMMIN(xd->n8_w, n8_w); + n4_w = mi_size_wide[mbmi->sb_type]; + mi_step = AOMMIN(xd->n4_w, n4_w); if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { @@ -1209,11 +1186,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, int mi_col_offset = -1; MB_MODE_INFO *mbmi = xd->mi[mi_col_offset]; - uint8_t n8_h = mi_size_high[mbmi->sb_type]; + uint8_t n4_h = mi_size_high[mbmi->sb_type]; - if (xd->n8_h <= n8_h) { + if (xd->n4_h <= n4_h) { // Handle "current block height <= above block height" case. - int row_offset = -mi_row % n8_h; + int row_offset = -mi_row % n4_h; if (row_offset < 0) do_tl = 0; @@ -1226,11 +1203,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, } } else { // Handle "current block height > above block height" case. - for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) { + for (i = 0; i < AOMMIN(xd->n4_h, cm->mi_rows - mi_row); i += mi_step) { int mi_row_offset = i; mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]; - n8_h = mi_size_high[mbmi->sb_type]; - mi_step = AOMMIN(xd->n8_h, n8_h); + n4_h = mi_size_high[mbmi->sb_type]; + mi_step = AOMMIN(xd->n4_h, n4_h); if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { @@ -1264,18 +1241,18 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, // Top-right block if (do_tr && - has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n8_w, xd->n8_h))) { - POSITION trb_pos = { -1, xd->n8_w }; + has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n4_w, xd->n4_h))) { + POSITION trb_pos = { -1, xd->n4_w }; - if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &trb_pos)) { + if (is_inside(tile, mi_col, mi_row, &trb_pos)) { int mi_row_offset = -1; - int mi_col_offset = xd->n8_w; + int mi_col_offset = xd->n4_w; MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { - record_samples(mbmi, pts, pts_inref, 0, -1, xd->n8_w, 1); + record_samples(mbmi, pts, pts_inref, 0, -1, xd->n4_w, 1); np++; if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; } @@ -1372,7 +1349,7 @@ static int compare_ref_frame_info(const void *arg_a, const void *arg_b) { static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx, REF_FRAME_INFO *ref_info) { - assert(frame_idx >= 0 && frame_idx <= INTER_REFS_PER_FRAME); + assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME); const int buf_idx = ref_info->buf_idx; diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h index f68c159e1..83f7a1ac0 100644 --- a/third_party/aom/av1/common/mvref_common.h +++ b/third_party/aom/av1/common/mvref_common.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_MVREF_COMMON_H_ -#define AV1_COMMON_MVREF_COMMON_H_ +#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_ +#define AOM_AV1_COMMON_MVREF_COMMON_H_ #include "av1/common/onyxc_int.h" #include "av1/common/blockd.h" @@ -85,29 +85,17 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, // Checks that the given mi_row, mi_col and search point // are inside the borders of the tile. static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row, - int mi_rows, const POSITION *mi_pos) { - const int dependent_horz_tile_flag = 0; - if (dependent_horz_tile_flag && !tile->tg_horz_boundary) { - return !(mi_row + mi_pos->row < 0 || - mi_col + mi_pos->col < tile->mi_col_start || - mi_row + mi_pos->row >= mi_rows || - mi_col + mi_pos->col >= tile->mi_col_end); - } else { - return !(mi_row + mi_pos->row < tile->mi_row_start || - mi_col + mi_pos->col < tile->mi_col_start || - mi_row + mi_pos->row >= tile->mi_row_end || - mi_col + mi_pos->col >= tile->mi_col_end); - } + const POSITION *mi_pos) { + return !(mi_row + mi_pos->row < tile->mi_row_start || + mi_col + mi_pos->col < tile->mi_col_start || + mi_row + mi_pos->row >= tile->mi_row_end || + mi_col + mi_pos->col >= tile->mi_col_end); } static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row, - int mi_rows, int row_offset) { - const int dependent_horz_tile_flag = 0; - if (dependent_horz_tile_flag && !tile->tg_horz_boundary) - return clamp(row_offset, -mi_row, mi_rows - mi_row - 1); - else - return clamp(row_offset, tile->mi_row_start - mi_row, - tile->mi_row_end - mi_row - 1); + int row_offset) { + return clamp(row_offset, tile->mi_row_start - mi_row, + tile->mi_row_end - mi_row - 1); } static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col, @@ -263,8 +251,9 @@ static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) { } } -void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi, - int mi_row, int mi_col, int x_mis, int y_mis); +void av1_copy_frame_mvs(const AV1_COMMON *const cm, + const MB_MODE_INFO *const mi, int mi_row, int mi_col, + int x_mis, int y_mis); void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, @@ -286,7 +275,6 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, #define INTRABC_DELAY_PIXELS 256 // Delay of 256 pixels #define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64) -#define USE_WAVE_FRONT 1 // Use only top left area of frame for reference. static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile, int mib_size, int mi_row, int mi_col) { @@ -356,13 +344,12 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm, const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col; if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0; -#if USE_WAVE_FRONT + // Wavefront constraint: use only top left area of frame for reference. const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64); const int wf_offset = gradient * (active_sb_row - src_sb_row); if (src_sb_row > active_sb_row || src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset) return 0; -#endif return 1; } @@ -371,4 +358,4 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm, } // extern "C" #endif -#endif // AV1_COMMON_MVREF_COMMON_H_ +#endif // AOM_AV1_COMMON_MVREF_COMMON_H_ diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h index 3918c82c6..1c90cd93f 100644 --- a/third_party/aom/av1/common/obmc.h +++ b/third_party/aom/av1/common/obmc.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_OBMC_H_ -#define AV1_COMMON_OBMC_H_ +#ifndef AOM_AV1_COMMON_OBMC_H_ +#define AOM_AV1_COMMON_OBMC_H_ typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos, uint8_t nb_mi_size, @@ -30,7 +30,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm, // prev_row_mi points into the mi array, starting at the beginning of the // previous row. MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride; - const int end_col = AOMMIN(mi_col + xd->n8_w, cm->mi_cols); + const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols); uint8_t mi_step; for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max; above_mi_col += mi_step) { @@ -49,7 +49,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm, } if (is_neighbor_overlappable(*above_mi)) { ++nb_count; - fun(xd, above_mi_col - mi_col, AOMMIN(xd->n8_w, mi_step), *above_mi, + fun(xd, above_mi_col - mi_col, AOMMIN(xd->n4_w, mi_step), *above_mi, fun_ctxt, num_planes); } } @@ -68,7 +68,7 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm, // prev_col_mi points into the mi array, starting at the top of the // previous column MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride; - const int end_row = AOMMIN(mi_row + xd->n8_h, cm->mi_rows); + const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows); uint8_t mi_step; for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max; left_mi_row += mi_step) { @@ -82,10 +82,10 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm, } if (is_neighbor_overlappable(*left_mi)) { ++nb_count; - fun(xd, left_mi_row - mi_row, AOMMIN(xd->n8_h, mi_step), *left_mi, + fun(xd, left_mi_row - mi_row, AOMMIN(xd->n4_h, mi_step), *left_mi, fun_ctxt, num_planes); } } } -#endif // AV1_COMMON_OBMC_H_ +#endif // AOM_AV1_COMMON_OBMC_H_ diff --git a/third_party/aom/av1/common/obu_util.c b/third_party/aom/av1/common/obu_util.c new file mode 100644 index 000000000..823b700b1 --- /dev/null +++ b/third_party/aom/av1/common/obu_util.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "av1/common/obu_util.h" + +#include "aom_dsp/bitreader_buffer.h" + +// Returns 1 when OBU type is valid, and 0 otherwise. +static int valid_obu_type(int obu_type) { + int valid_type = 0; + switch (obu_type) { + case OBU_SEQUENCE_HEADER: + case OBU_TEMPORAL_DELIMITER: + case OBU_FRAME_HEADER: + case OBU_TILE_GROUP: + case OBU_METADATA: + case OBU_FRAME: + case OBU_REDUNDANT_FRAME_HEADER: + case OBU_TILE_LIST: + case OBU_PADDING: valid_type = 1; break; + default: break; + } + return valid_type; +} + +static aom_codec_err_t read_obu_size(const uint8_t *data, + size_t bytes_available, + size_t *const obu_size, + size_t *const length_field_size) { + uint64_t u_obu_size = 0; + if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) != + 0) { + return AOM_CODEC_CORRUPT_FRAME; + } + + if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME; + *obu_size = (size_t)u_obu_size; + return AOM_CODEC_OK; +} + +// Parses OBU header and stores values in 'header'. +static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb, + int is_annexb, ObuHeader *header) { + if (!rb || !header) return AOM_CODEC_INVALID_PARAM; + + const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer; + if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME; + + header->size = 1; + + if (aom_rb_read_bit(rb) != 0) { + // Forbidden bit. Must not be set. + return AOM_CODEC_CORRUPT_FRAME; + } + + header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4); + + if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME; + + header->has_extension = aom_rb_read_bit(rb); + header->has_size_field = aom_rb_read_bit(rb); + + if (!header->has_size_field && !is_annexb) { + // section 5 obu streams must have obu_size field set. + return AOM_CODEC_UNSUP_BITSTREAM; + } + + if (aom_rb_read_bit(rb) != 0) { + // obu_reserved_1bit must be set to 0. + return AOM_CODEC_CORRUPT_FRAME; + } + + if (header->has_extension) { + if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME; + + header->size += 1; + header->temporal_layer_id = aom_rb_read_literal(rb, 3); + header->spatial_layer_id = aom_rb_read_literal(rb, 2); + if (aom_rb_read_literal(rb, 3) != 0) { + // extension_header_reserved_3bits must be set to 0. + return AOM_CODEC_CORRUPT_FRAME; + } + } + + return AOM_CODEC_OK; +} + +aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, + size_t *consumed, ObuHeader *header, + int is_annexb) { + if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM; + + // TODO(tomfinegan): Set the error handler here and throughout this file, and + // confirm parsing work done via aom_read_bit_buffer is successful. + struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL, + NULL }; + aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header); + if (parse_result == AOM_CODEC_OK) *consumed = header->size; + return parse_result; +} + +aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, + size_t bytes_available, + int is_annexb, + ObuHeader *obu_header, + size_t *const payload_size, + size_t *const bytes_read) { + size_t length_field_size = 0, obu_size = 0; + aom_codec_err_t status; + + if (is_annexb) { + // Size field comes before the OBU header, and includes the OBU header + status = + read_obu_size(data, bytes_available, &obu_size, &length_field_size); + + if (status != AOM_CODEC_OK) return status; + } + + struct aom_read_bit_buffer rb = { data + length_field_size, + data + bytes_available, 0, NULL, NULL }; + + status = read_obu_header(&rb, is_annexb, obu_header); + if (status != AOM_CODEC_OK) return status; + + if (is_annexb) { + // Derive the payload size from the data we've already read + if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME; + + *payload_size = obu_size - obu_header->size; + } else { + // Size field comes after the OBU header, and is just the payload size + status = read_obu_size(data + obu_header->size, + bytes_available - obu_header->size, payload_size, + &length_field_size); + if (status != AOM_CODEC_OK) return status; + } + + *bytes_read = length_field_size + obu_header->size; + return AOM_CODEC_OK; +} diff --git a/third_party/aom/av1/common/obu_util.h b/third_party/aom/av1/common/obu_util.h new file mode 100644 index 000000000..7c56904c8 --- /dev/null +++ b/third_party/aom/av1/common/obu_util.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_OBU_UTIL_H_ +#define AOM_AV1_COMMON_OBU_UTIL_H_ + +#include "aom/aom_codec.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + size_t size; // Size (1 or 2 bytes) of the OBU header (including the + // optional OBU extension header) in the bitstream. + OBU_TYPE type; + int has_size_field; + int has_extension; + // The following fields come from the OBU extension header and therefore are + // only used if has_extension is true. + int temporal_layer_id; + int spatial_layer_id; +} ObuHeader; + +aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, + size_t *consumed, ObuHeader *header, + int is_annexb); + +aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, + size_t bytes_available, + int is_annexb, + ObuHeader *obu_header, + size_t *const payload_size, + size_t *const bytes_read); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_OBU_UTIL_H_ diff --git a/third_party/aom/av1/common/odintrin.h b/third_party/aom/av1/common/odintrin.h index e87c5a0bf..e1db0f44d 100644 --- a/third_party/aom/av1/common/odintrin.h +++ b/third_party/aom/av1/common/odintrin.h @@ -11,8 +11,8 @@ /* clang-format off */ -#ifndef AV1_COMMON_ODINTRIN_H_ -#define AV1_COMMON_ODINTRIN_H_ +#ifndef AOM_AV1_COMMON_ODINTRIN_H_ +#define AOM_AV1_COMMON_ODINTRIN_H_ #include #include @@ -46,9 +46,9 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2]; #define OD_MAXI AOMMAX #define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max))) -#define OD_CLZ0 (1) -#define OD_CLZ(x) (-get_msb(x)) -#define OD_ILOG_NZ(x) (OD_CLZ0 - OD_CLZ(x)) +/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer. + OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/ +#define OD_ILOG_NZ(x) (1 + get_msb(x)) /*Enable special features for gcc and compatible compilers.*/ #if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) @@ -93,4 +93,4 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2]; } // extern "C" #endif -#endif // AV1_COMMON_ODINTRIN_H_ +#endif // AOM_AV1_COMMON_ODINTRIN_H_ diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h index 6b1bf2d74..ff011c89e 100644 --- a/third_party/aom/av1/common/onyxc_int.h +++ b/third_party/aom/av1/common/onyxc_int.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ONYXC_INT_H_ -#define AV1_COMMON_ONYXC_INT_H_ +#ifndef AOM_AV1_COMMON_ONYXC_INT_H_ +#define AOM_AV1_COMMON_ONYXC_INT_H_ #include "config/aom_config.h" #include "config/av1_rtcd.h" @@ -480,6 +480,7 @@ typedef struct AV1Common { int byte_alignment; int skip_loop_filter; + int skip_film_grain; // Private data associated with the frame buffer callbacks. void *cb_priv; @@ -823,18 +824,18 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, xd->chroma_left_mbmi = chroma_left_mi; } - xd->n8_h = bh; - xd->n8_w = bw; + xd->n4_h = bh; + xd->n4_w = bw; xd->is_sec_rect = 0; - if (xd->n8_w < xd->n8_h) { + if (xd->n4_w < xd->n4_h) { // Only mark is_sec_rect as 1 for the last block. // For PARTITION_VERT_4, it would be (0, 0, 0, 1); // For other partitions, it would be (0, 1). - if (!((mi_col + xd->n8_w) & (xd->n8_h - 1))) xd->is_sec_rect = 1; + if (!((mi_col + xd->n4_w) & (xd->n4_h - 1))) xd->is_sec_rect = 1; } - if (xd->n8_w > xd->n8_h) - if (mi_row & (xd->n8_w - 1)) xd->is_sec_rect = 1; + if (xd->n4_w > xd->n4_h) + if (mi_row & (xd->n4_w - 1)) xd->is_sec_rect = 1; } static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx, @@ -1115,18 +1116,18 @@ static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) { for (i = 0; i < len; ++i) txfm_ctx[i] = txs; } -static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n8_w, int n8_h, int skip, +static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip, const MACROBLOCKD *xd) { uint8_t bw = tx_size_wide[tx_size]; uint8_t bh = tx_size_high[tx_size]; if (skip) { - bw = n8_w * MI_SIZE; - bh = n8_h * MI_SIZE; + bw = n4_w * MI_SIZE; + bh = n4_h * MI_SIZE; } - set_txfm_ctx(xd->above_txfm_context, bw, n8_w); - set_txfm_ctx(xd->left_txfm_context, bh, n8_h); + set_txfm_ctx(xd->above_txfm_context, bw, n4_w); + set_txfm_ctx(xd->left_txfm_context, bh, n4_h); } static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx, @@ -1338,4 +1339,4 @@ static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) { } // extern "C" #endif -#endif // AV1_COMMON_ONYXC_INT_H_ +#endif // AOM_AV1_COMMON_ONYXC_INT_H_ diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c index 58933a7b3..026a07809 100644 --- a/third_party/aom/av1/common/ppc/cfl_ppc.c +++ b/third_party/aom/av1/common/ppc/cfl_ppc.c @@ -24,19 +24,21 @@ #define CFL_LINE_2 128 #define CFL_LINE_3 192 -typedef vector int8_t int8x16_t; -typedef vector uint8_t uint8x16_t; -typedef vector int16_t int16x8_t; -typedef vector uint16_t uint16x8_t; -typedef vector int32_t int32x4_t; -typedef vector uint32_t uint32x4_t; -typedef vector uint64_t uint64x2_t; +typedef vector signed char int8x16_t; // NOLINT(runtime/int) +typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int) +typedef vector signed short int16x8_t; // NOLINT(runtime/int) +typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int) +typedef vector signed int int32x4_t; // NOLINT(runtime/int) +typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int) +typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int) -static INLINE void subtract_average_vsx(int16_t *pred_buf, int width, - int height, int round_offset, +static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst, + int width, int height, int round_offset, int num_pel_log2) { - const int16_t *end = pred_buf + height * CFL_BUF_LINE; - const int16_t *sum_buf = pred_buf; + // int16_t *dst = dst_ptr; + const int16_t *dst_end = dst + height * CFL_BUF_LINE; + const int16_t *sum_buf = (const int16_t *)src_ptr; + const int16_t *end = sum_buf + height * CFL_BUF_LINE; const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2); const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; @@ -71,43 +73,40 @@ static INLINE void subtract_average_vsx(int16_t *pred_buf, int width, const int32x4_t avg = vec_sr(sum_32x4, div_shift); const int16x8_t vec_avg = vec_pack(avg, avg); do { - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, pred_buf), vec_avg), OFF_0, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, pred_buf), vec_avg), - OFF_0 + CFL_BUF_LINE_BYTES, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, pred_buf), vec_avg), - OFF_0 + CFL_LINE_2, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, pred_buf), vec_avg), - OFF_0 + CFL_LINE_3, pred_buf); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg), + OFF_0 + CFL_BUF_LINE_BYTES, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg), + OFF_0 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg), + OFF_0 + CFL_LINE_3, dst); if (width >= 16) { - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, pred_buf), vec_avg), OFF_1, - pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, pred_buf), vec_avg), - OFF_1 + CFL_LINE_1, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, pred_buf), vec_avg), - OFF_1 + CFL_LINE_2, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, pred_buf), vec_avg), - OFF_1 + CFL_LINE_3, pred_buf); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg), + OFF_1 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg), + OFF_1 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg), + OFF_1 + CFL_LINE_3, dst); } if (width == 32) { - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, pred_buf), vec_avg), OFF_2, - pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, pred_buf), vec_avg), - OFF_2 + CFL_LINE_1, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, pred_buf), vec_avg), - OFF_2 + CFL_LINE_2, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, pred_buf), vec_avg), - OFF_2 + CFL_LINE_3, pred_buf); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg), + OFF_2 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg), + OFF_2 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg), + OFF_2 + CFL_LINE_3, dst); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, pred_buf), vec_avg), OFF_3, - pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, pred_buf), vec_avg), - OFF_3 + CFL_LINE_1, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, pred_buf), vec_avg), - OFF_3 + CFL_LINE_2, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, pred_buf), vec_avg), - OFF_3 + CFL_LINE_3, pred_buf); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg), + OFF_3 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg), + OFF_3 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg), + OFF_3 + CFL_LINE_3, dst); } - } while ((pred_buf += CFL_BUF_LINE * 4) < end); + } while ((dst += CFL_BUF_LINE * 4) < dst_end); } // Declare wrappers for VSX sizes diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c index d77739d85..5952441d1 100644 --- a/third_party/aom/av1/common/pred_common.c +++ b/third_party/aom/av1/common/pred_common.c @@ -31,8 +31,8 @@ int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) { const MB_MODE_INFO *const mbmi = xd->mi[0]; const int ctx_offset = (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET; - MV_REFERENCE_FRAME ref_frame = - (dir < 2) ? mbmi->ref_frame[0] : mbmi->ref_frame[1]; + assert(dir == 0 || dir == 1); + const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; // Note: // The mode info data structure has a one element border above and to the // left of the entries corresponding to real macroblocks. diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h index 6a835c467..6dba2322d 100644 --- a/third_party/aom/av1/common/pred_common.h +++ b/third_party/aom/av1/common/pred_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_PRED_COMMON_H_ -#define AV1_COMMON_PRED_COMMON_H_ +#ifndef AOM_AV1_COMMON_PRED_COMMON_H_ +#define AOM_AV1_COMMON_PRED_COMMON_H_ #include "av1/common/blockd.h" #include "av1/common/mvref_common.h" @@ -357,4 +357,4 @@ static INLINE int get_tx_size_context(const MACROBLOCKD *xd) { } // extern "C" #endif -#endif // AV1_COMMON_PRED_COMMON_H_ +#endif // AOM_AV1_COMMON_PRED_COMMON_H_ diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h index ca199e94c..d1f52a660 100644 --- a/third_party/aom/av1/common/quant_common.h +++ b/third_party/aom/av1/common/quant_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_QUANT_COMMON_H_ -#define AV1_COMMON_QUANT_COMMON_H_ +#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_ +#define AOM_AV1_COMMON_QUANT_COMMON_H_ #include "aom/aom_codec.h" #include "av1/common/seg_common.h" @@ -60,4 +60,4 @@ const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp, } // extern "C" #endif -#endif // AV1_COMMON_QUANT_COMMON_H_ +#endif // AOM_AV1_COMMON_QUANT_COMMON_H_ diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c index b9f0b57f3..3203efce4 100644 --- a/third_party/aom/av1/common/reconinter.c +++ b/third_party/aom/av1/common/reconinter.c @@ -44,10 +44,9 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi, if (build_for_obmc) return 0; - if (warp_types->local_warp_allowed && !mbmi->wm_params[0].invalid) { + if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) { if (final_warp_params != NULL) - memcpy(final_warp_params, &mbmi->wm_params[0], - sizeof(*final_warp_params)); + memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params)); return 1; } else if (warp_types->global_warp_allowed && !gm_params->invalid) { if (final_warp_params != NULL) @@ -78,6 +77,9 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], build_for_obmc, subpel_params->xs, subpel_params->ys, &final_warp_params)); + const int is_intrabc = mi->use_intrabc; + assert(IMPLIES(is_intrabc, !do_warp)); + if (do_warp && xd->cur_frame_force_integer_mv == 0) { const struct macroblockd_plane *const pd = &xd->plane[plane]; const struct buf_2d *const pre_buf = &pd->pre[ref]; @@ -88,10 +90,11 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, pd->subsampling_x, pd->subsampling_y, conv_params); } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, - w, h, conv_params, interp_filters, xd->bd); + w, h, conv_params, interp_filters, is_intrabc, + xd->bd); } else { inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h, - conv_params, interp_filters); + conv_params, interp_filters, is_intrabc); } } @@ -574,37 +577,6 @@ static void build_masked_compound_no_round( h, subw, subh, conv_params); } -static void build_masked_compound( - uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride, - const uint8_t *src1, int src1_stride, - const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, - int w) { - // Derive subsampling from h and w passed in. May be refactored to - // pass in subsampling factors directly. - const int subh = (2 << mi_size_high_log2[sb_type]) == h; - const int subw = (2 << mi_size_wide_log2[sb_type]) == w; - const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); - aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, block_size_wide[sb_type], w, h, subw, subh); -} - -static void build_masked_compound_highbd( - uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride, - const uint8_t *src1_8, int src1_stride, - const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, - int w, int bd) { - // Derive subsampling from h and w passed in. May be refactored to - // pass in subsampling factors directly. - const int subh = (2 << mi_size_high_log2[sb_type]) == h; - const int subw = (2 << mi_size_wide_log2[sb_type]) == w; - const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); - // const uint8_t *mask = - // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type); - aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8, - src1_stride, mask, block_size_wide[sb_type], w, h, - subw, subh, bd); -} - void av1_make_masked_inter_predictor( const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride, const SubpelParams *subpel_params, const struct scale_factors *sf, int w, @@ -653,63 +625,6 @@ void av1_make_masked_inter_predictor( mi->sb_type, h, w, conv_params, xd); } -// TODO(sarahparker) av1_highbd_build_inter_predictor and -// av1_build_inter_predictor should be combined with -// av1_make_inter_predictor -void av1_highbd_build_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, - const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, - InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col, - int p_row, int plane, enum mv_precision precision, int x, int y, - const MACROBLOCKD *xd, int can_use_previous) { - const int is_q4 = precision == MV_PRECISION_Q4; - const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, - is_q4 ? src_mv->col : src_mv->col * 2 }; - MV32 mv = av1_scale_mv(&mv_q4, x, y, sf); - mv.col += SCALE_EXTRA_OFF; - mv.row += SCALE_EXTRA_OFF; - const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, - mv.col & SCALE_SUBPEL_MASK, - mv.row & SCALE_SUBPEL_MASK }; - ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd); - - src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride + - (mv.col >> SCALE_SUBPEL_BITS); - - av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf, - w, h, &conv_params, interp_filters, warp_types, - p_col, p_row, plane, ref, xd->mi[0], 0, xd, - can_use_previous); -} - -void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, const MV *src_mv, - const struct scale_factors *sf, int w, int h, - ConvolveParams *conv_params, - InterpFilters interp_filters, - const WarpTypesAllowed *warp_types, int p_col, - int p_row, int plane, int ref, - enum mv_precision precision, int x, int y, - const MACROBLOCKD *xd, int can_use_previous) { - const int is_q4 = precision == MV_PRECISION_Q4; - const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, - is_q4 ? src_mv->col : src_mv->col * 2 }; - MV32 mv = av1_scale_mv(&mv_q4, x, y, sf); - mv.col += SCALE_EXTRA_OFF; - mv.row += SCALE_EXTRA_OFF; - - const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, - mv.col & SCALE_SUBPEL_MASK, - mv.row & SCALE_SUBPEL_MASK }; - src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride + - (mv.col >> SCALE_SUBPEL_BITS); - - av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf, - w, h, conv_params, interp_filters, warp_types, p_col, - p_row, plane, ref, xd->mi[0], 0, xd, - can_use_previous); -} - void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, int order_idx, int *fwd_offset, int *bck_offset, int *use_jnt_comp_avg, int is_compound) { @@ -759,279 +674,6 @@ void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order]; } -static INLINE void calc_subpel_params( - MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv, - int plane, const int pre_x, const int pre_y, int x, int y, - struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params, - int bw, int bh) { - struct macroblockd_plane *const pd = &xd->plane[plane]; - const int is_scaled = av1_is_scaled(sf); - if (is_scaled) { - int ssx = pd->subsampling_x; - int ssy = pd->subsampling_y; - int orig_pos_y = (pre_y + y) << SUBPEL_BITS; - orig_pos_y += mv.row * (1 << (1 - ssy)); - int orig_pos_x = (pre_x + x) << SUBPEL_BITS; - orig_pos_x += mv.col * (1 << (1 - ssx)); - int pos_y = sf->scale_value_y(orig_pos_y, sf); - int pos_x = sf->scale_value_x(orig_pos_x, sf); - pos_x += SCALE_EXTRA_OFF; - pos_y += SCALE_EXTRA_OFF; - - const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); - const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); - const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; - pos_y = clamp(pos_y, top, bottom); - pos_x = clamp(pos_x, left, right); - - *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + - (pos_x >> SCALE_SUBPEL_BITS); - subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK; - subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK; - subpel_params->xs = sf->x_step_q4; - subpel_params->ys = sf->y_step_q4; - } else { - const MV mv_q4 = clamp_mv_to_umv_border_sb( - xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); - subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS; - subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; - subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; - *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride + - (x + (mv_q4.col >> SUBPEL_BITS)); - } -} - -static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, - int plane, const MB_MODE_INFO *mi, - int build_for_obmc, int bw, int bh, - int mi_x, int mi_y) { - struct macroblockd_plane *const pd = &xd->plane[plane]; - int is_compound = has_second_ref(mi); - int ref; - const int is_intrabc = is_intrabc_block(mi); - assert(IMPLIES(is_intrabc, !is_compound)); - int is_global[2] = { 0, 0 }; - for (ref = 0; ref < 1 + is_compound; ++ref) { - const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; - is_global[ref] = is_global_mv_block(mi, wm->wmtype); - } - - const BLOCK_SIZE bsize = mi->sb_type; - const int ss_x = pd->subsampling_x; - const int ss_y = pd->subsampling_y; - int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) || - (block_size_high[bsize] < 8 && ss_y); - - if (is_intrabc) sub8x8_inter = 0; - - // For sub8x8 chroma blocks, we may be covering more than one luma block's - // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for - // the top-left corner of the prediction source - the correct top-left corner - // is at (pre_x, pre_y). - const int row_start = - (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0; - const int col_start = - (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0; - const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; - const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; - - sub8x8_inter = sub8x8_inter && !build_for_obmc; - if (sub8x8_inter) { - for (int row = row_start; row <= 0 && sub8x8_inter; ++row) { - for (int col = col_start; col <= 0; ++col) { - const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; - if (!is_inter_block(this_mbmi)) sub8x8_inter = 0; - if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0; - } - } - } - - if (sub8x8_inter) { - // block size - const int b4_w = block_size_wide[bsize] >> ss_x; - const int b4_h = block_size_high[bsize] >> ss_y; - const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y); - const int b8_w = block_size_wide[plane_bsize] >> ss_x; - const int b8_h = block_size_high[plane_bsize] >> ss_y; - assert(!is_compound); - - const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] }; - - int row = row_start; - for (int y = 0; y < b8_h; y += b4_h) { - int col = col_start; - for (int x = 0; x < b8_w; x += b4_w) { - MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; - is_compound = has_second_ref(this_mbmi); - DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]); - int tmp_dst_stride = 8; - assert(bw < 8 || bh < 8); - ConvolveParams conv_params = get_conv_params_no_round( - 0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd); - conv_params.use_jnt_comp_avg = 0; - struct buf_2d *const dst_buf = &pd->dst; - uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x; - - ref = 0; - const RefBuffer *ref_buf = - &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME]; - - pd->pre[ref].buf0 = - (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer; - pd->pre[ref].buf = - pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y, - ref_buf->buf->uv_stride, - &ref_buf->sf); - pd->pre[ref].width = ref_buf->buf->uv_crop_width; - pd->pre[ref].height = ref_buf->buf->uv_crop_height; - pd->pre[ref].stride = ref_buf->buf->uv_stride; - - const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &ref_buf->sf; - struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; - - const MV mv = this_mbmi->mv[ref].as_mv; - - uint8_t *pre; - SubpelParams subpel_params; - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global[ref]; - warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL; - - calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre, - &subpel_params, bw, bh); - - conv_params.ref = ref; - conv_params.do_average = ref; - if (is_masked_compound_type(mi->interinter_comp.type)) { - // masked compound type has its own average mechanism - conv_params.do_average = 0; - } - - av1_make_inter_predictor( - pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, - b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types, - (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y, - plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion); - - ++col; - } - ++row; - } - - for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref]; - return; - } - - { - DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]); - ConvolveParams conv_params = get_conv_params_no_round( - 0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd); - av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset, - &conv_params.bck_offset, - &conv_params.use_jnt_comp_avg, is_compound); - - struct buf_2d *const dst_buf = &pd->dst; - uint8_t *const dst = dst_buf->buf; - for (ref = 0; ref < 1 + is_compound; ++ref) { - const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf; - struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; - const MV mv = mi->mv[ref].as_mv; - - uint8_t *pre; - SubpelParams subpel_params; - calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre, - &subpel_params, bw, bh); - - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global[ref]; - warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; - conv_params.ref = ref; - - if (ref && is_masked_compound_type(mi->interinter_comp.type)) { - // masked compound type has its own average mechanism - conv_params.do_average = 0; - av1_make_masked_inter_predictor( - pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw, - bh, &conv_params, mi->interp_filters, plane, &warp_types, - mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd, - cm->allow_warped_motion); - } else { - conv_params.do_average = ref; - av1_make_inter_predictor( - pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw, - bh, &conv_params, mi->interp_filters, &warp_types, - mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref, - mi, build_for_obmc, xd, cm->allow_warped_motion); - } - } - } -} - -static void build_inter_predictors_for_planes(const AV1_COMMON *cm, - MACROBLOCKD *xd, BLOCK_SIZE bsize, - int mi_row, int mi_col, - int plane_from, int plane_to) { - int plane; - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - for (plane = plane_from; plane <= plane_to; ++plane) { - const struct macroblockd_plane *pd = &xd->plane[plane]; - const int bw = pd->width; - const int bh = pd->height; - - if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, - pd->subsampling_y)) - continue; - - build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y); - } -} - -void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize) { - build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0); - - if (is_interintra_pred(xd->mi[0])) { - BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL }, - { xd->plane[0].dst.stride, 0, 0 } }; - if (!ctx) ctx = &default_ctx; - av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf, - xd->plane[0].dst.stride, ctx, 0, bsize); - } -} - -void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize) { - build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1, - MAX_MB_PLANE - 1); - - if (is_interintra_pred(xd->mi[0])) { - BUFFER_SET default_ctx = { - { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf }, - { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride } - }; - if (!ctx) ctx = &default_ctx; - av1_build_interintra_predictors_sbuv( - cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf, - xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize); - } -} - -void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize) { - const int num_planes = av1_num_planes(cm); - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize); - if (num_planes > 1) - av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize); -} - void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, const int plane_start, const int plane_end) { @@ -1292,63 +934,7 @@ void av1_setup_build_prediction_by_above_pred( xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col); xd->mb_to_right_edge = ctxt->mb_to_far_edge + - (xd->n8_w - rel_mi_col - above_mi_width) * MI_SIZE * 8; -} - -static INLINE void build_prediction_by_above_pred( - MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, - MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) { - struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; - const int above_mi_col = ctxt->mi_col + rel_mi_col; - int mi_x, mi_y; - MB_MODE_INFO backup_mbmi = *above_mbmi; - - av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width, - above_mbmi, ctxt, num_planes); - mi_x = above_mi_col << MI_SIZE_LOG2; - mi_y = ctxt->mi_row << MI_SIZE_LOG2; - - const BLOCK_SIZE bsize = xd->mi[0]->sb_type; - - for (int j = 0; j < num_planes; ++j) { - const struct macroblockd_plane *pd = &xd->plane[j]; - int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x; - int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4, - block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1)); - - if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue; - build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y); - } - *above_mbmi = backup_mbmi; -} - -void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, - uint8_t *tmp_buf[MAX_MB_PLANE], - int tmp_width[MAX_MB_PLANE], - int tmp_height[MAX_MB_PLANE], - int tmp_stride[MAX_MB_PLANE]) { - if (!xd->up_available) return; - - // Adjust mb_to_bottom_edge to have the correct value for the OBMC - // prediction block. This is half the height of the original block, - // except for 128-wide blocks, where we only use a height of 32. - int this_height = xd->n8_h * MI_SIZE; - int pred_height = AOMMIN(this_height / 2, 32); - xd->mb_to_bottom_edge += (this_height - pred_height) * 8; - - struct build_prediction_ctxt ctxt = { cm, mi_row, - mi_col, tmp_buf, - tmp_width, tmp_height, - tmp_stride, xd->mb_to_right_edge }; - BLOCK_SIZE bsize = xd->mi[0]->sb_type; - foreach_overlappable_nb_above(cm, xd, mi_col, - max_neighbor_obmc[mi_size_wide_log2[bsize]], - build_prediction_by_above_pred, &ctxt); - - xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); - xd->mb_to_right_edge = ctxt.mb_to_far_edge; - xd->mb_to_bottom_edge -= (this_height - pred_height) * 8; + (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8; } void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, @@ -1386,101 +972,7 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row); xd->mb_to_bottom_edge = ctxt->mb_to_far_edge + - (xd->n8_h - rel_mi_row - left_mi_height) * MI_SIZE * 8; -} - -static INLINE void build_prediction_by_left_pred( - MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height, - MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) { - struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; - const int left_mi_row = ctxt->mi_row + rel_mi_row; - int mi_x, mi_y; - MB_MODE_INFO backup_mbmi = *left_mbmi; - - av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height, - left_mbmi, ctxt, num_planes); - mi_x = ctxt->mi_col << MI_SIZE_LOG2; - mi_y = left_mi_row << MI_SIZE_LOG2; - const BLOCK_SIZE bsize = xd->mi[0]->sb_type; - - for (int j = 0; j < num_planes; ++j) { - const struct macroblockd_plane *pd = &xd->plane[j]; - int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4, - block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1)); - int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y; - - if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue; - build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y); - } - *left_mbmi = backup_mbmi; -} - -void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, - uint8_t *tmp_buf[MAX_MB_PLANE], - int tmp_width[MAX_MB_PLANE], - int tmp_height[MAX_MB_PLANE], - int tmp_stride[MAX_MB_PLANE]) { - if (!xd->left_available) return; - - // Adjust mb_to_right_edge to have the correct value for the OBMC - // prediction block. This is half the width of the original block, - // except for 128-wide blocks, where we only use a width of 32. - int this_width = xd->n8_w * MI_SIZE; - int pred_width = AOMMIN(this_width / 2, 32); - xd->mb_to_right_edge += (this_width - pred_width) * 8; - - struct build_prediction_ctxt ctxt = { cm, mi_row, - mi_col, tmp_buf, - tmp_width, tmp_height, - tmp_stride, xd->mb_to_bottom_edge }; - BLOCK_SIZE bsize = xd->mi[0]->sb_type; - foreach_overlappable_nb_left(cm, xd, mi_row, - max_neighbor_obmc[mi_size_high_log2[bsize]], - build_prediction_by_left_pred, &ctxt); - - xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); - xd->mb_to_right_edge -= (this_width - pred_width) * 8; - xd->mb_to_bottom_edge = ctxt.mb_to_far_edge; -} - -void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col) { - const int num_planes = av1_num_planes(cm); - DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); - DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); - uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE]; - int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - int len = sizeof(uint16_t); - dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1); - dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len); - dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len); - dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2); - dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len); - dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len); - } else { - dst_buf1[0] = tmp_buf1; - dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE; - dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2; - dst_buf2[0] = tmp_buf2; - dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE; - dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2; - } - av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1, - dst_width1, dst_height1, dst_stride1); - av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2, - dst_width2, dst_height2, dst_stride2); - av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm), - mi_row, mi_col, 0, num_planes); - av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1, - dst_buf2, dst_stride2); + (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8; } /* clang-format off */ @@ -1668,127 +1160,3 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize); av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize); } - -void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, - uint8_t *ypred, uint8_t *upred, - uint8_t *vpred, int ystride, int ustride, - int vstride, BUFFER_SET *ctx, - BLOCK_SIZE bsize) { - av1_build_interintra_predictors_sbp(cm, xd, ypred, ystride, ctx, 0, bsize); - av1_build_interintra_predictors_sbuv(cm, xd, upred, vpred, ustride, vstride, - ctx, bsize); -} - -// Builds the inter-predictor for the single ref case -// for use in the encoder to search the wedges efficiently. -static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane, - int bw, int bh, int x, int y, - int w, int h, int mi_x, int mi_y, - int ref, uint8_t *const ext_dst, - int ext_dst_stride, - int can_use_previous) { - struct macroblockd_plane *const pd = &xd->plane[plane]; - const MB_MODE_INFO *mi = xd->mi[0]; - - const struct scale_factors *const sf = &xd->block_refs[ref]->sf; - struct buf_2d *const pre_buf = &pd->pre[ref]; - uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x; - const MV mv = mi->mv[ref].as_mv; - - ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd); - WarpTypesAllowed warp_types; - const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; - warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype); - warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; - const int pre_x = (mi_x) >> pd->subsampling_x; - const int pre_y = (mi_y) >> pd->subsampling_y; - uint8_t *pre; - SubpelParams subpel_params; - calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre, - &subpel_params, bw, bh); - - av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride, - &subpel_params, sf, w, h, &conv_params, - mi->interp_filters, &warp_types, pre_x + x, - pre_y + y, plane, ref, mi, 0, xd, can_use_previous); -} - -void av1_build_inter_predictors_for_planes_single_buf( - MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row, - int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3], - int can_use_previous) { - int plane; - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - for (plane = plane_from; plane <= plane_to; ++plane) { - const BLOCK_SIZE plane_bsize = get_plane_block_size( - bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); - const int bw = block_size_wide[plane_bsize]; - const int bh = block_size_high[plane_bsize]; - build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x, - mi_y, ref, ext_dst[plane], - ext_dst_stride[plane], can_use_previous); - } -} - -static void build_wedge_inter_predictor_from_buf( - MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0, - int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) { - MB_MODE_INFO *const mbmi = xd->mi[0]; - const int is_compound = has_second_ref(mbmi); - MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; - struct buf_2d *const dst_buf = &pd->dst; - uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; - mbmi->interinter_comp.seg_mask = xd->seg_mask; - const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp; - - if (is_compound && is_masked_compound_type(comp_data->type)) { - if (!plane && comp_data->type == COMPOUND_DIFFWTD) { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - av1_build_compound_diffwtd_mask_highbd( - comp_data->seg_mask, comp_data->mask_type, - CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, - CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd); - else - av1_build_compound_diffwtd_mask( - comp_data->seg_mask, comp_data->mask_type, ext_dst0, - ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w); - } - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - build_masked_compound_highbd( - dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, - CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, - mbmi->sb_type, h, w, xd->bd); - else - build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, - ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type, - h, w); - } else { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, - dst, dst_buf->stride, NULL, 0, NULL, 0, w, h, - xd->bd); - else - aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL, - 0, NULL, 0, w, h); - } -} - -void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, - int plane_from, int plane_to, - uint8_t *ext_dst0[3], - int ext_dst_stride0[3], - uint8_t *ext_dst1[3], - int ext_dst_stride1[3]) { - int plane; - for (plane = plane_from; plane <= plane_to; ++plane) { - const BLOCK_SIZE plane_bsize = get_plane_block_size( - bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); - const int bw = block_size_wide[plane_bsize]; - const int bh = block_size_high[plane_bsize]; - build_wedge_inter_predictor_from_buf( - xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane], - ext_dst1[plane], ext_dst_stride1[plane]); - } -} diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h index 6a3def270..db86c777e 100644 --- a/third_party/aom/av1/common/reconinter.h +++ b/third_party/aom/av1/common/reconinter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_RECONINTER_H_ -#define AV1_COMMON_RECONINTER_H_ +#ifndef AOM_AV1_COMMON_RECONINTER_H_ +#define AOM_AV1_COMMON_RECONINTER_H_ #include "av1/common/filter.h" #include "av1/common/onyxc_int.h" @@ -113,40 +113,48 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride, const SubpelParams *subpel_params, const struct scale_factors *sf, int w, int h, ConvolveParams *conv_params, - InterpFilters interp_filters) { + InterpFilters interp_filters, + int is_intrabc) { assert(conv_params->do_average == 0 || conv_params->do_average == 1); assert(sf); - if (has_scale(subpel_params->xs, subpel_params->ys)) { + const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); + assert(IMPLIES(is_intrabc, !is_scaled)); + if (is_scaled) { av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y, - subpel_params->ys, 1, conv_params, sf); + subpel_params->ys, 1, conv_params, sf, is_intrabc); } else { SubpelParams sp = *subpel_params; revert_scale_extra_bits(&sp); av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x, sp.xs, sp.subpel_y, - sp.ys, 0, conv_params, sf); + sp.ys, 0, conv_params, sf, is_intrabc); } } -static INLINE void highbd_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, - const SubpelParams *subpel_params, const struct scale_factors *sf, int w, - int h, ConvolveParams *conv_params, InterpFilters interp_filters, int bd) { +static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const SubpelParams *subpel_params, + const struct scale_factors *sf, int w, + int h, ConvolveParams *conv_params, + InterpFilters interp_filters, + int is_intrabc, int bd) { assert(conv_params->do_average == 0 || conv_params->do_average == 1); assert(sf); - if (has_scale(subpel_params->xs, subpel_params->ys)) { - av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, - interp_filters, subpel_params->subpel_x, - subpel_params->xs, subpel_params->subpel_y, - subpel_params->ys, 1, conv_params, sf, bd); + const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); + assert(IMPLIES(is_intrabc, !is_scaled)); + if (is_scaled) { + av1_highbd_convolve_2d_facade( + src, src_stride, dst, dst_stride, w, h, interp_filters, + subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y, + subpel_params->ys, 1, conv_params, sf, is_intrabc, bd); } else { SubpelParams sp = *subpel_params; revert_scale_extra_bits(&sp); - av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, - interp_filters, sp.subpel_x, sp.xs, - sp.subpel_y, sp.ys, 0, conv_params, sf, bd); + av1_highbd_convolve_2d_facade( + src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x, + sp.xs, sp.subpel_y, sp.ys, 0, conv_params, sf, is_intrabc, bd); } } @@ -237,35 +245,6 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, return clamped_mv; } -void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize); - -void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize); - -void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize); - -void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, const MV *src_mv, - const struct scale_factors *sf, int w, int h, - ConvolveParams *conv_params, - InterpFilters interp_filters, - const WarpTypesAllowed *warp_types, int p_col, - int p_row, int plane, int ref, - enum mv_precision precision, int x, int y, - const MACROBLOCKD *xd, int can_use_previous); - -void av1_highbd_build_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, - const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg, - InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col, - int p_row, int plane, enum mv_precision precision, int x, int y, - const MACROBLOCKD *xd, int can_use_previous); - static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride, const struct scale_factors *sf) { const int x = @@ -303,32 +282,6 @@ void av1_setup_pre_planes(MACROBLOCKD *xd, int idx, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, const struct scale_factors *sf, const int num_planes); -// Detect if the block have sub-pixel level motion vectors -// per component. -#define CHECK_SUBPEL 0 -static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi, - const MACROBLOCKD *const xd, - int dir) { -#if CHECK_SUBPEL - const BLOCK_SIZE bsize = mbmi->sb_type; - int plane; - int ref = (dir >> 1); - - if (dir & 0x01) { - if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1; - } else { - if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1; - } - - return 0; -#else - (void)mbmi; - (void)xd; - (void)dir; - return 1; -#endif -} - static INLINE void set_default_interp_filters( MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) { mbmi->interp_filters = @@ -343,21 +296,6 @@ static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) { return 1; } -static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) { - MB_MODE_INFO *const mi = xd->mi[0]; - const int is_compound = has_second_ref(mi); - int ref; - for (ref = 0; ref < 1 + is_compound; ++ref) { - int row_col; - for (row_col = 0; row_col < 2; ++row_col) { - const int dir = (ref << 1) + row_col; - if (has_subpel_mv_component(mi, xd, dir)) { - return 1; - } - } - } - return 0; -} void av1_setup_build_prediction_by_above_pred( MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt, @@ -367,18 +305,6 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, MB_MODE_INFO *left_mbmi, struct build_prediction_ctxt *ctxt, const int num_planes); -void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, - uint8_t *tmp_buf[MAX_MB_PLANE], - int tmp_width[MAX_MB_PLANE], - int tmp_height[MAX_MB_PLANE], - int tmp_stride[MAX_MB_PLANE]); -void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, - uint8_t *tmp_buf[MAX_MB_PLANE], - int tmp_width[MAX_MB_PLANE], - int tmp_height[MAX_MB_PLANE], - int tmp_stride[MAX_MB_PLANE]); void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, uint8_t *above[MAX_MB_PLANE], @@ -389,8 +315,6 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, const uint8_t *av1_get_obmc_mask(int length); void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col); -void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col); #define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1) #define MASK_MASTER_STRIDE (MASK_MASTER_SIZE) @@ -406,12 +330,6 @@ static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index, const uint8_t *av1_get_compound_type_mask( const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type); -void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, - uint8_t *ypred, uint8_t *upred, - uint8_t *vpred, int ystride, int ustride, - int vstride, BUFFER_SET *ctx, - BLOCK_SIZE bsize); - // build interintra_predictors for one plane void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *pred, int stride, @@ -431,18 +349,6 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, const uint8_t *inter_pred, int inter_stride, const uint8_t *intra_pred, int intra_stride); -// Encoder only -void av1_build_inter_predictors_for_planes_single_buf( - MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row, - int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3], - int can_use_previous); -void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, - int plane_from, int plane_to, - uint8_t *ext_dst0[3], - int ext_dst_stride0[3], - uint8_t *ext_dst1[3], - int ext_dst_stride1[3]); - void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, int order_idx, int *fwd_offset, int *bck_offset, int *use_jnt_comp_avg, int is_compound); @@ -456,4 +362,4 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi, } // extern "C" #endif -#endif // AV1_COMMON_RECONINTER_H_ +#endif // AOM_AV1_COMMON_RECONINTER_H_ diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h index 57638f24e..07853aba0 100644 --- a/third_party/aom/av1/common/reconintra.h +++ b/third_party/aom/av1/common/reconintra.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_RECONINTRA_H_ -#define AV1_COMMON_RECONINTRA_H_ +#ifndef AOM_AV1_COMMON_RECONINTRA_H_ +#define AOM_AV1_COMMON_RECONINTRA_H_ #include @@ -116,4 +116,4 @@ static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta, #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_COMMON_RECONINTRA_H_ +#endif // AOM_AV1_COMMON_RECONINTRA_H_ diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c index 93d62292a..d61a20aa2 100644 --- a/third_party/aom/av1/common/resize.c +++ b/third_party/aom/av1/common/resize.c @@ -170,42 +170,6 @@ static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = { { -1, 3, -9, 17, 112, 10, -7, 3 }, { -1, 3, -8, 15, 112, 12, -7, 2 }, }; -// Filters for interpolation (full-band) - no filtering for integer pixels -static const InterpKernel filteredinterp_filters1000[(1 << RS_SUBPEL_BITS)] = { - { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -1, 128, 2, -1, 0, 0 }, - { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -4, 127, 6, -3, 1, 0 }, - { 0, 2, -6, 126, 8, -3, 1, 0 }, { 0, 2, -7, 125, 11, -4, 1, 0 }, - { -1, 2, -8, 125, 13, -5, 2, 0 }, { -1, 3, -9, 124, 15, -6, 2, 0 }, - { -1, 3, -10, 123, 18, -6, 2, -1 }, { -1, 3, -11, 122, 20, -7, 3, -1 }, - { -1, 4, -12, 121, 22, -8, 3, -1 }, { -1, 4, -13, 120, 25, -9, 3, -1 }, - { -1, 4, -14, 118, 28, -9, 3, -1 }, { -1, 4, -15, 117, 30, -10, 4, -1 }, - { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 }, - { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 }, - { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 }, - { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 }, - { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 }, - { -1, 6, -20, 97, 58, -17, 6, -1 }, { -1, 6, -20, 95, 61, -18, 6, -1 }, - { -2, 7, -20, 93, 64, -18, 6, -2 }, { -2, 7, -20, 91, 66, -19, 6, -1 }, - { -2, 7, -20, 88, 69, -19, 6, -1 }, { -2, 7, -20, 86, 71, -19, 6, -1 }, - { -2, 7, -20, 84, 74, -20, 7, -2 }, { -2, 7, -20, 81, 76, -20, 7, -1 }, - { -2, 7, -20, 79, 79, -20, 7, -2 }, { -1, 7, -20, 76, 81, -20, 7, -2 }, - { -2, 7, -20, 74, 84, -20, 7, -2 }, { -1, 6, -19, 71, 86, -20, 7, -2 }, - { -1, 6, -19, 69, 88, -20, 7, -2 }, { -1, 6, -19, 66, 91, -20, 7, -2 }, - { -2, 6, -18, 64, 93, -20, 7, -2 }, { -1, 6, -18, 61, 95, -20, 6, -1 }, - { -1, 6, -17, 58, 97, -20, 6, -1 }, { -1, 6, -17, 56, 99, -20, 6, -1 }, - { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 }, - { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 }, - { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 }, - { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 }, - { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 }, - { -1, 3, -9, 28, 118, -14, 4, -1 }, { -1, 3, -9, 25, 120, -13, 4, -1 }, - { -1, 3, -8, 22, 121, -12, 4, -1 }, { -1, 3, -7, 20, 122, -11, 3, -1 }, - { -1, 2, -6, 18, 123, -10, 3, -1 }, { 0, 2, -6, 15, 124, -9, 3, -1 }, - { 0, 2, -5, 13, 125, -8, 2, -1 }, { 0, 1, -4, 11, 125, -7, 2, 0 }, - { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -3, 6, 127, -4, 1, 0 }, - { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, -1, 2, 128, -1, 0, 0 }, -}; - const int16_t av1_resize_filter_normative[( 1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = { #if UPSCALE_NORMATIVE_TAPS == 8 @@ -246,6 +210,9 @@ const int16_t av1_resize_filter_normative[( #endif // UPSCALE_NORMATIVE_TAPS == 8 }; +// Filters for interpolation (full-band) - no filtering for integer pixels +#define filteredinterp_filters1000 av1_resize_filter_normative + // Filters for factor of 2 downsampling. static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h index feec3a90e..9a59a8d63 100644 --- a/third_party/aom/av1/common/resize.h +++ b/third_party/aom/av1/common/resize.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RESIZE_H_ -#define AV1_ENCODER_RESIZE_H_ +#ifndef AOM_AV1_COMMON_RESIZE_H_ +#define AOM_AV1_COMMON_RESIZE_H_ #include #include "aom/aom_integer.h" @@ -109,4 +109,4 @@ int32_t av1_get_upscale_convolve_step(int in_length, int out_length); } // extern "C" #endif -#endif // AV1_ENCODER_RESIZE_H_ +#endif // AOM_AV1_COMMON_RESIZE_H_ diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c index 632967957..d276a915b 100644 --- a/third_party/aom/av1/common/restoration.c +++ b/third_party/aom/av1/common/restoration.c @@ -661,9 +661,10 @@ const int32_t one_by_x[MAX_NELEM] = { 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, }; -static void selfguided_restoration_fast_internal( - int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, - int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { +static void calculate_intermediate_result(int32_t *dgd, int width, int height, + int dgd_stride, int bit_depth, + int sgr_params_idx, int radius_idx, + int pass, int32_t *A, int32_t *B) { const sgr_params_type *const params = &sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; @@ -673,10 +674,7 @@ static void selfguided_restoration_fast_internal( // We also align the stride to a multiple of 16 bytes, for consistency // with the SIMD version of this function. int buf_stride = ((width_ext + 3) & ~3) + 16; - int32_t A_[RESTORATION_PROC_UNIT_PELS]; - int32_t B_[RESTORATION_PROC_UNIT_PELS]; - int32_t *A = A_; - int32_t *B = B_; + const int step = pass == 0 ? 1 : 2; int i, j; assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); @@ -691,7 +689,7 @@ static void selfguided_restoration_fast_internal( B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie, // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[]. - for (i = -1; i < height + 1; i += 2) { + for (i = -1; i < height + 1; i += step) { for (j = -1; j < width + 1; ++j) { const int k = i * buf_stride + j; const int n = (2 * r + 1) * (2 * r + 1); @@ -754,7 +752,31 @@ static void selfguided_restoration_fast_internal( SGRPROJ_RECIP_BITS); } } +} + +static void selfguided_restoration_fast_internal( + int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes, for consistency + // with the SIMD version of this function. + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; + int i, j; + calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, + sgr_params_idx, radius_idx, 1, A, B); + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + // Use the A[] and B[] arrays to calculate the filtered image + (void)r; assert(r == 2); for (i = 0; i < height; ++i) { if (!(i & 1)) { // even row @@ -796,10 +818,7 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height, int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; - const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; - const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; // Adjusting the stride of A and B here appears to avoid bad cache effects, // leading to a significant speed improvement. // We also align the stride to a multiple of 16 bytes, for consistency @@ -810,82 +829,11 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height, int32_t *A = A_; int32_t *B = B_; int i, j; - - assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); - assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && - "Need SGRPROJ_BORDER_* >= r+1"); - - boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, - width_ext, height_ext, dgd_stride, r, 0, B, buf_stride); - boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, - width_ext, height_ext, dgd_stride, r, 1, A, buf_stride); + calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, + sgr_params_idx, radius_idx, 0, A, B); A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; - // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie, - // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[]. - for (i = -1; i < height + 1; ++i) { - for (j = -1; j < width + 1; ++j) { - const int k = i * buf_stride + j; - const int n = (2 * r + 1) * (2 * r + 1); - - // a < 2^16 * n < 2^22 regardless of bit depth - uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8)); - // b < 2^8 * n < 2^14 regardless of bit depth - uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8); - - // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28, - // and p itself satisfies p < 2^14 * n^2 < 2^26. - // This bound on p is due to: - // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances - // - // Note: Sometimes, in high bit depth, we can end up with a*n < b*b. - // This is an artefact of rounding, and can only happen if all pixels - // are (almost) identical, so in this case we saturate to p=0. - uint32_t p = (a * n < b * b) ? 0 : a * n - b * b; - - const uint32_t s = params->s[radius_idx]; - - // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32 - // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12 - // (this holds even after accounting for the rounding in s) - const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS); - - // Note: We have to be quite careful about the value of A[k]. - // This is used as a blend factor between individual pixel values and the - // local mean. So it logically has a range of [0, 256], including both - // endpoints. - // - // This is a pain for hardware, as we'd like something which can be stored - // in exactly 8 bits. - // Further, in the calculation of B[k] below, if z == 0 and r == 2, - // then A[k] "should be" 0. But then we can end up setting B[k] to a value - // slightly above 2^(8 + bit depth), due to rounding in the value of - // one_by_x[25-1]. - // - // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0. - // This fixes the above issues (256 - A[k] fits in a uint8, and we can't - // overflow), without significantly affecting the final result: z == 0 - // implies that the image is essentially "flat", so the local mean and - // individual pixel values are very similar. - // - // Note that saturating on the other side, ie. requring A[k] <= 255, - // would be a bad idea, as that corresponds to the case where the image - // is very variable, when we want to preserve the local pixel value as - // much as possible. - A[k] = x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256] - // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n, - // one_by_x[n - 1] = round(2^12 / n) - // => the product here is < 2^(20 + bit_depth) <= 2^32, - // and B[k] is set to a value < 2^(8 + bit depth) - // This holds even with the rounding in one_by_x and in the overall - // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8. - B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) * - (uint32_t)B[k] * - (uint32_t)one_by_x[n - 1], - SGRPROJ_RECIP_BITS); - } - } // Use the A[] and B[] arrays to calculate the filtered image for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { @@ -911,10 +859,10 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height, } } -void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, - int dgd_stride, int32_t *flt0, int32_t *flt1, - int flt_stride, int sgr_params_idx, - int bit_depth, int highbd) { +int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, + int dgd_stride, int32_t *flt0, int32_t *flt1, + int flt_stride, int sgr_params_idx, + int bit_depth, int highbd) { int32_t dgd32_[RESTORATION_PROC_UNIT_PELS]; const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ; int32_t *dgd32 = @@ -948,6 +896,7 @@ void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, if (params->r[1] > 0) selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1, flt_stride, bit_depth, sgr_params_idx, 1); + return 0; } void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height, @@ -959,8 +908,10 @@ void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height, int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); - av1_selfguided_restoration_c(dat8, width, height, stride, flt0, flt1, width, - eps, bit_depth, highbd); + const int ret = av1_selfguided_restoration_c( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + (void)ret; + assert(!ret); const sgr_params_type *const params = &sgr_params[eps]; int xq[2]; decode_xq(xqd, xq, params); diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h index aec37d834..d834f9270 100644 --- a/third_party/aom/av1/common/restoration.h +++ b/third_party/aom/av1/common/restoration.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_RESTORATION_H_ -#define AV1_COMMON_RESTORATION_H_ +#ifndef AOM_AV1_COMMON_RESTORATION_H_ +#define AOM_AV1_COMMON_RESTORATION_H_ #include "aom_ports/mem.h" #include "config/aom_config.h" @@ -120,6 +120,7 @@ extern "C" { // If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for // chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN. #define WIENER_WIN_CHROMA (WIENER_WIN - 2) +#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA)) #define WIENER_FILT_PREC_BITS 7 #define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS) @@ -373,4 +374,4 @@ void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c, } // extern "C" #endif -#endif // AV1_COMMON_RESTORATION_H_ +#endif // AOM_AV1_COMMON_RESTORATION_H_ diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h index 5f02fdb81..748e958c3 100644 --- a/third_party/aom/av1/common/scale.h +++ b/third_party/aom/av1/common/scale.h @@ -9,12 +9,11 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_SCALE_H_ -#define AV1_COMMON_SCALE_H_ +#ifndef AOM_AV1_COMMON_SCALE_H_ +#define AOM_AV1_COMMON_SCALE_H_ #include "av1/common/convolve.h" #include "av1/common/mv.h" -#include "aom_dsp/aom_convolve.h" #ifdef __cplusplus extern "C" { @@ -65,4 +64,4 @@ static INLINE int valid_ref_frame_size(int ref_width, int ref_height, } // extern "C" #endif -#endif // AV1_COMMON_SCALE_H_ +#endif // AOM_AV1_COMMON_SCALE_H_ diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h index d206586b5..233dc0efa 100644 --- a/third_party/aom/av1/common/scan.h +++ b/third_party/aom/av1/common/scan.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_SCAN_H_ -#define AV1_COMMON_SCAN_H_ +#ifndef AOM_AV1_COMMON_SCAN_H_ +#define AOM_AV1_COMMON_SCAN_H_ #include "aom/aom_integer.h" #include "aom_ports/mem.h" @@ -52,4 +52,4 @@ static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) { } // extern "C" #endif -#endif // AV1_COMMON_SCAN_H_ +#endif // AOM_AV1_COMMON_SCAN_H_ diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h index c851d65fd..8c35bba86 100644 --- a/third_party/aom/av1/common/seg_common.h +++ b/third_party/aom/av1/common/seg_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_SEG_COMMON_H_ -#define AV1_COMMON_SEG_COMMON_H_ +#ifndef AOM_AV1_COMMON_SEG_COMMON_H_ +#define AOM_AV1_COMMON_SEG_COMMON_H_ #include "aom_dsp/prob.h" @@ -101,4 +101,4 @@ static INLINE int get_segdata(const struct segmentation *seg, int segment_id, } // extern "C" #endif -#endif // AV1_COMMON_SEG_COMMON_H_ +#endif // AOM_AV1_COMMON_SEG_COMMON_H_ diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c index f9b734b8c..8df4c9a09 100644 --- a/third_party/aom/av1/common/thread_common.c +++ b/third_party/aom/av1/common/thread_common.c @@ -304,8 +304,9 @@ static INLINE void thread_loop_filter_rows( } // Row-based multi-threaded loopfilter hook -static int loop_filter_row_worker(AV1LfSync *const lf_sync, - LFWorkerData *const lf_data) { +static int loop_filter_row_worker(void *arg1, void *arg2) { + AV1LfSync *const lf_sync = (AV1LfSync *)arg1; + LFWorkerData *const lf_data = (LFWorkerData *)arg2; thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd, lf_sync); return 1; @@ -342,7 +343,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, AVxWorker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; - worker->hook = (AVxWorkerHook)loop_filter_row_worker; + worker->hook = loop_filter_row_worker; worker->data1 = lf_sync; worker->data2 = lf_data; @@ -649,8 +650,9 @@ AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) { } // Implement row loop restoration for each thread. -static int loop_restoration_row_worker(AV1LrSync *const lr_sync, - LRWorkerData *lrworkerdata) { +static int loop_restoration_row_worker(void *arg1, void *arg2) { + AV1LrSync *const lr_sync = (AV1LrSync *)arg1; + LRWorkerData *lrworkerdata = (LRWorkerData *)arg2; AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt; FilterFrameCtxt *ctxt = lr_ctxt->ctxt; int lr_unit_row; @@ -714,10 +716,12 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt, int num_rows_lr = 0; for (int plane = 0; plane < num_planes; plane++) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + const AV1PixelRect tile_rect = ctxt[plane].tile_rect; const int max_tile_h = tile_rect.bottom - tile_rect.top; - const int unit_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64; + const int unit_size = cm->rst_info[plane].restoration_unit_size; num_rows_lr = AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h)); @@ -746,7 +750,7 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt, for (i = 0; i < num_workers; ++i) { AVxWorker *const worker = &workers[i]; lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt; - worker->hook = (AVxWorkerHook)loop_restoration_row_worker; + worker->hook = loop_restoration_row_worker; worker->data1 = lr_sync; worker->data2 = &lr_sync->lrworkerdata[i]; diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h index 4b0d5d2b8..23d61d72a 100644 --- a/third_party/aom/av1/common/thread_common.h +++ b/third_party/aom/av1/common/thread_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_LOOPFILTER_THREAD_H_ -#define AV1_COMMON_LOOPFILTER_THREAD_H_ +#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_ +#define AOM_AV1_COMMON_THREAD_COMMON_H_ #include "config/aom_config.h" @@ -116,4 +116,4 @@ void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers); } // extern "C" #endif -#endif // AV1_COMMON_LOOPFILTER_THREAD_H_ +#endif // AOM_AV1_COMMON_THREAD_COMMON_H_ diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c index 026c904b6..1b413487f 100644 --- a/third_party/aom/av1/common/tile_common.c +++ b/third_party/aom/av1/common/tile_common.c @@ -127,6 +127,22 @@ void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) { assert(tile->mi_col_end > tile->mi_col_start); } +int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) { + int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO( + tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2); + int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2; + + return sb_rows; +} + +int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) { + int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO( + tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2); + int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2; + + return sb_cols; +} + int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) { // Round the frame up to a whole number of max superblocks mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2); diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h index be037fb17..c03553dc6 100644 --- a/third_party/aom/av1/common/tile_common.h +++ b/third_party/aom/av1/common/tile_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_TILE_COMMON_H_ -#define AV1_COMMON_TILE_COMMON_H_ +#ifndef AOM_AV1_COMMON_TILE_COMMON_H_ +#define AOM_AV1_COMMON_TILE_COMMON_H_ #ifdef __cplusplus extern "C" { @@ -44,6 +44,9 @@ void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, // tiles horizontally or vertically in the frame. int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles); +int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile); +int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile); + typedef struct { int left, top, right, bottom; } AV1PixelRect; @@ -66,4 +69,4 @@ void av1_calculate_tile_rows(struct AV1Common *const cm); } // extern "C" #endif -#endif // AV1_COMMON_TILE_COMMON_H_ +#endif // AOM_AV1_COMMON_TILE_COMMON_H_ diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h index 1749baa57..06939ae43 100644 --- a/third_party/aom/av1/common/timing.h +++ b/third_party/aom/av1/common/timing.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_TIMING_H_ -#define AOM_TIMING_H_ +#ifndef AOM_AV1_COMMON_TIMING_H_ +#define AOM_AV1_COMMON_TIMING_H_ #include "aom/aom_integer.h" #include "av1/common/enums.h" @@ -56,4 +56,4 @@ void set_resource_availability_parameters( int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, int seq_tier); -#endif // AOM_TIMING_H_ +#endif // AOM_AV1_COMMON_TIMING_H_ diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h index 9a6b454ac..53e956450 100644 --- a/third_party/aom/av1/common/token_cdfs.h +++ b/third_party/aom/av1/common/token_cdfs.h @@ -9,6 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_ +#define AOM_AV1_COMMON_TOKEN_CDFS_H_ + #include "config/aom_config.h" #include "av1/common/entropy.h" @@ -3548,3 +3551,5 @@ static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) } } } } }; + +#endif // AOM_AV1_COMMON_TOKEN_CDFS_H_ diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h index f0ab79d0f..1dda51f8b 100644 --- a/third_party/aom/av1/common/txb_common.h +++ b/third_party/aom/av1/common/txb_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_TXB_COMMON_H_ -#define AV1_COMMON_TXB_COMMON_H_ +#ifndef AOM_AV1_COMMON_TXB_COMMON_H_ +#define AOM_AV1_COMMON_TXB_COMMON_H_ extern const int16_t k_eob_group_start[12]; extern const int16_t k_eob_offset_bits[12]; @@ -34,24 +34,6 @@ static const int base_level_count_to_index[13] = { 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, }; -// Note: TX_PAD_2D is dependent to this offset table. -static const int base_ref_offset[BASE_CONTEXT_POSITION_NUM][2] = { - /* clang-format off*/ - { -2, 0 }, { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -2 }, { 0, -1 }, { 0, 1 }, - { 0, 2 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 2, 0 } - /* clang-format on*/ -}; - -#define CONTEXT_MAG_POSITION_NUM 3 -static const int mag_ref_offset_with_txclass[3][CONTEXT_MAG_POSITION_NUM][2] = { - { { 0, 1 }, { 1, 0 }, { 1, 1 } }, - { { 0, 1 }, { 1, 0 }, { 0, 2 } }, - { { 0, 1 }, { 1, 0 }, { 2, 0 } } -}; -static const int mag_ref_offset[CONTEXT_MAG_POSITION_NUM][2] = { - { 0, 1 }, { 1, 0 }, { 1, 1 } -}; - static const TX_CLASS tx_type_to_class[TX_TYPES] = { TX_CLASS_2D, // DCT_DCT TX_CLASS_2D, // ADST_DCT @@ -71,61 +53,6 @@ static const TX_CLASS tx_type_to_class[TX_TYPES] = { TX_CLASS_HORIZ, // H_FLIPADST }; -static const int8_t eob_to_pos_small[33] = { - 0, 1, 2, // 0-2 - 3, 3, // 3-4 - 4, 4, 4, 4, // 5-8 - 5, 5, 5, 5, 5, 5, 5, 5, // 9-16 - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32 -}; - -static const int8_t eob_to_pos_large[17] = { - 6, // place holder - 7, // 33-64 - 8, 8, // 65-128 - 9, 9, 9, 9, // 129-256 - 10, 10, 10, 10, 10, 10, 10, 10, // 257-512 - 11 // 513- -}; - -static INLINE int get_eob_pos_token(const int eob, int *const extra) { - int t; - - if (eob < 33) { - t = eob_to_pos_small[eob]; - } else { - const int e = AOMMIN((eob - 1) >> 5, 16); - t = eob_to_pos_large[e]; - } - - *extra = eob - k_eob_group_start[t]; - - return t; -} - -static INLINE int av1_get_eob_pos_ctx(const TX_TYPE tx_type, - const int eob_token) { - static const int8_t tx_type_to_offset[TX_TYPES] = { - -1, // DCT_DCT - -1, // ADST_DCT - -1, // DCT_ADST - -1, // ADST_ADST - -1, // FLIPADST_DCT - -1, // DCT_FLIPADST - -1, // FLIPADST_FLIPADST - -1, // ADST_FLIPADST - -1, // FLIPADST_ADST - -1, // IDTX - 10, // V_DCT - 10, // H_DCT - 10, // V_ADST - 10, // H_ADST - 10, // V_FLIPADST - 10, // H_FLIPADST - }; - return eob_token + tx_type_to_offset[tx_type]; -} - static INLINE int get_txb_bwl(TX_SIZE tx_size) { tx_size = av1_get_adjusted_tx_size(tx_size); return tx_size_wide_log2[tx_size]; @@ -141,36 +68,6 @@ static INLINE int get_txb_high(TX_SIZE tx_size) { return tx_size_high[tx_size]; } -static INLINE void get_base_count_mag(int *mag, int *count, - const tran_low_t *tcoeffs, int bwl, - int height, int row, int col) { - mag[0] = 0; - mag[1] = 0; - for (int i = 0; i < NUM_BASE_LEVELS; ++i) count[i] = 0; - for (int idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) { - const int ref_row = row + base_ref_offset[idx][0]; - const int ref_col = col + base_ref_offset[idx][1]; - if (ref_row < 0 || ref_col < 0 || ref_row >= height || - ref_col >= (1 << bwl)) - continue; - const int pos = (ref_row << bwl) + ref_col; - tran_low_t abs_coeff = abs(tcoeffs[pos]); - // count - for (int i = 0; i < NUM_BASE_LEVELS; ++i) { - count[i] += abs_coeff > i; - } - // mag - if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0) { - if (abs_coeff > mag[0]) { - mag[0] = abs_coeff; - mag[1] = 1; - } else if (abs_coeff == mag[0]) { - ++mag[1]; - } - } - } -} - static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int width) { return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR); } @@ -179,30 +76,6 @@ static INLINE int get_padded_idx(const int idx, const int bwl) { return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2); } -static INLINE int get_level_count(const uint8_t *const levels, const int stride, - const int row, const int col, const int level, - const int (*nb_offset)[2], const int nb_num) { - int count = 0; - - for (int idx = 0; idx < nb_num; ++idx) { - const int ref_row = row + nb_offset[idx][0]; - const int ref_col = col + nb_offset[idx][1]; - const int pos = ref_row * stride + ref_col; - count += levels[pos] > level; - } - return count; -} - -static INLINE void get_level_mag(const uint8_t *const levels, const int stride, - const int row, const int col, int *const mag) { - for (int idx = 0; idx < CONTEXT_MAG_POSITION_NUM; ++idx) { - const int ref_row = row + mag_ref_offset[idx][0]; - const int ref_col = col + mag_ref_offset[idx][1]; - const int pos = ref_row * stride + ref_col; - mag[idx] = levels[pos]; - } -} - static INLINE int get_base_ctx_from_count_mag(int row, int col, int count, int sig_mag) { const int ctx = base_level_count_to_index[count]; @@ -267,84 +140,6 @@ static INLINE int get_base_ctx_from_count_mag(int row, int col, int count, return ctx_idx; } -static INLINE int get_base_ctx(const uint8_t *const levels, - const int c, // raster order - const int bwl, const int level_minus_1, - const int count) { - const int row = c >> bwl; - const int col = c - (row << bwl); - const int stride = (1 << bwl) + TX_PAD_HOR; - int mag_count = 0; - int nb_mag[3] = { 0 }; - - get_level_mag(levels, stride, row, col, nb_mag); - - for (int idx = 0; idx < 3; ++idx) - mag_count += nb_mag[idx] > (level_minus_1 + 1); - const int ctx_idx = - get_base_ctx_from_count_mag(row, col, count, AOMMIN(2, mag_count)); - return ctx_idx; -} - -#define BR_CONTEXT_POSITION_NUM 8 // Base range coefficient context -// Note: TX_PAD_2D is dependent to this offset table. -static const int br_ref_offset[BR_CONTEXT_POSITION_NUM][2] = { - /* clang-format off*/ - { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -1 }, - { 0, 1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, - /* clang-format on*/ -}; - -static const int br_level_map[9] = { - 0, 0, 1, 1, 2, 2, 3, 3, 3, -}; - -// Note: If BR_MAG_OFFSET changes, the calculation of offset in -// get_br_ctx_from_count_mag() must be updated. -#define BR_MAG_OFFSET 1 -// TODO(angiebird): optimize this function by using a table to map from -// count/mag to ctx - -static INLINE int get_br_count_mag(int *mag, const tran_low_t *tcoeffs, int bwl, - int height, int row, int col, int level) { - mag[0] = 0; - mag[1] = 0; - int count = 0; - for (int idx = 0; idx < BR_CONTEXT_POSITION_NUM; ++idx) { - const int ref_row = row + br_ref_offset[idx][0]; - const int ref_col = col + br_ref_offset[idx][1]; - if (ref_row < 0 || ref_col < 0 || ref_row >= height || - ref_col >= (1 << bwl)) - continue; - const int pos = (ref_row << bwl) + ref_col; - tran_low_t abs_coeff = abs(tcoeffs[pos]); - count += abs_coeff > level; - if (br_ref_offset[idx][0] >= 0 && br_ref_offset[idx][1] >= 0) { - if (abs_coeff > mag[0]) { - mag[0] = abs_coeff; - mag[1] = 1; - } else if (abs_coeff == mag[0]) { - ++mag[1]; - } - } - } - return count; -} - -static INLINE int get_br_ctx_from_count_mag(const int row, const int col, - const int count, const int mag) { - // DC: 0 - 1 - // Top row: 2 - 4 - // Left column: 5 - 7 - // others: 8 - 11 - static const int offset_pos[2][2] = { { 8, 5 }, { 2, 0 } }; - const int mag_clamp = AOMMIN(mag, 6); - const int offset = mag_clamp >> 1; - const int ctx = - br_level_map[count] + offset * BR_TMP_OFFSET + offset_pos[!row][!col]; - return ctx; -} - static INLINE int get_br_ctx_2d(const uint8_t *const levels, const int c, // raster order const int bwl) { @@ -396,38 +191,6 @@ static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels, return mag + 14; } -#define SIG_REF_OFFSET_NUM 5 - -// Note: TX_PAD_2D is dependent to these offset tables. -static const int sig_ref_offset[SIG_REF_OFFSET_NUM][2] = { - { 0, 1 }, { 1, 0 }, { 1, 1 }, { 0, 2 }, { 2, 0 } - // , { 1, 2 }, { 2, 1 }, -}; - -static const int sig_ref_offset_vert[SIG_REF_OFFSET_NUM][2] = { - { 1, 0 }, { 2, 0 }, { 0, 1 }, { 3, 0 }, { 4, 0 } - // , { 1, 1 }, { 2, 1 }, -}; - -static const int sig_ref_offset_horiz[SIG_REF_OFFSET_NUM][2] = { - { 0, 1 }, { 0, 2 }, { 1, 0 }, { 0, 3 }, { 0, 4 } - // , { 1, 1 }, { 1, 2 }, -}; - -#define SIG_REF_DIFF_OFFSET_NUM 3 - -static const int sig_ref_diff_offset[SIG_REF_DIFF_OFFSET_NUM][2] = { - { 1, 1 }, { 0, 2 }, { 2, 0 } -}; - -static const int sig_ref_diff_offset_vert[SIG_REF_DIFF_OFFSET_NUM][2] = { - { 2, 0 }, { 3, 0 }, { 4, 0 } -}; - -static const int sig_ref_diff_offset_horiz[SIG_REF_DIFF_OFFSET_NUM][2] = { - { 0, 2 }, { 0, 3 }, { 0, 4 } -}; - static const uint8_t clip_max3[256] = { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, @@ -658,4 +421,4 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize, void av1_init_lv_map(AV1_COMMON *cm); -#endif // AV1_COMMON_TXB_COMMON_H_ +#endif // AOM_AV1_COMMON_TXB_COMMON_H_ diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c index 412d83ed8..4144c4389 100644 --- a/third_party/aom/av1/common/warped_motion.c +++ b/third_party/aom/av1/common/warped_motion.c @@ -562,7 +562,7 @@ static int64_t highbd_warp_error( const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]; - ConvolveParams conv_params = get_conv_params(0, 0, 0, bd); + ConvolveParams conv_params = get_conv_params(0, 0, bd); conv_params.use_jnt_comp_avg = 0; for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { @@ -845,7 +845,7 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref, int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]; - ConvolveParams conv_params = get_conv_params(0, 0, 0, 8); + ConvolveParams conv_params = get_conv_params(0, 0, 8); conv_params.use_jnt_comp_avg = 0; for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h index ce4032ee5..a1a4f067d 100644 --- a/third_party/aom/av1/common/warped_motion.h +++ b/third_party/aom/av1/common/warped_motion.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_WARPED_MOTION_H_ -#define AV1_COMMON_WARPED_MOTION_H_ +#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_ +#define AOM_AV1_COMMON_WARPED_MOTION_H_ #include #include @@ -92,4 +92,4 @@ int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy, int mi_col); int get_shear_params(WarpedMotionParams *wm); -#endif // AV1_COMMON_WARPED_MOTION_H_ +#endif // AOM_AV1_COMMON_WARPED_MOTION_H_ diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c index 0c5286f9d..d9fb53785 100644 --- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c +++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c @@ -14,7 +14,6 @@ #include "config/aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c index ae331b40d..5db2ccf6c 100644 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c @@ -18,6 +18,12 @@ #include "av1/common/x86/av1_inv_txfm_avx2.h" #include "av1/common/x86/av1_inv_txfm_ssse3.h" +// TODO(venkatsanampudi@ittiam.com): move this to header file + +// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 +static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h index 7b5b29cf8..f74cbaeaa 100644 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ -#define AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ +#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ +#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ #include @@ -68,4 +68,4 @@ void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, } #endif -#endif // AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ +#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c index dd7cee24c..995bc3da4 100644 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c @@ -16,6 +16,12 @@ #include "av1/common/x86/av1_inv_txfm_ssse3.h" #include "av1/common/x86/av1_txfm_sse2.h" +// TODO(venkatsanampudi@ittiam.com): move this to header file + +// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 +static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + // TODO(binpengsmail@gmail.com): replace some for loop with do {} while static void idct4_new_sse2(const __m128i *input, __m128i *output, diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h index dc9be25d2..66bd339d1 100644 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ -#define AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ +#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ +#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ #include // SSE2 #include // SSSE3 @@ -94,10 +94,6 @@ static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, }; -// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 -static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, - 4 * 5793 }; - DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, }; @@ -233,4 +229,4 @@ void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, } // extern "C" #endif -#endif // AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ +#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h index 721cfe059..77aeb6eb1 100644 --- a/third_party/aom/av1/common/x86/av1_txfm_sse2.h +++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_X86_AV1_TXFM_SSE2_H_ -#define AV1_COMMON_X86_AV1_TXFM_SSE2_H_ +#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ +#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ #include // SSE2 @@ -314,4 +314,4 @@ typedef struct { #ifdef __cplusplus } #endif // __cplusplus -#endif // AV1_COMMON_X86_AV1_TXFM_SSE2_H_ +#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h index 367e02096..6cad821b1 100644 --- a/third_party/aom/av1/common/x86/av1_txfm_sse4.h +++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_TXFM_SSE4_H_ -#define AV1_TXFM_SSE4_H_ +#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ +#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ #include @@ -45,8 +45,9 @@ static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input, static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input, __m128i *output, const int size, - const int bit) { - const __m128i sqrt2 = _mm_set1_epi32(NewSqrt2); + const int bit, + const int val) { + const __m128i sqrt2 = _mm_set1_epi32(val); if (bit > 0) { int i; for (i = 0; i < size; i++) { @@ -68,4 +69,4 @@ static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input, } #endif -#endif // AV1_TXFM_SSE4_H_ +#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h index 7479ac3e1..3b342cd4e 100644 --- a/third_party/aom/av1/common/x86/cfl_simd.h +++ b/third_party/aom/av1/common/x86/cfl_simd.h @@ -9,6 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_ +#define AOM_AV1_COMMON_X86_CFL_SIMD_H_ + #include "av1/common/blockd.h" // SSSE3 version is optimal for with == 4, we reuse them in AVX2 @@ -236,3 +239,5 @@ void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); + +#endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_ diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c index 1099144fe..0acafd044 100644 --- a/third_party/aom/av1/common/x86/convolve_2d_avx2.c +++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c @@ -11,10 +11,8 @@ #include -#include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/convolve_common_intrin.h" #include "aom_dsp/aom_dsp_common.h" diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c index 637f83cf7..b1a62a4f6 100644 --- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c +++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c @@ -11,9 +11,8 @@ #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c index f66dee37d..5016642de 100644 --- a/third_party/aom/av1/common/x86/convolve_sse2.c +++ b/third_party/aom/av1/common/x86/convolve_sse2.c @@ -11,9 +11,8 @@ #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_common_intrin.h" @@ -76,8 +75,8 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s, return convolve(ss, coeffs); } -void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, - const uint8_t *dst, int dst_stride, int w, int h, +void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, @@ -237,8 +236,8 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, } } -void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, - const uint8_t *dst, int dst_stride, int w, int h, +void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c index 8444ffa93..ae68f0bbb 100644 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c @@ -14,7 +14,6 @@ #include "config/aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/aom_dsp_common.h" diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c index eb340523a..3f8dafb4b 100644 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c @@ -15,7 +15,6 @@ #include "config/aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c index 33183fdee..1d029db39 100644 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c @@ -14,7 +14,6 @@ #include "config/aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c index debb05a6d..ade2af03e 100644 --- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c +++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c @@ -15,6 +15,9 @@ #include "config/av1_rtcd.h" #include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/idct.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" // Note: // Total 32x4 registers to represent 32x32 block coefficients. @@ -27,131 +30,125 @@ // ... ... // v124, v125, v126, v127 -static void transpose_32x32_8x8(const __m256i *in, __m256i *out) { +static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one); + __m256i clamped, mask; + + mask = _mm256_cmpgt_epi16(u, max); + clamped = _mm256_andnot_si256(mask, u); + mask = _mm256_and_si256(mask, max); + clamped = _mm256_or_si256(mask, clamped); + mask = _mm256_cmpgt_epi16(clamped, zero); + clamped = _mm256_and_si256(clamped, mask); + + return clamped; +} + +static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred, + __m256i res0, __m256i res1, + const int bd) { + __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred)); + __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1)); + + x0 = _mm256_add_epi32(res0, x0); + x1 = _mm256_add_epi32(res1, x1); + x0 = _mm256_packus_epi32(x0, x1); + x0 = _mm256_permute4x64_epi64(x0, 0xd8); + x0 = highbd_clamp_epi16_avx2(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride)); + __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd); + + _mm256_storeu_si256((__m256i *)(output + i * stride), u); + } +} + +static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) { + __m256i tmp, round; + round = _mm256_set1_epi32(1 << (bit - 1)); + tmp = _mm256_add_epi32(vec, round); + return _mm256_srai_epi32(tmp, bit); +} + +static INLINE void av1_round_shift_array_32_avx2(__m256i *input, + __m256i *output, + const int size, + const int bit) { + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + output[i] = av1_round_shift_32_avx2(input[i], bit); + } + } else { + int i; + for (i = 0; i < size; i++) { + output[i] = _mm256_slli_epi32(input[i], -bit); + } + } +} + +static void transpose_8x8_avx2(const __m256i *in, __m256i *out) { __m256i u0, u1, u2, u3, u4, u5, u6, u7; __m256i x0, x1; - u0 = _mm256_unpacklo_epi32(in[0], in[4]); - u1 = _mm256_unpackhi_epi32(in[0], in[4]); + u0 = _mm256_unpacklo_epi32(in[0], in[1]); + u1 = _mm256_unpackhi_epi32(in[0], in[1]); - u2 = _mm256_unpacklo_epi32(in[8], in[12]); - u3 = _mm256_unpackhi_epi32(in[8], in[12]); + u2 = _mm256_unpacklo_epi32(in[2], in[3]); + u3 = _mm256_unpackhi_epi32(in[2], in[3]); - u4 = _mm256_unpacklo_epi32(in[16], in[20]); - u5 = _mm256_unpackhi_epi32(in[16], in[20]); + u4 = _mm256_unpacklo_epi32(in[4], in[5]); + u5 = _mm256_unpackhi_epi32(in[4], in[5]); - u6 = _mm256_unpacklo_epi32(in[24], in[28]); - u7 = _mm256_unpackhi_epi32(in[24], in[28]); + u6 = _mm256_unpacklo_epi32(in[6], in[7]); + u7 = _mm256_unpackhi_epi32(in[6], in[7]); x0 = _mm256_unpacklo_epi64(u0, u2); x1 = _mm256_unpacklo_epi64(u4, u6); out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); - out[16] = _mm256_permute2f128_si256(x0, x1, 0x31); + out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpackhi_epi64(u0, u2); x1 = _mm256_unpackhi_epi64(u4, u6); - out[4] = _mm256_permute2f128_si256(x0, x1, 0x20); - out[20] = _mm256_permute2f128_si256(x0, x1, 0x31); + out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpacklo_epi64(u1, u3); x1 = _mm256_unpacklo_epi64(u5, u7); - out[8] = _mm256_permute2f128_si256(x0, x1, 0x20); - out[24] = _mm256_permute2f128_si256(x0, x1, 0x31); + out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpackhi_epi64(u1, u3); x1 = _mm256_unpackhi_epi64(u5, u7); - out[12] = _mm256_permute2f128_si256(x0, x1, 0x20); - out[28] = _mm256_permute2f128_si256(x0, x1, 0x31); -} - -static void transpose_32x32_16x16(const __m256i *in, __m256i *out) { - transpose_32x32_8x8(&in[0], &out[0]); - transpose_32x32_8x8(&in[1], &out[32]); - transpose_32x32_8x8(&in[32], &out[1]); - transpose_32x32_8x8(&in[33], &out[33]); -} - -static void transpose_32x32(const __m256i *in, __m256i *out) { - transpose_32x32_16x16(&in[0], &out[0]); - transpose_32x32_16x16(&in[2], &out[64]); - transpose_32x32_16x16(&in[64], &out[2]); - transpose_32x32_16x16(&in[66], &out[66]); + out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); } -static void load_buffer_32x32(const int32_t *coeff, __m256i *in) { +static void load_buffer_32x32(const int32_t *coeff, __m256i *in, + int input_stiride, int size) { int i; - for (i = 0; i < 128; ++i) { - in[i] = _mm256_loadu_si256((const __m256i *)coeff); - coeff += 8; + for (i = 0; i < size; ++i) { + in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride)); } } -static __m256i highbd_clamp_epi32(__m256i x, int bd) { - const __m256i zero = _mm256_setzero_si256(); - const __m256i one = _mm256_set1_epi16(1); - const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one); - __m256i clamped, mask; - - mask = _mm256_cmpgt_epi16(x, max); - clamped = _mm256_andnot_si256(mask, x); - mask = _mm256_and_si256(mask, max); - clamped = _mm256_or_si256(mask, clamped); - mask = _mm256_cmpgt_epi16(clamped, zero); - clamped = _mm256_and_si256(clamped, mask); - - return clamped; -} - -static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride, - int fliplr, int flipud, int shift, int bd) { - __m256i u0, u1, x0, x1, x2, x3, v0, v1, v2, v3; - const __m256i zero = _mm256_setzero_si256(); - int i = 0; - (void)fliplr; - (void)flipud; - - __m256i round = _mm256_set1_epi32((1 << shift) >> 1); - - while (i < 128) { - u0 = _mm256_loadu_si256((const __m256i *)output); - u1 = _mm256_loadu_si256((const __m256i *)(output + 16)); - - x0 = _mm256_unpacklo_epi16(u0, zero); - x1 = _mm256_unpackhi_epi16(u0, zero); - x2 = _mm256_unpacklo_epi16(u1, zero); - x3 = _mm256_unpackhi_epi16(u1, zero); - - v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20); - v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31); - v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20); - v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31); - - v0 = _mm256_add_epi32(v0, round); - v1 = _mm256_add_epi32(v1, round); - v2 = _mm256_add_epi32(v2, round); - v3 = _mm256_add_epi32(v3, round); - - v0 = _mm256_sra_epi32(v0, _mm_cvtsi32_si128(shift)); - v1 = _mm256_sra_epi32(v1, _mm_cvtsi32_si128(shift)); - v2 = _mm256_sra_epi32(v2, _mm_cvtsi32_si128(shift)); - v3 = _mm256_sra_epi32(v3, _mm_cvtsi32_si128(shift)); - - v0 = _mm256_add_epi32(v0, x0); - v1 = _mm256_add_epi32(v1, x1); - v2 = _mm256_add_epi32(v2, x2); - v3 = _mm256_add_epi32(v3, x3); - - v0 = _mm256_packus_epi32(v0, v1); - v2 = _mm256_packus_epi32(v2, v3); - - v0 = highbd_clamp_epi32(v0, bd); - v2 = highbd_clamp_epi32(v2, bd); - - _mm256_storeu_si256((__m256i *)output, v0); - _mm256_storeu_si256((__m256i *)(output + 16), v2); - output += stride; - i += 4; - } +static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *rounding, int bit) { + __m256i x; + x = _mm256_mullo_epi32(*w0, *n0); + x = _mm256_add_epi32(x, *rounding); + x = _mm256_srai_epi32(x, bit); + return x; } static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0, @@ -200,18 +197,549 @@ static void addsub_shift_avx2(const __m256i in0, const __m256i in1, __m256i a0 = _mm256_add_epi32(in0_w_offset, in1); __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1); + a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + a0 = _mm256_max_epi32(a0, *clamp_lo); a0 = _mm256_min_epi32(a0, *clamp_hi); a1 = _mm256_max_epi32(a1, *clamp_lo); a1 = _mm256_min_epi32(a1, *clamp_hi); - a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift)); - a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift)); - *out0 = a0; *out1 = a1; } +static INLINE void idct32_stage4_avx2( + __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56, + const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40, + const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); + bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); + bf1[17] = temp1; + + temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); + bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); + bf1[18] = temp2; + + temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); + bf1[21] = temp1; + + temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); + bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); + bf1[22] = temp2; +} + +static INLINE void idct32_stage5_avx2( + __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48, + const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo, + const __m256i *clamp_hi, const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); + bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); + bf1[9] = temp1; + + temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); + bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); + bf1[10] = temp2; + + addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); + addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); + addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); + addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); + addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage6_avx2( + __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32, + const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, + const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[5] = temp1; + + addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); + addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); + addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); + addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); + bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); + bf1[18] = temp1; + temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); + bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); + bf1[19] = temp2; + temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); + bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); + bf1[21] = temp2; +} + +static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[10] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[11] = temp2; + + addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); + addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); + addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); + addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); + addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); + addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); + addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); + addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); + addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[21] = temp2; + temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[22] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[23] = temp2; +} + +static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out, + const int do_cols, const int bd, + const int out_shift, + const int log_range) { + if (do_cols) { + addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31); + addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30); + addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29); + addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28); + addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27); + addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26); + addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25); + addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24); + addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23); + addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22); + addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21); + addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20); + addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19); + addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18); + addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17); + addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + +static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i x; + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + x = _mm256_mullo_epi32(in[0], cospi32); + x = _mm256_add_epi32(x, rounding); + x = _mm256_srai_epi32(x, bit); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + if (do_cols) { + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + x = _mm256_add_epi32(offset, x); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + x = _mm256_max_epi32(x, clamp_lo_out); + x = _mm256_min_epi32(x, clamp_hi_out); + } + + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; +} + +static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[4] = in[4]; + bf1[8] = in[2]; + bf1[12] = in[6]; + bf1[16] = in[1]; + bf1[20] = in[5]; + bf1[24] = in[3]; + bf1[28] = in[7]; + + // stage 2 + bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); + bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); + bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); + + bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); + bf1[17] = bf1[16]; + bf1[18] = bf1[19]; + bf1[21] = bf1[20]; + bf1[22] = bf1[23]; + bf1[25] = bf1[24]; + bf1[26] = bf1[27]; + bf1[29] = bf1[28]; + bf1[30] = bf1[31]; + + // stage 4 + bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); + + bf1[9] = bf1[8]; + bf1[10] = bf1[11]; + bf1[13] = bf1[12]; + bf1[14] = bf1[15]; + + idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[5] = bf1[4]; + bf1[6] = bf1[7]; + + idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + bf1[3] = bf1[0]; + bf1[2] = bf1[1]; + + idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range); + } +} + +static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[2] = in[8]; + bf1[4] = in[4]; + bf1[6] = in[12]; + bf1[8] = in[2]; + bf1[10] = in[10]; + bf1[12] = in[6]; + bf1[14] = in[14]; + bf1[16] = in[1]; + bf1[18] = in[9]; + bf1[20] = in[5]; + bf1[22] = in[13]; + bf1[24] = in[3]; + bf1[26] = in[11]; + bf1[28] = in[7]; + bf1[30] = in[15]; + + // stage 2 + bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); + bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit); + bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit); + bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit); + bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit); + bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); + bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit); + bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit); + bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit); + bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit); + bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); + bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit); + bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit); + bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit); + bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit); + bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); + + addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); + bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit); + bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit); + + addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); + + idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit); + bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit); + + addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + + idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); + + idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range); + } +} + static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); @@ -270,43 +798,42 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i bf1[32], bf0[32]; - int col; - for (col = 0; col < 4; ++col) { + { // stage 0 // stage 1 - bf1[0] = in[0 * 4 + col]; - bf1[1] = in[16 * 4 + col]; - bf1[2] = in[8 * 4 + col]; - bf1[3] = in[24 * 4 + col]; - bf1[4] = in[4 * 4 + col]; - bf1[5] = in[20 * 4 + col]; - bf1[6] = in[12 * 4 + col]; - bf1[7] = in[28 * 4 + col]; - bf1[8] = in[2 * 4 + col]; - bf1[9] = in[18 * 4 + col]; - bf1[10] = in[10 * 4 + col]; - bf1[11] = in[26 * 4 + col]; - bf1[12] = in[6 * 4 + col]; - bf1[13] = in[22 * 4 + col]; - bf1[14] = in[14 * 4 + col]; - bf1[15] = in[30 * 4 + col]; - bf1[16] = in[1 * 4 + col]; - bf1[17] = in[17 * 4 + col]; - bf1[18] = in[9 * 4 + col]; - bf1[19] = in[25 * 4 + col]; - bf1[20] = in[5 * 4 + col]; - bf1[21] = in[21 * 4 + col]; - bf1[22] = in[13 * 4 + col]; - bf1[23] = in[29 * 4 + col]; - bf1[24] = in[3 * 4 + col]; - bf1[25] = in[19 * 4 + col]; - bf1[26] = in[11 * 4 + col]; - bf1[27] = in[27 * 4 + col]; - bf1[28] = in[7 * 4 + col]; - bf1[29] = in[23 * 4 + col]; - bf1[30] = in[15 * 4 + col]; - bf1[31] = in[31 * 4 + col]; + bf1[0] = in[0]; + bf1[1] = in[16]; + bf1[2] = in[8]; + bf1[3] = in[24]; + bf1[4] = in[4]; + bf1[5] = in[20]; + bf1[6] = in[12]; + bf1[7] = in[28]; + bf1[8] = in[2]; + bf1[9] = in[18]; + bf1[10] = in[10]; + bf1[11] = in[26]; + bf1[12] = in[6]; + bf1[13] = in[22]; + bf1[14] = in[14]; + bf1[15] = in[30]; + bf1[16] = in[1]; + bf1[17] = in[17]; + bf1[18] = in[9]; + bf1[19] = in[25]; + bf1[20] = in[5]; + bf1[21] = in[21]; + bf1[22] = in[13]; + bf1[23] = in[29]; + bf1[24] = in[3]; + bf1[25] = in[19]; + bf1[26] = in[11]; + bf1[27] = in[27]; + bf1[28] = in[7]; + bf1[29] = in[23]; + bf1[30] = in[15]; + bf1[31] = in[31]; // stage 2 bf0[0] = bf1[0]; @@ -568,91 +1095,255 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, // stage 9 if (do_cols) { - addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0 * 4 + col, - out + 31 * 4 + col); - addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1 * 4 + col, - out + 30 * 4 + col); - addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2 * 4 + col, - out + 29 * 4 + col); - addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3 * 4 + col, - out + 28 * 4 + col); - addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4 * 4 + col, - out + 27 * 4 + col); - addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5 * 4 + col, - out + 26 * 4 + col); - addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6 * 4 + col, - out + 25 * 4 + col); - addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7 * 4 + col, - out + 24 * 4 + col); - addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8 * 4 + col, - out + 23 * 4 + col); - addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9 * 4 + col, - out + 22 * 4 + col); - addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10 * 4 + col, - out + 21 * 4 + col); - addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11 * 4 + col, - out + 20 * 4 + col); - addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12 * 4 + col, - out + 19 * 4 + col); - addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13 * 4 + col, - out + 18 * 4 + col); - addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14 * 4 + col, - out + 17 * 4 + col); - addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15 * 4 + col, - out + 16 * 4 + col); + addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31); + addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30); + addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29); + addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28); + addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27); + addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26); + addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25); + addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24); + addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23); + addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22); + addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21); + addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20); + addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19); + addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18); + addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17); + addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16); } else { - addsub_shift_avx2(bf0[0], bf0[31], out + 0 * 4 + col, out + 31 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[1], bf0[30], out + 1 * 4 + col, out + 30 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[2], bf0[29], out + 2 * 4 + col, out + 29 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[3], bf0[28], out + 3 * 4 + col, out + 28 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[4], bf0[27], out + 4 * 4 + col, out + 27 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[5], bf0[26], out + 5 * 4 + col, out + 26 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[6], bf0[25], out + 6 * 4 + col, out + 25 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[7], bf0[24], out + 7 * 4 + col, out + 24 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[8], bf0[23], out + 8 * 4 + col, out + 23 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[9], bf0[22], out + 9 * 4 + col, out + 22 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[10], bf0[21], out + 10 * 4 + col, - out + 21 * 4 + col, &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[11], bf0[20], out + 11 * 4 + col, - out + 20 * 4 + col, &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[12], bf0[19], out + 12 * 4 + col, - out + 19 * 4 + col, &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[13], bf0[18], out + 13 * 4 + col, - out + 18 * 4 + col, &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[14], bf0[17], out + 14 * 4 + col, - out + 17 * 4 + col, &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[15], bf0[16], out + 15 * 4 + col, - out + 16 * 4 + col, &clamp_lo, &clamp_hi, out_shift); + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out, + &clamp_hi_out, out_shift); } } } -void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output, - int stride, TX_TYPE tx_type, int bd) { - __m256i in[128], out[128]; - const int8_t *shift = inv_txfm_shift_ls[TX_32X32]; - const int txw_idx = get_txw_idx(TX_32X32); - const int txh_idx = get_txh_idx(TX_32X32); +typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit, + int do_cols, int bd, int out_shift); + +static const transform_1d_avx2 + highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + + { { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m256i buf1[64 * 2]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_avx2 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + __m256i buf0[32]; + const int32_t *input_row = input + i * input_stride * 8; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + __m256i *buf0_cur = buf0 + j * 8; + load_buffer_32x32(input_row + j * 8, buf0_cur, input_stride, 8); + + transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]); + } + + row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m256i *_buf1 = buf1 + i * 8; + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]); + } + } + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2, + output + 16 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} + +void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { switch (tx_type) { case DCT_DCT: - load_buffer_32x32(coeff, in); - transpose_32x32(in, out); - idct32_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); - transpose_32x32(in, out); - idct32_avx2(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_32x32(in, output, stride, 0, 0, -shift[1], bd); + highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, eob, bd); break; + default: assert(0); break; + } +} + +void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + const int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + case DCT_DCT: + av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + // Assembly version doesn't support IDTX, so use C version for it. + case IDTX: + av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + default: assert(0); } } + +void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_32X32: + av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param); + break; + case TX_16X16: + av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param); + break; + case TX_8X8: + av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X8: + av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); + break; + case TX_8X16: + av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X8: + av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X32: + av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); + break; + case TX_32X16: + av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); + break; + case TX_32X64: + av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); + break; + case TX_64X32: + av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); + break; + case TX_4X4: + av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); + break; + case TX_8X32: + av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); + break; + case TX_32X8: + av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); + break; + case TX_64X64: + case TX_16X64: + case TX_64X16: + av1_highbd_inv_txfm2d_add_universe_sse4_1( + input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, + txfm_param->eob, txfm_param->bd); + break; + default: assert(0 && "Invalid transform size"); break; + } +} diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c index 801a4133b..e29e0baf5 100644 --- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c +++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c @@ -15,8 +15,60 @@ #include "config/av1_rtcd.h" #include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/idct.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/av1_txfm_sse4.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" +static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + __m128i clamped, mask; + + mask = _mm_cmpgt_epi16(u, max); + clamped = _mm_andnot_si128(mask, u); + mask = _mm_and_si128(mask, max); + clamped = _mm_or_si128(mask, clamped); + mask = _mm_cmpgt_epi16(clamped, zero); + clamped = _mm_and_si128(clamped, mask); + + return clamped; +} + +static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred, + __m128i res0, __m128i res1, + const int bd) { + __m128i x0 = _mm_cvtepi16_epi32(pred); + __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8)); + + x0 = _mm_add_epi32(res0, x0); + x1 = _mm_add_epi32(res1, x1); + x0 = _mm_packus_epi32(x0, x1); + x0 = highbd_clamp_epi16(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); + __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd); + + _mm_storeu_si128((__m128i *)(output + i * stride), u); + } +} + +static INLINE void load_buffer_32bit_input(const int32_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride)); + } +} + static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) { in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); @@ -57,18 +109,231 @@ static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1, __m128i a0 = _mm_add_epi32(in0_w_offset, in1); __m128i a1 = _mm_sub_epi32(in0_w_offset, in1); + a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + a0 = _mm_max_epi32(a0, *clamp_lo); a0 = _mm_min_epi32(a0, *clamp_hi); a1 = _mm_max_epi32(a1, *clamp_lo); a1 = _mm_min_epi32(a1, *clamp_hi); - a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); - a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); - *out0 = a0; *out1 = a1; } +static INLINE void idct32_stage4_sse4_1( + __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56, + const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40, + const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); + bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); + bf1[17] = temp1; + + temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); + bf1[29] = + half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); + bf1[18] = temp2; + + temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); + bf1[21] = temp1; + + temp2 = + half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); + bf1[25] = + half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); + bf1[22] = temp2; +} + +static INLINE void idct32_stage5_sse4_1( + __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48, + const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo, + const __m128i *clamp_hi, const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); + bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); + bf1[9] = temp1; + + temp2 = + half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); + bf1[13] = + half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); + bf1[10] = temp2; + + addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage6_sse4_1( + __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32, + const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, + const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[5] = temp1; + + addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); + bf1[29] = + half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); + bf1[18] = temp1; + temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); + bf1[28] = + half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); + bf1[19] = temp2; + temp1 = + half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); + bf1[27] = + half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = + half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); + bf1[21] = temp2; +} + +static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[13] = + half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[10] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[12] = + half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[11] = temp2; + + addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[27] = + half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[21] = temp2; + temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[25] = + half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[22] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[24] = + half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[23] = temp2; +} + +static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out, + const int do_cols, const int bd, + const int out_shift, + const int log_range) { + if (do_cols) { + addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31); + addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30); + addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29); + addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28); + addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27); + addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26); + addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25); + addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24); + addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23); + addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22); + addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21); + addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20); + addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19); + addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18); + addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17); + addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + static void neg_shift_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0, __m128i *out1, const __m128i *clamp_lo, const __m128i *clamp_hi, @@ -77,14 +342,14 @@ static void neg_shift_sse4_1(const __m128i in0, const __m128i in1, __m128i a0 = _mm_add_epi32(offset, in0); __m128i a1 = _mm_sub_epi32(offset, in1); + a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + a0 = _mm_max_epi32(a0, *clamp_lo); a0 = _mm_min_epi32(a0, *clamp_hi); a1 = _mm_max_epi32(a1, *clamp_lo); a1 = _mm_min_epi32(a1, *clamp_hi); - a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); - a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); - *out0 = a0; *out1 = a1; } @@ -96,9 +361,6 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3, x, y; @@ -135,11 +397,19 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { v3 = _mm_add_epi32(v3, rnding); v3 = _mm_srai_epi32(v3, bit); - addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi); - addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi); + if (do_cols) { + addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3); + addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2); + } else { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi); + } } -static void iadst4x4_sse4_1(__m128i *in, int bit) { +static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { const int32_t *sinpi = sinpi_arr(bit); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); @@ -197,6 +467,21 @@ static void iadst4x4_sse4_1(__m128i *in, int bit) { u3 = _mm_add_epi32(u3, rnding); u3 = _mm_srai_epi32(u3, bit); + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + u0 = _mm_max_epi32(u0, clamp_lo); + u0 = _mm_min_epi32(u0, clamp_hi); + u1 = _mm_max_epi32(u1, clamp_lo); + u1 = _mm_min_epi32(u1, clamp_hi); + u2 = _mm_max_epi32(u2, clamp_lo); + u2 = _mm_min_epi32(u2, clamp_hi); + u3 = _mm_max_epi32(u3, clamp_lo); + u3 = _mm_min_epi32(u3, clamp_hi); + } + in[0] = u0; in[1] = u1; in[2] = u2; @@ -217,22 +502,6 @@ static INLINE void round_shift_4x4(__m128i *in, int shift) { in[3] = _mm_srai_epi32(in[3], shift); } -static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) { - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - __m128i clamped, mask; - - mask = _mm_cmpgt_epi16(u, max); - clamped = _mm_andnot_si128(mask, u); - mask = _mm_and_si128(mask, max); - clamped = _mm_or_si128(mask, clamped); - mask = _mm_cmpgt_epi16(clamped, zero); - clamped = _mm_and_si128(clamped, mask); - - return clamped; -} - static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, int fliplr, int flipud, int shift, int bd) { const __m128i zero = _mm_setzero_si128(); @@ -304,49 +573,49 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output, case ADST_DCT: load_buffer_4x4(coeff, in); idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case DCT_ADST: load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_ADST: load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); + iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case FLIPADST_DCT: load_buffer_4x4(coeff, in); idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; case DCT_FLIPADST: load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_FLIPADST: load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); + iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); break; case ADST_FLIPADST: load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); + iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_ADST: load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); + iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; default: assert(0); @@ -482,14 +751,19 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col); addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col); } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, - &clamp_lo, &clamp_hi, out_shift); + &clamp_lo_out, &clamp_hi_out, out_shift); addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, - &clamp_lo, &clamp_hi, out_shift); + &clamp_lo_out, &clamp_hi_out, out_shift); addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, - &clamp_lo, &clamp_hi, out_shift); + &clamp_lo_out, &clamp_hi_out, out_shift); addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, - &clamp_lo, &clamp_hi, out_shift); + &clamp_lo_out, &clamp_hi_out, out_shift); } } } @@ -651,14 +925,18 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, out[12] = u[5]; out[14] = _mm_sub_epi32(kZero, u[1]); } else { - neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo, &clamp_hi, + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo, &clamp_hi, + neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out, + &clamp_hi_out, out_shift); } // Odd 8 points: 1, 3, ..., 15 @@ -796,14 +1074,18 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, out[13] = u[5]; out[15] = _mm_sub_epi32(kZero, u[1]); } else { - neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo, &clamp_hi, + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo, &clamp_hi, + neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); } } @@ -976,81 +1258,51 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output, } } -// 16x16 -static void load_buffer_16x16(const int32_t *coeff, __m128i *in) { - int i; - for (i = 0; i < 64; ++i) { - in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2))); - } -} - -static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8, - int col) { - int i; - for (i = 0; i < 16; i += 2) { - in8x8[i] = in[col]; - in8x8[i + 1] = in[col + 1]; - col += 4; - } -} - -static void swap_addr(uint16_t **output1, uint16_t **output2) { - uint16_t *tmp; - tmp = *output1; - *output1 = *output2; - *output2 = tmp; -} - -static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride, - int fliplr, int flipud, int shift, int bd) { - __m128i in8x8[16]; - uint16_t *leftUp = &output[0]; - uint16_t *rightUp = &output[8]; - uint16_t *leftDown = &output[8 * stride]; - uint16_t *rightDown = &output[8 * stride + 8]; +static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i x; - if (fliplr) { - swap_addr(&leftUp, &rightUp); - swap_addr(&leftDown, &rightDown); - } + // stage 0 + // stage 1 + // stage 2 + // stage 3 + x = _mm_mullo_epi32(in[0], cospi32); + x = _mm_add_epi32(x, rnding); + x = _mm_srai_epi32(x, bit); - if (flipud) { - swap_addr(&leftUp, &leftDown); - swap_addr(&rightUp, &rightDown); + // stage 4 + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + x = _mm_add_epi32(x, offset); + x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + x = _mm_max_epi32(x, clamp_lo_out); + x = _mm_min_epi32(x, clamp_hi_out); } - // Left-up quarter - assign_8x8_input_from_16x16(in, in8x8, 0); - write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd); - - // Right-up quarter - assign_8x8_input_from_16x16(in, in8x8, 2); - write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd); - - // Left-down quarter - assign_8x8_input_from_16x16(in, in8x8, 32); - write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd); - - // Right-down quarter - assign_8x8_input_from_16x16(in, in8x8, 34); - write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; } -static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { +static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); - const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); - const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); - const __m128i cospi52 = _mm_set1_epi32(cospi[52]); - const __m128i cospi36 = _mm_set1_epi32(cospi[36]); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); @@ -1059,473 +1311,687 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i u[16], v[16], x, y; - int col; - - for (col = 0; col < 4; ++col) { - // stage 0 - // stage 1 - u[0] = in[0 * 4 + col]; - u[1] = in[8 * 4 + col]; - u[2] = in[4 * 4 + col]; - u[3] = in[12 * 4 + col]; - u[4] = in[2 * 4 + col]; - u[5] = in[10 * 4 + col]; - u[6] = in[6 * 4 + col]; - u[7] = in[14 * 4 + col]; - u[8] = in[1 * 4 + col]; - u[9] = in[9 * 4 + col]; - u[10] = in[5 * 4 + col]; - u[11] = in[13 * 4 + col]; - u[12] = in[3 * 4 + col]; - u[13] = in[11 * 4 + col]; - u[14] = in[7 * 4 + col]; - u[15] = in[15 * 4 + col]; + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; - // stage 2 - v[0] = u[0]; - v[1] = u[1]; - v[2] = u[2]; - v[3] = u[3]; - v[4] = u[4]; - v[5] = u[5]; - v[6] = u[6]; - v[7] = u[7]; + // stage 0 + // stage 1 + // stage 2 + u0 = in[0]; + u1 = in[4]; + u2 = in[2]; + u3 = in[6]; + + x = _mm_mullo_epi32(in[1], cospi56); + y = _mm_mullo_epi32(in[7], cospim8); + u4 = _mm_add_epi32(x, y); + u4 = _mm_add_epi32(u4, rnding); + u4 = _mm_srai_epi32(u4, bit); + + x = _mm_mullo_epi32(in[1], cospi8); + y = _mm_mullo_epi32(in[7], cospi56); + u7 = _mm_add_epi32(x, y); + u7 = _mm_add_epi32(u7, rnding); + u7 = _mm_srai_epi32(u7, bit); + + x = _mm_mullo_epi32(in[5], cospi24); + y = _mm_mullo_epi32(in[3], cospim40); + u5 = _mm_add_epi32(x, y); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + x = _mm_mullo_epi32(in[5], cospi40); + y = _mm_mullo_epi32(in[3], cospi24); + u6 = _mm_add_epi32(x, y); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); - v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); - v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); - v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); - v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); - v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); - v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); - v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); - v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); + // stage 3 + x = _mm_mullo_epi32(u0, cospi32); + y = _mm_mullo_epi32(u1, cospi32); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); - // stage 3 - u[0] = v[0]; - u[1] = v[1]; - u[2] = v[2]; - u[3] = v[3]; - u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); - u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); - u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); - u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); - addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + v1 = _mm_sub_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); - // stage 4 - x = _mm_mullo_epi32(u[0], cospi32); - y = _mm_mullo_epi32(u[1], cospi32); - v[0] = _mm_add_epi32(x, y); - v[0] = _mm_add_epi32(v[0], rnding); - v[0] = _mm_srai_epi32(v[0], bit); + x = _mm_mullo_epi32(u2, cospi48); + y = _mm_mullo_epi32(u3, cospim16); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); - v[1] = _mm_sub_epi32(x, y); - v[1] = _mm_add_epi32(v[1], rnding); - v[1] = _mm_srai_epi32(v[1], bit); + x = _mm_mullo_epi32(u2, cospi16); + y = _mm_mullo_epi32(u3, cospi48); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); - v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); - v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); - addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); - v[8] = u[8]; - v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); - v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); - v[11] = u[11]; - v[12] = u[12]; - v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); - v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); - v[15] = u[15]; + addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); - // stage 5 - addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); - u[4] = v[4]; + // stage 4 + addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; - x = _mm_mullo_epi32(v[5], cospi32); - y = _mm_mullo_epi32(v[6], cospi32); - u[5] = _mm_sub_epi32(y, x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); + x = _mm_mullo_epi32(v5, cospi32); + y = _mm_mullo_epi32(v6, cospi32); + u6 = _mm_add_epi32(y, x); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); - u[6] = _mm_add_epi32(y, x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); + u5 = _mm_sub_epi32(y, x); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); - u[7] = v[7]; - addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + // stage 5 + if (do_cols) { + addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7); + addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6); + addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5); + addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out, + out_shift); + addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} - // stage 6 - addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); - v[8] = u[8]; - v[9] = u[9]; +static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + __m128i u[8], x; - x = _mm_mullo_epi32(u[10], cospi32); - y = _mm_mullo_epi32(u[13], cospi32); - v[10] = _mm_sub_epi32(y, x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); + // stage 0 + // stage 1 + // stage 2 - v[13] = _mm_add_epi32(x, y); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); - x = _mm_mullo_epi32(u[11], cospi32); - y = _mm_mullo_epi32(u[12], cospi32); - v[11] = _mm_sub_epi32(y, x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(kZero, x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); - v[12] = _mm_add_epi32(x, y); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); + // stage 3 + // stage 4 + __m128i temp1, temp2; + temp1 = _mm_mullo_epi32(u[0], cospi16); + x = _mm_mullo_epi32(u[1], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + u[4] = temp1; + + temp2 = _mm_mullo_epi32(u[0], cospi48); + x = _mm_mullo_epi32(u[1], cospi16); + u[5] = _mm_sub_epi32(temp2, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); - v[14] = u[14]; - v[15] = u[15]; + // stage 5 + // stage 6 + temp1 = _mm_mullo_epi32(u[0], cospi32); + x = _mm_mullo_epi32(u[1], cospi32); + u[2] = _mm_add_epi32(temp1, x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); - // stage 7 - if (do_cols) { - addsub_no_clamp_sse4_1(v[0], v[15], out + 0 * 4 + col, - out + 15 * 4 + col); - addsub_no_clamp_sse4_1(v[1], v[14], out + 1 * 4 + col, - out + 14 * 4 + col); - addsub_no_clamp_sse4_1(v[2], v[13], out + 2 * 4 + col, - out + 13 * 4 + col); - addsub_no_clamp_sse4_1(v[3], v[12], out + 3 * 4 + col, - out + 12 * 4 + col); - addsub_no_clamp_sse4_1(v[4], v[11], out + 4 * 4 + col, - out + 11 * 4 + col); - addsub_no_clamp_sse4_1(v[5], v[10], out + 5 * 4 + col, - out + 10 * 4 + col); - addsub_no_clamp_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col); - addsub_no_clamp_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col); - } else { - addsub_shift_sse4_1(v[0], v[15], out + 0 * 4 + col, out + 15 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[1], v[14], out + 1 * 4 + col, out + 14 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[2], v[13], out + 2 * 4 + col, out + 13 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[3], v[12], out + 3 * 4 + col, out + 12 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[4], v[11], out + 4 * 4 + col, out + 11 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[5], v[10], out + 5 * 4 + col, out + 10 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - } + u[3] = _mm_sub_epi32(temp1, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + temp1 = _mm_mullo_epi32(u[4], cospi32); + x = _mm_mullo_epi32(u[5], cospi32); + u[6] = _mm_add_epi32(temp1, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(temp1, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); } } -static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { +static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - const __m128i cospi10 = _mm_set1_epi32(cospi[10]); - const __m128i cospi54 = _mm_set1_epi32(cospi[54]); - const __m128i cospi18 = _mm_set1_epi32(cospi[18]); - const __m128i cospi46 = _mm_set1_epi32(cospi[46]); - const __m128i cospi26 = _mm_set1_epi32(cospi[26]); - const __m128i cospi38 = _mm_set1_epi32(cospi[38]); - const __m128i cospi34 = _mm_set1_epi32(cospi[34]); - const __m128i cospi30 = _mm_set1_epi32(cospi[30]); - const __m128i cospi42 = _mm_set1_epi32(cospi[42]); - const __m128i cospi22 = _mm_set1_epi32(cospi[22]); - const __m128i cospi50 = _mm_set1_epi32(cospi[50]); - const __m128i cospi14 = _mm_set1_epi32(cospi[14]); - const __m128i cospi58 = _mm_set1_epi32(cospi[58]); - const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i u[16], v[16], x, y; - const int col_num = 4; - int col; + __m128i u[8], v[8], x; - // Calculate the column 0, 1, 2, 3 - for (col = 0; col < col_num; ++col) { - // stage 0 - // stage 1 - // stage 2 - v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2); - x = _mm_mullo_epi32(in[0 * col_num + col], cospi62); - v[0] = _mm_add_epi32(v[0], x); - v[0] = _mm_add_epi32(v[0], rnding); - v[0] = _mm_srai_epi32(v[0], bit); + // stage 0 + // stage 1 + // stage 2 - v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62); - x = _mm_mullo_epi32(in[0 * col_num + col], cospi2); - v[1] = _mm_sub_epi32(v[1], x); - v[1] = _mm_add_epi32(v[1], rnding); - v[1] = _mm_srai_epi32(v[1], bit); + u[0] = _mm_mullo_epi32(in[7], cospi4); + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); - v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10); - x = _mm_mullo_epi32(in[2 * col_num + col], cospi54); - v[2] = _mm_add_epi32(v[2], x); - v[2] = _mm_add_epi32(v[2], rnding); - v[2] = _mm_srai_epi32(v[2], bit); + u[1] = _mm_mullo_epi32(in[7], cospi60); + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); - v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54); - x = _mm_mullo_epi32(in[2 * col_num + col], cospi10); - v[3] = _mm_sub_epi32(v[3], x); - v[3] = _mm_add_epi32(v[3], rnding); - v[3] = _mm_srai_epi32(v[3], bit); + // (2) + u[2] = _mm_mullo_epi32(in[5], cospi20); + x = _mm_mullo_epi32(in[2], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); - v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18); - x = _mm_mullo_epi32(in[4 * col_num + col], cospi46); - v[4] = _mm_add_epi32(v[4], x); - v[4] = _mm_add_epi32(v[4], rnding); - v[4] = _mm_srai_epi32(v[4], bit); + u[3] = _mm_mullo_epi32(in[5], cospi44); + x = _mm_mullo_epi32(in[2], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); - v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46); - x = _mm_mullo_epi32(in[4 * col_num + col], cospi18); - v[5] = _mm_sub_epi32(v[5], x); - v[5] = _mm_add_epi32(v[5], rnding); - v[5] = _mm_srai_epi32(v[5], bit); + // (3) + u[4] = _mm_mullo_epi32(in[3], cospi36); + x = _mm_mullo_epi32(in[4], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); - v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26); - x = _mm_mullo_epi32(in[6 * col_num + col], cospi38); - v[6] = _mm_add_epi32(v[6], x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); + u[5] = _mm_mullo_epi32(in[3], cospi28); + x = _mm_mullo_epi32(in[4], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); - v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38); - x = _mm_mullo_epi32(in[6 * col_num + col], cospi26); - v[7] = _mm_sub_epi32(v[7], x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); + // (4) + u[6] = _mm_mullo_epi32(in[1], cospi52); + x = _mm_mullo_epi32(in[6], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); - v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34); - x = _mm_mullo_epi32(in[8 * col_num + col], cospi30); - v[8] = _mm_add_epi32(v[8], x); - v[8] = _mm_add_epi32(v[8], rnding); - v[8] = _mm_srai_epi32(v[8], bit); + u[7] = _mm_mullo_epi32(in[1], cospi12); + x = _mm_mullo_epi32(in[6], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); - v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30); - x = _mm_mullo_epi32(in[8 * col_num + col], cospi34); - v[9] = _mm_sub_epi32(v[9], x); - v[9] = _mm_add_epi32(v[9], rnding); - v[9] = _mm_srai_epi32(v[9], bit); + // stage 3 + addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); - v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42); - x = _mm_mullo_epi32(in[10 * col_num + col], cospi22); - v[10] = _mm_add_epi32(v[10], x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; - v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22); - x = _mm_mullo_epi32(in[10 * col_num + col], cospi42); - v[11] = _mm_sub_epi32(v[11], x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); - v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50); - x = _mm_mullo_epi32(in[12 * col_num + col], cospi14); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); - v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14); - x = _mm_mullo_epi32(in[12 * col_num + col], cospi50); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); - v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58); - x = _mm_mullo_epi32(in[14 * col_num + col], cospi6); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); - v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6); - x = _mm_mullo_epi32(in[14 * col_num + col], cospi58); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); + // stage 5 + addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); - // stage 3 - addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; - // stage 4 - v[0] = u[0]; - v[1] = u[1]; - v[2] = u[2]; - v[3] = u[3]; - v[4] = u[4]; - v[5] = u[5]; - v[6] = u[6]; - v[7] = u[7]; + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); - v[8] = _mm_mullo_epi32(u[8], cospi8); - x = _mm_mullo_epi32(u[9], cospi56); - v[8] = _mm_add_epi32(v[8], x); - v[8] = _mm_add_epi32(v[8], rnding); - v[8] = _mm_srai_epi32(v[8], bit); + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); - v[9] = _mm_mullo_epi32(u[8], cospi56); - x = _mm_mullo_epi32(u[9], cospi8); - v[9] = _mm_sub_epi32(v[9], x); - v[9] = _mm_add_epi32(v[9], rnding); - v[9] = _mm_srai_epi32(v[9], bit); + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); - v[10] = _mm_mullo_epi32(u[10], cospi40); - x = _mm_mullo_epi32(u[11], cospi24); - v[10] = _mm_add_epi32(v[10], x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); - v[11] = _mm_mullo_epi32(u[10], cospi24); - x = _mm_mullo_epi32(u[11], cospi40); - v[11] = _mm_sub_epi32(v[11], x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); - v[12] = _mm_mullo_epi32(u[12], cospim56); - x = _mm_mullo_epi32(u[13], cospi8); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); + neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} - v[13] = _mm_mullo_epi32(u[12], cospi8); - x = _mm_mullo_epi32(u[13], cospim56); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); +static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - v[14] = _mm_mullo_epi32(u[14], cospim24); - x = _mm_mullo_epi32(u[15], cospi40); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); + { + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + in[0] = _mm_mullo_epi32(in[0], cospi32); + in[0] = _mm_add_epi32(in[0], rnding); + in[0] = _mm_srai_epi32(in[0], bit); - v[15] = _mm_mullo_epi32(u[14], cospi40); - x = _mm_mullo_epi32(u[15], cospim24); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); + // stage 5 + // stage 6 + // stage 7 + if (do_cols) { + in[0] = _mm_max_epi32(in[0], clamp_lo); + in[0] = _mm_min_epi32(in[0], clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + in[0] = _mm_add_epi32(in[0], offset); + in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); + in[0] = _mm_max_epi32(in[0], clamp_lo_out); + in[0] = _mm_min_epi32(in[0], clamp_hi_out); + } + + out[0] = in[0]; + out[1] = in[0]; + out[2] = in[0]; + out[3] = in[0]; + out[4] = in[0]; + out[5] = in[0]; + out[6] = in[0]; + out[7] = in[0]; + out[8] = in[0]; + out[9] = in[0]; + out[10] = in[0]; + out[11] = in[0]; + out[12] = in[0]; + out[13] = in[0]; + out[14] = in[0]; + out[15] = in[0]; + } +} + +static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[2] = in[4]; + u[4] = in[2]; + u[6] = in[6]; + u[8] = in[1]; + u[10] = in[5]; + u[12] = in[3]; + u[14] = in[7]; + + // stage 2 + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + + u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); + u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); + + u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); + u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); + + u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + + // stage 3 + u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); + u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit); + u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit); + + addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm_mullo_epi32(u[0], cospi32); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + u[1] = u[0]; + + u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); + + addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); + + x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = x; + y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = y; // stage 5 - addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + x = _mm_mullo_epi32(u[5], cospi32); + y = _mm_mullo_epi32(u[6], cospi32); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); // stage 6 - v[0] = u[0]; - v[1] = u[1]; - v[2] = u[2]; - v[3] = u[3]; + addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); - v[4] = _mm_mullo_epi32(u[4], cospi16); - x = _mm_mullo_epi32(u[5], cospi48); - v[4] = _mm_add_epi32(v[4], x); - v[4] = _mm_add_epi32(v[4], rnding); - v[4] = _mm_srai_epi32(v[4], bit); + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[13], cospi32); + u[10] = _mm_sub_epi32(y, x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); - v[5] = _mm_mullo_epi32(u[4], cospi48); - x = _mm_mullo_epi32(u[5], cospi16); - v[5] = _mm_sub_epi32(v[5], x); - v[5] = _mm_add_epi32(v[5], rnding); - v[5] = _mm_srai_epi32(v[5], bit); + u[13] = _mm_add_epi32(x, y); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); - v[6] = _mm_mullo_epi32(u[6], cospim48); - x = _mm_mullo_epi32(u[7], cospi16); - v[6] = _mm_add_epi32(v[6], x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); + x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(u[12], cospi32); + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); - v[7] = _mm_mullo_epi32(u[6], cospi16); - x = _mm_mullo_epi32(u[7], cospim48); - v[7] = _mm_sub_epi32(v[7], x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); + u[12] = _mm_add_epi32(x, y); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + // stage 7 + if (do_cols) { + addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15); + addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14); + addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13); + addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12); + addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11); + addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10); + addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9); + addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} - v[8] = u[8]; - v[9] = u[9]; - v[10] = u[10]; - v[11] = u[11]; +static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + __m128i v[16], x, y, temp1, temp2; - v[12] = _mm_mullo_epi32(u[12], cospi16); - x = _mm_mullo_epi32(u[13], cospi48); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); + // Calculate the column 0, 1, 2, 3 + { + // stage 0 + // stage 1 + // stage 2 + x = _mm_mullo_epi32(in[0], cospi62); + v[0] = _mm_add_epi32(x, rnding); + v[0] = _mm_srai_epi32(v[0], bit); - v[13] = _mm_mullo_epi32(u[12], cospi48); - x = _mm_mullo_epi32(u[13], cospi16); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); + x = _mm_mullo_epi32(in[0], cospi2); + v[1] = _mm_sub_epi32(zero, x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); - v[14] = _mm_mullo_epi32(u[14], cospim48); - x = _mm_mullo_epi32(u[15], cospi16); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); + // stage 3 + v[8] = v[0]; + v[9] = v[1]; - v[15] = _mm_mullo_epi32(u[14], cospi16); - x = _mm_mullo_epi32(u[15], cospim48); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); + // stage 4 + temp1 = _mm_mullo_epi32(v[8], cospi8); + x = _mm_mullo_epi32(v[9], cospi56); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[8], cospi56); + x = _mm_mullo_epi32(v[9], cospi8); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[8] = temp1; + v[9] = temp2; + + // stage 5 + v[4] = v[0]; + v[5] = v[1]; + v[12] = v[8]; + v[13] = v[9]; + + // stage 6 + temp1 = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[4] = temp1; + v[5] = temp2; + + temp1 = _mm_mullo_epi32(v[12], cospi16); + x = _mm_mullo_epi32(v[13], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[12], cospi48); + x = _mm_mullo_epi32(v[13], cospi16); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[12] = temp1; + v[13] = temp2; // stage 7 - addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + v[2] = v[0]; + v[3] = v[1]; + v[6] = v[4]; + v[7] = v[5]; + v[10] = v[8]; + v[11] = v[9]; + v[14] = v[12]; + v[15] = v[13]; // stage 8 - v[0] = u[0]; - v[1] = u[1]; - - y = _mm_mullo_epi32(u[2], cospi32); - x = _mm_mullo_epi32(u[3], cospi32); + y = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); v[2] = _mm_add_epi32(y, x); v[2] = _mm_add_epi32(v[2], rnding); v[2] = _mm_srai_epi32(v[2], bit); @@ -1534,11 +2000,8 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, v[3] = _mm_add_epi32(v[3], rnding); v[3] = _mm_srai_epi32(v[3], bit); - v[4] = u[4]; - v[5] = u[5]; - - y = _mm_mullo_epi32(u[6], cospi32); - x = _mm_mullo_epi32(u[7], cospi32); + y = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); v[6] = _mm_add_epi32(y, x); v[6] = _mm_add_epi32(v[6], rnding); v[6] = _mm_srai_epi32(v[6], bit); @@ -1547,11 +2010,8 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, v[7] = _mm_add_epi32(v[7], rnding); v[7] = _mm_srai_epi32(v[7], bit); - v[8] = u[8]; - v[9] = u[9]; - - y = _mm_mullo_epi32(u[10], cospi32); - x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(v[10], cospi32); + x = _mm_mullo_epi32(v[11], cospi32); v[10] = _mm_add_epi32(y, x); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); @@ -1560,11 +2020,8 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, v[11] = _mm_add_epi32(v[11], rnding); v[11] = _mm_srai_epi32(v[11], bit); - v[12] = u[12]; - v[13] = u[13]; - - y = _mm_mullo_epi32(u[14], cospi32); - x = _mm_mullo_epi32(u[15], cospi32); + y = _mm_mullo_epi32(v[14], cospi32); + x = _mm_mullo_epi32(v[15], cospi32); v[14] = _mm_add_epi32(y, x); v[14] = _mm_add_epi32(v[14], rnding); v[14] = _mm_srai_epi32(v[14], bit); @@ -1575,439 +2032,1904 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, // stage 9 if (do_cols) { - out[0 * col_num + col] = v[0]; - out[1 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[8]); - out[2 * col_num + col] = v[12]; - out[3 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[4]); - out[4 * col_num + col] = v[6]; - out[5 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[14]); - out[6 * col_num + col] = v[10]; - out[7 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[2]); - out[8 * col_num + col] = v[3]; - out[9 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[11]); - out[10 * col_num + col] = v[15]; - out[11 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[7]); - out[12 * col_num + col] = v[5]; - out[13 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[13]); - out[14 * col_num + col] = v[9]; - out[15 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[1]); + out[0] = v[0]; + out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]); + out[2] = v[12]; + out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]); + out[4] = v[6]; + out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]); + out[6] = v[10]; + out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]); + out[8] = v[3]; + out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]); + out[10] = v[15]; + out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]); + out[12] = v[5]; + out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]); + out[14] = v[9]; + out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]); } else { - neg_shift_sse4_1(v[0], v[8], out + 0 * col_num + col, - out + 1 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[12], v[4], out + 2 * col_num + col, - out + 3 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[6], v[14], out + 4 * col_num + col, - out + 5 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[10], v[2], out + 6 * col_num + col, - out + 7 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[3], v[11], out + 8 * col_num + col, - out + 9 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[15], v[7], out + 10 * col_num + col, - out + 11 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[5], v[13], out + 12 * col_num + col, - out + 13 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[9], v[1], out + 14 * col_num + col, - out + 15 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); } } } -void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output, - int stride, TX_TYPE tx_type, int bd) { - __m128i in[64], out[64]; - const int8_t *shift = inv_txfm_shift_ls[TX_16X16]; - const int txw_idx = get_txw_idx(TX_16X16); - const int txh_idx = get_txh_idx(TX_16X16); +static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], x, y; - switch (tx_type) { - case DCT_DCT: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd); - break; - case DCT_ADST: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd); - break; - case ADST_DCT: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd); - break; - case ADST_ADST: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd); - break; - case FLIPADST_DCT: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd); - break; - case DCT_FLIPADST: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd); - break; - case ADST_FLIPADST: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd); - break; - case FLIPADST_FLIPADST: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 1, 1, -shift[1], bd); - break; - case FLIPADST_ADST: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd); - break; - default: assert(0); - } -} + // Calculate the column 0, 1, 2, 3 + { + // stage 0 + // stage 1 + // stage 2 + __m128i zero = _mm_setzero_si128(); + x = _mm_mullo_epi32(in[0], cospi62); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + x = _mm_mullo_epi32(in[0], cospi2); + u[1] = _mm_sub_epi32(zero, x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + x = _mm_mullo_epi32(in[2], cospi54); + u[2] = _mm_add_epi32(x, rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + x = _mm_mullo_epi32(in[2], cospi10); + u[3] = _mm_sub_epi32(zero, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + x = _mm_mullo_epi32(in[4], cospi46); + u[4] = _mm_add_epi32(x, rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + x = _mm_mullo_epi32(in[4], cospi18); + u[5] = _mm_sub_epi32(zero, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); -static void load_buffer_64x64_lower_32x32(const int32_t *coeff, __m128i *in) { - int i, j; + x = _mm_mullo_epi32(in[6], cospi38); + u[6] = _mm_add_epi32(x, rnding); + u[6] = _mm_srai_epi32(u[6], bit); - __m128i zero = _mm_setzero_si128(); + x = _mm_mullo_epi32(in[6], cospi26); + u[7] = _mm_sub_epi32(zero, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); - for (i = 0; i < 32; ++i) { - for (j = 0; j < 8; ++j) { - in[16 * i + j] = - _mm_loadu_si128((const __m128i *)(coeff + 32 * i + 4 * j)); - in[16 * i + j + 8] = zero; - } - } + u[8] = _mm_mullo_epi32(in[7], cospi34); + u[8] = _mm_add_epi32(u[8], rnding); + u[8] = _mm_srai_epi32(u[8], bit); - for (i = 0; i < 512; ++i) in[512 + i] = zero; -} + u[9] = _mm_mullo_epi32(in[7], cospi30); + u[9] = _mm_add_epi32(u[9], rnding); + u[9] = _mm_srai_epi32(u[9], bit); -static void transpose_64x64(__m128i *in, __m128i *out, int do_cols) { - int i, j; - for (i = 0; i < (do_cols ? 16 : 8); ++i) { - for (j = 0; j < 8; ++j) { - TRANSPOSE_4X4(in[(4 * i + 0) * 16 + j], in[(4 * i + 1) * 16 + j], - in[(4 * i + 2) * 16 + j], in[(4 * i + 3) * 16 + j], - out[(4 * j + 0) * 16 + i], out[(4 * j + 1) * 16 + i], - out[(4 * j + 2) * 16 + i], out[(4 * j + 3) * 16 + i]); - } - } -} + u[10] = _mm_mullo_epi32(in[5], cospi42); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); -static void assign_16x16_input_from_32x32(const __m128i *in, __m128i *in16x16, - int col) { - int i; - for (i = 0; i < 16 * 16 / 4; i += 4) { - in16x16[i] = in[col]; - in16x16[i + 1] = in[col + 1]; - in16x16[i + 2] = in[col + 2]; - in16x16[i + 3] = in[col + 3]; - col += 8; - } -} + u[11] = _mm_mullo_epi32(in[5], cospi22); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); -static void write_buffer_32x32(__m128i *in, uint16_t *output, int stride, - int fliplr, int flipud, int shift, int bd) { - __m128i in16x16[16 * 16 / 4]; - uint16_t *leftUp = &output[0]; - uint16_t *rightUp = &output[16]; - uint16_t *leftDown = &output[16 * stride]; - uint16_t *rightDown = &output[16 * stride + 16]; + u[12] = _mm_mullo_epi32(in[3], cospi50); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); - if (fliplr) { - swap_addr(&leftUp, &rightUp); - swap_addr(&leftDown, &rightDown); - } + u[13] = _mm_mullo_epi32(in[3], cospi14); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); - if (flipud) { - swap_addr(&leftUp, &leftDown); - swap_addr(&rightUp, &rightDown); - } + u[14] = _mm_mullo_epi32(in[1], cospi58); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); - // Left-up quarter - assign_16x16_input_from_32x32(in, in16x16, 0); - write_buffer_16x16(in16x16, leftUp, stride, fliplr, flipud, shift, bd); + u[15] = _mm_mullo_epi32(in[1], cospi6); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); - // Right-up quarter - assign_16x16_input_from_32x32(in, in16x16, 32 / 2 / 4); - write_buffer_16x16(in16x16, rightUp, stride, fliplr, flipud, shift, bd); + // stage 3 + addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); - // Left-down quarter - assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4); - write_buffer_16x16(in16x16, leftDown, stride, fliplr, flipud, shift, bd); + // stage 4 + y = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi56); + u[8] = _mm_mullo_epi32(u[8], cospi8); + u[8] = _mm_add_epi32(u[8], x); + u[8] = _mm_add_epi32(u[8], rnding); + u[8] = _mm_srai_epi32(u[8], bit); - // Right-down quarter - assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4 + 32 / 2 / 4); - write_buffer_16x16(in16x16, rightDown, stride, fliplr, flipud, shift, bd); -} + x = _mm_mullo_epi32(u[9], cospi8); + u[9] = _mm_sub_epi32(y, x); + u[9] = _mm_add_epi32(u[9], rnding); + u[9] = _mm_srai_epi32(u[9], bit); -static void assign_32x32_input_from_64x64(const __m128i *in, __m128i *in32x32, - int col) { - int i; - for (i = 0; i < 32 * 32 / 4; i += 8) { - in32x32[i] = in[col]; - in32x32[i + 1] = in[col + 1]; - in32x32[i + 2] = in[col + 2]; - in32x32[i + 3] = in[col + 3]; - in32x32[i + 4] = in[col + 4]; - in32x32[i + 5] = in[col + 5]; - in32x32[i + 6] = in[col + 6]; - in32x32[i + 7] = in[col + 7]; - col += 16; - } -} + x = _mm_mullo_epi32(u[11], cospi24); + y = _mm_mullo_epi32(u[10], cospi24); + u[10] = _mm_mullo_epi32(u[10], cospi40); + u[10] = _mm_add_epi32(u[10], x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); -static void write_buffer_64x64(__m128i *in, uint16_t *output, int stride, - int fliplr, int flipud, int shift, int bd) { - __m128i in32x32[32 * 32 / 4]; - uint16_t *leftUp = &output[0]; - uint16_t *rightUp = &output[32]; - uint16_t *leftDown = &output[32 * stride]; - uint16_t *rightDown = &output[32 * stride + 32]; + x = _mm_mullo_epi32(u[11], cospi40); + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); - if (fliplr) { - swap_addr(&leftUp, &rightUp); - swap_addr(&leftDown, &rightDown); - } + x = _mm_mullo_epi32(u[13], cospi8); + y = _mm_mullo_epi32(u[12], cospi8); + u[12] = _mm_mullo_epi32(u[12], cospim56); + u[12] = _mm_add_epi32(u[12], x); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); - if (flipud) { - swap_addr(&leftUp, &leftDown); - swap_addr(&rightUp, &rightDown); - } + x = _mm_mullo_epi32(u[13], cospim56); + u[13] = _mm_sub_epi32(y, x); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[15], cospi40); + y = _mm_mullo_epi32(u[14], cospi40); + u[14] = _mm_mullo_epi32(u[14], cospim24); + u[14] = _mm_add_epi32(u[14], x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + x = _mm_mullo_epi32(u[15], cospim24); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 5 + addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + x = _mm_mullo_epi32(u[5], cospi48); + y = _mm_mullo_epi32(u[4], cospi48); + u[4] = _mm_mullo_epi32(u[4], cospi16); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + x = _mm_mullo_epi32(u[5], cospi16); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + x = _mm_mullo_epi32(u[7], cospi16); + y = _mm_mullo_epi32(u[6], cospi16); + u[6] = _mm_mullo_epi32(u[6], cospim48); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + x = _mm_mullo_epi32(u[7], cospim48); + u[7] = _mm_sub_epi32(y, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + x = _mm_mullo_epi32(u[13], cospi48); + y = _mm_mullo_epi32(u[12], cospi48); + u[12] = _mm_mullo_epi32(u[12], cospi16); + u[12] = _mm_add_epi32(u[12], x); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + x = _mm_mullo_epi32(u[13], cospi16); + u[13] = _mm_sub_epi32(y, x); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[15], cospi16); + y = _mm_mullo_epi32(u[14], cospi16); + u[14] = _mm_mullo_epi32(u[14], cospim48); + u[14] = _mm_add_epi32(u[14], x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + x = _mm_mullo_epi32(u[15], cospim48); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 7 + addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + u[2] = _mm_add_epi32(y, x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(y, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); - // Left-up quarter - assign_32x32_input_from_64x64(in, in32x32, 0); - write_buffer_32x32(in32x32, leftUp, stride, fliplr, flipud, shift, bd); + u[7] = _mm_sub_epi32(y, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); - // Right-up quarter - assign_32x32_input_from_64x64(in, in32x32, 64 / 2 / 4); - write_buffer_32x32(in32x32, rightUp, stride, fliplr, flipud, shift, bd); + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + u[10] = _mm_add_epi32(y, x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + u[14] = _mm_add_epi32(y, x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); - // Left-down quarter - assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4); - write_buffer_32x32(in32x32, leftDown, stride, fliplr, flipud, shift, bd); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); - // Right-down quarter - assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4 + 64 / 2 / 4); - write_buffer_32x32(in32x32, rightDown, stride, fliplr, flipud, shift, bd); + // stage 9 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]); + out[2] = u[12]; + out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]); + out[4] = u[6]; + out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]); + out[6] = u[10]; + out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]); + out[8] = u[3]; + out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]); + out[10] = u[15]; + out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]); + out[12] = u[5]; + out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]); + out[14] = u[9]; + out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } } -static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, +static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { - int i, j; const int32_t *cospi = cospi_arr(bit); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - int col; - - const __m128i cospi1 = _mm_set1_epi32(cospi[1]); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospi3 = _mm_set1_epi32(cospi[3]); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospi5 = _mm_set1_epi32(cospi[5]); - const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi7 = _mm_set1_epi32(cospi[7]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi9 = _mm_set1_epi32(cospi[9]); - const __m128i cospi10 = _mm_set1_epi32(cospi[10]); - const __m128i cospi11 = _mm_set1_epi32(cospi[11]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi13 = _mm_set1_epi32(cospi[13]); - const __m128i cospi14 = _mm_set1_epi32(cospi[14]); - const __m128i cospi15 = _mm_set1_epi32(cospi[15]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospi17 = _mm_set1_epi32(cospi[17]); - const __m128i cospi18 = _mm_set1_epi32(cospi[18]); - const __m128i cospi19 = _mm_set1_epi32(cospi[19]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospi21 = _mm_set1_epi32(cospi[21]); - const __m128i cospi22 = _mm_set1_epi32(cospi[22]); - const __m128i cospi23 = _mm_set1_epi32(cospi[23]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospi25 = _mm_set1_epi32(cospi[25]); - const __m128i cospi26 = _mm_set1_epi32(cospi[26]); - const __m128i cospi27 = _mm_set1_epi32(cospi[27]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospi29 = _mm_set1_epi32(cospi[29]); - const __m128i cospi30 = _mm_set1_epi32(cospi[30]); - const __m128i cospi31 = _mm_set1_epi32(cospi[31]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospi35 = _mm_set1_epi32(cospi[35]); - const __m128i cospi36 = _mm_set1_epi32(cospi[36]); - const __m128i cospi38 = _mm_set1_epi32(cospi[38]); - const __m128i cospi39 = _mm_set1_epi32(cospi[39]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi43 = _mm_set1_epi32(cospi[43]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi46 = _mm_set1_epi32(cospi[46]); - const __m128i cospi47 = _mm_set1_epi32(cospi[47]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi51 = _mm_set1_epi32(cospi[51]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i cospi52 = _mm_set1_epi32(cospi[52]); - const __m128i cospi54 = _mm_set1_epi32(cospi[54]); - const __m128i cospi55 = _mm_set1_epi32(cospi[55]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi59 = _mm_set1_epi32(cospi[59]); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - const __m128i cospi63 = _mm_set1_epi32(cospi[63]); - - const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); - const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); - const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); - const __m128i cospim33 = _mm_set1_epi32(-cospi[33]); - const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); - const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); - const __m128i cospim37 = _mm_set1_epi32(-cospi[37]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); - const __m128i cospim41 = _mm_set1_epi32(-cospi[41]); - const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); - const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); - const __m128i cospim45 = _mm_set1_epi32(-cospi[45]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); - const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); - const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); - const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); - const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); - const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); - const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); - - for (col = 0; col < (do_cols ? 64 / 4 : 32 / 4); ++col) { - __m128i u[64], v[64]; + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], v[16], x, y; + { + // stage 0 // stage 1 - u[32] = in[1 * 16 + col]; - u[34] = in[17 * 16 + col]; - u[36] = in[9 * 16 + col]; - u[38] = in[25 * 16 + col]; - u[40] = in[5 * 16 + col]; - u[42] = in[21 * 16 + col]; - u[44] = in[13 * 16 + col]; - u[46] = in[29 * 16 + col]; - u[48] = in[3 * 16 + col]; - u[50] = in[19 * 16 + col]; - u[52] = in[11 * 16 + col]; - u[54] = in[27 * 16 + col]; - u[56] = in[7 * 16 + col]; - u[58] = in[23 * 16 + col]; - u[60] = in[15 * 16 + col]; - u[62] = in[31 * 16 + col]; - - v[16] = in[2 * 16 + col]; - v[18] = in[18 * 16 + col]; - v[20] = in[10 * 16 + col]; - v[22] = in[26 * 16 + col]; - v[24] = in[6 * 16 + col]; - v[26] = in[22 * 16 + col]; - v[28] = in[14 * 16 + col]; - v[30] = in[30 * 16 + col]; - - u[8] = in[4 * 16 + col]; - u[10] = in[20 * 16 + col]; - u[12] = in[12 * 16 + col]; - u[14] = in[28 * 16 + col]; - - v[4] = in[8 * 16 + col]; - v[6] = in[24 * 16 + col]; - - u[0] = in[0 * 16 + col]; - u[2] = in[16 * 16 + col]; + u[0] = in[0]; + u[1] = in[8]; + u[2] = in[4]; + u[3] = in[12]; + u[4] = in[2]; + u[5] = in[10]; + u[6] = in[6]; + u[7] = in[14]; + u[8] = in[1]; + u[9] = in[9]; + u[10] = in[5]; + u[11] = in[13]; + u[12] = in[3]; + u[13] = in[11]; + u[14] = in[7]; + u[15] = in[15]; // stage 2 - v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); - v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit); - v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit); - v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); - v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); - v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit); - v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit); - v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); - v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); - v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit); - v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit); - v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); - v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); - v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit); - v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit); - v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); - v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); - v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit); - v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit); - v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); - v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); - v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit); - v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit); - v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); - v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); - v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit); - v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit); - v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); - v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); - v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit); - v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit); - v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; - // stage 3 - u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit); - u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit); - u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit); + v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); + + // stage 3 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); + u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); + u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); + addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm_mullo_epi32(u[0], cospi32); + y = _mm_mullo_epi32(u[1], cospi32); + v[0] = _mm_add_epi32(x, y); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_sub_epi32(x, y); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); + addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[11] = u[11]; + v[12] = u[12]; + v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + v[15] = u[15]; + + // stage 5 + addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + u[4] = v[4]; + + x = _mm_mullo_epi32(v[5], cospi32); + y = _mm_mullo_epi32(v[6], cospi32); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = v[7]; + addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = u[9]; + + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[13], cospi32); + v[10] = _mm_sub_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_add_epi32(x, y); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(u[12], cospi32); + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_add_epi32(x, y); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[14] = u[14]; + v[15] = u[15]; + + // stage 7 + if (do_cols) { + addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15); + addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14); + addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13); + addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12); + addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11); + addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10); + addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9); + addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} + +static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], v[16], x, y; + + // Calculate the column 0, 1, 2, 3 + { + // stage 0 + // stage 1 + // stage 2 + v[0] = _mm_mullo_epi32(in[15], cospi2); + x = _mm_mullo_epi32(in[0], cospi62); + v[0] = _mm_add_epi32(v[0], x); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_mullo_epi32(in[15], cospi62); + x = _mm_mullo_epi32(in[0], cospi2); + v[1] = _mm_sub_epi32(v[1], x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = _mm_mullo_epi32(in[13], cospi10); + x = _mm_mullo_epi32(in[2], cospi54); + v[2] = _mm_add_epi32(v[2], x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_mullo_epi32(in[13], cospi54); + x = _mm_mullo_epi32(in[2], cospi10); + v[3] = _mm_sub_epi32(v[3], x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = _mm_mullo_epi32(in[11], cospi18); + x = _mm_mullo_epi32(in[4], cospi46); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(in[11], cospi46); + x = _mm_mullo_epi32(in[4], cospi18); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(in[9], cospi26); + x = _mm_mullo_epi32(in[6], cospi38); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(in[9], cospi38); + x = _mm_mullo_epi32(in[6], cospi26); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = _mm_mullo_epi32(in[7], cospi34); + x = _mm_mullo_epi32(in[8], cospi30); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(in[7], cospi30); + x = _mm_mullo_epi32(in[8], cospi34); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(in[5], cospi42); + x = _mm_mullo_epi32(in[10], cospi22); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(in[5], cospi22); + x = _mm_mullo_epi32(in[10], cospi42); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(in[3], cospi50); + x = _mm_mullo_epi32(in[12], cospi14); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(in[3], cospi14); + x = _mm_mullo_epi32(in[12], cospi50); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(in[1], cospi58); + x = _mm_mullo_epi32(in[14], cospi6); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(in[1], cospi6); + x = _mm_mullo_epi32(in[14], cospi58); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 3 + addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm_mullo_epi32(u[8], cospi8); + x = _mm_mullo_epi32(u[9], cospi56); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi8); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(u[10], cospi40); + x = _mm_mullo_epi32(u[11], cospi24); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(u[10], cospi24); + x = _mm_mullo_epi32(u[11], cospi40); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[12], cospim56); + x = _mm_mullo_epi32(u[13], cospi8); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi8); + x = _mm_mullo_epi32(u[13], cospim56); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim24); + x = _mm_mullo_epi32(u[15], cospi40); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi40); + x = _mm_mullo_epi32(u[15], cospim24); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 5 + addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = _mm_mullo_epi32(u[4], cospi16); + x = _mm_mullo_epi32(u[5], cospi48); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(u[4], cospi48); + x = _mm_mullo_epi32(u[5], cospi16); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(u[6], cospim48); + x = _mm_mullo_epi32(u[7], cospi16); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(u[6], cospi16); + x = _mm_mullo_epi32(u[7], cospim48); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = _mm_mullo_epi32(u[12], cospi16); + x = _mm_mullo_epi32(u[13], cospi48); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi48); + x = _mm_mullo_epi32(u[13], cospi16); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim48); + x = _mm_mullo_epi32(u[15], cospi16); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi16); + x = _mm_mullo_epi32(u[15], cospim48); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 7 + addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + v[2] = _mm_add_epi32(y, x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(y, x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + v[6] = _mm_add_epi32(y, x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(y, x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + v[10] = _mm_add_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + v[14] = _mm_add_epi32(y, x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(y, x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]); + out[2] = v[12]; + out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]); + out[4] = v[6]; + out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]); + out[6] = v[10]; + out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]); + out[8] = v[3]; + out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]); + out[10] = v[15]; + out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]); + out[12] = v[5]; + out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]); + out[14] = v[9]; + out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} + +static INLINE void idct64_stage8_sse4_1( + __m128i *u, const __m128i *cospim32, const __m128i *cospi32, + const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, + const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + int i; + __m128i temp1, temp2, temp3, temp4; + temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit); + u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit); + u[10] = temp1; + temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit); + u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit); + u[11] = temp2; + + for (i = 16; i < 20; ++i) { + addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); + addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, + clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit); + temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit); + temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit); + temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit); + u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit); + u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit); + u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit); + u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit); + u[36] = temp1; + u[37] = temp2; + u[38] = temp3; + u[39] = temp4; + + temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit); + temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit); + temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit); + temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit); + u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit); + u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit); + u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit); + u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; +} + +static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + int i; + __m128i temp1, temp2, temp3, temp4; + for (i = 0; i < 8; ++i) { + addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit); + u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit); + u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit); + u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit); + u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit); + u[20] = temp1; + u[21] = temp2; + u[22] = temp3; + u[23] = temp4; + for (i = 32; i < 40; i++) { + addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); + } +} + +static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + __m128i temp1, temp2, temp3, temp4; + for (int i = 0; i < 16; i++) { + addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit); + u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit); + u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit); + u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit); + u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; + + temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit); + u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit); + u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit); + u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit); + u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit); + u[44] = temp1; + u[45] = temp2; + u[46] = temp3; + u[47] = temp4; +} + +static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols, + int bd, int out_shift, + const int log_range) { + if (do_cols) { + for (int i = 0; i < 32; i++) { + addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]); + } + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + for (int i = 0; i < 32; i++) { + addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)], + &clamp_lo_out, &clamp_hi_out, out_shift); + } + } +} + +static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + + { + __m128i x; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit); + + // stage 8 + // stage 9 + // stage 10 + // stage 11 + if (do_cols) { + x = _mm_max_epi32(x, clamp_lo); + x = _mm_min_epi32(x, clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + x = _mm_add_epi32(x, offset); + x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + + x = _mm_max_epi32(x, clamp_lo_out); + x = _mm_min_epi32(x, clamp_hi_out); + } + + out[0] = x; + out[63] = x; + out[1] = x; + out[62] = x; + out[2] = x; + out[61] = x; + out[3] = x; + out[60] = x; + out[4] = x; + out[59] = x; + out[5] = x; + out[58] = x; + out[6] = x; + out[57] = x; + out[7] = x; + out[56] = x; + out[8] = x; + out[55] = x; + out[9] = x; + out[54] = x; + out[10] = x; + out[53] = x; + out[11] = x; + out[52] = x; + out[12] = x; + out[51] = x; + out[13] = x; + out[50] = x; + out[14] = x; + out[49] = x; + out[15] = x; + out[48] = x; + out[16] = x; + out[47] = x; + out[17] = x; + out[46] = x; + out[18] = x; + out[45] = x; + out[19] = x; + out[44] = x; + out[20] = x; + out[43] = x; + out[21] = x; + out[42] = x; + out[22] = x; + out[41] = x; + out[23] = x; + out[40] = x; + out[24] = x; + out[39] = x; + out[25] = x; + out[38] = x; + out[26] = x; + out[37] = x; + out[27] = x; + out[36] = x; + out[28] = x; + out[35] = x; + out[29] = x; + out[34] = x; + out[30] = x; + out[33] = x; + out[31] = x; + out[32] = x; + } +} + +static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + + { + __m128i u[64]; + + // stage 1 + u[0] = in[0]; + u[8] = in[4]; + u[16] = in[2]; + u[24] = in[6]; + u[32] = in[1]; + u[40] = in[5]; + u[48] = in[3]; + u[56] = in[7]; + + // stage 2 + u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[38] = u[39]; + u[41] = u[40]; + u[46] = u[47]; + u[49] = u[48]; + u[54] = u[55]; + u[57] = u[56]; + u[62] = u[63]; + + // stage 4 + __m128i temp1, temp2; + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + u[17] = u[16]; + u[22] = u[23]; + u[25] = u[24]; + u[30] = u[31]; + + temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = temp1; + + temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = temp2; + + temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = temp1; + + temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[46] = temp2; + + // stage 5 + u[9] = u[8]; + u[14] = u[15]; + + temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = temp1; + + temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[22] = temp2; + + u[35] = u[32]; + u[34] = u[33]; + u[36] = u[39]; + u[37] = u[38]; + u[43] = u[40]; + u[42] = u[41]; + u[44] = u[47]; + u[45] = u[46]; + u[51] = u[48]; + u[50] = u[49]; + u[52] = u[55]; + u[53] = u[54]; + u[59] = u[56]; + u[58] = u[57]; + u[60] = u[63]; + u[61] = u[62]; + + // stage 6 + temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[0] = temp1; + + temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = temp2; + u[19] = u[16]; + u[18] = u[17]; + u[20] = u[23]; + u[21] = u[22]; + u[27] = u[24]; + u[26] = u[25]; + u[28] = u[31]; + u[29] = u[30]; + + temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = temp1; + temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[35] = temp2; + temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[36] = temp1; + temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[37] = temp2; + temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = temp1; + temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[43] = temp2; + temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[44] = temp1; + temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[45] = temp2; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + u[11] = u[8]; + u[10] = u[9]; + u[12] = u[15]; + u[13] = u[14]; + + temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = temp1; + temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[19] = temp2; + temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[20] = temp1; + temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[21] = temp2; + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + u[7] = u[0]; + u[6] = u[1]; + u[5] = u[2]; + u[4] = u[3]; + u[9] = u[9]; + + idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range); + } +} + +static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi9 = _mm_set1_epi32(cospi[9]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi11 = _mm_set1_epi32(cospi[11]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi13 = _mm_set1_epi32(cospi[13]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi15 = _mm_set1_epi32(cospi[15]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi51 = _mm_set1_epi32(cospi[51]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi55 = _mm_set1_epi32(cospi[55]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + + { + __m128i u[64]; + __m128i tmp1, tmp2, tmp3, tmp4; + // stage 1 + u[0] = in[0]; + u[32] = in[1]; + u[36] = in[9]; + u[40] = in[5]; + u[44] = in[13]; + u[48] = in[3]; + u[52] = in[11]; + u[56] = in[7]; + u[60] = in[15]; + u[16] = in[2]; + u[20] = in[10]; + u[24] = in[6]; + u[28] = in[14]; + u[4] = in[8]; + u[8] = in[4]; + u[12] = in[12]; + + // stage 2 + u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); + u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); + u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); + u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); + u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); + u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); + u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); + u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); + u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit); + u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit); + u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit); + u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[34] = u[35]; + u[37] = u[36]; + u[38] = u[39]; + u[41] = u[40]; + u[42] = u[43]; + u[45] = u[44]; + u[46] = u[47]; + u[49] = u[48]; + u[50] = u[51]; + u[53] = u[52]; + u[54] = u[55]; + u[57] = u[56]; + u[58] = u[59]; + u[61] = u[60]; + u[62] = u[63]; + + // stage 4 + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + + u[17] = u[16]; + u[18] = u[19]; + u[21] = u[20]; + u[22] = u[23]; + u[25] = u[24]; + u[26] = u[27]; + u[29] = u[28]; + u[30] = u[31]; + + tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = tmp1; + u[34] = tmp2; + u[37] = tmp3; + u[38] = tmp4; + + tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = tmp1; + u[42] = tmp2; + u[45] = tmp3; + u[46] = tmp4; + + // stage 5 + u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); + + u[9] = u[8]; + u[10] = u[11]; + u[13] = u[12]; + u[14] = u[15]; + + tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); + u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = tmp1; + u[18] = tmp2; + u[21] = tmp3; + u[22] = tmp4; + + for (i = 32; i < 64; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[0] = tmp1; + u[5] = u[4]; + u[6] = u[7]; + + tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = tmp1; + tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = tmp2; + + for (i = 16; i < 32; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = tmp1; + u[35] = tmp2; + u[36] = tmp3; + u[37] = tmp4; + + tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = tmp1; + u[43] = tmp2; + u[44] = tmp3; + u[45] = tmp4; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); + u[5] = tmp1; + addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = tmp1; + u[19] = tmp2; + u[20] = tmp3; + u[21] = tmp4; + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); + } + + idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range); + } +} + +static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi9 = _mm_set1_epi32(cospi[9]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi11 = _mm_set1_epi32(cospi[11]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi13 = _mm_set1_epi32(cospi[13]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi15 = _mm_set1_epi32(cospi[15]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi17 = _mm_set1_epi32(cospi[17]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi19 = _mm_set1_epi32(cospi[19]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi21 = _mm_set1_epi32(cospi[21]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi23 = _mm_set1_epi32(cospi[23]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi25 = _mm_set1_epi32(cospi[25]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi27 = _mm_set1_epi32(cospi[27]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi29 = _mm_set1_epi32(cospi[29]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi31 = _mm_set1_epi32(cospi[31]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi35 = _mm_set1_epi32(cospi[35]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi39 = _mm_set1_epi32(cospi[39]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi43 = _mm_set1_epi32(cospi[43]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi47 = _mm_set1_epi32(cospi[47]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi51 = _mm_set1_epi32(cospi[51]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi55 = _mm_set1_epi32(cospi[55]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim33 = _mm_set1_epi32(-cospi[33]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim37 = _mm_set1_epi32(-cospi[37]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim41 = _mm_set1_epi32(-cospi[41]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); + const __m128i cospim45 = _mm_set1_epi32(-cospi[45]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + + { + __m128i u[64], v[64]; + + // stage 1 + u[32] = in[1]; + u[34] = in[17]; + u[36] = in[9]; + u[38] = in[25]; + u[40] = in[5]; + u[42] = in[21]; + u[44] = in[13]; + u[46] = in[29]; + u[48] = in[3]; + u[50] = in[19]; + u[52] = in[11]; + u[54] = in[27]; + u[56] = in[7]; + u[58] = in[23]; + u[60] = in[15]; + u[62] = in[31]; + + v[16] = in[2]; + v[18] = in[18]; + v[20] = in[10]; + v[22] = in[26]; + v[24] = in[6]; + v[26] = in[22]; + v[28] = in[14]; + v[30] = in[30]; + + u[8] = in[4]; + u[10] = in[20]; + u[12] = in[12]; + u[14] = in[28]; + + v[4] = in[8]; + v[6] = in[24]; + + u[0] = in[0]; + u[2] = in[16]; + + // stage 2 + v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit); + v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit); + v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); + v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); + v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit); + v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit); + v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit); + v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit); + v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); + v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); + v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit); + v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit); + v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit); + v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit); + v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); + v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); + v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit); + v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit); + v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit); + v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit); + v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); + v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); + v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit); + v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit); + v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + + // stage 3 + u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit); + u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit); + u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit); u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit); u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit); u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit); @@ -2039,301 +3961,1388 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); - for (i = 16; i < 32; i += 4) { - addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, - &clamp_hi); + for (i = 16; i < 32; i += 4) { + addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + + // stage 5 + u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit); + u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit); + u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit); + u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit); + + for (i = 8; i < 16; i += 4) { + addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 16; i < 32; i += 4) { + u[i + 0] = v[i + 0]; + u[i + 3] = v[i + 3]; + } + + u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); + u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); + u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); + u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); + + for (i = 32; i < 64; i += 8) { + addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); + v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + + addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + + for (i = 8; i < 16; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + + for (i = 16; i < 32; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 8) { + v[i + 0] = u[i + 0]; + v[i + 1] = u[i + 1]; + v[i + 6] = u[i + 6]; + v[i + 7] = u[i + 7]; + } + + v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + + // stage 7 + addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + u[4] = v[4]; + u[7] = v[7]; + u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); + + addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + for (i = 16; i < 32; i += 8) { + u[i + 0] = v[i + 0]; + u[i + 1] = v[i + 1]; + u[i + 6] = v[i + 6]; + u[i + 7] = v[i + 7]; + } + + u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); + u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); + u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); + } + + v[8] = u[8]; + v[9] = u[9]; + v[14] = u[14]; + v[15] = u[15]; + + v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); + v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); + + for (i = 16; i < 20; ++i) { + addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 36; ++i) { + v[i] = u[i]; + v[i + 12] = u[i + 12]; + v[i + 16] = u[i + 16]; + v[i + 28] = u[i + 28]; + } + + v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); + v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); + v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); + v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); + v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); + v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); + v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); + v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); + v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); + v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); + + // stage 9 + for (i = 0; i < 8; ++i) { + addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); + } + + for (i = 16; i < 20; ++i) { + u[i] = v[i]; + u[i + 12] = v[i + 12]; + } + + u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); + u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); + u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); + u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); + u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); + + for (i = 32; i < 40; i++) { + addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); } - for (i = 32; i < 64; i += 4) { - v[i + 0] = u[i + 0]; - v[i + 3] = u[i + 3]; + for (i = 48; i < 56; i++) { + addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); } - v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); - v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); - v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); - v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); - v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); - v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); - v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); - v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); - v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); - v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); - v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); - v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); - v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); - v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); - v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); - v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + // stage 10 + for (i = 0; i < 16; i++) { + addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); + } - // stage 5 - u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit); - u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit); - u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit); - u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit); + for (i = 32; i < 40; i++) v[i] = u[i]; - for (i = 8; i < 16; i += 4) { - addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, - &clamp_hi); - addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, - &clamp_hi); - } + v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); + v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); + v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); + v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); + v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); + v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); + v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); + v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); + v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); + v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); - for (i = 16; i < 32; i += 4) { - u[i + 0] = v[i + 0]; - u[i + 3] = v[i + 3]; + for (i = 56; i < 64; i++) v[i] = u[i]; + + // stage 11 + if (do_cols) { + for (i = 0; i < 32; i++) { + addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]); + } + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + for (i = 0; i < 32; i++) { + addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], + &clamp_lo_out, &clamp_hi_out, out_shift); + } } + } +} - u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); - u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); - u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); - u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); - u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); - u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); - u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); - u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); +static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1; - for (i = 32; i < 64; i += 8) { - addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, - &clamp_hi); - addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, - &clamp_hi); + // stage 0 + // stage 1 + bf1 = in[0]; - addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, - &clamp_hi); - addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, - &clamp_hi); - } + // stage 2 + // stage 3 + // stage 4 + // stage 5 + bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit); - // stage 6 - v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); - v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); - v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); - v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + // stage 6 + // stage 7 + // stage 8 + // stage 9 + if (do_cols) { + bf1 = _mm_max_epi32(bf1, clamp_lo); + bf1 = _mm_min_epi32(bf1, clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + bf1 = _mm_add_epi32(bf1, offset); + bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift)); + bf1 = _mm_max_epi32(bf1, clamp_lo_out); + bf1 = _mm_min_epi32(bf1, clamp_hi_out); + } + out[0] = bf1; + out[1] = bf1; + out[2] = bf1; + out[3] = bf1; + out[4] = bf1; + out[5] = bf1; + out[6] = bf1; + out[7] = bf1; + out[8] = bf1; + out[9] = bf1; + out[10] = bf1; + out[11] = bf1; + out[12] = bf1; + out[13] = bf1; + out[14] = bf1; + out[15] = bf1; + out[16] = bf1; + out[17] = bf1; + out[18] = bf1; + out[19] = bf1; + out[20] = bf1; + out[21] = bf1; + out[22] = bf1; + out[23] = bf1; + out[24] = bf1; + out[25] = bf1; + out[26] = bf1; + out[27] = bf1; + out[28] = bf1; + out[29] = bf1; + out[30] = bf1; + out[31] = bf1; +} - addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); +static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32]; - for (i = 8; i < 16; i += 4) { - v[i + 0] = u[i + 0]; - v[i + 3] = u[i + 3]; - } + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[4] = in[4]; + bf1[8] = in[2]; + bf1[12] = in[6]; + bf1[16] = in[1]; + bf1[20] = in[5]; + bf1[24] = in[3]; + bf1[28] = in[7]; - v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); - v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); - v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); - v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + // stage 2 + bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); + bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); + bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); - for (i = 16; i < 32; i += 8) { - addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, - &clamp_hi); + // stage 3 + bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); + + bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); + bf1[17] = bf1[16]; + bf1[18] = bf1[19]; + bf1[21] = bf1[20]; + bf1[22] = bf1[23]; + bf1[25] = bf1[24]; + bf1[26] = bf1[27]; + bf1[29] = bf1[28]; + bf1[30] = bf1[31]; + + // stage 4 : + bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); + + bf1[9] = bf1[8]; + bf1[10] = bf1[11]; + bf1[13] = bf1[12]; + bf1[14] = bf1[15]; + + idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); - addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, - &clamp_hi); - } + // stage 5 + bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[5] = bf1[4]; + bf1[6] = bf1[7]; - for (i = 32; i < 64; i += 8) { - v[i + 0] = u[i + 0]; - v[i + 1] = u[i + 1]; - v[i + 6] = u[i + 6]; - v[i + 7] = u[i + 7]; - } + idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); - v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); - v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); - v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); - v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); - v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); - v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); - v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); - v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); - v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); - v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); - v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); - v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); - v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); - v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); - v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); - v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + // stage 6 + bf1[3] = bf1[0]; + bf1[2] = bf1[1]; - // stage 7 - addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range); +} + +static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32]; + + // stage 0 + // stage 1 + + bf1[0] = in[0]; + bf1[2] = in[8]; + bf1[4] = in[4]; + bf1[6] = in[12]; + bf1[8] = in[2]; + bf1[10] = in[10]; + bf1[12] = in[6]; + bf1[14] = in[14]; + bf1[16] = in[1]; + bf1[18] = in[9]; + bf1[20] = in[5]; + bf1[22] = in[13]; + bf1[24] = in[3]; + bf1[26] = in[11]; + bf1[28] = in[7]; + bf1[30] = in[15]; + + // stage 2 + bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); + bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit); + bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit); + bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit); + bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit); + bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); + bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit); + bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit); + bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit); + bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit); + bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); + bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit); + bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit); + bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit); + bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit); + bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); + + addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + // stage 4 + bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); + bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit); + bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit); + + addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); + + idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit); + bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit); - u[4] = v[4]; - u[7] = v[7]; - u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); - u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); + addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); - addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); - for (i = 16; i < 32; i += 8) { - u[i + 0] = v[i + 0]; - u[i + 1] = v[i + 1]; - u[i + 6] = v[i + 6]; - u[i + 7] = v[i + 7]; - } + // stage 6 + addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); - u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); - u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); - u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); - u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); - u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); - u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); - u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); - u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); + idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); - for (i = 32; i < 64; i += 16) { - for (j = i; j < i + 4; j++) { - addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, - &clamp_hi); - } - } + // stage 7 + idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); - // stage 8 - for (i = 0; i < 4; ++i) { - addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); - } + // stage 8 + idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); - v[8] = u[8]; - v[9] = u[9]; - v[14] = u[14]; - v[15] = u[15]; + // stage 9 + idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range); +} - v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); - v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); - v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); - v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); +static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32], bf0[32]; - for (i = 16; i < 20; ++i) { - addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, - &clamp_hi); - } + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[1] = in[16]; + bf1[2] = in[8]; + bf1[3] = in[24]; + bf1[4] = in[4]; + bf1[5] = in[20]; + bf1[6] = in[12]; + bf1[7] = in[28]; + bf1[8] = in[2]; + bf1[9] = in[18]; + bf1[10] = in[10]; + bf1[11] = in[26]; + bf1[12] = in[6]; + bf1[13] = in[22]; + bf1[14] = in[14]; + bf1[15] = in[30]; + bf1[16] = in[1]; + bf1[17] = in[17]; + bf1[18] = in[9]; + bf1[19] = in[25]; + bf1[20] = in[5]; + bf1[21] = in[21]; + bf1[22] = in[13]; + bf1[23] = in[29]; + bf1[24] = in[3]; + bf1[25] = in[19]; + bf1[26] = in[11]; + bf1[27] = in[27]; + bf1[28] = in[7]; + bf1[29] = in[23]; + bf1[30] = in[15]; + bf1[31] = in[31]; - for (i = 32; i < 36; ++i) { - v[i] = u[i]; - v[i + 12] = u[i + 12]; - v[i + 16] = u[i + 16]; - v[i + 28] = u[i + 28]; - } + // stage 2 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = bf1[4]; + bf0[5] = bf1[5]; + bf0[6] = bf1[6]; + bf0[7] = bf1[7]; + bf0[8] = bf1[8]; + bf0[9] = bf1[9]; + bf0[10] = bf1[10]; + bf0[11] = bf1[11]; + bf0[12] = bf1[12]; + bf0[13] = bf1[13]; + bf0[14] = bf1[14]; + bf0[15] = bf1[15]; + bf0[16] = + half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); + bf0[17] = + half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); + bf0[31] = + half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); - v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); - v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); - v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); - v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); - v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); - v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); - v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); - v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); - v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); - v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); - v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); - v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); - v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); - v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); - v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); - v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); + // stage 3 + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = + half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); + bf1[9] = + half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); + bf1[15] = + half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); + + addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); - // stage 9 - for (i = 0; i < 8; ++i) { - addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); - } + // stage 4 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = + half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); + bf0[5] = + half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); + bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); + + addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); + + bf0[16] = bf1[16]; + bf0[17] = + half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); + bf0[19] = bf1[19]; + bf0[20] = bf1[20]; + bf0[21] = + half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = + half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); + bf0[27] = bf1[27]; + bf0[28] = bf1[28]; + bf0[29] = + half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); + bf0[31] = bf1[31]; - for (i = 16; i < 20; ++i) { - u[i] = v[i]; - u[i + 12] = v[i + 12]; - } + // stage 5 + bf1[0] = + half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); + bf1[1] = + half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); + bf1[2] = + half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); + bf1[3] = + half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); + addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = + half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = + half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); + bf1[15] = bf0[15]; + addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); - u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); - u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); - u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); - u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); - u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); - u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); - u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); - u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); + // stage 6 + addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); + bf0[4] = bf1[4]; + bf0[5] = + half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[7] = bf1[7]; + addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = + half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); + bf0[22] = bf1[22]; + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = bf1[25]; + bf0[26] = + half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; - for (i = 32; i < 40; i++) { - addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); - } + // stage 7 + addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = + half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); + + // stage 8 + addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = bf1[18]; + bf0[19] = bf1[19]; + bf0[20] = + half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[28] = bf1[28]; + bf0[29] = bf1[29]; + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 9 + if (do_cols) { + addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31); + addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30); + addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29); + addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28); + addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27); + addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26); + addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25); + addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24); + addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23); + addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22); + addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21); + addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20); + addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19); + addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18); + addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17); + addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} - for (i = 48; i < 56; i++) { - addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); - } +void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + // Assembly version doesn't support some transform types, so use C version + // for those. + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); + break; + default: + av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + } +} - // stage 10 - for (i = 0; i < 16; i++) { - addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); - } +void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + // Assembly version doesn't support some transform types, so use C version + // for those. + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); + break; + default: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + } +} - for (i = 32; i < 40; i++) v[i] = u[i]; +void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + // Assembly version doesn't support some transform types, so use C version + // for those. + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); + break; + default: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + } +} - v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); - v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); - v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); - v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); - v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); - v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); - v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); - v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); - v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); - v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); - v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); - v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); - v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); - v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); - v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); - v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); +void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input, + uint8_t *dest, int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + // Assembly version doesn't support some transform types, so use C version + // for those. + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + default: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + } +} - for (i = 56; i < 64; i++) v[i] = u[i]; +void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input, + uint8_t *dest, int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + case DCT_DCT: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + // Assembly version doesn't support IDTX, so use C version for it. + case IDTX: + av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + default: assert(0); + } +} - // stage 11 - if (do_cols) { - for (i = 0; i < 32; i++) { - addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[16 * (i) + col], - &out[16 * (63 - i) + col]); +void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + int eob = txfm_param->eob; + int bd = txfm_param->bd; + int lossless = txfm_param->lossless; + const int32_t *src = cast_to_int32(input); + const TX_TYPE tx_type = txfm_param->tx_type; + if (lossless) { + assert(tx_type == DCT_DCT); + av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); + return; + } + switch (tx_type) { + // Assembly version doesn't support some transform types, so use C version + // for those. + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); + break; + default: + av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + } +} + +static const transform_1d_sse4_1 + highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL }, + { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { + { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1, + NULL }, + { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1, + NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1, + idct32x32_sse4_1 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1, + idct64x64_sse4_1 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64 * 16]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { + __m128i buf0[64]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1( + buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2); + } + row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]); } } else { - for (i = 0; i < 32; i++) { - addsub_shift_sse4_1(v[i], v[63 - i], &out[16 * (i) + col], - &out[16 * (63 - i) + col], &clamp_lo, &clamp_hi, - out_shift); + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); } } } -} + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } -void av1_inv_txfm2d_add_64x64_sse4_1(const int32_t *coeff, uint16_t *output, - int stride, TX_TYPE tx_type, int bd) { - __m128i in[64 * 64 / 4], out[64 * 64 / 4]; - const int8_t *shift = inv_txfm_shift_ls[TX_64X64]; - const int txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0]; - const int txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0]; + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} +void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { switch (tx_type) { case DCT_DCT: - load_buffer_64x64_lower_32x32(coeff, in); - transpose_64x64(in, out, 0); - idct64x64_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_64x64(in, out, 1); - idct64x64_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_64x64(in, output, stride, 0, 0, -shift[1], bd); + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + highbd_inv_txfm2d_add_no_identity_sse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); break; + default: assert(0); break; + } +} - default: - av1_inv_txfm2d_add_64x64_c(coeff, output, stride, tx_type, bd); +void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_32X32: + av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X16: + av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param); + break; + case TX_8X8: + av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X8: + av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); + break; + case TX_8X16: + av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X8: + av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X32: + av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); + break; + case TX_32X16: + av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); + break; + case TX_32X64: + av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); + break; + case TX_64X32: + av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); + break; + case TX_4X4: + av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); + break; + case TX_8X32: + av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); + break; + case TX_32X8: + av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); + break; + case TX_64X64: + case TX_16X64: + case TX_64X16: + av1_highbd_inv_txfm2d_add_universe_sse4_1( + input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, + txfm_param->eob, txfm_param->bd); break; + default: assert(0 && "Invalid transform size"); break; } } diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c index 608bd88a4..e298cf653 100644 --- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c +++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c @@ -14,7 +14,6 @@ #include "config/aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/convolve_common_intrin.h" #include "aom_dsp/x86/convolve_sse4_1.h" diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h index b29bd1d79..6f24e5948 100644 --- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h +++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H -#define _HIGHBD_TXFM_UTILITY_SSE4_H +#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ +#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ #include /* SSE4.1 */ @@ -75,6 +75,17 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) { out[63]); } +static INLINE void transpose_32x32(const __m128i *input, __m128i *output) { + for (int j = 0; j < 8; j++) { + for (int i = 0; i < 8; i++) { + TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8], + input[i * 32 + j + 16], input[i * 32 + j + 24], + output[j * 32 + i + 0], output[j * 32 + i + 8], + output[j * 32 + i + 16], output[j * 32 + i + 24]); + } + } +} + // Note: // rounding = 1 << (bit - 1) static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0, @@ -100,4 +111,15 @@ static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0, return x; } -#endif // _HIGHBD_TXFM_UTILITY_SSE4_H +typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift); + +typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, + const int num_cols); + +void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd); + +#endif // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c index a08beaafd..4bcab0564 100644 --- a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c +++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c @@ -19,10 +19,21 @@ static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp, - int sx, int alpha, int k, - const int offset_bits_horiz, - const int reduce_bits_horiz) { +static const uint8_t highbd_shuffle_alpha0_mask0[16] = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +}; +static const uint8_t highbd_shuffle_alpha0_mask1[16] = { + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 +}; +static const uint8_t highbd_shuffle_alpha0_mask2[16] = { + 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 +}; +static const uint8_t highbd_shuffle_alpha0_mask3[16] = { + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 +}; + +static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx, + __m128i *coeff) { // Filter even-index pixels const __m128i tmp_0 = _mm_loadu_si128( (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS))); @@ -43,27 +54,13 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp, const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6 - const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); + coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6 - const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); + coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10); // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6 - const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); + coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14); // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6 - const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); - - const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) + - ((1 << reduce_bits_horiz) >> 1)); - - // Calculate filtered results - const __m128i res_0 = _mm_madd_epi16(src, coeff_0); - const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2); - const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4); - const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6); - - __m128i res_even = - _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); - res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), - _mm_cvtsi32_si128(reduce_bits_horiz)); + coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14); // Filter odd-index pixels const __m128i tmp_1 = _mm_loadu_si128( @@ -80,15 +77,63 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp, const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); - const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); - const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); - const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); - const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); + coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11); + coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11); + coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15); + coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0( + int sx, __m128i *coeff) { + // Filter coeff + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + coeff[0] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0)); + coeff[2] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1)); + coeff[4] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2)); + coeff[6] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3)); + + coeff[1] = coeff[0]; + coeff[3] = coeff[2]; + coeff[5] = coeff[4]; + coeff[7] = coeff[6]; +} + +static INLINE void highbd_filter_src_pixels( + const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff, + const int offset_bits_horiz, const int reduce_bits_horiz, int k) { + const __m128i src_1 = *src; + const __m128i src2_1 = *src2; - const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1); - const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3); - const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5); - const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7); + const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) + + ((1 << reduce_bits_horiz) >> 1)); + + const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]); + + __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), + _mm_cvtsi32_si128(reduce_bits_horiz)); + + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); @@ -101,6 +146,145 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp, tmp[k + 7] = _mm_packs_epi32(res_even, res_odd); } +static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2, + __m128i *tmp, int sx, int alpha, int k, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff); + highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); +} + +static INLINE void highbd_warp_horizontal_filter_alpha0_beta0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + (void)alpha; + int k; + + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter_alpha0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)alpha; + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter_beta0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + int k; + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } +} + +static INLINE void highbd_prepare_warp_horizontal_filter( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + if (alpha == 0 && beta == 0) + highbd_warp_horizontal_filter_alpha0_beta0( + ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + + else if (alpha == 0 && beta != 0) + highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + + else if (alpha != 0 && beta == 0) + highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + else + highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); +} + void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, @@ -247,27 +431,13 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi); const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi); - horizontal_filter(src_padded, src2_padded, tmp, sx, alpha, k, - offset_bits_horiz, reduce_bits_horiz); + highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k, + offset_bits_horiz, reduce_bits_horiz); } } else { - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - int sx = sx4 + beta * (k + 4); - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - const __m128i src2 = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); - - horizontal_filter(src, src2, tmp, sx, alpha, k, offset_bits_horiz, - reduce_bits_horiz); - } + highbd_prepare_warp_horizontal_filter( + ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); } // Vertical filter diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c index d1ea26290..9f2e2b457 100644 --- a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c +++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c @@ -13,7 +13,6 @@ #include "config/aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/convolve_common_intrin.h" #include "aom_dsp/x86/convolve_sse4_1.h" @@ -21,6 +20,21 @@ #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" +static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) { + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi16(w0); + const __m256i wt1 = _mm256_set1_epi16(w1); + const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + return wt; +} + +static INLINE __m256i load_line2_avx2(const void *a, const void *b) { + return _mm256_permute2x128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); +} + void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, @@ -34,11 +48,7 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_horiz; const int bits = FILTER_BITS - conv_params->round_1; - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi16(w0); - const __m256i wt1 = _mm256_set1_epi16(w1); - const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; const int offset_0 = @@ -68,13 +78,11 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, (void)subpel_y_q4; for (i = 0; i < h; i += 2) { + const uint8_t *src_data = src_ptr + i * src_stride; + CONV_BUF_TYPE *dst_data = dst + i * dst_stride; for (j = 0; j < w; j += 8) { - const __m256i data = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&src_ptr[i * src_stride + j + src_stride]))), - 0x20); + const __m256i data = + load_line2_avx2(&src_data[j], &src_data[j + src_stride]); __m256i res = convolve_lowbd_x(data, coeffs, filt); @@ -86,13 +94,8 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, // Accumulate values into the destination buffer if (do_average) { - const __m256i data_ref_0 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))), - 0x20); - + const __m256i data_ref_0 = + load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); @@ -141,11 +144,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m256i round_const = _mm256_set1_epi32((1 << conv_params->round_1) >> 1); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi16(w0); - const __m256i wt1 = _mm256_set1_epi16(w1); - const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; const int offset_0 = @@ -172,72 +171,35 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, for (j = 0; j < w; j += 16) { const uint8_t *data = &src_ptr[j]; __m256i src6; - // Load lines a and b. Line a to lower 128, line b to upper 128 - const __m256i src_01a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - 0x20); - - const __m256i src_12a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - 0x20); - - const __m256i src_23a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - 0x20); - - const __m256i src_34a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - 0x20); - - const __m256i src_45a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - 0x20); - - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); - const __m256i src_56a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - src6, 0x20); - - s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); - s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); - s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); - - s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); - s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); - s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); + { + __m256i src_ab[7]; + __m256i src_a[7]; + src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + for (int kk = 0; kk < 6; ++kk) { + data += src_stride; + src_a[kk + 1] = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + } + src6 = src_a[6]; + s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); + s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); + s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); + s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); + s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); + s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); + } for (i = 0; i < h; i += 2) { - data = &src_ptr[i * src_stride + j]; - const __m256i src_67a = _mm256_permute2x128_si256( - src6, - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - 0x20); + data = &src_ptr[(i + 7) * src_stride + j]; + const __m256i src7 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); - const __m256i src_78a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - src6, 0x20); + _mm_loadu_si128((__m128i *)(data + src_stride))); + const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); @@ -266,13 +228,8 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, if (w - j < 16) { if (do_average) { - const __m256i data_ref_0 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))), - 0x20); - + const __m256i data_ref_0 = load_line2_avx2( + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg); @@ -325,19 +282,12 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, _mm256_add_epi16(res_hi_round, offset_const_2); if (do_average) { - const __m256i data_ref_0_lo = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))), - 0x20); - - const __m256i data_ref_0_hi = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j + 8]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]))), - 0x20); + const __m256i data_ref_0_lo = load_line2_avx2( + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); + + const __m256i data_ref_0_hi = + load_line2_avx2(&dst[i * dst_stride + j + 8], + &dst[i * dst_stride + j + 8 + dst_stride]); const __m256i comp_avg_res_lo = comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg); @@ -404,11 +354,7 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi16(w0); - const __m256i wt1 = _mm256_set1_epi16(w1); - const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; const int offset_0 = @@ -442,15 +388,14 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, for (j = 0; j < w; j += 8) { /* Horizontal filter */ { + const uint8_t *src_h = src_ptr + j; for (i = 0; i < im_h; i += 2) { - __m256i data = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + __m256i data = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); if (i + 1 < im_h) data = _mm256_inserti128_si256( - data, - _mm_loadu_si128( - (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), - 1); + data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); + src_h += (src_stride << 1); __m256i res = convolve_lowbd_x(data, coeffs_x, filt); res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), @@ -500,13 +445,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); if (do_average) { - const __m256i data_ref_0 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))), - 0x20); - + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); @@ -534,12 +475,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); if (do_average) { - const __m256i data_ref_0 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))), - 0x20); + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); @@ -598,11 +536,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, const __m128i left_shift = _mm_cvtsi32_si128(bits); const int do_average = conv_params->do_average; const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi16(w0); - const __m256i wt1 = _mm256_set1_epi16(w1); - const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + const __m256i wt = unpack_weights_avx2(conv_params); const __m256i zero = _mm256_setzero_si256(); const int offset_0 = @@ -663,13 +597,8 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, // Accumulate values into the destination buffer if (do_average) { - const __m256i data_ref_0 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))), - 0x20); - + const __m256i data_ref_0 = load_line2_avx2( + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c index ffbb31849..f645e0454 100644 --- a/third_party/aom/av1/common/x86/reconinter_avx2.c +++ b/third_party/aom/av1/common/x86/reconinter_avx2.c @@ -16,8 +16,504 @@ #include "aom/aom_integer.h" #include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" #include "av1/common/blockd.h" +static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0, + const __m256i s1) { + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)); + return _mm256_abs_epi16( + _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4))); + // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54) +} +void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int stride0, + const uint8_t *src1, int stride1, + int h, int w) { + const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; + const __m256i y_mask_base = _mm256_set1_epi16(38 - mb); + int i = 0; + if (4 == w) { + do { + const __m128i s0A = xx_loadl_32(src0); + const __m128i s0B = xx_loadl_32(src0 + stride0); + const __m128i s0C = xx_loadl_32(src0 + stride0 * 2); + const __m128i s0D = xx_loadl_32(src0 + stride0 * 3); + const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); + const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D); + const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD); + const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD); + + const __m128i s1A = xx_loadl_32(src1); + const __m128i s1B = xx_loadl_32(src1 + stride1); + const __m128i s1C = xx_loadl_32(src1 + stride1 * 2); + const __m128i s1D = xx_loadl_32(src1 + stride1 * 3); + const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); + const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D); + const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD); + const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD); + const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + const __m128i x_m8 = + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)); + xx_storeu_128(mask, x_m8); + src0 += (stride0 << 2); + src1 += (stride1 << 2); + mask += 16; + i += 4; + } while (i < h); + } else if (8 == w) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + stride0); + const __m128i s0C = xx_loadl_64(src0 + stride0 * 2); + const __m128i s0D = xx_loadl_64(src0 + stride0 * 3); + const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C)); + const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D)); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + stride1); + const __m128i s1C = xx_loadl_64(src1 + stride1 * 2); + const __m128i s1D = xx_loadl_64(src1 + stride1 * 3); + const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C)); + const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D)); + const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w); + const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w); + const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD); + yy_storeu_256(mask, m8); + src0 += stride0 << 2; + src1 += stride1 << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (16 == w) { + do { + const __m128i s0A = xx_load_128(src0); + const __m128i s0B = xx_load_128(src0 + stride0); + const __m128i s1A = xx_load_128(src1); + const __m128i s1B = xx_load_128(src1 + stride1); + const __m256i s0AL = _mm256_cvtepu8_epi16(s0A); + const __m256i s0BL = _mm256_cvtepu8_epi16(s0B); + const __m256i s1AL = _mm256_cvtepu8_epi16(s1A); + const __m256i s1BL = _mm256_cvtepu8_epi16(s1B); + + const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL); + const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL); + + const __m256i m8 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8); + yy_storeu_256(mask, m8); + src0 += stride0 << 1; + src1 += stride1 << 1; + mask += 32; + i += 2; + } while (i < h); + } else { + do { + int j = 0; + do { + const __m256i s0 = yy_loadu_256(src0 + j); + const __m256i s1 = yy_loadu_256(src1 + j); + const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0)); + const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); + const __m256i s0H = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1)); + const __m256i s1H = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); + const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L); + const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H); + const __m256i m8 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8); + yy_storeu_256(mask + j, m8); + j += 32; + } while (j < w); + src0 += stride0; + src1 += stride1; + mask += w; + i += 1; + } while (i < h); + } +} + +static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0, + const __m256i *data_src1, + const __m256i *round_const, + const __m256i *mask_base_16, + const __m256i *clip_diff, int round) { + const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); + const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); + const __m256i diff = _mm256_max_epu16(diffa, diffb); + const __m256i diff_round = + _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); + const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); + const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); + return diff_clamp; +} + +static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0, + const __m256i *data_src1, + const __m256i *round_const, + const __m256i *mask_base_16, + const __m256i *clip_diff, + int round) { + const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); + const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); + const __m256i diff = _mm256_max_epu16(diffa, diffb); + const __m256i diff_round = + _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); + const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); + const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); + const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp); + return diff_const_16; +} + +static INLINE void build_compound_diffwtd_mask_d16_avx2( + uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { + const int mask_base = 38; + const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); + const __m256i y38 = _mm256_set1_epi16(mask_base); + const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + int i = 0; + if (w == 4) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), + _mm_unpacklo_epi64(s0A, s0B)); + const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), + _mm_unpacklo_epi64(s1A, s1B)); + const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + xx_storeu_128(mask, + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 16; + i += 4; + } while (i < h); + } else if (w == 8) { + do { + const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); + const __m256i s0CD = + yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); + const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); + const __m256i s1CD = + yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); + const __m256i m16AB = + calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); + const __m256i m16CD = + calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (w == 16) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + src0_stride); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + src1_stride); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else if (w == 32) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 32; + i += 1; + } while (i < h); + } else if (w == 64) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 64; + i += 1; + } while (i < h); + } else { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s0E = yy_loadu_256(src0 + 64); + const __m256i s0F = yy_loadu_256(src0 + 80); + const __m256i s0G = yy_loadu_256(src0 + 96); + const __m256i s0H = yy_loadu_256(src0 + 112); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i s1E = yy_loadu_256(src1 + 64); + const __m256i s1F = yy_loadu_256(src1 + 80); + const __m256i s1G = yy_loadu_256(src1 + 96); + const __m256i s1H = yy_loadu_256(src1 + 112); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m16E = + calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); + const __m256i m16F = + calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); + const __m256i m16G = + calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); + const __m256i m16H = + calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); + const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); + yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 128; + i += 1; + } while (i < h); + } +} + +static INLINE void build_compound_diffwtd_mask_d16_inv_avx2( + uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { + const int mask_base = 38; + const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); + const __m256i y38 = _mm256_set1_epi16(mask_base); + const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + int i = 0; + if (w == 4) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), + _mm_unpacklo_epi64(s0A, s0B)); + const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), + _mm_unpacklo_epi64(s1A, s1B)); + const __m256i m16 = + calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + xx_storeu_128(mask, + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 16; + i += 4; + } while (i < h); + } else if (w == 8) { + do { + const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); + const __m256i s0CD = + yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); + const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); + const __m256i s1CD = + yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); + const __m256i m16AB = + calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); + const __m256i m16CD = + calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (w == 16) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + src0_stride); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + src1_stride); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else if (w == 32) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 32; + i += 1; + } while (i < h); + } else if (w == 64) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 64; + i += 1; + } while (i < h); + } else { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s0E = yy_loadu_256(src0 + 64); + const __m256i s0F = yy_loadu_256(src0 + 80); + const __m256i s0G = yy_loadu_256(src0 + 96); + const __m256i s0H = yy_loadu_256(src0 + 112); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i s1E = yy_loadu_256(src1 + 64); + const __m256i s1F = yy_loadu_256(src1 + 80); + const __m256i s1G = yy_loadu_256(src1 + 96); + const __m256i s1H = yy_loadu_256(src1 + 112); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m16E = + calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); + const __m256i m16F = + calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); + const __m256i m16G = + calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); + const __m256i m16H = + calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); + const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); + yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 128; + i += 1; + } while (i < h); + } +} + +void av1_build_compound_diffwtd_mask_d16_avx2( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + const int shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + // When rounding constant is added, there is a possibility of overflow. + // However that much precision is not required. Code should very well work for + // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But + // there is a possibility of corner case bugs. + assert(DIFF_FACTOR_LOG2 == 4); + assert(AOM_BLEND_A64_MAX_ALPHA == 64); + + if (mask_type == DIFFWTD_38) { + build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1, + src1_stride, h, w, shift); + } else { + build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1, + src1_stride, h, w, shift); + } +} + void av1_build_compound_diffwtd_mask_highbd_avx2( uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c index 375def62e..0aaf1f454 100644 --- a/third_party/aom/av1/common/x86/selfguided_avx2.c +++ b/third_party/aom/av1/common/x86/selfguided_avx2.c @@ -546,17 +546,18 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, } } -void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, - int dgd_stride, int32_t *flt0, - int32_t *flt1, int flt_stride, - int sgr_params_idx, int bit_depth, - int highbd) { +int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, + int dgd_stride, int32_t *flt0, + int32_t *flt1, int flt_stride, + int sgr_params_idx, int bit_depth, + int highbd) { // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl, // Ctl and Dtl is 32-byte aligned. const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3); - DECLARE_ALIGNED(32, int32_t, - buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]); + int32_t *buf = aom_memalign( + 32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)); + if (!buf) return -1; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; @@ -625,6 +626,8 @@ void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, height, highbd); } + aom_free(buf); + return 0; } void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, @@ -635,8 +638,10 @@ void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); - av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1, - width, eps, bit_depth, highbd); + const int ret = av1_selfguided_restoration_avx2( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + (void)ret; + assert(!ret); const sgr_params_type *const params = &sgr_params[eps]; int xq[2]; decode_xq(xqd, xq, params); diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c index c64150b9d..ea3f6d942 100644 --- a/third_party/aom/av1/common/x86/selfguided_sse4.c +++ b/third_party/aom/av1/common/x86/selfguided_sse4.c @@ -499,13 +499,15 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, } } -void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, - int height, int dgd_stride, - int32_t *flt0, int32_t *flt1, - int flt_stride, int sgr_params_idx, - int bit_depth, int highbd) { - DECLARE_ALIGNED(16, int32_t, buf[4 * RESTORATION_PROC_UNIT_PELS]); - memset(buf, 0, sizeof(buf)); +int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, + int height, int dgd_stride, int32_t *flt0, + int32_t *flt1, int flt_stride, + int sgr_params_idx, int bit_depth, + int highbd) { + int32_t *buf = (int32_t *)aom_memalign( + 16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); + if (!buf) return -1; + memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; @@ -574,6 +576,8 @@ void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, height, highbd); } + aom_free(buf); + return 0; } void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width, @@ -584,8 +588,10 @@ void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width, int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); - av1_selfguided_restoration_sse4_1(dat8, width, height, stride, flt0, flt1, - width, eps, bit_depth, highbd); + const int ret = av1_selfguided_restoration_sse4_1( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + (void)ret; + assert(!ret); const sgr_params_type *const params = &sgr_params[eps]; int xq[2]; decode_xq(xqd, xq, params); diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c index efc542cbf..b810cea2e 100644 --- a/third_party/aom/av1/common/x86/warp_plane_sse4.c +++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c @@ -203,15 +203,72 @@ static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8, static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, 0 }; -static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx, - int alpha, int k, +static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1 }; + +static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3 }; + +static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5, + 4, 5, 4, 5, 4, 5, 4, 5 }; + +static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7, + 6, 7, 6, 7, 6, 7, 6, 7 }; + +static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3 }; +static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7 }; +static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11, 8, 9, 10, 11 }; +static const uint8_t shuffle_gamma0_mask3[16] = { + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 +}; + +static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff, const int offset_bits_horiz, - const int reduce_bits_horiz) { + const int reduce_bits_horiz, int k) { const __m128i src_even = _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask)); const __m128i src_odd = _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask)); + // The pixel order we need for 'src' is: + // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 + const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); + const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]); + // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13 + const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4), + _mm_srli_si128(src_odd, 4)); + const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]); + // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10 + const __m128i src_13 = + _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2)); + const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]); + // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14 + const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4), + _mm_srli_si128(src_even, 6)); + const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]); + + const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) + + ((1 << reduce_bits_horiz) >> 1)); + // Note: The values res_02 + res_46 and res_13 + res_57 both + // fit into int16s at this point, but their sum may be too wide to fit + // into an int16. However, once we also add round_const, the sum of + // all of these fits into a uint16. + // + // The wrapping behaviour of _mm_add_* is used here to make sure we + // get the correct result despite converting between different + // (implicit) types. + const __m128i res_even = _mm_add_epi16(res_02, res_46); + const __m128i res_odd = _mm_add_epi16(res_13, res_57); + const __m128i res = + _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const); + tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz)); +} + +static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx, + __m128i *coeff) { // Filter even-index pixels const __m128i tmp_0 = _mm_loadl_epi64( (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); @@ -249,47 +306,504 @@ static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx, const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 - const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14); + coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14); // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 - const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14); + coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14); // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 - const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15); + coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15); // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 - const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15); + coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} - // The pixel order we need for 'src' is: - // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 - const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); - const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02); - // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13 - const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4), - _mm_srli_si128(src_odd, 4)); - const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46); - // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10 - const __m128i src_13 = - _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2)); - const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13); - // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14 - const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4), - _mm_srli_si128(src_even, 6)); - const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57); +static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx, + __m128i *coeff) { + // Filter even-index pixels + const __m128i tmp_0 = + _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); - const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) + - ((1 << reduce_bits_horiz) >> 1)); + // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 + coeff[0] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01)); + // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 + coeff[1] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23)); + // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 + coeff[2] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45)); + // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 + coeff[3] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67)); +} - // Note: The values res_02 + res_46 and res_13 + res_57 both - // fit into int16s at this point, but their sum may be too wide to fit - // into an int16. However, once we also add round_const, the sum of - // all of these fits into a uint16. - // - // The wrapping behaviour of _mm_add_* is used here to make sure we - // get the correct result despite converting between different - // (implicit) types. - const __m128i res_even = _mm_add_epi16(res_02, res_46); - const __m128i res_odd = _mm_add_epi16(res_13, res_57); - const __m128i res = - _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const); - tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz)); +static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx, + int alpha, int k, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + __m128i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); +} + +static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp, + int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, + int p_height, int height, int i, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } +} + +static INLINE void warp_horizontal_filter_alpha0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)alpha; + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + + __m128i coeff[4]; + prepare_horizontal_filter_coeff_alpha0(sx, coeff); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void warp_horizontal_filter_beta0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + int k; + __m128i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void warp_horizontal_filter_alpha0_beta0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + (void)alpha; + int k; + + __m128i coeff[4]; + prepare_horizontal_filter_coeff_alpha0(sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void unpack_weights_and_set_round_const( + ConvolveParams *conv_params, const int round_bits, const int offset_bits, + __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) { + *res_sub_const = + _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + *wt = _mm_unpacklo_epi16(wt0, wt1); +} + +static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy, + __m128i *coeffs) { + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + // even coeffs + coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); + coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10); + coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14); + coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14); + + const __m128i tmp_1 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + // odd coeffs + coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11); + coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11); + coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15); + coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy, + __m128i *coeffs) { + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + // even coeffs + coeffs[0] = + _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0)); + coeffs[1] = + _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1)); + coeffs[2] = + _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2)); + coeffs[3] = + _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3)); + + // odd coeffs + coeffs[4] = coeffs[0]; + coeffs[5] = coeffs[1]; + coeffs[6] = coeffs[2]; + coeffs[7] = coeffs[3]; +} + +static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs, + __m128i *res_lo, __m128i *res_hi, + int k) { + // Load from tmp and rearrange pairs of consecutive rows into the + // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 + const __m128i *src = tmp + (k + 4); + const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); + const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); + const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); + const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); + const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); + const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); + const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); + + const __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); + const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); + const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); + const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]); + const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]); + const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]); + const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]); + + const __m128i res_odd = + _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + *res_lo = _mm_unpacklo_epi32(res_even, res_odd); + *res_hi = _mm_unpackhi_epi32(res_even, res_odd); +} + +static INLINE void store_vertical_filter_output( + __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const, + const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const, + uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k, + const int reduce_bits_vert, int p_stride, int p_width, + const int round_bits) { + __m128i res_lo_1 = *res_lo; + __m128i res_hi_1 = *res_hi; + + if (conv_params->is_compound) { + __m128i *const p = + (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; + res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const), + reduce_bits_vert); + const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1); + __m128i res_lo_16; + if (conv_params->do_average) { + __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + const __m128i p_16 = _mm_loadl_epi64(p); + + if (conv_params->use_jnt_comp_avg) { + const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16); + const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1); + } + + res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const); + + res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const), + round_bits); + __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16); + *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo); + } else { + _mm_storel_epi64(p, temp_lo_16); + } + if (p_width > 4) { + __m128i *const p4 = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const), + reduce_bits_vert); + const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1); + __m128i res_hi_16; + + if (conv_params->do_average) { + __m128i *const dst8_4 = + (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; + const __m128i p4_16 = _mm_loadl_epi64(p4); + + if (conv_params->use_jnt_comp_avg) { + const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16); + const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1); + } + res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const); + + res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const), + round_bits); + __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16); + *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi); + + } else { + _mm_storel_epi64(p4, temp_hi_16); + } + } + } else { + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); + + const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + + // Note: If we're outputting a 4x4 block, we need to be very careful + // to only output 4 pixels at this point, to avoid encode/decode + // mismatches when encoding with multiple threads. + if (p_width == 4) { + *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit); + } else { + _mm_storel_epi64(p, res_8bit); + } + } +} + +static INLINE void warp_vertical_filter( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs(gamma, sy, coeffs); + + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_gamma0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + int k; + (void)gamma; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0(sy, coeffs); + + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_delta0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + (void)delta; + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs(gamma, sy4, coeffs); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_gamma0_delta0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + (void)delta; + (void)gamma; + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0(sy4, coeffs); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void prepare_warp_vertical_filter( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + if (gamma == 0 && delta == 0) + warp_vertical_filter_gamma0_delta0( + pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, + sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits); + else if (gamma == 0 && delta != 0) + warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); + else if (gamma != 0 && delta == 0) + warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); + else + warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); +} + +static INLINE void prepare_warp_horizontal_filter( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + if (alpha == 0 && beta == 0) + warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + else if (alpha == 0 && beta != 0) + warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); + else if (alpha != 0 && beta == 0) + warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); + else + warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); } void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, @@ -309,24 +823,12 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; - const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); const __m128i reduce_bits_vert_const = _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const __m128i res_sub_const = - _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - - (1 << (offset_bits - conv_params->round_1 - 1))); - __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); - __m128i round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1)); - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi16(w0); - const __m128i wt1 = _mm_set1_epi16(w1); - const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); /* Note: For this code to work, the left/right frame borders need to be @@ -340,6 +842,13 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); } }*/ + __m128i res_add_const_1; + if (conv_params->is_compound == 1) { + res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const); + } else { + res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + } for (i = 0; i < p_height; i += 8) { for (j = 0; j < p_width; j += 8) { @@ -419,203 +928,15 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, reduce_bits_horiz); } } else { - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - int sx = sx4 + beta * (k + 4); - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, - reduce_bits_horiz); - } + prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); } // Vertical filter - for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { - int sy = sy4 + delta * (k + 4); - - // Load from tmp and rearrange pairs of consecutive rows into the - // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 - const __m128i *src = tmp + (k + 4); - const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); - const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); - const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); - const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); - - // Filter even-index pixels - const __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_2 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_4 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_6 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); - - const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); - const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); - const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); - const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); - - const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); - const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); - const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); - const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); - const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); - const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); - const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); - - const __m128i tmp_1 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_3 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_5 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_7 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); - - const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); - const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); - const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); - const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); - - const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); - const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); - const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); - const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - if (conv_params->is_compound) { - __m128i *const p = - (__m128i *)&conv_params - ->dst[(i + k + 4) * conv_params->dst_stride + j]; - res_lo = _mm_add_epi32(res_lo, res_add_const); - res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const), - reduce_bits_vert_shift); - const __m128i temp_lo_16 = _mm_packus_epi32(res_lo, res_lo); - __m128i res_lo_16; - if (conv_params->do_average) { - __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; - const __m128i p_16 = _mm_loadl_epi64(p); - - if (conv_params->use_jnt_comp_avg) { - const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16); - const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt); - const __m128i shifted_32 = - _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); - res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32); - } else { - res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1); - } - - res_lo_16 = _mm_add_epi16(res_lo_16, res_sub_const); - - res_lo_16 = _mm_sra_epi16( - _mm_add_epi16(res_lo_16, round_bits_const), round_bits_shift); - __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16); - *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo); - } else { - _mm_storel_epi64(p, temp_lo_16); - } - if (p_width > 4) { - __m128i *const p4 = - (__m128i *)&conv_params - ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; - - res_hi = _mm_add_epi32(res_hi, res_add_const); - res_hi = - _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const), - reduce_bits_vert_shift); - const __m128i temp_hi_16 = _mm_packus_epi32(res_hi, res_hi); - __m128i res_hi_16; - - if (conv_params->do_average) { - __m128i *const dst8_4 = - (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; - const __m128i p4_16 = _mm_loadl_epi64(p4); - - if (conv_params->use_jnt_comp_avg) { - const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16); - const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, wt); - const __m128i shifted_32 = - _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); - res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32); - } else { - res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1); - } - res_hi_16 = _mm_add_epi16(res_hi_16, res_sub_const); - - res_hi_16 = _mm_sra_epi16( - _mm_add_epi16(res_hi_16, round_bits_const), round_bits_shift); - __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16); - *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi); - - } else { - _mm_storel_epi64(p4, temp_hi_16); - } - } - } else { - // Round and pack into 8 bits - const __m128i round_const = - _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + - ((1 << reduce_bits_vert) >> 1)); - - const __m128i res_lo_round = _mm_srai_epi32( - _mm_add_epi32(res_lo, round_const), reduce_bits_vert); - const __m128i res_hi_round = _mm_srai_epi32( - _mm_add_epi32(res_hi, round_const), reduce_bits_vert); - - const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); - __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); - - // Store, blending with 'pred' if needed - __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; - - // Note: If we're outputting a 4x4 block, we need to be very careful - // to only output 4 pixels at this point, to avoid encode/decode - // mismatches when encoding with multiple threads. - if (p_width == 4) { - *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit); - } else { - _mm_storel_epi64(p, res_8bit); - } - } - } + prepare_warp_vertical_filter( + pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, + j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits); } } } diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c index e1449fd21..87a6e1239 100644 --- a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c +++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c @@ -39,7 +39,8 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, DECLARE_ALIGNED(32, uint16_t, temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); - int intermediate_height = h + SUBPEL_TAPS - 1; + int intermediate_height = h + SUBPEL_TAPS - 2; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c index 3083d224b..f9d00b733 100644 --- a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c +++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c @@ -32,7 +32,8 @@ void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, DECLARE_ALIGNED(16, uint16_t, temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); - int intermediate_height = h + SUBPEL_TAPS - 1; + int intermediate_height = h + SUBPEL_TAPS - 2; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); int i, j; const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; diff --git a/third_party/aom/av1/decoder/accounting.h b/third_party/aom/av1/decoder/accounting.h index 9099d081b..288e5e63e 100644 --- a/third_party/aom/av1/decoder/accounting.h +++ b/third_party/aom/av1/decoder/accounting.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_ACCOUNTING_H_ -#define AOM_ACCOUNTING_H_ +#ifndef AOM_AV1_DECODER_ACCOUNTING_H_ +#define AOM_AV1_DECODER_ACCOUNTING_H_ #include #include "aom/aomdx.h" @@ -79,4 +79,4 @@ void aom_accounting_dump(Accounting *accounting); #ifdef __cplusplus } // extern "C" #endif // __cplusplus -#endif // AOM_ACCOUNTING_H_ +#endif // AOM_AV1_DECODER_ACCOUNTING_H_ diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c index 6dbc4f3eb..31f14b531 100644 --- a/third_party/aom/av1/decoder/decodeframe.c +++ b/third_party/aom/av1/decoder/decodeframe.c @@ -43,6 +43,7 @@ #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" +#include "av1/common/frame_buffers.h" #include "av1/common/idct.h" #include "av1/common/mvref_common.h" #include "av1/common/pred_common.h" @@ -87,18 +88,25 @@ int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) { static void set_planes_to_neutral_grey(const SequenceHeader *const seq_params, const YV12_BUFFER_CONFIG *const buf, int only_chroma) { - const int val = 1 << (seq_params->bit_depth - 1); - - for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) { - const int is_uv = plane > 0; - for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) { - if (seq_params->use_highbitdepth) { - // TODO(yaowu): replace this with aom_memset16() for speed - for (int col_idx = 0; col_idx < buf->crop_widths[is_uv]; col_idx++) { - uint16_t *base = CONVERT_TO_SHORTPTR(buf->buffers[plane]); - base[row_idx * buf->strides[is_uv] + col_idx] = val; + if (seq_params->use_highbitdepth) { + const int val = 1 << (seq_params->bit_depth - 1); + for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) { + const int is_uv = plane > 0; + uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]); + // Set the first row to neutral grey. Then copy the first row to all + // subsequent rows. + if (buf->crop_heights[is_uv] > 0) { + aom_memset16(base, val, buf->crop_widths[is_uv]); + for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) { + memcpy(&base[row_idx * buf->strides[is_uv]], base, + sizeof(*base) * buf->crop_widths[is_uv]); } - } else { + } + } + } else { + for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) { + const int is_uv = plane > 0; + for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) { memset(&buf->buffers[plane][row_idx * buf->uv_stride], 1 << 7, buf->crop_widths[is_uv]); } @@ -687,11 +695,10 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm, for (int x = 0; x < b8_w; x += b4_w) { MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; is_compound = has_second_ref(this_mbmi); - DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]); int tmp_dst_stride = 8; assert(bw < 8 || bh < 8); ConvolveParams conv_params = get_conv_params_no_round( - 0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd); + 0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd); conv_params.use_jnt_comp_avg = 0; struct buf_2d *const dst_buf = &pd->dst; uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x; @@ -735,7 +742,6 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm, extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv, subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref], &pre, &src_stride); - conv_params.ref = ref; conv_params.do_average = ref; if (is_masked_compound_type(mi->interinter_comp.type)) { // masked compound type has its own average mechanism @@ -762,7 +768,6 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm, uint8_t *const dst = dst_buf->buf; uint8_t *pre[2]; SubpelParams subpel_params[2]; - DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]); int src_stride[2]; for (ref = 0; ref < 1 + is_compound; ++ref) { const struct scale_factors *const sf = @@ -797,7 +802,7 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm, } ConvolveParams conv_params = get_conv_params_no_round( - 0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd); + 0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset, &conv_params.bck_offset, &conv_params.use_jnt_comp_avg, is_compound); @@ -808,7 +813,6 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm, WarpTypesAllowed warp_types; warp_types.global_warp_allowed = is_global[ref]; warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; - conv_params.ref = ref; conv_params.do_average = ref; if (is_masked_compound_type(mi->interinter_comp.type)) { // masked compound type has its own average mechanism @@ -931,7 +935,7 @@ static void dec_build_prediction_by_above_preds( // Adjust mb_to_bottom_edge to have the correct value for the OBMC // prediction block. This is half the height of the original block, // except for 128-wide blocks, where we only use a height of 32. - int this_height = xd->n8_h * MI_SIZE; + int this_height = xd->n4_h * MI_SIZE; int pred_height = AOMMIN(this_height / 2, 32); xd->mb_to_bottom_edge += (this_height - pred_height) * 8; @@ -984,7 +988,7 @@ static void dec_build_prediction_by_left_preds( // Adjust mb_to_right_edge to have the correct value for the OBMC // prediction block. This is half the width of the original block, // except for 128-wide blocks, where we only use a width of 32. - int this_width = xd->n8_w * MI_SIZE; + int this_width = xd->n4_w * MI_SIZE; int pred_width = AOMMIN(this_width / 2, 32); xd->mb_to_right_edge += (this_width - pred_width) * 8; @@ -1006,8 +1010,6 @@ static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col) { const int num_planes = av1_num_planes(cm); - DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); - DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE]; int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; @@ -1018,19 +1020,23 @@ static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { int len = sizeof(uint16_t); - dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1); - dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len); - dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len); - dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2); - dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len); - dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len); + dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]); + dst_buf1[1] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len); + dst_buf1[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len); + dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]); + dst_buf2[1] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len); + dst_buf2[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len); } else { - dst_buf1[0] = tmp_buf1; - dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE; - dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2; - dst_buf2[0] = tmp_buf2; - dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE; - dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2; + dst_buf1[0] = xd->tmp_obmc_bufs[0]; + dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE; + dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2; + dst_buf2[0] = xd->tmp_obmc_bufs[1]; + dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE; + dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2; } dec_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1, dst_width1, dst_height1, dst_stride1); @@ -1069,8 +1075,9 @@ static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd, } dec_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); - if (mbmi->motion_mode == OBMC_CAUSAL) + if (mbmi->motion_mode == OBMC_CAUSAL) { dec_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); + } #if CONFIG_MISMATCH_DEBUG for (int plane = 0; plane < num_planes; ++plane) { const struct macroblockd_plane *pd = &xd->plane[plane]; @@ -1225,9 +1232,18 @@ static void decode_token_recon_block(AV1Decoder *const pbi, set_color_index_map_offset); } +#if LOOP_FILTER_BITMASK +static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, TX_SIZE tx_size, + MB_MODE_INFO *mbmi); +#endif + static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi, - TX_SIZE tx_size, int depth, int blk_row, - int blk_col, aom_reader *r) { + TX_SIZE tx_size, int depth, +#if LOOP_FILTER_BITMASK + AV1_COMMON *cm, int mi_row, int mi_col, +#endif + int blk_row, int blk_col, aom_reader *r) { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; int is_split = 0; const BLOCK_SIZE bsize = mbmi->sb_type; @@ -1271,15 +1287,29 @@ static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi, mbmi->tx_size = sub_txs; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, sub_txs, tx_size); +#if LOOP_FILTER_BITMASK + store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col, BLOCK_8X8, + TX_4X4, mbmi); +#endif return; } +#if LOOP_FILTER_BITMASK + if (depth + 1 == MAX_VARTX_DEPTH) { + store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col, + txsize_to_bsize[tx_size], sub_txs, mbmi); + } +#endif assert(bsw > 0 && bsh > 0); for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { int offsetr = blk_row + row; int offsetc = blk_col + col; - read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r); + read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, +#if LOOP_FILTER_BITMASK + cm, mi_row, mi_col, +#endif + offsetr, offsetc, r); } } } else { @@ -1293,6 +1323,10 @@ static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi, mbmi->tx_size = tx_size; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, tx_size, tx_size); +#if LOOP_FILTER_BITMASK + store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col, + txsize_to_bsize[tx_size], tx_size, mbmi); +#endif } } @@ -1330,6 +1364,191 @@ static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter, } } +#if LOOP_FILTER_BITMASK +static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, TX_SIZE tx_size, + MB_MODE_INFO *mbmi) { + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size]; + const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size]; + const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize( + mbmi->sb_type, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y)]; + const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize( + mbmi->sb_type, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y)]; + const int is_square_transform_size = tx_size <= TX_64X64; + int mask_id = 0; + int offset = 0; + const int half_ratio_tx_size_max32 = + (tx_size > TX_64X64) & (tx_size <= TX_32X16); + if (is_square_transform_size) { + switch (tx_size) { + case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break; + case TX_8X8: + mask_id = mask_id_table_tx_8x8[bsize]; + offset = 19; + break; + case TX_16X16: + mask_id = mask_id_table_tx_16x16[bsize]; + offset = 33; + break; + case TX_32X32: + mask_id = mask_id_table_tx_32x32[bsize]; + offset = 42; + break; + case TX_64X64: mask_id = 46; break; + default: assert(!is_square_transform_size); return; + } + mask_id += offset; + } else if (half_ratio_tx_size_max32) { + int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size]; + mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1); + } else if (tx_size == TX_32X64) { + mask_id = 59; + } else if (tx_size == TX_64X32) { + mask_id = 60; + } else { // quarter ratio tx size + mask_id = 61 + (tx_size - TX_4X16); + } + int index = 0; + const int row = mi_row % MI_SIZE_64X64; + const int col = mi_col % MI_SIZE_64X64; + const int shift = get_index_shift(col, row, &index); + const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col; + for (int i = 0; i + index < 4; ++i) { + // y vertical. + lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |= + (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift); + // y horizontal. + lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |= + (above_mask_univariant_reordered[mask_id].bits[i] << shift); + // u/v vertical. + lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |= + (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift); + // u/v horizontal. + lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |= + (above_mask_univariant_reordered[mask_id].bits[i] << shift); + } +} + +static void store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) { + // Use a lookup table that provides one bitmask for a given block size and + // a univariant transform size. + int index; + int shift; + int row; + int col; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size]; + const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size]; + const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize( + mbmi->sb_type, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y)]; + const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize( + mbmi->sb_type, cm->seq_params.subsampling_x, + cm->seq_params.subsampling_y)]; + const int is_square_transform_size = mbmi->tx_size <= TX_64X64; + int mask_id = 0; + int offset = 0; + const int half_ratio_tx_size_max32 = + (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16); + if (is_square_transform_size) { + switch (mbmi->tx_size) { + case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break; + case TX_8X8: + mask_id = mask_id_table_tx_8x8[bsize]; + offset = 19; + break; + case TX_16X16: + mask_id = mask_id_table_tx_16x16[bsize]; + offset = 33; + break; + case TX_32X32: + mask_id = mask_id_table_tx_32x32[bsize]; + offset = 42; + break; + case TX_64X64: mask_id = 46; break; + default: assert(!is_square_transform_size); return; + } + mask_id += offset; + } else if (half_ratio_tx_size_max32) { + int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size]; + mask_id = + 47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1); + } else if (mbmi->tx_size == TX_32X64) { + mask_id = 59; + } else if (mbmi->tx_size == TX_64X32) { + mask_id = 60; + } else { // quarter ratio tx size + mask_id = 61 + (mbmi->tx_size - TX_4X16); + } + row = mi_row % MI_SIZE_64X64; + col = mi_col % MI_SIZE_64X64; + shift = get_index_shift(col, row, &index); + const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col; + for (int i = 0; i + index < 4; ++i) { + // y vertical. + lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |= + (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift); + // y horizontal. + lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |= + (above_mask_univariant_reordered[mask_id].bits[i] << shift); + // u/v vertical. + lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |= + (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift); + // u/v horizontal. + lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |= + (above_mask_univariant_reordered[mask_id].bits[i] << shift); + } +} + +static void store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) { + int index; + int shift; + int row; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + const int row_start = mi_row % MI_SIZE_64X64; + const int col_start = mi_col % MI_SIZE_64X64; + shift = get_index_shift(col_start, row_start, &index); + const uint64_t top_edge_mask = + ((uint64_t)1 << (shift + mi_size_wide[bsize])) - ((uint64_t)1 << shift); + lfm->is_horz_border.bits[index] |= top_edge_mask; + const int is_vert_border = mask_id_table_vert_border[bsize]; + const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start; + for (int i = 0; i + index < 4; ++i) { + lfm->is_vert_border.bits[i + index] |= + (left_mask_univariant_reordered[is_vert_border].bits[i] << vert_shift); + } + const int is_skip = mbmi->skip && is_inter_block(mbmi); + if (is_skip) { + const int is_skip_mask = mask_id_table_tx_4x4[bsize]; + for (int i = 0; i + index < 4; ++i) { + lfm->skip.bits[i + index] |= + (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift); + } + } + const uint8_t level_vert_y = get_filter_level(cm, &cm->lf_info, 0, 0, mbmi); + const uint8_t level_horz_y = get_filter_level(cm, &cm->lf_info, 1, 0, mbmi); + const uint8_t level_u = get_filter_level(cm, &cm->lf_info, 0, 1, mbmi); + const uint8_t level_v = get_filter_level(cm, &cm->lf_info, 0, 2, mbmi); + for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) { + index = 0; + row = r % MI_SIZE_64X64; + memset(&lfm->lfl_y_ver[row][col_start], level_vert_y, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_y_hor[row][col_start], level_horz_y, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_u[row][col_start], level_u, + sizeof(uint8_t) * mi_size_wide[bsize]); + memset(&lfm->lfl_v[row][col_start], level_v, + sizeof(uint8_t) * mi_size_wide[bsize]); + } +} +#endif + static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td, int mi_row, int mi_col, aom_reader *r, PARTITION_TYPE partition, BLOCK_SIZE bsize) { @@ -1353,14 +1572,46 @@ static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td, for (int idy = 0; idy < height; idy += bh) for (int idx = 0; idx < width; idx += bw) - read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r); + read_tx_size_vartx(xd, mbmi, max_tx_size, 0, +#if LOOP_FILTER_BITMASK + cm, mi_row, mi_col, +#endif + idy, idx, r); } else { mbmi->tx_size = read_tx_size(cm, xd, inter_block_tx, !mbmi->skip, r); if (inter_block_tx) memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); - set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, + set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h, mbmi->skip && is_inter_block(mbmi), xd); +#if LOOP_FILTER_BITMASK + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) { + store_bitmask_univariant_tx(cm, mi_row, mi_col, bsize, mbmi); + } else { + for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) { + for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) { + store_bitmask_univariant_tx(cm, mi_row + row, mi_col + col, + BLOCK_64X64, mbmi); + } + } + } +#endif + } +#if LOOP_FILTER_BITMASK + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) { + store_bitmask_other_info(cm, mi_row, mi_col, bsize, mbmi); + } else { + for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) { + for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) { + store_bitmask_other_info(cm, mi_row + row, mi_col + col, BLOCK_64X64, + mbmi); + } + } } +#endif if (cm->delta_q_present_flag) { for (int i = 0; i < MAX_SEGMENTS; i++) { @@ -1952,6 +2203,11 @@ static void setup_quantization(AV1_COMMON *const cm, cm->v_dc_delta_q = cm->u_dc_delta_q; cm->v_ac_delta_q = cm->u_ac_delta_q; } + } else { + cm->u_dc_delta_q = 0; + cm->u_ac_delta_q = 0; + cm->v_dc_delta_q = 0; + cm->v_ac_delta_q = 0; } cm->dequant_bit_depth = seq_params->bit_depth; cm->using_qmatrix = aom_rb_read_bit(rb); @@ -2082,29 +2338,9 @@ static void resize_context_buffers(AV1_COMMON *cm, int width, int height) { cm->cur_frame->height = cm->height; } -static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag, - struct aom_read_bit_buffer *rb) { - const SequenceHeader *const seq_params = &cm->seq_params; - int width, height; +static void setup_buffer_pool(AV1_COMMON *cm) { BufferPool *const pool = cm->buffer_pool; - - if (frame_size_override_flag) { - int num_bits_width = seq_params->num_bits_width; - int num_bits_height = seq_params->num_bits_height; - av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height); - if (width > seq_params->max_frame_width || - height > seq_params->max_frame_height) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, - "Frame dimensions are larger than the maximum values"); - } - } else { - width = seq_params->max_frame_width; - height = seq_params->max_frame_height; - } - - setup_superres(cm, rb, &width, &height); - resize_context_buffers(cm, width, height); - setup_render_size(cm, rb); + const SequenceHeader *const seq_params = &cm->seq_params; lock_buffer_pool(pool); if (aom_realloc_frame_buffer( @@ -2140,6 +2376,31 @@ static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag, pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height; } +static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag, + struct aom_read_bit_buffer *rb) { + const SequenceHeader *const seq_params = &cm->seq_params; + int width, height; + + if (frame_size_override_flag) { + int num_bits_width = seq_params->num_bits_width; + int num_bits_height = seq_params->num_bits_height; + av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height); + if (width > seq_params->max_frame_width || + height > seq_params->max_frame_height) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Frame dimensions are larger than the maximum values"); + } + } else { + width = seq_params->max_frame_width; + height = seq_params->max_frame_height; + } + + setup_superres(cm, rb, &width, &height); + resize_context_buffers(cm, width, height); + setup_render_size(cm, rb); + setup_buffer_pool(cm); +} + static void setup_sb_size(SequenceHeader *seq_params, struct aom_read_bit_buffer *rb) { set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64); @@ -2158,7 +2419,6 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm, int width, height; int found = 0; int has_valid_ref_frame = 0; - BufferPool *const pool = cm->buffer_pool; for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { if (aom_rb_read_bit(rb)) { YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf; @@ -2208,39 +2468,7 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm, aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Referenced frame has incompatible color format"); } - - lock_buffer_pool(pool); - if (aom_realloc_frame_buffer( - get_frame_new_buffer(cm), cm->width, cm->height, - seq_params->subsampling_x, seq_params->subsampling_y, - seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->byte_alignment, - &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, - pool->cb_priv)) { - unlock_buffer_pool(pool); - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, - "Failed to allocate frame buffer"); - } - unlock_buffer_pool(pool); - - pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = - seq_params->subsampling_x; - pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = - seq_params->subsampling_y; - pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = - (unsigned int)seq_params->bit_depth; - pool->frame_bufs[cm->new_fb_idx].buf.color_primaries = - seq_params->color_primaries; - pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics = - seq_params->transfer_characteristics; - pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients = - seq_params->matrix_coefficients; - pool->frame_bufs[cm->new_fb_idx].buf.monochrome = seq_params->monochrome; - pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position = - seq_params->chroma_sample_position; - pool->frame_bufs[cm->new_fb_idx].buf.color_range = seq_params->color_range; - pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width; - pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height; + setup_buffer_pool(cm); } // Same function as av1_read_uniform but reading from uncompresses header wb @@ -2252,7 +2480,7 @@ static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) { if (v < m) return v; else - return (v << 1) - m + aom_rb_read_literal(rb, 1); + return (v << 1) - m + aom_rb_read_bit(rb); } static void read_tile_info_max_tile(AV1_COMMON *const cm, @@ -2344,6 +2572,10 @@ static void read_tile_info(AV1Decoder *const pbi, // tile to use for cdf update cm->context_update_tile_id = aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols); + if (cm->context_update_tile_id >= cm->tile_rows * cm->tile_cols) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Invalid context_update_tile_id"); + } // tile size magnitude pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1; } @@ -2746,31 +2978,13 @@ static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r, #endif // CONFIG_MULTITHREAD } -static INLINE int get_sb_rows_in_tile(AV1Decoder *pbi, TileInfo tile) { - AV1_COMMON *cm = &pbi->common; - int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO( - tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2); - int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2; - - return sb_rows; -} - -static INLINE int get_sb_cols_in_tile(AV1Decoder *pbi, TileInfo tile) { - AV1_COMMON *cm = &pbi->common; - int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO( - tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2); - int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2; - - return sb_cols; -} - static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td, TileInfo tile_info, const int mi_row) { AV1_COMMON *const cm = &pbi->common; const int num_planes = av1_num_planes(cm); TileDataDec *const tile_data = pbi->tile_data + tile_info.tile_row * cm->tile_cols + tile_info.tile_col; - const int sb_cols_in_tile = get_sb_cols_in_tile(pbi, tile_info); + const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info); const int sb_row_in_tile = (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2; int sb_col_in_tile = 0; @@ -2792,15 +3006,11 @@ static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td, } static int check_trailing_bits_after_symbol_coder(aom_reader *r) { + if (aom_reader_has_overflowed(r)) return -1; + uint32_t nb_bits = aom_reader_tell(r); uint32_t nb_bytes = (nb_bits + 7) >> 3; - - const uint8_t *p_begin = aom_reader_find_begin(r); - const uint8_t *p_end = aom_reader_find_end(r); - - // It is legal to have no padding bytes (nb_bytes == p_end - p_begin). - if ((ptrdiff_t)nb_bytes > p_end - p_begin) return -1; - const uint8_t *p = p_begin + nb_bytes; + const uint8_t *p = aom_reader_find_begin(r) + nb_bytes; // aom_reader_tell() returns 1 for a newly initialized decoder, and the // return value only increases as values are decoded. So nb_bits > 0, and @@ -2810,6 +3020,7 @@ static int check_trailing_bits_after_symbol_coder(aom_reader *r) { if ((last_byte & (2 * pattern - 1)) != pattern) return -1; // Make sure that all padding bytes are zero as required by the spec. + const uint8_t *p_end = aom_reader_find_end(r); while (p < p_end) { if (*p != 0) return -1; p++; @@ -2863,6 +3074,11 @@ static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row, // Bit-stream parsing and decoding of the superblock decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, cm->seq_params.sb_size, 0x3); + + if (aom_reader_has_overflowed(td->bit_reader)) { + aom_merge_corrupted_flag(&td->xd.corrupted, 1); + return; + } } } @@ -2950,6 +3166,11 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, td->xd.corrupted = 0; td->xd.mc_buf[0] = td->mc_buf[0]; td->xd.mc_buf[1] = td->mc_buf[1]; + td->xd.tmp_conv_dst = td->tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + td->xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j]; + } + for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) { const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row; @@ -3236,6 +3457,7 @@ static int row_mt_worker_hook(void *arg1, void *arg2) { #endif frame_row_mt_info->row_mt_exit = 1; #if CONFIG_MULTITHREAD + pthread_cond_broadcast(pbi->row_mt_cond_); pthread_mutex_unlock(pbi->row_mt_mutex_); #endif return 0; @@ -3386,16 +3608,24 @@ static void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm, aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles)); } -void av1_free_mc_tmp_buf(ThreadData *thread_data, int use_highbd) { +void av1_free_mc_tmp_buf(ThreadData *thread_data) { int ref; for (ref = 0; ref < 2; ref++) { - if (use_highbd) + if (thread_data->mc_buf_use_highbd) aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref])); else aom_free(thread_data->mc_buf[ref]); thread_data->mc_buf[ref] = NULL; } thread_data->mc_buf_size = 0; + thread_data->mc_buf_use_highbd = 0; + + aom_free(thread_data->tmp_conv_dst); + thread_data->tmp_conv_dst = NULL; + for (int i = 0; i < 2; ++i) { + aom_free(thread_data->tmp_obmc_bufs[i]); + thread_data->tmp_obmc_bufs[i] = NULL; + } } static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data, @@ -3411,6 +3641,17 @@ static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data, } } thread_data->mc_buf_size = buf_size; + thread_data->mc_buf_use_highbd = use_highbd; + + CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * + sizeof(*thread_data->tmp_conv_dst))); + for (int i = 0; i < 2; ++i) { + CHECK_MEM_ERROR( + cm, thread_data->tmp_obmc_bufs[i], + aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->tmp_obmc_bufs[i]))); + } } static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook, @@ -3425,6 +3666,10 @@ static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook, thread_data->td->xd.corrupted = 0; thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0]; thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1]; + thread_data->td->xd.tmp_conv_dst = thread_data->td->tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + thread_data->td->xd.tmp_obmc_bufs[j] = thread_data->td->tmp_obmc_bufs[j]; + } winterface->sync(worker); worker->hook = worker_hook; @@ -3511,7 +3756,7 @@ static void decode_mt_init(AV1Decoder *pbi) { for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) { DecWorkerData *const thread_data = pbi->thread_data + worker_idx; if (thread_data->td->mc_buf_size != buf_size) { - av1_free_mc_tmp_buf(thread_data->td, use_highbd); + av1_free_mc_tmp_buf(thread_data->td); allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd); } } @@ -3783,8 +4028,8 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data, TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col; av1_tile_init(&tile_data->tile_info, cm, row, col); - max_sb_rows = - AOMMAX(max_sb_rows, get_sb_rows_in_tile(pbi, tile_data->tile_info)); + max_sb_rows = AOMMAX(max_sb_rows, + av1_get_sb_rows_in_tile(cm, tile_data->tile_info)); } } @@ -3905,6 +4150,8 @@ void av1_read_film_grain_params(AV1_COMMON *cm, if (!seq_params->monochrome) pars->chroma_scaling_from_luma = aom_rb_read_bit(rb); + else + pars->chroma_scaling_from_luma = 0; if (seq_params->monochrome || pars->chroma_scaling_from_luma || ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) && @@ -4412,6 +4659,29 @@ static void show_existing_frame_reset(AV1Decoder *const pbi, *cm->fc = cm->frame_contexts[existing_frame_idx]; } +static INLINE void reset_frame_buffers(AV1_COMMON *cm) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); + memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map)); + + lock_buffer_pool(cm->buffer_pool); + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (i != cm->new_fb_idx) { + frame_bufs[i].ref_count = 0; + cm->buffer_pool->release_fb_cb(cm->buffer_pool->cb_priv, + &frame_bufs[i].raw_frame_buffer); + } else { + assert(frame_bufs[i].ref_count == 1); + } + frame_bufs[i].cur_frame_offset = 0; + av1_zero(frame_bufs[i].ref_frame_offset); + } + av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers); + unlock_buffer_pool(cm->buffer_pool); +} + // On success, returns 0. On failure, calls aom_internal_error and does not // return. static int read_uncompressed_header(AV1Decoder *pbi, @@ -4443,6 +4713,11 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->reset_decoder_state = 0; if (cm->show_existing_frame) { + if (pbi->sequence_header_changed) { + aom_internal_error( + &cm->error, AOM_CODEC_CORRUPT_FRAME, + "New sequence header starts with a show_existing_frame."); + } // Show an existing frame directly. const int existing_frame_idx = aom_rb_read_literal(rb, 3); const int frame_to_show = cm->ref_frame_map[existing_frame_idx]; @@ -4493,6 +4768,18 @@ static int read_uncompressed_header(AV1Decoder *pbi, } cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2); // 2 bits + if (pbi->sequence_header_changed) { + if (pbi->common.frame_type == KEY_FRAME) { + // This is the start of a new coded video sequence. + pbi->sequence_header_changed = 0; + pbi->decoding_first_frame = 1; + reset_frame_buffers(&pbi->common); + } else { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Sequence header has changed without a keyframe."); + } + } + cm->show_frame = aom_rb_read_bit(rb); if (seq_params->still_picture && (cm->frame_type != KEY_FRAME || !cm->show_frame)) { @@ -4582,8 +4869,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, } } - frame_size_override_flag = - frame_is_sframe(cm) ? 1 : aom_rb_read_literal(rb, 1); + frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb); cm->frame_offset = aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1); @@ -5152,7 +5438,7 @@ static void setup_frame_info(AV1Decoder *pbi) { const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0; const int buf_size = MC_TEMP_BUF_PELS << use_highbd; if (pbi->td.mc_buf_size != buf_size) { - av1_free_mc_tmp_buf(&pbi->td, use_highbd); + av1_free_mc_tmp_buf(&pbi->td); allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd); } } @@ -5166,6 +5452,11 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, const int tile_count_tg = end_tile - start_tile + 1; if (initialize_flag) setup_frame_info(pbi); + const int num_planes = av1_num_planes(cm); +#if LOOP_FILTER_BITMASK + av1_loop_filter_frame_init(cm, 0, num_planes); + av1_zero_array(cm->lf.lfm, cm->lf.lfm_num); +#endif if (pbi->max_threads > 1 && !(cm->large_scale_tile && !pbi->ext_tile_debug) && pbi->row_mt) @@ -5177,7 +5468,6 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, else *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile); - const int num_planes = av1_num_planes(cm); // If the bit stream is monochrome, set the U and V buffers to a constant. if (num_planes < 3) { set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1); @@ -5190,7 +5480,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, if (!cm->allow_intrabc && !cm->single_tile_decoding) { if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) { #if LOOP_FILTER_BITMASK - av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 0, + av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 1, 0, num_planes, 0); #else if (pbi->num_workers > 1) { @@ -5255,6 +5545,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, if (!xd->corrupted) { if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { + assert(cm->context_update_tile_id < pbi->allocated_tiles); *cm->fc = pbi->tile_data[cm->context_update_tile_id].tctx; av1_reset_cdf_symbol_counters(cm->fc); } diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h index d289b31f2..ddad273f1 100644 --- a/third_party/aom/av1/decoder/decodeframe.h +++ b/third_party/aom/av1/decoder/decodeframe.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_DECODER_DECODEFRAME_H_ -#define AV1_DECODER_DECODEFRAME_H_ +#ifndef AOM_AV1_DECODER_DECODEFRAME_H_ +#define AOM_AV1_DECODER_DECODEFRAME_H_ #ifdef __cplusplus extern "C" { @@ -74,7 +74,7 @@ struct aom_read_bit_buffer *av1_init_read_bit_buffer( struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data, const uint8_t *data_end); -void av1_free_mc_tmp_buf(struct ThreadData *thread_data, int use_highbd); +void av1_free_mc_tmp_buf(struct ThreadData *thread_data); void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm); @@ -82,4 +82,4 @@ void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm); } // extern "C" #endif -#endif // AV1_DECODER_DECODEFRAME_H_ +#endif // AOM_AV1_DECODER_DECODEFRAME_H_ diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c index 5e920b18d..551e4d543 100644 --- a/third_party/aom/av1/decoder/decodemv.c +++ b/third_party/aom/av1/decoder/decodemv.c @@ -94,42 +94,26 @@ static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd, } return reduced_delta_qindex; } -static int read_delta_lflevel(AV1_COMMON *cm, const MACROBLOCKD *xd, - aom_reader *r, int lf_id, - MB_MODE_INFO *const mbmi, int mi_col, +static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r, + aom_cdf_prob *const cdf, + const MB_MODE_INFO *const mbmi, int mi_col, int mi_row) { - int sign, abs, reduced_delta_lflevel = 0; - BLOCK_SIZE bsize = mbmi->sb_type; + int reduced_delta_lflevel = 0; + const BLOCK_SIZE bsize = mbmi->sb_type; const int b_col = mi_col & (cm->seq_params.mib_size - 1); const int b_row = mi_row & (cm->seq_params.mib_size - 1); const int read_delta_lf_flag = (b_col == 0 && b_row == 0); - FRAME_CONTEXT *ec_ctx = xd->tile_ctx; if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) && read_delta_lf_flag) { - if (cm->delta_lf_multi) { - assert(lf_id >= 0 && - lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT - : FRAME_LF_COUNT - 2)); - abs = aom_read_symbol(r, ec_ctx->delta_lf_multi_cdf[lf_id], - DELTA_LF_PROBS + 1, ACCT_STR); - } else { - abs = aom_read_symbol(r, ec_ctx->delta_lf_cdf, DELTA_LF_PROBS + 1, - ACCT_STR); - } + int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR); const int smallval = (abs < DELTA_LF_SMALL); if (!smallval) { const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1; const int thr = (1 << rem_bits) + 1; abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr; } - - if (abs) { - sign = aom_read_bit(r, ACCT_STR); - } else { - sign = 1; - } - + const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1; reduced_delta_lflevel = sign ? -abs : abs; } return reduced_delta_lflevel; @@ -618,19 +602,22 @@ static void read_filter_intra_mode_info(const AV1_COMMON *const cm, void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row, int blk_col, TX_SIZE tx_size, aom_reader *r) { MB_MODE_INFO *mbmi = xd->mi[0]; - const int inter_block = is_inter_block(mbmi); - FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - const int txk_type_idx = av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col); TX_TYPE *tx_type = &mbmi->txk_type[txk_type_idx]; + *tx_type = DCT_DCT; + + // No need to read transform type if block is skipped. + if (mbmi->skip || segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + return; + + // No need to read transform type for lossless mode(qindex==0). + const int qindex = + cm->seg.enabled ? xd->qindex[mbmi->segment_id] : cm->base_qindex; + if (qindex <= 0) return; - const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; - if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1 && - ((!cm->seg.enabled && cm->base_qindex > 0) || - (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) && - !mbmi->skip && - !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const int inter_block = is_inter_block(mbmi); + if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1) { const TxSetType tx_set_type = av1_get_ext_tx_set_type(tx_size, inter_block, cm->reduced_tx_set_used); const int eset = @@ -639,23 +626,22 @@ void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row, // there is no need to read the tx_type assert(eset != 0); + const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; if (inter_block) { *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol( r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], av1_num_ext_tx_set[tx_set_type], ACCT_STR)]; } else { - PREDICTION_MODE intra_dir; - if (mbmi->filter_intra_mode_info.use_filter_intra) - intra_dir = - fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]; - else - intra_dir = mbmi->mode; + const PREDICTION_MODE intra_mode = + mbmi->filter_intra_mode_info.use_filter_intra + ? fimode_to_intradir[mbmi->filter_intra_mode_info + .filter_intra_mode] + : mbmi->mode; *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol( - r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir], + r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode], av1_num_ext_tx_set[tx_set_type], ACCT_STR)]; } - } else { - *tx_type = DCT_DCT; } } @@ -720,6 +706,43 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, } } +// If delta q is present, reads delta_q index. +// Also reads delta_q loop filter levels, if present. +static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd, + const int mi_row, const int mi_col, + aom_reader *r) { + if (cm->delta_q_present_flag) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + xd->current_qindex += + read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res; + /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */ + xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ); + FRAME_CONTEXT *const ec_ctx = xd->tile_ctx; + if (cm->delta_lf_present_flag) { + if (cm->delta_lf_multi) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + const int tmp_lvl = + xd->delta_lf[lf_id] + + read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi, + mi_col, mi_row) * + cm->delta_lf_res; + mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] = + clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); + } + } else { + const int tmp_lvl = xd->delta_lf_from_base + + read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf, + mbmi, mi_col, mi_row) * + cm->delta_lf_res; + mbmi->delta_lf_from_base = xd->delta_lf_from_base = + clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); + } + } + } +} + static void read_intra_frame_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, int mi_row, int mi_col, aom_reader *r) { @@ -743,33 +766,7 @@ static void read_intra_frame_mode_info(AV1_COMMON *const cm, read_cdef(cm, r, xd, mi_col, mi_row); - if (cm->delta_q_present_flag) { - xd->current_qindex += - read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res; - /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */ - xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ); - if (cm->delta_lf_present_flag) { - if (cm->delta_lf_multi) { - const int frame_lf_count = - av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; - for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { - const int tmp_lvl = - xd->delta_lf[lf_id] + - read_delta_lflevel(cm, xd, r, lf_id, mbmi, mi_col, mi_row) * - cm->delta_lf_res; - mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] = - clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); - } - } else { - const int tmp_lvl = - xd->delta_lf_from_base + - read_delta_lflevel(cm, xd, r, -1, mbmi, mi_col, mi_row) * - cm->delta_lf_res; - mbmi->delta_lf_from_base = xd->delta_lf_from_base = - clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); - } - } - } + read_delta_q_params(cm, xd, mi_row, mi_col, r); mbmi->current_qindex = xd->current_qindex; @@ -1402,7 +1399,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi, mbmi->motion_mode = SIMPLE_TRANSLATION; if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode && !has_second_ref(mbmi)) - mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref); + mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref); av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col); if (mbmi->ref_frame[1] != INTRA_FRAME) @@ -1463,20 +1460,20 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi, read_mb_interp_filter(cm, xd, mbmi, r); if (mbmi->motion_mode == WARPED_CAUSAL) { - mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE; - mbmi->wm_params[0].invalid = 0; + mbmi->wm_params.wmtype = DEFAULT_WMTYPE; + mbmi->wm_params.invalid = 0; - if (mbmi->num_proj_ref[0] > 1) - mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, - mbmi->num_proj_ref[0], bsize); + if (mbmi->num_proj_ref > 1) + mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref, bsize); - if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, + if (find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, - &mbmi->wm_params[0], mi_row, mi_col)) { + &mbmi->wm_params, mi_row, mi_col)) { #if WARPED_MOTION_DEBUG printf("Warning: unexpected warped model from aomenc\n"); #endif - mbmi->wm_params[0].invalid = 1; + mbmi->wm_params.invalid = 1; } } @@ -1512,33 +1509,7 @@ static void read_inter_frame_mode_info(AV1Decoder *const pbi, read_cdef(cm, r, xd, mi_col, mi_row); - if (cm->delta_q_present_flag) { - xd->current_qindex += - read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res; - /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */ - xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ); - if (cm->delta_lf_present_flag) { - if (cm->delta_lf_multi) { - const int frame_lf_count = - av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; - for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { - const int tmp_lvl = - xd->delta_lf[lf_id] + - read_delta_lflevel(cm, xd, r, lf_id, mbmi, mi_col, mi_row) * - cm->delta_lf_res; - mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] = - clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); - } - } else { - const int tmp_lvl = - xd->delta_lf_from_base + - read_delta_lflevel(cm, xd, r, -1, mbmi, mi_col, mi_row) * - cm->delta_lf_res; - mbmi->delta_lf_from_base = xd->delta_lf_from_base = - clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); - } - } - } + read_delta_q_params(cm, xd, mi_row, mi_col, r); if (!mbmi->skip_mode) inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r); diff --git a/third_party/aom/av1/decoder/decodemv.h b/third_party/aom/av1/decoder/decodemv.h index 6243bb168..1625e5bd2 100644 --- a/third_party/aom/av1/decoder/decodemv.h +++ b/third_party/aom/av1/decoder/decodemv.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_DECODER_DECODEMV_H_ -#define AV1_DECODER_DECODEMV_H_ +#ifndef AOM_AV1_DECODER_DECODEMV_H_ +#define AOM_AV1_DECODER_DECODEMV_H_ #include "aom_dsp/bitreader.h" @@ -32,4 +32,4 @@ void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row, int blk_col, TX_SIZE tx_size, aom_reader *r); -#endif // AV1_DECODER_DECODEMV_H_ +#endif // AOM_AV1_DECODER_DECODEMV_H_ diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c index e978fad6c..a5f4fd67f 100644 --- a/third_party/aom/av1/decoder/decoder.c +++ b/third_party/aom/av1/decoder/decoder.c @@ -37,16 +37,11 @@ #include "av1/decoder/obu.h" static void initialize_dec(void) { - static volatile int init_done = 0; - - if (!init_done) { - av1_rtcd(); - aom_dsp_rtcd(); - aom_scale_rtcd(); - av1_init_intra_predictors(); - av1_init_wedge_masks(); - init_done = 1; - } + av1_rtcd(); + aom_dsp_rtcd(); + aom_scale_rtcd(); + av1_init_intra_predictors(); + av1_init_wedge_masks(); } static void dec_setup_mi(AV1_COMMON *cm) { @@ -171,8 +166,7 @@ void av1_decoder_remove(AV1Decoder *pbi) { if (pbi->thread_data) { for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) { DecWorkerData *const thread_data = pbi->thread_data + worker_idx; - const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0; - av1_free_mc_tmp_buf(thread_data->td, use_highbd); + av1_free_mc_tmp_buf(thread_data->td); aom_free(thread_data->td); } aom_free(pbi->thread_data); @@ -209,8 +203,7 @@ void av1_decoder_remove(AV1Decoder *pbi) { #if CONFIG_ACCOUNTING aom_accounting_clear(&pbi->accounting); #endif - const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0; - av1_free_mc_tmp_buf(&pbi->td, use_highbd); + av1_free_mc_tmp_buf(&pbi->td); aom_free(pbi); } diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h index 610b98d95..5ca939c24 100644 --- a/third_party/aom/av1/decoder/decoder.h +++ b/third_party/aom/av1/decoder/decoder.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_DECODER_DECODER_H_ -#define AV1_DECODER_DECODER_H_ +#ifndef AOM_AV1_DECODER_DECODER_H_ +#define AOM_AV1_DECODER_DECODER_H_ #include "config/aom_config.h" @@ -55,6 +55,11 @@ typedef struct ThreadData { CB_BUFFER cb_buffer_base; uint8_t *mc_buf[2]; int32_t mc_buf_size; + int mc_buf_use_highbd; // Boolean: whether the byte pointers stored in + // mc_buf were converted from highbd pointers. + + CONV_BUF_TYPE *tmp_conv_dst; + uint8_t *tmp_obmc_bufs[2]; decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit; decode_block_visitor_fn_t predict_and_recon_intra_block_visit; @@ -199,6 +204,7 @@ typedef struct AV1Decoder { int tg_start; // First tile in the current tilegroup int tg_size_bit_offset; int sequence_header_ready; + int sequence_header_changed; #if CONFIG_INSPECTION aom_inspect_cb inspect_cb; void *inspect_ctx; @@ -308,4 +314,4 @@ typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td, } // extern "C" #endif -#endif // AV1_DECODER_DECODER_H_ +#endif // AOM_AV1_DECODER_DECODER_H_ diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h index 687bba958..fe04f6abd 100644 --- a/third_party/aom/av1/decoder/decodetxb.h +++ b/third_party/aom/av1/decoder/decodetxb.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef DECODETXB_H_ -#define DECODETXB_H_ +#ifndef AOM_AV1_DECODER_DECODETXB_H_ +#define AOM_AV1_DECODER_DECODETXB_H_ #include "config/aom_config.h" @@ -29,4 +29,4 @@ void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r, const int plane, const int row, const int col, const TX_SIZE tx_size); -#endif // DECODETXB_H_ +#endif // AOM_AV1_DECODER_DECODETXB_H_ diff --git a/third_party/aom/av1/decoder/detokenize.h b/third_party/aom/av1/decoder/detokenize.h index ec85bf7ea..173b437a9 100644 --- a/third_party/aom/av1/decoder/detokenize.h +++ b/third_party/aom/av1/decoder/detokenize.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_DECODER_DETOKENIZE_H_ -#define AV1_DECODER_DETOKENIZE_H_ +#ifndef AOM_AV1_DECODER_DETOKENIZE_H_ +#define AOM_AV1_DECODER_DETOKENIZE_H_ #include "config/aom_config.h" @@ -26,4 +26,4 @@ void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r); #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_DECODER_DETOKENIZE_H_ +#endif // AOM_AV1_DECODER_DETOKENIZE_H_ diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h index 9f854e015..1d264b07e 100644 --- a/third_party/aom/av1/decoder/dthread.h +++ b/third_party/aom/av1/decoder/dthread.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_DECODER_DTHREAD_H_ -#define AV1_DECODER_DTHREAD_H_ +#ifndef AOM_AV1_DECODER_DTHREAD_H_ +#define AOM_AV1_DECODER_DTHREAD_H_ #include "config/aom_config.h" @@ -79,4 +79,4 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker, } // extern "C" #endif -#endif // AV1_DECODER_DTHREAD_H_ +#endif // AOM_AV1_DECODER_DTHREAD_H_ diff --git a/third_party/aom/av1/decoder/inspection.h b/third_party/aom/av1/decoder/inspection.h index bb604f684..7214a9bed 100644 --- a/third_party/aom/av1/decoder/inspection.h +++ b/third_party/aom/av1/decoder/inspection.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_INSPECTION_H_ -#define AOM_INSPECTION_H_ +#ifndef AOM_AV1_DECODER_INSPECTION_H_ +#define AOM_AV1_DECODER_INSPECTION_H_ #ifdef __cplusplus extern "C" { @@ -81,4 +81,4 @@ int ifd_inspect(insp_frame_data *fd, void *decoder); #ifdef __cplusplus } // extern "C" #endif // __cplusplus -#endif // AOM_INSPECTION_H_ +#endif // AOM_AV1_DECODER_INSPECTION_H_ diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c index 715bc6837..44ecf818e 100644 --- a/third_party/aom/av1/decoder/obu.c +++ b/third_party/aom/av1/decoder/obu.c @@ -18,6 +18,7 @@ #include "aom_ports/mem_ops.h" #include "av1/common/common.h" +#include "av1/common/obu_util.h" #include "av1/common/timing.h" #include "av1/decoder/decoder.h" #include "av1/decoder/decodeframe.h" @@ -42,85 +43,6 @@ typedef enum { SCALABILITY_SS = 14 } SCALABILITY_STRUCTURES; -// Returns 1 when OBU type is valid, and 0 otherwise. -static int valid_obu_type(int obu_type) { - int valid_type = 0; - switch (obu_type) { - case OBU_SEQUENCE_HEADER: - case OBU_TEMPORAL_DELIMITER: - case OBU_FRAME_HEADER: - case OBU_TILE_GROUP: - case OBU_METADATA: - case OBU_FRAME: - case OBU_REDUNDANT_FRAME_HEADER: - case OBU_TILE_LIST: - case OBU_PADDING: valid_type = 1; break; - default: break; - } - return valid_type; -} - -// Parses OBU header and stores values in 'header'. -static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb, - int is_annexb, ObuHeader *header) { - if (!rb || !header) return AOM_CODEC_INVALID_PARAM; - - const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer; - if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME; - - header->size = 1; - - if (aom_rb_read_bit(rb) != 0) { - // Forbidden bit. Must not be set. - return AOM_CODEC_CORRUPT_FRAME; - } - - header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4); - - if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME; - - header->has_extension = aom_rb_read_bit(rb); - header->has_size_field = aom_rb_read_bit(rb); - - if (!header->has_size_field && !is_annexb) { - // section 5 obu streams must have obu_size field set. - return AOM_CODEC_UNSUP_BITSTREAM; - } - - if (aom_rb_read_bit(rb) != 0) { - // obu_reserved_1bit must be set to 0. - return AOM_CODEC_CORRUPT_FRAME; - } - - if (header->has_extension) { - if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME; - - header->size += 1; - header->temporal_layer_id = aom_rb_read_literal(rb, 3); - header->spatial_layer_id = aom_rb_read_literal(rb, 2); - if (aom_rb_read_literal(rb, 3) != 0) { - // extension_header_reserved_3bits must be set to 0. - return AOM_CODEC_CORRUPT_FRAME; - } - } - - return AOM_CODEC_OK; -} - -aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, - size_t *consumed, ObuHeader *header, - int is_annexb) { - if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM; - - // TODO(tomfinegan): Set the error handler here and throughout this file, and - // confirm parsing work done via aom_read_bit_buffer is successful. - struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL, - NULL }; - aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header); - if (parse_result == AOM_CODEC_OK) *consumed = header->size; - return parse_result; -} - aom_codec_err_t aom_get_num_layers_from_operating_point_idc( int operating_point_idc, unsigned int *number_spatial_layers, unsigned int *number_temporal_layers) { @@ -208,7 +130,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, SequenceHeader *const seq_params = &sh; seq_params->profile = av1_read_profile(rb); - if (seq_params->profile > PROFILE_2) { + if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) { cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return 0; } @@ -349,10 +271,8 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, // If a sequence header has been decoded before, we check if the new // one is consistent with the old one. if (pbi->sequence_header_ready) { - if (!are_seq_headers_consistent(&cm->seq_params, seq_params)) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, - "Inconsistent sequence headers received."); - } + if (!are_seq_headers_consistent(&cm->seq_params, seq_params)) + pbi->sequence_header_changed = 1; } cm->seq_params = *seq_params; @@ -620,9 +540,9 @@ static void read_metadata_hdr_mdcv(const uint8_t *data, size_t sz) { static void scalability_structure(struct aom_read_bit_buffer *rb) { int spatial_layers_cnt = aom_rb_read_literal(rb, 2); - int spatial_layer_dimensions_present_flag = aom_rb_read_literal(rb, 1); - int spatial_layer_description_present_flag = aom_rb_read_literal(rb, 1); - int temporal_group_description_present_flag = aom_rb_read_literal(rb, 1); + int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb); + int spatial_layer_description_present_flag = aom_rb_read_bit(rb); + int temporal_group_description_present_flag = aom_rb_read_bit(rb); aom_rb_read_literal(rb, 3); // reserved if (spatial_layer_dimensions_present_flag) { @@ -643,8 +563,8 @@ static void scalability_structure(struct aom_read_bit_buffer *rb) { temporal_group_size = aom_rb_read_literal(rb, 8); for (i = 0; i < temporal_group_size; i++) { aom_rb_read_literal(rb, 3); - aom_rb_read_literal(rb, 1); - aom_rb_read_literal(rb, 1); + aom_rb_read_bit(rb); + aom_rb_read_bit(rb); int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3); for (j = 0; j < temporal_group_ref_cnt; j++) { aom_rb_read_literal(rb, 8); @@ -716,61 +636,6 @@ static size_t read_metadata(const uint8_t *data, size_t sz) { return sz; } -static aom_codec_err_t read_obu_size(const uint8_t *data, - size_t bytes_available, - size_t *const obu_size, - size_t *const length_field_size) { - uint64_t u_obu_size = 0; - if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) != - 0) { - return AOM_CODEC_CORRUPT_FRAME; - } - - if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME; - *obu_size = (size_t)u_obu_size; - return AOM_CODEC_OK; -} - -aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, - size_t bytes_available, - int is_annexb, - ObuHeader *obu_header, - size_t *const payload_size, - size_t *const bytes_read) { - size_t length_field_size = 0, obu_size = 0; - aom_codec_err_t status; - - if (is_annexb) { - // Size field comes before the OBU header, and includes the OBU header - status = - read_obu_size(data, bytes_available, &obu_size, &length_field_size); - - if (status != AOM_CODEC_OK) return status; - } - - struct aom_read_bit_buffer rb = { data + length_field_size, - data + bytes_available, 0, NULL, NULL }; - - status = read_obu_header(&rb, is_annexb, obu_header); - if (status != AOM_CODEC_OK) return status; - - if (is_annexb) { - // Derive the payload size from the data we've already read - if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME; - - *payload_size = obu_size - obu_header->size; - } else { - // Size field comes after the OBU header, and is just the payload size - status = read_obu_size(data + obu_header->size, - bytes_available - obu_header->size, payload_size, - &length_field_size); - if (status != AOM_CODEC_OK) return status; - } - - *bytes_read = length_field_size + obu_header->size; - return AOM_CODEC_OK; -} - // On success, returns a boolean that indicates whether the decoding of the // current frame is finished. On failure, sets cm->error.error_code and // returns -1. @@ -781,8 +646,6 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, int frame_decoding_finished = 0; int is_first_tg_obu_received = 1; uint32_t frame_header_size = 0; - int seq_header_received = 0; - size_t seq_header_size = 0; ObuHeader obu_header; memset(&obu_header, 0, sizeof(obu_header)); pbi->seen_frame_header = 0; @@ -853,19 +716,8 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, pbi->seen_frame_header = 0; break; case OBU_SEQUENCE_HEADER: - if (!seq_header_received) { - decoded_payload_size = read_sequence_header_obu(pbi, &rb); - if (cm->error.error_code != AOM_CODEC_OK) return -1; - - seq_header_size = decoded_payload_size; - seq_header_received = 1; - } else { - // Seeing another sequence header, skip as all sequence headers are - // required to be identical except for the contents of - // operating_parameters_info and the amount of trailing bits. - // TODO(yaowu): verifying redundant sequence headers are identical. - decoded_payload_size = seq_header_size; - } + decoded_payload_size = read_sequence_header_obu(pbi, &rb); + if (cm->error.error_code != AOM_CODEC_OK) return -1; break; case OBU_FRAME_HEADER: case OBU_REDUNDANT_FRAME_HEADER: @@ -889,6 +741,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, assert(rb.bit_offset == 0); rb.bit_offset = 8 * frame_header_size; } + decoded_payload_size = frame_header_size; pbi->frame_header_size = frame_header_size; @@ -938,6 +791,11 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, decoded_payload_size = read_metadata(data, payload_size); break; case OBU_TILE_LIST: + if (CONFIG_NORMAL_TILE_MODE) { + cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return -1; + } + // This OBU type is purely for the large scale tile coding mode. // The common camera frame header has to be already decoded. if (!pbi->camera_frame_header_ready) { diff --git a/third_party/aom/av1/decoder/obu.h b/third_party/aom/av1/decoder/obu.h index 5f2197058..5ab243fc9 100644 --- a/third_party/aom/av1/decoder/obu.h +++ b/third_party/aom/av1/decoder/obu.h @@ -9,35 +9,12 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_DECODER_OBU_H -#define AV1_DECODER_OBU_H +#ifndef AOM_AV1_DECODER_OBU_H_ +#define AOM_AV1_DECODER_OBU_H_ #include "aom/aom_codec.h" #include "av1/decoder/decoder.h" -typedef struct { - size_t size; // Size (1 or 2 bytes) of the OBU header (including the - // optional OBU extension header) in the bitstream. - OBU_TYPE type; - int has_size_field; - int has_extension; - // The following fields come from the OBU extension header and therefore are - // only used if has_extension is true. - int temporal_layer_id; - int spatial_layer_id; -} ObuHeader; - -aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, - size_t *consumed, ObuHeader *header, - int is_annexb); - -aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, - size_t bytes_available, - int is_annexb, - ObuHeader *obu_header, - size_t *const payload_size, - size_t *const bytes_read); - // Try to decode one frame from a buffer. // Returns 1 if we decoded a frame, // 0 if we didn't decode a frame but that's okay @@ -51,4 +28,4 @@ aom_codec_err_t aom_get_num_layers_from_operating_point_idc( int operating_point_idc, unsigned int *num_spatial_layers, unsigned int *num_temporal_layers); -#endif +#endif // AOM_AV1_DECODER_OBU_H_ diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c index b721b6d2b..80f8e2e66 100644 --- a/third_party/aom/av1/encoder/aq_complexity.c +++ b/third_party/aom/av1/encoder/aq_complexity.c @@ -143,9 +143,10 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth); aom_clear_system_state(); - low_var_thresh = (cpi->oxcf.pass == 2) ? AOMMAX(cpi->twopass.mb_av_energy, - MIN_DEFAULT_LV_THRESH) - : DEFAULT_LV_THRESH; + low_var_thresh = + (cpi->oxcf.pass == 2) + ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH) + : DEFAULT_LV_THRESH; av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes); logvar = av1_log_block_var(cpi, mb, bs); diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h index af525b36d..3421d74c9 100644 --- a/third_party/aom/av1/encoder/aq_complexity.h +++ b/third_party/aom/av1/encoder/aq_complexity.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_AQ_COMPLEXITY_H_ -#define AV1_ENCODER_AQ_COMPLEXITY_H_ +#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ +#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ #ifdef __cplusplus extern "C" { @@ -34,4 +34,4 @@ void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi); } // extern "C" #endif -#endif // AV1_ENCODER_AQ_COMPLEXITY_H_ +#endif // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c index dec2c730d..f532d48da 100644 --- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c +++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c @@ -80,9 +80,11 @@ CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) { } void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) { - aom_free(cr->map); - aom_free(cr->last_coded_q_map); - aom_free(cr); + if (cr != NULL) { + aom_free(cr->map); + aom_free(cr->last_coded_q_map); + aom_free(cr); + } } // Check if we should turn off cyclic refresh based on bitrate condition. diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h index 459ab80b8..b45781983 100644 --- a/third_party/aom/av1/encoder/aq_cyclicrefresh.h +++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_AQ_CYCLICREFRESH_H_ -#define AV1_ENCODER_AQ_CYCLICREFRESH_H_ +#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ +#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ #include "av1/common/blockd.h" @@ -95,4 +95,4 @@ static INLINE int cyclic_refresh_segment_id(int segment_id) { } // extern "C" #endif -#endif // AV1_ENCODER_AQ_CYCLICREFRESH_H_ +#endif // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c index 6cb6adc42..58f906bdc 100644 --- a/third_party/aom/av1/encoder/aq_variance.c +++ b/third_party/aom/av1/encoder/aq_variance.c @@ -14,34 +14,33 @@ #include "aom_ports/mem.h" #include "av1/encoder/aq_variance.h" - #include "av1/common/seg_common.h" +#include "av1/encoder/encodeframe.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/dwt.h" #include "aom_ports/system_state.h" +static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0, + 0.9, .8, .7, .6 }; + +static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0, + 0.75, 1.0, 1.0, 1.0 }; #define ENERGY_MIN (-4) #define ENERGY_MAX (1) #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) #define ENERGY_IN_BOUNDS(energy) \ assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) -static const double rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0, - 0.75, 1.0, 1.0, 1.0 }; -static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 }; - -#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN] - DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 }; + DECLARE_ALIGNED(16, static const uint16_t, av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 }; -unsigned int av1_vaq_segment_id(int energy) { - ENERGY_IN_BOUNDS(energy); - return SEGMENT_ID(energy); -} +static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 }; + +#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN] void av1_vaq_frame_setup(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; @@ -51,6 +50,12 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) { int resolution_change = cm->prev_frame && (cm->width != cm->prev_frame->width || cm->height != cm->prev_frame->height); + int avg_energy = (int)(cpi->twopass.mb_av_energy - 2); + double avg_ratio; + if (avg_energy > 7) avg_energy = 7; + if (avg_energy < 0) avg_energy = 0; + avg_ratio = rate_ratio[avg_energy]; + if (resolution_change) { memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); av1_clearall_segfeatures(seg); @@ -69,9 +74,11 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) { aom_clear_system_state(); for (i = 0; i < MAX_SEGMENTS; ++i) { - int qindex_delta = - av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, - rate_ratio[i], cm->seq_params.bit_depth); + // Set up avg segment id to be 1.0 and adjust the other segments around + // it. + int qindex_delta = av1_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[i] / avg_ratio, + cm->seq_params.bit_depth); // We don't allow qindex 0 in a segment if the base value is not 0. // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment @@ -87,114 +94,58 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) { } } -/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions - * of variance() and highbd_8_variance(). It should not. - */ -static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, unsigned int *sse, - int *sum) { - int i, j; - - *sum = 0; - *sse = 0; +int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { + // This functions returns a score for the blocks local variance as calculated + // by: sum of the log of the (4x4 variances) of each subblock to the current + // block (x,bs) + // * 32 / number of pixels in the block_size. + // This is used for segmentation because to avoid situations in which a large + // block with a gentle gradient gets marked high variance even though each + // subblock has a low variance. This allows us to assign the same segment + // number for the same sorts of area regardless of how the partitioning goes. - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; - } - - a += a_stride; - b += b_stride; - } -} - -static void aq_highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint64_t *sse, uint64_t *sum) { + MACROBLOCKD *xd = &x->e_mbd; + double var = 0; + unsigned int sse; int i, j; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; - } - a += a_stride; - b += b_stride; - } -} - -static void aq_highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - unsigned int *sse, int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; -} - -static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x, - BLOCK_SIZE bs) { - MACROBLOCKD *xd = &x->e_mbd; - unsigned int var, sse; int right_overflow = (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; int bottom_overflow = (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; - if (right_overflow || bottom_overflow) { - const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow; - const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow; - int avg; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride, - CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, bw, bh, - &sse, &avg); - sse >>= 2 * (xd->bd - 8); - avg >>= (xd->bd - 8); - } else { - aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0, - bw, bh, &sse, &avg); - } - var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh)); - return (unsigned int)((uint64_t)var * 256) / (bw * bh); - } else { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - var = - cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, - CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse); - } else { - var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, - av1_all_zeros, 0, &sse); + const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow; + const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow; + + aom_clear_system_state(); + + for (i = 0; i < bh; i += 4) { + for (j = 0; j < bw; j += 4) { + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + var += + log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf( + x->plane[0].src.buf + i * x->plane[0].src.stride + j, + x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) / + 16); + } else { + var += + log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf( + x->plane[0].src.buf + i * x->plane[0].src.stride + j, + x->plane[0].src.stride, av1_all_zeros, 0, &sse) / + 16); + } } - return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs]; } -} + // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561. + var /= (bw / 4 * bh / 4); + if (var > 7) var = 7; -double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { - unsigned int var = block_variance(cpi, x, bs); aom_clear_system_state(); - return log(var + 1.0); + return (int)(var); } #define DEFAULT_E_MIDPOINT 10.0 -int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { - double energy; - double energy_midpoint; - aom_clear_system_state(); - energy_midpoint = - (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT; - energy = av1_log_block_var(cpi, x, bs) - energy_midpoint; - return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); -} unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) { MACROBLOCKD *xd = &x->e_mbd; @@ -231,17 +182,21 @@ int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi, int block_var_level) { - ENERGY_IN_BOUNDS(block_var_level); - - const int rate_level = SEGMENT_ID(block_var_level); + int rate_level; const AV1_COMMON *const cm = &cpi->common; + + if (DELTAQ_MODULATION == 1) { + ENERGY_IN_BOUNDS(block_var_level); + rate_level = SEGMENT_ID(block_var_level); + } else { + rate_level = block_var_level; + } int qindex_delta = av1_compute_qdelta_by_rate( - &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[rate_level], + &cpi->rc, cm->frame_type, cm->base_qindex, deltaq_rate_ratio[rate_level], cm->seq_params.bit_depth); if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) { qindex_delta = -cm->base_qindex + 1; } - return qindex_delta; } diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h index b1a8bc38a..2d22b663e 100644 --- a/third_party/aom/av1/encoder/aq_variance.h +++ b/third_party/aom/av1/encoder/aq_variance.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_AQ_VARIANCE_H_ -#define AV1_ENCODER_AQ_VARIANCE_H_ +#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_ +#define AOM_AV1_ENCODER_AQ_VARIANCE_H_ #include "av1/encoder/encoder.h" @@ -18,11 +18,9 @@ extern "C" { #endif -unsigned int av1_vaq_segment_id(int energy); void av1_vaq_frame_setup(AV1_COMP *cpi); -int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); -double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); +int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi, int block_var_level); int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, @@ -32,4 +30,4 @@ int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, } // extern "C" #endif -#endif // AV1_ENCODER_AQ_VARIANCE_H_ +#endif // AOM_AV1_ENCODER_AQ_VARIANCE_H_ diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c index b92b3469f..98505e0b1 100644 --- a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c @@ -11,24 +11,7 @@ #include #include "av1/encoder/av1_fwd_txfm1d.h" - -#if CONFIG_COEFFICIENT_RANGE_CHECKING -void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf, - int32_t size, int8_t bit); - -#define range_check(stage, input, buf, size, bit) \ - range_check_func(stage, input, buf, size, bit) -#else // CONFIG_COEFFICIENT_RANGE_CHECKING - -#define range_check(stage, input, buf, size, bit) \ - { \ - (void)stage; \ - (void)input; \ - (void)buf; \ - (void)size; \ - (void)bit; \ - } -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING +#include "av1/common/av1_txfm.h" void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { @@ -40,7 +23,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[4]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -49,7 +32,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[1] = input[1] + input[2]; bf1[2] = -input[2] + input[1]; bf1[3] = -input[3] + input[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -60,7 +43,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -70,7 +53,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[1] = bf0[2]; bf1[2] = bf0[1]; bf1[3] = bf0[3]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -83,7 +66,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[8]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -96,7 +79,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = -input[5] + input[2]; bf1[6] = -input[6] + input[1]; bf1[7] = -input[7] + input[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -111,7 +94,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); bf1[7] = bf0[7]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -126,7 +109,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = -bf0[5] + bf0[4]; bf1[6] = -bf0[6] + bf0[7]; bf1[7] = bf0[7] + bf0[6]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -141,7 +124,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -155,7 +138,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = bf0[5]; bf1[6] = bf0[3]; bf1[7] = bf0[7]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -168,7 +151,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[16]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -189,7 +172,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = -input[13] + input[2]; bf1[14] = -input[14] + input[1]; bf1[15] = -input[15] + input[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -212,7 +195,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -235,7 +218,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = -bf0[13] + bf0[14]; bf1[14] = bf0[14] + bf0[13]; bf1[15] = bf0[15] + bf0[12]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -258,7 +241,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); bf1[15] = bf0[15]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -281,7 +264,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = -bf0[13] + bf0[12]; bf1[14] = -bf0[14] + bf0[15]; bf1[15] = bf0[15] + bf0[14]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -304,7 +287,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -326,7 +309,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[11]; bf1[14] = bf0[7]; bf1[15] = bf0[15]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -339,7 +322,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[32]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -376,7 +359,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = -input[29] + input[2]; bf1[30] = -input[30] + input[1]; bf1[31] = -input[31] + input[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -415,7 +398,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = bf0[29]; bf1[30] = bf0[30]; bf1[31] = bf0[31]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -454,7 +437,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = bf0[29] + bf0[26]; bf1[30] = bf0[30] + bf0[25]; bf1[31] = bf0[31] + bf0[24]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -493,7 +476,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); bf1[30] = bf0[30]; bf1[31] = bf0[31]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -532,7 +515,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = -bf0[29] + bf0[30]; bf1[30] = bf0[30] + bf0[29]; bf1[31] = bf0[31] + bf0[28]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -571,7 +554,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); bf1[31] = bf0[31]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -610,7 +593,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = -bf0[29] + bf0[28]; bf1[30] = -bf0[30] + bf0[31]; bf1[31] = bf0[31] + bf0[30]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; @@ -649,7 +632,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; @@ -687,7 +670,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = bf0[23]; bf1[30] = bf0[15]; bf1[31] = bf0[31]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -698,7 +681,7 @@ void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t s0, s1, s2, s3, s4, s5, s6, s7; // stage 0 - range_check(0, input, input, 4, stage_range[0]); + av1_range_check_buf(0, input, input, 4, stage_range[0]); x0 = input[0]; x1 = input[1]; x2 = input[2]; @@ -746,7 +729,7 @@ void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, output[1] = round_shift(s1, bit); output[2] = round_shift(s2, bit); output[3] = round_shift(s3, bit); - range_check(6, input, output, 4, stage_range[6]); + av1_range_check_buf(6, input, output, 4, stage_range[6]); } void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -759,7 +742,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[8]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -773,7 +756,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = input[6]; bf1[6] = input[2]; bf1[7] = -input[5]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -788,7 +771,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = bf0[5]; bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -802,7 +785,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = bf0[5] + bf0[7]; bf1[6] = bf0[4] - bf0[6]; bf1[7] = bf0[5] - bf0[7]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -817,7 +800,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -831,7 +814,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = bf0[1] - bf0[5]; bf1[6] = bf0[2] - bf0[6]; bf1[7] = bf0[3] - bf0[7]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -846,7 +829,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -860,7 +843,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = bf0[2]; bf1[6] = bf0[7]; bf1[7] = bf0[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -873,7 +856,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[16]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -895,7 +878,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = -input[13]; bf1[14] = -input[5]; bf1[15] = input[10]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -918,7 +901,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[13]; bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -940,7 +923,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[13] + bf0[15]; bf1[14] = bf0[12] - bf0[14]; bf1[15] = bf0[13] - bf0[15]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -963,7 +946,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -985,7 +968,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[9] - bf0[13]; bf1[14] = bf0[10] - bf0[14]; bf1[15] = bf0[11] - bf0[15]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -1008,7 +991,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -1030,7 +1013,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[5] - bf0[13]; bf1[14] = bf0[6] - bf0[14]; bf1[15] = bf0[7] - bf0[15]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; @@ -1053,7 +1036,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; @@ -1075,7 +1058,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[2]; bf1[14] = bf0[15]; bf1[15] = bf0[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -1084,14 +1067,14 @@ void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, for (int i = 0; i < 4; ++i) output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits); assert(stage_range[0] + NewSqrt2Bits <= 32); - range_check(0, input, output, 4, stage_range[0]); + av1_range_check_buf(0, input, output, 4, stage_range[0]); } void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { (void)cos_bit; for (int i = 0; i < 8; ++i) output[i] = input[i] * 2; - range_check(0, input, output, 8, stage_range[0]); + av1_range_check_buf(0, input, output, 8, stage_range[0]); } void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -1100,14 +1083,14 @@ void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, for (int i = 0; i < 16; ++i) output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits); assert(stage_range[0] + NewSqrt2Bits <= 32); - range_check(0, input, output, 16, stage_range[0]); + av1_range_check_buf(0, input, output, 16, stage_range[0]); } void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { (void)cos_bit; for (int i = 0; i < 32; ++i) output[i] = input[i] * 4; - range_check(0, input, output, 32, stage_range[0]); + av1_range_check_buf(0, input, output, 32, stage_range[0]); } void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -1120,7 +1103,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, int32_t step[64]; // stage 0; - range_check(stage, input, input, size, stage_range[stage]); + av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; @@ -1189,7 +1172,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = -input[61] + input[2]; bf1[62] = -input[62] + input[1]; bf1[63] = -input[63] + input[0]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -1260,7 +1243,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[61]; bf1[62] = bf0[62]; bf1[63] = bf0[63]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -1331,7 +1314,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[61] + bf0[50]; bf1[62] = bf0[62] + bf0[49]; bf1[63] = bf0[63] + bf0[48]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -1402,7 +1385,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[61]; bf1[62] = bf0[62]; bf1[63] = bf0[63]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -1473,7 +1456,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[61] + bf0[58]; bf1[62] = bf0[62] + bf0[57]; bf1[63] = bf0[63] + bf0[56]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -1544,7 +1527,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit); bf1[62] = bf0[62]; bf1[63] = bf0[63]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -1615,7 +1598,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = -bf0[61] + bf0[62]; bf1[62] = bf0[62] + bf0[61]; bf1[63] = bf0[63] + bf0[60]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; @@ -1686,7 +1669,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit); bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit); bf1[63] = bf0[63]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; @@ -1757,7 +1740,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = -bf0[61] + bf0[60]; bf1[62] = -bf0[62] + bf0[63]; bf1[63] = bf0[63] + bf0[62]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 10 stage++; @@ -1828,7 +1811,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit); bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit); bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit); - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 11 stage++; @@ -1898,5 +1881,5 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[47]; bf1[62] = bf0[31]; bf1[63] = bf0[63]; - range_check(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h index 9472af8e6..9dcf16552 100644 --- a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_FWD_TXFM1D_H_ -#define AV1_FWD_TXFM1D_H_ +#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ +#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ #include "av1/common/av1_txfm.h" @@ -46,4 +46,4 @@ void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, } #endif -#endif // AV1_FWD_TXFM1D_H_ +#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h index 174689a14..98b6530db 100644 --- a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h @@ -9,11 +9,11 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_FWD_TXFM2D_CFG_H_ -#define AV1_FWD_TXFM2D_CFG_H_ +#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ +#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ #include "av1/common/enums.h" #include "av1/encoder/av1_fwd_txfm1d.h" extern const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL]; extern const int8_t fwd_cos_bit_col[5][5]; extern const int8_t fwd_cos_bit_row[5][5]; -#endif // AV1_FWD_TXFM2D_CFG_H_ +#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c index d0477b35b..a0a926005 100644 --- a/third_party/aom/av1/encoder/av1_quantize.c +++ b/third_party/aom/av1/encoder/av1_quantize.c @@ -273,35 +273,32 @@ void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { - // obsolete skip_block - const int skip_block = 0; const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; if (qm_ptr != NULL && iqm_ptr != NULL) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, - p->round_QTX, p->quant_QTX, p->quant_shift_QTX, - qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, - sc->scan, sc->iscan, qm_ptr, iqm_ptr, - qparam->log_scale); + quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); } else { switch (qparam->log_scale) { case 0: - aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, - p->round_QTX, p->quant_QTX, p->quant_shift_QTX, - qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, - sc->scan, sc->iscan); + aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); break; case 1: - aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, - p->round_QTX, p->quant_QTX, p->quant_shift_QTX, - qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, - sc->scan, sc->iscan); + aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); break; case 2: - aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, - p->round_QTX, p->quant_QTX, p->quant_shift_QTX, - qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, - sc->scan, sc->iscan); + aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); break; default: assert(0); } @@ -392,28 +389,25 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { - // obsolete skip_block - const int skip_block = 0; const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; if (qm_ptr != NULL && iqm_ptr != NULL) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, - p->round_QTX, p->quant_QTX, p->quant_shift_QTX, - qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, - sc->scan, sc->iscan, qm_ptr, iqm_ptr, - qparam->log_scale); + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); } else { switch (qparam->log_scale) { case 0: if (LIKELY(n_coeffs >= 8)) { - aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, - p->round_QTX, p->quant_QTX, p->quant_shift_QTX, - qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, - eob_ptr, sc->scan, sc->iscan); + aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); } else { // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size // quantization - aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, + aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); @@ -421,15 +415,15 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, break; case 1: aom_highbd_quantize_b_32x32( - coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX, - p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, - p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); break; case 2: aom_highbd_quantize_b_64x64( - coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX, - p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, - p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); break; default: assert(0); } diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h index eaf8374de..35af9a67a 100644 --- a/third_party/aom/av1/encoder/av1_quantize.h +++ b/third_party/aom/av1/encoder/av1_quantize.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_QUANTIZE_H_ -#define AV1_ENCODER_QUANTIZE_H_ +#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_ +#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_ #include "config/aom_config.h" @@ -145,4 +145,4 @@ void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, } // extern "C" #endif -#endif // AV1_ENCODER_QUANTIZE_H_ +#endif // AOM_AV1_ENCODER_AV1_QUANTIZE_H_ diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c index 2070755cd..2c4acdb02 100644 --- a/third_party/aom/av1/encoder/bitstream.c +++ b/third_party/aom/av1/encoder/bitstream.c @@ -18,6 +18,7 @@ #include "aom_dsp/binary_codes_writer.h" #include "aom_dsp/bitwriter_buffer.h" #include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" #include "aom_ports/mem_ops.h" #include "aom_ports/system_state.h" #if CONFIG_BITSTREAM_DEBUG @@ -30,7 +31,6 @@ #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" #include "av1/common/mvref_common.h" -#include "av1/common/odintrin.h" #include "av1/common/pred_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" @@ -66,11 +66,11 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm, aom_writer *const w, int plane, FRAME_COUNTS *counts); -static void write_intra_mode_kf(FRAME_CONTEXT *frame_ctx, - const MB_MODE_INFO *mi, - const MB_MODE_INFO *above_mi, - const MB_MODE_INFO *left_mi, - PREDICTION_MODE mode, aom_writer *w) { +static void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx, + const MB_MODE_INFO *mi, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi, + PREDICTION_MODE mode, aom_writer *w) { assert(!is_intrabc_block(mi)); (void)mi; aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi), @@ -297,7 +297,7 @@ static void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex, DELTA_Q_PROBS + 1); if (!smallval) { - rem_bits = OD_ILOG_NZ(abs - 1) - 1; + rem_bits = get_msb(abs - 1); thr = (1 << rem_bits) + 1; aom_write_literal(w, rem_bits - 1, 3); aom_write_literal(w, abs - thr, rem_bits); @@ -326,7 +326,7 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd, } if (!smallval) { - rem_bits = OD_ILOG_NZ(abs - 1) - 1; + rem_bits = get_msb(abs - 1); thr = (1 << rem_bits) + 1; aom_write_literal(w, rem_bits - 1, 3); aom_write_literal(w, abs - thr, rem_bits); @@ -836,8 +836,8 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, } } -static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize, - PREDICTION_MODE mode, aom_writer *w) { +static void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize, + PREDICTION_MODE mode, aom_writer *w) { aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]], INTRA_MODES); } @@ -933,45 +933,24 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w, } } -static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, - const int mi_col, aom_writer *w) { +// If delta q is present, writes delta_q index. +// Also writes delta_q loop filter levels, if present. +static void write_delta_q_params(AV1_COMP *cpi, const int mi_row, + const int mi_col, int skip, aom_writer *w) { AV1_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->td.mb; - MACROBLOCKD *const xd = &x->e_mbd; - FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - const struct segmentation *const seg = &cm->seg; - struct segmentation_probs *const segp = &ec_ctx->seg; - const MB_MODE_INFO *const mbmi = xd->mi[0]; - const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; - const PREDICTION_MODE mode = mbmi->mode; - const int segment_id = mbmi->segment_id; - const BLOCK_SIZE bsize = mbmi->sb_type; - const int allow_hp = cm->allow_high_precision_mv; - const int is_inter = is_inter_block(mbmi); - const int is_compound = has_second_ref(mbmi); - int skip, ref; - (void)mi_row; - (void)mi_col; - - write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1); - - write_skip_mode(cm, xd, segment_id, mbmi, w); - - assert(IMPLIES(mbmi->skip_mode, mbmi->skip)); - skip = mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w); - - write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0); - - write_cdef(cm, xd, w, skip, mi_col, mi_row); - if (cm->delta_q_present_flag) { - int super_block_upper_left = + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int super_block_upper_left = ((mi_row & (cm->seq_params.mib_size - 1)) == 0) && ((mi_col & (cm->seq_params.mib_size - 1)) == 0); + if ((bsize != cm->seq_params.sb_size || skip == 0) && super_block_upper_left) { assert(mbmi->current_qindex > 0); - int reduced_delta_qindex = + const int reduced_delta_qindex = (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res; write_delta_qindex(xd, reduced_delta_qindex, w); xd->current_qindex = mbmi->current_qindex; @@ -996,37 +975,96 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, } } } +} - if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter); +static void write_intra_prediction_modes(AV1_COMP *cpi, const int mi_row, + const int mi_col, int is_keyframe, + aom_writer *w) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const PREDICTION_MODE mode = mbmi->mode; + const BLOCK_SIZE bsize = mbmi->sb_type; - if (mbmi->skip_mode) return; + // Y mode. + if (is_keyframe) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w); + } else { + write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w); + } - if (!is_inter) { - write_intra_mode(ec_ctx, bsize, mode, w); - const int use_angle_delta = av1_use_angle_delta(bsize); + // Y angle delta. + const int use_angle_delta = av1_use_angle_delta(bsize); + if (use_angle_delta && av1_is_directional_mode(mode)) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y], + ec_ctx->angle_delta_cdf[mode - V_PRED]); + } - if (use_angle_delta && av1_is_directional_mode(mode)) { - write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y], - ec_ctx->angle_delta_cdf[mode - V_PRED]); + // UV mode and UV angle delta. + if (!cm->seq_params.monochrome && + is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y)) { + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); + if (uv_mode == UV_CFL_PRED) + write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w); + if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV], + ec_ctx->angle_delta_cdf[uv_mode - V_PRED]); } + } - if (!cm->seq_params.monochrome && - is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, - xd->plane[1].subsampling_y)) { - const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; - write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); - if (uv_mode == UV_CFL_PRED) - write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w); - if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) { - write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV], - ec_ctx->angle_delta_cdf[uv_mode - V_PRED]); - } - } + // Palette. + if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) { + write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w); + } - if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) - write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w); + // Filter intra. + write_filter_intra_mode_info(cm, xd, mbmi, w); +} - write_filter_intra_mode_info(cm, xd, mbmi, w); +static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, + const int mi_col, aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const struct segmentation *const seg = &cm->seg; + struct segmentation_probs *const segp = &ec_ctx->seg; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const PREDICTION_MODE mode = mbmi->mode; + const int segment_id = mbmi->segment_id; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int allow_hp = cm->allow_high_precision_mv; + const int is_inter = is_inter_block(mbmi); + const int is_compound = has_second_ref(mbmi); + int ref; + + write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1); + + write_skip_mode(cm, xd, segment_id, mbmi, w); + + assert(IMPLIES(mbmi->skip_mode, mbmi->skip)); + const int skip = + mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w); + + write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0); + + write_cdef(cm, xd, w, skip, mi_col, mi_row); + + write_delta_q_params(cpi, mi_row, mi_col, skip, w); + + if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter); + + if (mbmi->skip_mode) return; + + if (!is_inter) { + write_intra_prediction_modes(cpi, mi_row, mi_col, 0, w); } else { int16_t mode_ctx; @@ -1172,11 +1210,7 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd, FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const struct segmentation *const seg = &cm->seg; struct segmentation_probs *const segp = &ec_ctx->seg; - const MB_MODE_INFO *const above_mi = xd->above_mbmi; - const MB_MODE_INFO *const left_mi = xd->left_mbmi; const MB_MODE_INFO *const mbmi = xd->mi[0]; - const BLOCK_SIZE bsize = mbmi->sb_type; - const PREDICTION_MODE mode = mbmi->mode; if (seg->segid_preskip && seg->update_map) write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0); @@ -1188,69 +1222,14 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd, write_cdef(cm, xd, w, skip, mi_col, mi_row); - if (cm->delta_q_present_flag) { - int super_block_upper_left = - ((mi_row & (cm->seq_params.mib_size - 1)) == 0) && - ((mi_col & (cm->seq_params.mib_size - 1)) == 0); - if ((bsize != cm->seq_params.sb_size || skip == 0) && - super_block_upper_left) { - assert(mbmi->current_qindex > 0); - int reduced_delta_qindex = - (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res; - write_delta_qindex(xd, reduced_delta_qindex, w); - xd->current_qindex = mbmi->current_qindex; - if (cm->delta_lf_present_flag) { - if (cm->delta_lf_multi) { - const int frame_lf_count = - av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; - for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { - int reduced_delta_lflevel = - (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / - cm->delta_lf_res; - write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w); - xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; - } - } else { - int reduced_delta_lflevel = - (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / - cm->delta_lf_res; - write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w); - xd->delta_lf_from_base = mbmi->delta_lf_from_base; - } - } - } - } + write_delta_q_params(cpi, mi_row, mi_col, skip, w); if (av1_allow_intrabc(cm)) { write_intrabc_info(xd, mbmi_ext, w); if (is_intrabc_block(mbmi)) return; } - write_intra_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w); - - const int use_angle_delta = av1_use_angle_delta(bsize); - if (use_angle_delta && av1_is_directional_mode(mode)) { - write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y], - ec_ctx->angle_delta_cdf[mode - V_PRED]); - } - - if (!cm->seq_params.monochrome && - is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, - xd->plane[1].subsampling_y)) { - const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; - write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); - if (uv_mode == UV_CFL_PRED) - write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w); - if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) { - write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV], - ec_ctx->angle_delta_cdf[uv_mode - V_PRED]); - } - } - - if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) - write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w); - - write_filter_intra_mode_info(cm, xd, mbmi, w); + write_intra_prediction_modes(cpi, mi_row, mi_col, 1, w); } #if CONFIG_RD_DEBUG @@ -1549,10 +1528,10 @@ static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile, write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w); } else { write_selected_tx_size(xd, w); - set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, 0, xd); + set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h, 0, xd); } } else { - set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, + set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h, skip && is_inter_block(mbmi), xd); } @@ -1694,15 +1673,14 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile, } static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile, - aom_writer *const w, const TOKENEXTRA **tok, - const TOKENEXTRA *const tok_end) { + aom_writer *const w, int tile_row, int tile_col) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; const int mi_row_start = tile->mi_row_start; const int mi_row_end = tile->mi_row_end; const int mi_col_start = tile->mi_col_start; const int mi_col_end = tile->mi_col_end; - int mi_row, mi_col; + int mi_row, mi_col, sb_row_in_tile; av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row); av1_init_above_context(cm, xd, tile->tile_row); @@ -1716,13 +1694,21 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile, for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->seq_params.mib_size) { + sb_row_in_tile = + (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2; + const TOKENEXTRA *tok = + cpi->tplist[tile_row][tile_col][sb_row_in_tile].start; + const TOKENEXTRA *tok_end = + tok + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count; + av1_zero_left_context(xd); for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->seq_params.mib_size) { - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, + write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col, cm->seq_params.sb_size); } + assert(tok == cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop); } } @@ -2220,33 +2206,12 @@ static void write_ext_tile_info(const AV1_COMMON *const cm, } } -#if USE_GF16_MULTI_LAYER -static int get_refresh_mask_gf16(AV1_COMP *cpi) { - if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common)) - return 0xFF; - - int refresh_mask = 0; - - if (cpi->refresh_last_frame || cpi->refresh_golden_frame || - cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame || - cpi->refresh_alt_ref_frame) { - assert(cpi->refresh_fb_idx >= 0 && cpi->refresh_fb_idx < REF_FRAMES); - refresh_mask |= (1 << cpi->refresh_fb_idx); - } - - return refresh_mask; -} -#endif // USE_GF16_MULTI_LAYER - static int get_refresh_mask(AV1_COMP *cpi) { if ((cpi->common.frame_type == KEY_FRAME && cpi->common.show_frame) || frame_is_sframe(&cpi->common)) return 0xFF; int refresh_mask = 0; -#if USE_GF16_MULTI_LAYER - if (cpi->rc.baseline_gf_interval == 16) return get_refresh_mask_gf16(cpi); -#endif // USE_GF16_MULTI_LAYER // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be // notified to get LAST3_FRAME refreshed and then the virtual indexes for all @@ -2281,8 +2246,13 @@ static int get_refresh_mask(AV1_COMP *cpi) { // Note: This is highly specific to the use of ARF as a forward reference, // and this needs to be generalized as other uses are implemented // (like RTC/temporal scalability). - return refresh_mask | - (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]); + + if (cpi->preserve_arf_as_gld) { + return refresh_mask; + } else { + return refresh_mask | + (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]); + } } else { const int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1]; return refresh_mask | @@ -2574,9 +2544,9 @@ static void write_film_grain_params(AV1_COMP *cpi, aom_wb_write_literal(wb, pars->random_seed, 16); - pars->random_seed += 3245; // For film grain test vectors purposes + pars->random_seed += 3381; // Changing random seed for film grain if (!pars->random_seed) // Random seed should not be zero - pars->random_seed += 1735; + pars->random_seed += 7391; if (cm->frame_type == INTER_FRAME) aom_wb_write_bit(wb, pars->update_parameters); else @@ -2685,7 +2655,8 @@ static void write_sb_size(SequenceHeader *seq_params, aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0); } -void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) { +static void write_sequence_header(AV1_COMP *cpi, + struct aom_write_bit_buffer *wb) { AV1_COMMON *const cm = &cpi->common; SequenceHeader *seq_params = &cm->seq_params; @@ -2695,8 +2666,10 @@ void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) { int max_frame_height = cpi->oxcf.forced_max_frame_height ? cpi->oxcf.forced_max_frame_height : cpi->oxcf.height; + // max((int)ceil(log2(max_frame_width)), 1) const int num_bits_width = (max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1; + // max((int)ceil(log2(max_frame_height)), 1) const int num_bits_height = (max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1; assert(num_bits_width <= 16); @@ -2954,7 +2927,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, assert(cm->frame_type == KEY_FRAME); } if (!seq_params->reduced_still_picture_hdr) { - if (cm->show_existing_frame) { + if (encode_show_existing_frame(cm)) { RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show]; @@ -3254,14 +3227,14 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, if (cm->base_qindex > 0) { aom_wb_write_bit(wb, cm->delta_q_present_flag); if (cm->delta_q_present_flag) { - aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2); + aom_wb_write_literal(wb, get_msb(cm->delta_q_res), 2); xd->current_qindex = cm->base_qindex; if (cm->allow_intrabc) assert(cm->delta_lf_present_flag == 0); else aom_wb_write_bit(wb, cm->delta_lf_present_flag); if (cm->delta_lf_present_flag) { - aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2); + aom_wb_write_literal(wb, get_msb(cm->delta_lf_res), 2); aom_wb_write_bit(wb, cm->delta_lf_multi); av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); } @@ -3508,7 +3481,7 @@ static void write_bitstream_level(BitstreamLevel bl, aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS); } -static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) { +uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) { AV1_COMMON *const cm = &cpi->common; struct aom_write_bit_buffer wb = { dst, 0 }; uint32_t size = 0; @@ -3619,7 +3592,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, AV1_COMMON *const cm = &cpi->common; aom_writer mode_bc; int tile_row, tile_col; - TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok; TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers; uint32_t total_size = 0; const int tile_cols = cm->tile_cols; @@ -3684,8 +3656,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, for (tile_row = 0; tile_row < tile_rows; tile_row++) { TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; - const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col]; - const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col]; const int data_offset = have_tiles ? 4 : 0; const int tile_idx = tile_row * tile_cols + tile_col; TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; @@ -3703,8 +3673,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, mode_bc.allow_update_cdf = mode_bc.allow_update_cdf && !cm->disable_cdf_update; aom_start_encode(&mode_bc, buf->data + data_offset); - write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end); - assert(tok == tok_end); + write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col); aom_stop_encode(&mode_bc); tile_size = mode_bc.pos; buf->size = tile_size; @@ -3794,8 +3763,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, const int tile_idx = tile_row * tile_cols + tile_col; TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; - const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col]; - const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col]; int is_last_tile_in_tg = 0; if (new_tg) { @@ -3847,7 +3814,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes); aom_start_encode(&mode_bc, dst + total_size); - write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end); + write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col); aom_stop_encode(&mode_bc); tile_size = mode_bc.pos; assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES); @@ -3990,7 +3957,8 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) { data += obu_header_size + obu_payload_size + length_field_size; } - const int write_frame_header = (cm->num_tg > 1 || cm->show_existing_frame); + const int write_frame_header = + (cm->num_tg > 1 || encode_show_existing_frame(cm)); struct aom_write_bit_buffer saved_wb; if (write_frame_header) { // Write Frame Header OBU. @@ -4017,7 +3985,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) { saved_wb.bit_buffer += length_field_size; } - if (cm->show_existing_frame) { + if (encode_show_existing_frame(cm)) { data_size = 0; } else { // Each tile group obu will be preceded by 4-byte size of the tile group diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h index 2047b6833..465ccaed5 100644 --- a/third_party/aom/av1/encoder/bitstream.h +++ b/third_party/aom/av1/encoder/bitstream.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_BITSTREAM_H_ -#define AV1_ENCODER_BITSTREAM_H_ +#ifndef AOM_AV1_ENCODER_BITSTREAM_H_ +#define AOM_AV1_ENCODER_BITSTREAM_H_ #ifdef __cplusplus extern "C" { @@ -20,8 +20,13 @@ extern "C" { struct aom_write_bit_buffer; -void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb); +// Writes only the OBU Sequence Header payload, and returns the size of the +// payload written to 'dst'. This function does not write the OBU header, the +// optional extension, or the OBU size to 'dst'. +uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst); +// Writes the OBU header byte, and the OBU header extension byte when +// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'. uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension, uint8_t *const dst); @@ -32,8 +37,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size); static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) { // Do not swap gf and arf indices for internal overlay frames - return !cpi->multi_arf_allowed && cpi->rc.is_src_frame_alt_ref && - !cpi->rc.is_src_frame_ext_arf; + return cpi->rc.is_src_frame_alt_ref && !cpi->rc.is_src_frame_ext_arf; } void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, @@ -44,4 +48,4 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, } // extern "C" #endif -#endif // AV1_ENCODER_BITSTREAM_H_ +#endif // AOM_AV1_ENCODER_BITSTREAM_H_ diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h index 003e59e39..0bc5dea82 100644 --- a/third_party/aom/av1/encoder/block.h +++ b/third_party/aom/av1/encoder/block.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_BLOCK_H_ -#define AV1_ENCODER_BLOCK_H_ +#ifndef AOM_AV1_ENCODER_BLOCK_H_ +#define AOM_AV1_ENCODER_BLOCK_H_ #include "av1/common/entropymv.h" #include "av1/common/entropy.h" @@ -170,6 +170,7 @@ typedef struct { InterpFilters filters; int_mv mv[2]; int8_t ref_frames[2]; + COMPOUND_TYPE comp_type; } INTERPOLATION_FILTER_STATS; typedef struct macroblock MACROBLOCK; @@ -254,6 +255,19 @@ struct macroblock { PALETTE_BUFFER *palette_buffer; + CONV_BUF_TYPE *tmp_conv_dst; + uint8_t *tmp_obmc_bufs[2]; + + // buffer for hash value calculation of a block + // used only in av1_get_block_hash_value() + // [first hash/second hash] + // [two buffers used ping-pong] + uint32_t *hash_value_buffer[2][2]; + + CRC_CALCULATOR crc_calculator1; + CRC_CALCULATOR crc_calculator2; + int g_crc_initialized; + // These define limits to motion vector components to prevent them // from extending outside the UMV borders MvLimits mv_limits; @@ -344,7 +358,6 @@ struct macroblock { #if CONFIG_DIST_8X8 int using_dist_8x8; aom_tune_metric tune_metric; - DECLARE_ALIGNED(16, int16_t, pred_luma[MAX_SB_SQUARE]); #endif // CONFIG_DIST_8X8 int comp_idx_cost[COMP_INDEX_CONTEXTS][2]; int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2]; @@ -352,6 +365,8 @@ struct macroblock { int tx_search_prune[EXT_TX_SET_TYPES]; int must_find_valid_partition; int tx_split_prune_flag; // Flag to skip tx split RD search. + int recalc_luma_mc_data; // Flag to indicate recalculation of MC data during + // interpolation filter search }; static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) { @@ -400,8 +415,38 @@ static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) { return depth; } +static INLINE void set_blk_skip(MACROBLOCK *x, int plane, int blk_idx, + int skip) { + if (skip) + x->blk_skip[blk_idx] |= 1UL << plane; + else + x->blk_skip[blk_idx] &= ~(1UL << plane); +#ifndef NDEBUG + // Set chroma planes to uninitialized states when luma is set to check if + // it will be set later + if (plane == 0) { + x->blk_skip[blk_idx] |= 1UL << (1 + 4); + x->blk_skip[blk_idx] |= 1UL << (2 + 4); + } + + // Clear the initialization checking bit + x->blk_skip[blk_idx] &= ~(1UL << (plane + 4)); +#endif +} + +static INLINE int is_blk_skip(MACROBLOCK *x, int plane, int blk_idx) { +#ifndef NDEBUG + // Check if this is initialized + assert(!(x->blk_skip[blk_idx] & (1UL << (plane + 4)))); + + // The magic number is 0x77, this is to test if there is garbage data + assert((x->blk_skip[blk_idx] & 0x88) == 0); +#endif + return (x->blk_skip[blk_idx] >> plane) & 1; +} + #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_BLOCK_H_ +#endif // AOM_AV1_ENCODER_BLOCK_H_ diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c index 66dedd9ed..f7cff9e53 100644 --- a/third_party/aom/av1/encoder/blockiness.c +++ b/third_party/aom/av1/encoder/blockiness.c @@ -16,7 +16,6 @@ #include "av1/common/common.h" #include "av1/common/filter.h" #include "aom/aom_integer.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_filter.h" #include "aom_ports/mem.h" #include "aom_ports/system_state.h" diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c index d6e556b93..57f59f304 100644 --- a/third_party/aom/av1/encoder/context_tree.c +++ b/third_party/aom/av1/encoder/context_tree.c @@ -175,14 +175,15 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) { } void av1_free_pc_tree(ThreadData *td, const int num_planes) { - const int tree_nodes_inc = 1024; - - const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1; - int i; - for (i = 0; i < tree_nodes; ++i) - free_tree_contexts(&td->pc_tree[i], num_planes); - aom_free(td->pc_tree); - td->pc_tree = NULL; + if (td->pc_tree != NULL) { + const int tree_nodes_inc = 1024; + const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1; + for (int i = 0; i < tree_nodes; ++i) { + free_tree_contexts(&td->pc_tree[i], num_planes); + } + aom_free(td->pc_tree); + td->pc_tree = NULL; + } } void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h index c05f48a7a..4efc34985 100644 --- a/third_party/aom/av1/encoder/context_tree.h +++ b/third_party/aom/av1/encoder/context_tree.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_CONTEXT_TREE_H_ -#define AV1_ENCODER_CONTEXT_TREE_H_ +#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_ +#define AOM_AV1_ENCODER_CONTEXT_TREE_H_ #include "av1/common/blockd.h" #include "av1/encoder/block.h" @@ -56,6 +56,8 @@ typedef struct { int hybrid_pred_diff; int comp_pred_diff; int single_pred_diff; + // Skip certain ref frames during RD search of rectangular partitions. + int skip_ref_frame_mask; // TODO(jingning) Use RD_COST struct here instead. This involves a boarder // scope of refactoring. @@ -109,4 +111,4 @@ void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, } // extern "C" #endif -#endif /* AV1_ENCODER_CONTEXT_TREE_H_ */ +#endif // AOM_AV1_ENCODER_CONTEXT_TREE_H_ diff --git a/third_party/aom/av1/encoder/corner_detect.h b/third_party/aom/av1/encoder/corner_detect.h index 0317db5b3..cab59a774 100644 --- a/third_party/aom/av1/encoder/corner_detect.h +++ b/third_party/aom/av1/encoder/corner_detect.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_CORNER_DETECT_H_ -#define AV1_ENCODER_CORNER_DETECT_H_ +#ifndef AOM_AV1_ENCODER_CORNER_DETECT_H_ +#define AOM_AV1_ENCODER_CORNER_DETECT_H_ #include #include @@ -19,4 +19,4 @@ int fast_corner_detect(unsigned char *buf, int width, int height, int stride, int *points, int max_points); -#endif // AV1_ENCODER_CORNER_DETECT_H_ +#endif // AOM_AV1_ENCODER_CORNER_DETECT_H_ diff --git a/third_party/aom/av1/encoder/corner_match.h b/third_party/aom/av1/encoder/corner_match.h index 3b16f9efc..535d2faed 100644 --- a/third_party/aom/av1/encoder/corner_match.h +++ b/third_party/aom/av1/encoder/corner_match.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_CORNER_MATCH_H_ -#define AV1_ENCODER_CORNER_MATCH_H_ +#ifndef AOM_AV1_ENCODER_CORNER_MATCH_H_ +#define AOM_AV1_ENCODER_CORNER_MATCH_H_ #include #include @@ -30,4 +30,4 @@ int determine_correspondence(unsigned char *frm, int *frm_corners, int height, int frm_stride, int ref_stride, int *correspondence_pts); -#endif // AV1_ENCODER_CORNER_MATCH_H_ +#endif // AOM_AV1_ENCODER_CORNER_MATCH_H_ diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h index 5de7765c5..af5b09837 100644 --- a/third_party/aom/av1/encoder/cost.h +++ b/third_party/aom/av1/encoder/cost.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_COST_H_ -#define AV1_ENCODER_COST_H_ +#ifndef AOM_AV1_ENCODER_COST_H_ +#define AOM_AV1_ENCODER_COST_H_ #include "aom_dsp/prob.h" #include "aom/aom_integer.h" @@ -44,4 +44,4 @@ void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf, } // extern "C" #endif -#endif // AV1_ENCODER_COST_H_ +#endif // AOM_AV1_ENCODER_COST_H_ diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h index 03318e5b7..37306c6a5 100644 --- a/third_party/aom/av1/encoder/dwt.h +++ b/third_party/aom/av1/encoder/dwt.h @@ -9,6 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#ifndef AOM_AV1_ENCODER_DWT_H_ +#define AOM_AV1_ENCODER_DWT_H_ + #include "av1/common/common.h" #include "av1/common/enums.h" @@ -18,3 +21,5 @@ void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride); void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride, int hbd); int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd); + +#endif // AOM_AV1_ENCODER_DWT_H_ diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c index 27ca53761..cb226c59e 100644 --- a/third_party/aom/av1/encoder/encodeframe.c +++ b/third_party/aom/av1/encoder/encodeframe.c @@ -40,11 +40,11 @@ #include "av1/common/reconinter.h" #include "av1/common/seg_common.h" #include "av1/common/tile_common.h" +#include "av1/common/warped_motion.h" #include "av1/encoder/aq_complexity.h" #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/encoder/aq_variance.h" -#include "av1/common/warped_motion.h" #include "av1/encoder/global_motion.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodemb.h" @@ -56,6 +56,7 @@ #include "av1/encoder/partition_model_weights.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/tokenize.h" @@ -348,8 +349,9 @@ static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi, x->skip = 0; } -static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data, - ThreadData *td, PICK_MODE_CONTEXT *ctx, int mi_row, +static void update_state(const AV1_COMP *const cpi, + const TileDataEnc *const tile_data, ThreadData *td, + const PICK_MODE_CONTEXT *const ctx, int mi_row, int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) { int i, x_idx, y; const AV1_COMMON *const cm = &cpi->common; @@ -359,7 +361,7 @@ static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; - MB_MODE_INFO *mi = &ctx->mic; + const MB_MODE_INFO *const mi = &ctx->mic; MB_MODE_INFO *const mi_addr = xd->mi[0]; const struct segmentation *const seg = &cm->seg; const int bw = mi_size_wide[mi->sb_type]; @@ -505,12 +507,12 @@ static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) { cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q); } -static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, +static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x, int mi_row, int mi_col, RD_STATS *rd_cost, PARTITION_TYPE partition, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd) { - const AV1_COMMON *const cm = &cpi->common; + AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCKD *const xd = &x->e_mbd; @@ -522,6 +524,13 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode; int i, orig_rdmult; + if (best_rd < 0) { + ctx->rdcost = INT64_MAX; + ctx->skip = 0; + av1_invalid_rd_stats(rd_cost); + return; + } + aom_clear_system_state(); set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); @@ -588,9 +597,10 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, if (aq_mode == VARIANCE_AQ) { if (cpi->vaq_refresh) { - const int energy = - bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize); - mbmi->segment_id = av1_vaq_segment_id(energy); + const int energy = bsize <= BLOCK_16X16 + ? x->mb_energy + : av1_log_block_var(cpi, x, bsize); + mbmi->segment_id = energy; } x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id); } else if (aq_mode == COMPLEXITY_AQ) { @@ -1407,8 +1417,8 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx, static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data, ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize, - PARTITION_TYPE partition, PICK_MODE_CONTEXT *ctx, - int *rate) { + PARTITION_TYPE partition, + const PICK_MODE_CONTEXT *const ctx, int *rate) { TileInfo *const tile = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *xd = &x->e_mbd; @@ -1691,7 +1701,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, if (bsize == BLOCK_16X16 && cpi->vaq_refresh) { set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); - x->mb_energy = av1_block_energy(cpi, x, bsize); + x->mb_energy = av1_log_block_var(cpi, x, bsize); } if (do_partition_search && @@ -1728,7 +1738,20 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, pc_tree->partitioning = partition; } } - + for (int b = 0; b < 2; ++b) { + pc_tree->horizontal[b].skip_ref_frame_mask = 0; + pc_tree->vertical[b].skip_ref_frame_mask = 0; + } + for (int b = 0; b < 3; ++b) { + pc_tree->horizontala[b].skip_ref_frame_mask = 0; + pc_tree->horizontalb[b].skip_ref_frame_mask = 0; + pc_tree->verticala[b].skip_ref_frame_mask = 0; + pc_tree->verticalb[b].skip_ref_frame_mask = 0; + } + for (int b = 0; b < 4; ++b) { + pc_tree->horizontal4[b].skip_ref_frame_mask = 0; + pc_tree->vertical4[b].skip_ref_frame_mask = 0; + } switch (partition) { case PARTITION_NONE: rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, @@ -1741,7 +1764,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + hbs < cm->mi_rows) { RD_STATS tmp_rdc; - PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0]; + const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0]; av1_init_rd_stats(&tmp_rdc); update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1); encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, @@ -1765,7 +1788,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + hbs < cm->mi_cols) { RD_STATS tmp_rdc; - PICK_MODE_CONTEXT *ctx_v = &pc_tree->vertical[0]; + const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0]; av1_init_rd_stats(&tmp_rdc); update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1); encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, @@ -1812,7 +1835,8 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, case PARTITION_HORZ_A: case PARTITION_HORZ_B: case PARTITION_HORZ_4: - case PARTITION_VERT_4: assert(0 && "Cannot handle extended partiton types"); + case PARTITION_VERT_4: + assert(0 && "Cannot handle extended partition types"); default: assert(0); break; } @@ -2164,7 +2188,8 @@ static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); } -static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { +static INLINE void load_pred_mv(MACROBLOCK *x, + const PICK_MODE_CONTEXT *const ctx) { memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); } @@ -2221,12 +2246,11 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, // Try searching for an encoding for the given subblock. Returns zero if the // rdcost is already too high (to tell the caller not to bother searching for // encodings of further subblocks) -static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td, - TileDataEnc *tile_data, TOKENEXTRA **tp, - int is_first, int is_last, int mi_row, int mi_col, - BLOCK_SIZE subsize, RD_STATS *best_rdc, - RD_STATS *sum_rdc, RD_STATS *this_rdc, - PARTITION_TYPE partition, +static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, int is_last, + int mi_row, int mi_col, BLOCK_SIZE subsize, + RD_STATS *best_rdc, RD_STATS *sum_rdc, + RD_STATS *this_rdc, PARTITION_TYPE partition, PICK_MODE_CONTEXT *prev_ctx, PICK_MODE_CONTEXT *this_ctx) { #define RTS_X_RATE_NOCOEF_ARG @@ -2236,25 +2260,20 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td, if (cpi->sf.adaptive_motion_search) load_pred_mv(x, prev_ctx); - // On the first time around, write the rd stats straight to sum_rdc. Also, we - // should treat sum_rdc as containing zeros (even if it doesn't) to avoid - // having to zero it at the start. - if (is_first) this_rdc = sum_rdc; - const int64_t spent_rdcost = is_first ? 0 : sum_rdc->rdcost; - const int64_t rdcost_remaining = best_rdc->rdcost - spent_rdcost; + const int64_t rdcost_remaining = best_rdc->rdcost == INT64_MAX + ? INT64_MAX + : (best_rdc->rdcost - sum_rdc->rdcost); rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx, rdcost_remaining); - if (!is_first) { - if (this_rdc->rate == INT_MAX) { - sum_rdc->rdcost = INT64_MAX; - } else { - sum_rdc->rate += this_rdc->rate; - sum_rdc->dist += this_rdc->dist; - sum_rdc->rdcost += this_rdc->rdcost; - } + if (this_rdc->rate == INT_MAX) { + sum_rdc->rdcost = INT64_MAX; + } else { + sum_rdc->rate += this_rdc->rate; + sum_rdc->dist += this_rdc->dist; + sum_rdc->rdcost += this_rdc->rdcost; } if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0; @@ -2271,7 +2290,7 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td, #undef RTS_MAX_RDCOST } -static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td, +static void rd_test_partition3(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TOKENEXTRA **tp, PC_TREE *pc_tree, RD_STATS *best_rdc, PICK_MODE_CONTEXT ctxs[3], @@ -2284,13 +2303,16 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td, MACROBLOCKD *const xd = &x->e_mbd; RD_STATS sum_rdc, this_rdc; #define RTP_STX_TRY_ARGS - - if (!rd_try_subblock(cpi, td, tile_data, tp, 1, 0, mi_row0, mi_col0, subsize0, + int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + av1_init_rd_stats(&sum_rdc); + sum_rdc.rate = x->partition_cost[pl][partition]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row0, mi_col0, subsize0, best_rdc, &sum_rdc, &this_rdc, RTP_STX_TRY_ARGS partition, ctx, &ctxs[0])) return; - if (!rd_try_subblock(cpi, td, tile_data, tp, 0, 0, mi_row1, mi_col1, subsize1, + if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row1, mi_col1, subsize1, best_rdc, &sum_rdc, &this_rdc, RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1])) return; @@ -2302,15 +2324,13 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td, // difference (obviously) doesn't contribute to the error. const int try_block2 = 1; if (try_block2 && - !rd_try_subblock(cpi, td, tile_data, tp, 0, 1, mi_row2, mi_col2, subsize2, + !rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2, best_rdc, &sum_rdc, &this_rdc, RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2])) return; if (sum_rdc.rdcost >= best_rdc->rdcost) return; - int pl = partition_plane_context(xd, mi_row, mi_col, bsize); - sum_rdc.rate += x->partition_cost[pl][partition]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost >= best_rdc->rdcost) return; @@ -2321,45 +2341,6 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td, #undef RTP_STX_TRY_ARGS } -#if CONFIG_DIST_8X8 -static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x, - uint8_t *src_plane_8x8[MAX_MB_PLANE], - uint8_t *dst_plane_8x8[MAX_MB_PLANE]) { - const AV1_COMMON *const cm = &cpi->common; - const int num_planes = av1_num_planes(cm); - MACROBLOCKD *const xd = &x->e_mbd; - int64_t dist_8x8, dist_8x8_uv, total_dist; - const int src_stride = x->plane[0].src.stride; - int plane; - - const int dst_stride = xd->plane[0].dst.stride; - dist_8x8 = - av1_dist_8x8(cpi, x, src_plane_8x8[0], src_stride, dst_plane_8x8[0], - dst_stride, BLOCK_8X8, 8, 8, 8, 8, x->qindex) - << 4; - - // Compute chroma distortion for a luma 8x8 block - dist_8x8_uv = 0; - - if (num_planes > 1) { - for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - unsigned sse; - const int src_stride_uv = x->plane[plane].src.stride; - const int dst_stride_uv = xd->plane[plane].dst.stride; - const int ssx = xd->plane[plane].subsampling_x; - const int ssy = xd->plane[plane].subsampling_y; - const BLOCK_SIZE plane_bsize = get_plane_block_size(BLOCK_8X8, ssx, ssy); - - cpi->fn_ptr[plane_bsize].vf(src_plane_8x8[plane], src_stride_uv, - dst_plane_8x8[plane], dst_stride_uv, &sse); - dist_8x8_uv += (int64_t)sse << 4; - } - } - - return total_dist = dist_8x8 + dist_8x8_uv; -} -#endif // CONFIG_DIST_8X8 - static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) { pc_tree->partitioning = PARTITION_NONE; pc_tree->cb_search_range = SEARCH_FULL_PLANE; @@ -2372,7 +2353,7 @@ static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) { } } -static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, +static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, int64_t best_rd, @@ -2410,7 +2391,12 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, (void)*tp_orig; (void)split_rd; - av1_zero(pc_tree->pc_tree_stats); + if (best_rd < 0) { + pc_tree->none.rdcost = INT64_MAX; + pc_tree->none.skip = 0; + av1_invalid_rd_stats(rd_cost); + return; + } pc_tree->pc_tree_stats.valid = 1; // Override partition costs at the edges of the frame in the same @@ -2441,9 +2427,11 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, #ifndef NDEBUG // Nothing should rely on the default value of this array (which is just - // leftover from encoding the previous block. Setting it to magic number + // leftover from encoding the previous block. Setting it to fixed pattern // when debugging. - memset(x->blk_skip, 234, sizeof(x->blk_skip)); + // bit 0, 1, 2 are blk_skip of each plane + // bit 4, 5, 6 are initialization checking of each plane + memset(x->blk_skip, 0x77, sizeof(x->blk_skip)); #endif // NDEBUG assert(mi_size_wide[bsize] == mi_size_high[bsize]); @@ -2456,19 +2444,35 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); if (bsize == BLOCK_16X16 && cpi->vaq_refresh) - x->mb_energy = av1_block_energy(cpi, x, bsize); + x->mb_energy = av1_log_block_var(cpi, x, bsize); xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) { + if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) + do_square_split = 0; + } +#endif + // PARTITION_NONE if (partition_none_allowed) { - if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE; - + int pt_cost = 0; + if (bsize_at_least_8x8) { + pc_tree->partitioning = PARTITION_NONE; + pt_cost = partition_cost[PARTITION_NONE] < INT_MAX + ? partition_cost[PARTITION_NONE] + : 0; + } + int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0); + int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX + ? INT64_MAX + : (best_rdc.rdcost - partition_rd_cost); rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, - PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost); + PARTITION_NONE, bsize, ctx_none, best_remain_rdcost); pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost; pc_tree->pc_tree_stats.skip = ctx_none->skip; @@ -2476,9 +2480,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, if (none_rd) *none_rd = this_rdc.rdcost; if (this_rdc.rate != INT_MAX) { if (bsize_at_least_8x8) { - const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX - ? partition_cost[PARTITION_NONE] - : 0; this_rdc.rate += pt_cost; this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); } @@ -2520,17 +2521,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, int64_t temp_best_rdcost = best_rdc.rdcost; pn_rdc = best_rdc; -#if CONFIG_DIST_8X8 - uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE]; - - if (x->using_dist_8x8 && bsize == BLOCK_8X8) { - for (int i = 0; i < MAX_MB_PLANE; i++) { - src_plane_8x8[i] = x->plane[i].src.buf; - dst_plane_8x8[i] = xd->plane[i].dst.buf; - } - } -#endif // CONFIG_DIST_8X8 - // PARTITION_SPLIT if (do_square_split) { int reached_last_index = 0; @@ -2548,6 +2538,8 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, pc_tree->split[idx]->index = idx; int64_t *p_split_rd = &split_rd[idx]; + // TODO(Cherma) : Account for partition cost while passing best rd to + // rd_pick_sqr_partition() rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, &this_rdc, temp_best_rdcost - sum_rdc.rdcost, @@ -2568,14 +2560,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, } reached_last_index = (idx == 4); -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && reached_last_index && - sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) { - sum_rdc.dist = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8); - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); - } -#endif // CONFIG_DIST_8X8 - if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) { sum_rdc.rate += partition_cost[PARTITION_SPLIT]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); @@ -2634,14 +2618,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td, } } -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && best_rdc.rate < INT_MAX && - best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) { - encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, - pc_tree, NULL); - } -#endif // CONFIG_DIST_8X8 - if (bsize == cm->seq_params.sb_size) { assert(best_rdc.rate < INT_MAX); assert(best_rdc.dist < INT64_MAX); @@ -2791,6 +2767,99 @@ static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats, } #undef FEATURE_SIZE +static void ml_prune_rect_partition(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + int64_t best_rd, int64_t none_rd, + int64_t *split_rd, + int *const dst_prune_horz, + int *const dst_prune_vert) { + if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return; + best_rd = AOMMAX(best_rd, 1); + const NN_CONFIG *nn_config = NULL; + const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f }; + float cur_thresh = 0.0f; + switch (bsize) { + case BLOCK_8X8: + nn_config = &av1_rect_partition_nnconfig_8; + cur_thresh = prob_thresholds[0]; + break; + case BLOCK_16X16: + nn_config = &av1_rect_partition_nnconfig_16; + cur_thresh = prob_thresholds[1]; + break; + case BLOCK_32X32: + nn_config = &av1_rect_partition_nnconfig_32; + cur_thresh = prob_thresholds[2]; + break; + case BLOCK_64X64: + nn_config = &av1_rect_partition_nnconfig_64; + cur_thresh = prob_thresholds[3]; + break; + case BLOCK_128X128: + nn_config = &av1_rect_partition_nnconfig_128; + cur_thresh = prob_thresholds[4]; + break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + aom_clear_system_state(); + + // 1. Compute input features + float features[9]; + + // RD cost ratios + for (int i = 0; i < 5; i++) features[i] = 1.0f; + if (none_rd > 0 && none_rd < 1000000000) + features[0] = (float)none_rd / (float)best_rd; + for (int i = 0; i < 4; i++) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + features[1 + i] = (float)split_rd[i] / (float)best_rd; + } + + // Variance ratios + const MACROBLOCKD *const xd = &x->e_mbd; + int whole_block_variance; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + whole_block_variance = av1_high_get_sby_perpixel_variance( + cpi, &x->plane[0].src, bsize, xd->bd); + } else { + whole_block_variance = + av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + } + whole_block_variance = AOMMAX(whole_block_variance, 1); + + int split_variance[4]; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + struct buf_2d buf; + buf.stride = x->plane[0].src.stride; + const int bw = block_size_wide[bsize]; + for (int i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bw / 2; + const int y_idx = (i >> 1) * bw / 2; + buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + split_variance[i] = + av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd); + } else { + split_variance[i] = av1_get_sby_perpixel_variance(cpi, &buf, subsize); + } + } + + for (int i = 0; i < 4; i++) + features[5 + i] = (float)split_variance[i] / (float)whole_block_variance; + + // 2. Do the prediction and prune 0-2 partitions based on their probabilities + float raw_scores[3] = { 0.0f }; + av1_nn_predict(features, nn_config, raw_scores); + float probs[3] = { 0.0f }; + av1_nn_softmax(raw_scores, probs, 3); + + // probs[0] is the probability of the fact that both rectangular partitions + // are worse than current best_rd + if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1; + if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1; +} + // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be // considered. static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx, @@ -2880,13 +2949,14 @@ static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx, #define FEATURES 18 #define LABELS 4 // Use a ML model to predict if horz4 and vert4 should be considered. -static void ml_prune_4_partition(const AV1_COMP *const cpi, - const MACROBLOCK *const x, BLOCK_SIZE bsize, - int part_ctx, int64_t best_rd, - int64_t horz_rd[2], int64_t vert_rd[2], - int64_t split_rd[4], +static void ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int part_ctx, + int64_t best_rd, int64_t horz_rd[2], + int64_t vert_rd[2], int64_t split_rd[4], int *const partition_horz4_allowed, - int *const partition_vert4_allowed) { + int *const partition_vert4_allowed, + unsigned int pb_source_variance, int mi_row, + int mi_col) { if (best_rd >= 1000000000) return; const NN_CONFIG *nn_config = NULL; switch (bsize) { @@ -2903,7 +2973,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, float features[FEATURES]; int feature_index = 0; features[feature_index++] = (float)part_ctx; - features[feature_index++] = (float)get_unsigned_bits(x->source_variance); + features[feature_index++] = (float)get_unsigned_bits(pb_source_variance); const int rdcost = (int)AOMMIN(INT_MAX, best_rd); int sub_block_rdcost[8] = { 0 }; @@ -2937,6 +3007,8 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, { BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4); BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4); + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, + av1_num_planes(&cpi->common)); const int src_stride = x->plane[0].src.stride; const uint8_t *src = x->plane[0].src.buf; const MACROBLOCKD *const xd = &x->e_mbd; @@ -2990,7 +3062,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, } } - const float denom = (float)(x->source_variance + 1); + const float denom = (float)(pb_source_variance + 1); const float low_b = 0.1f; const float high_b = 10.0f; for (int i = 0; i < 4; ++i) { @@ -3022,9 +3094,9 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, // Make decisions based on the model scores. int thresh = max_score; switch (bsize) { - case BLOCK_16X16: thresh -= 400; break; - case BLOCK_32X32: thresh -= 400; break; - case BLOCK_64X64: thresh -= 100; break; + case BLOCK_16X16: thresh -= 500; break; + case BLOCK_32X32: thresh -= 500; break; + case BLOCK_64X64: thresh -= 200; break; default: break; } *partition_horz4_allowed = 0; @@ -3039,10 +3111,73 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, #undef FEATURES #undef LABELS +#define FEATURES 4 +// ML-based partition search breakout. +static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + unsigned int pb_source_variance) { + const NN_CONFIG *nn_config = NULL; + int thresh = 0; + switch (bsize) { + case BLOCK_8X8: + nn_config = &av1_partition_breakout_nnconfig_8; + thresh = cpi->sf.ml_partition_search_breakout_thresh[0]; + break; + case BLOCK_16X16: + nn_config = &av1_partition_breakout_nnconfig_16; + thresh = cpi->sf.ml_partition_search_breakout_thresh[1]; + break; + case BLOCK_32X32: + nn_config = &av1_partition_breakout_nnconfig_32; + thresh = cpi->sf.ml_partition_search_breakout_thresh[2]; + break; + case BLOCK_64X64: + nn_config = &av1_partition_breakout_nnconfig_64; + thresh = cpi->sf.ml_partition_search_breakout_thresh[3]; + break; + case BLOCK_128X128: + nn_config = &av1_partition_breakout_nnconfig_128; + thresh = cpi->sf.ml_partition_search_breakout_thresh[4]; + break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config || thresh < 0) return 0; + + // Generate feature values. + float features[FEATURES]; + int feature_index = 0; + aom_clear_system_state(); + + const int num_pels_log2 = num_pels_log2_lookup[bsize]; + float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX); + rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) * + rate_f; + features[feature_index++] = rate_f; + + const float dist_f = + (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2); + features[feature_index++] = dist_f; + + features[feature_index++] = (float)pb_source_variance; + + const int dc_q = (int)x->plane[0].dequant_QTX[0]; + features[feature_index++] = (float)(dc_q * dc_q) / 256.0f; + assert(feature_index == FEATURES); + + // Calculate score using the NN model. + float score = 0.0f; + av1_nn_predict(features, nn_config, &score); + + // Make decision. + return (int)(score * 100) >= thresh; +} +#undef FEATURES + // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. -static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, +static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, int64_t best_rd, @@ -3068,6 +3203,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0]; int do_rectangular_split = 1; + int64_t cur_none_rd = 0; int64_t split_rd[4] = { 0, 0, 0, 0 }; int64_t horz_rd[2] = { 0, 0 }; int64_t vert_rd[2] = { 0, 0 }; @@ -3077,6 +3213,12 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, int vert_ctx_is_ready = 0; BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); + if (best_rd < 0) { + pc_tree->none.rdcost = INT64_MAX; + pc_tree->none.skip = 0; + av1_invalid_rd_stats(rd_cost); + return; + } if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0; // Override skipping rectangular partition operations for edge blocks @@ -3129,9 +3271,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, #ifndef NDEBUG // Nothing should rely on the default value of this array (which is just - // leftover from encoding the previous block. Setting it to magic number + // leftover from encoding the previous block. Setting it to fixed pattern // when debugging. - memset(x->blk_skip, 234, sizeof(x->blk_skip)); + // bit 0, 1, 2 are blk_skip of each plane + // bit 4, 5, 6 are initialization checking of each plane + memset(x->blk_skip, 0x77, sizeof(x->blk_skip)); #endif // NDEBUG assert(mi_size_wide[bsize] == mi_size_high[bsize]); @@ -3143,7 +3287,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); if (bsize == BLOCK_16X16 && cpi->vaq_refresh) - x->mb_energy = av1_block_energy(cpi, x, bsize); + x->mb_energy = av1_log_block_var(cpi, x, bsize); if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) { const int cb_partition_search_ctrl = @@ -3285,22 +3429,56 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, } #endif + // Ref frames picked in the [i_th] quarter subblock during square partition + // RD search. It may be used to prune ref frame selection of rect partitions. + int ref_frames_used[4] = { + 0, + }; + BEGIN_PARTITION_SEARCH: if (x->must_find_valid_partition) { partition_none_allowed = has_rows && has_cols; partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8; partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8; } + + // Partition block source pixel variance. + unsigned int pb_source_variance = UINT_MAX; + +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) { + if (block_size_high[bsize] <= 8) partition_horz_allowed = 0; + if (block_size_wide[bsize] <= 8) partition_vert_allowed = 0; + if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) + do_square_split = 0; + } +#endif + // PARTITION_NONE if (partition_none_allowed) { + int pt_cost = 0; + if (bsize_at_least_8x8) { + pt_cost = partition_cost[PARTITION_NONE] < INT_MAX + ? partition_cost[PARTITION_NONE] + : 0; + } + int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0); + int64_t best_remain_rdcost = (best_rdc.rdcost == INT64_MAX) + ? INT64_MAX + : (best_rdc.rdcost - partition_rd_cost); rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, - PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost); + PARTITION_NONE, bsize, ctx_none, best_remain_rdcost); + pb_source_variance = x->source_variance; if (none_rd) *none_rd = this_rdc.rdcost; + cur_none_rd = this_rdc.rdcost; if (this_rdc.rate != INT_MAX) { + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame); + for (int i = 0; i < 4; ++i) { + ref_frames_used[i] |= (1 << ref_type); + } + } if (bsize_at_least_8x8) { - const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX - ? partition_cost[PARTITION_NONE] - : 0; this_rdc.rate += pt_cost; this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); } @@ -3318,16 +3496,29 @@ BEGIN_PARTITION_SEARCH: best_rdc = this_rdc; if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE; - // If all y, u, v transform blocks in this partition are skippable, and - // the dist & rate are within the thresholds, the partition search is - // terminated for current branch of the partition search tree. - // The dist & rate thresholds are set to 0 at speed 0 to disable the - // early termination at that speed. - if (!x->e_mbd.lossless[xd->mi[0]->segment_id] && - (ctx_none->skippable && best_rdc.dist < dist_breakout_thr && - best_rdc.rate < rate_breakout_thr)) { - do_square_split = 0; - do_rectangular_split = 0; + if ((do_square_split || do_rectangular_split) && + !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) { + const int use_ml_based_breakout = + bsize <= cpi->sf.use_square_partition_only_threshold && + bsize > BLOCK_4X4 && xd->bd == 8; + if (use_ml_based_breakout) { + if (ml_predict_breakout(cpi, bsize, x, &this_rdc, + pb_source_variance)) { + do_square_split = 0; + do_rectangular_split = 0; + } + } + + // If all y, u, v transform blocks in this partition are skippable, + // and the dist & rate are within the thresholds, the partition + // search is terminated for current branch of the partition search + // tree. The dist & rate thresholds are set to 0 at speed 0 to + // disable the early termination at that speed. + if (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr) { + do_square_split = 0; + do_rectangular_split = 0; + } } #if CONFIG_FP_MB_STATS @@ -3384,24 +3575,14 @@ BEGIN_PARTITION_SEARCH: // store estimated motion vector if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none); -#if CONFIG_DIST_8X8 - uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE]; - - if (x->using_dist_8x8 && bsize == BLOCK_8X8) { - for (int i = 0; i < num_planes; i++) { - src_plane_8x8[i] = x->plane[i].src.buf; - dst_plane_8x8[i] = xd->plane[i].dst.buf; - } - } -#endif // CONFIG_DIST_8X8 - // PARTITION_SPLIT if (do_square_split) { av1_init_rd_stats(&sum_rdc); - int reached_last_index = 0; subsize = get_partition_subsize(bsize, PARTITION_SPLIT); - int idx; + sum_rdc.rate = partition_cost[PARTITION_SPLIT]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + int idx; for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) { const int x_idx = (idx & 1) * mi_step; const int y_idx = (idx >> 1) * mi_step; @@ -3413,8 +3594,13 @@ BEGIN_PARTITION_SEARCH: pc_tree->split[idx]->index = idx; int64_t *p_split_rd = &split_rd[idx]; + int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX + ? INT64_MAX + : (best_rdc.rdcost - sum_rdc.rdcost); + if (cpi->sf.prune_ref_frame_for_rect_partitions) + pc_tree->split[idx]->none.rate = INT_MAX; rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, - subsize, &this_rdc, best_rdc.rdcost - sum_rdc.rdcost, + subsize, &this_rdc, best_remain_rdcost, pc_tree->split[idx], p_split_rd); if (this_rdc.rate == INT_MAX) { @@ -3424,11 +3610,16 @@ BEGIN_PARTITION_SEARCH: sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; sum_rdc.rdcost += this_rdc.rdcost; - + if (cpi->sf.prune_ref_frame_for_rect_partitions && + pc_tree->split[idx]->none.rate != INT_MAX) { + const int ref_type = + av1_ref_frame_type(pc_tree->split[idx]->none.mic.ref_frame); + ref_frames_used[idx] |= (1 << ref_type); + } if (idx <= 1 && (bsize <= BLOCK_8X8 || pc_tree->split[idx]->partitioning == PARTITION_NONE)) { - MB_MODE_INFO *const mbmi = &(pc_tree->split[idx]->none.mic); - PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; // Neither palette mode nor cfl predicted if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1; @@ -3436,60 +3627,83 @@ BEGIN_PARTITION_SEARCH: } } } - reached_last_index = (idx == 4); - -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && reached_last_index && - sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) { - int64_t dist_8x8; - dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8); -#ifdef DEBUG_DIST_8X8 - // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/) - assert(sum_rdc.dist == dist_8x8); -#endif // DEBUG_DIST_8X8 - sum_rdc.dist = dist_8x8; - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); - } -#endif // CONFIG_DIST_8X8 + const int reached_last_index = (idx == 4); if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += partition_cost[PARTITION_SPLIT]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; pc_tree->partitioning = PARTITION_SPLIT; } - } else if (cpi->sf.less_rectangular_check) { + } else if (cpi->sf.less_rectangular_check_level > 0) { // skip rectangular partition test when larger block size // gives better rd cost - do_rectangular_split &= !partition_none_allowed; + if (cpi->sf.less_rectangular_check_level == 2 || idx <= 2) + do_rectangular_split &= !partition_none_allowed; } restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // if (do_split) + pc_tree->horizontal[0].skip_ref_frame_mask = 0; + pc_tree->horizontal[1].skip_ref_frame_mask = 0; + pc_tree->vertical[0].skip_ref_frame_mask = 0; + pc_tree->vertical[1].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + int used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[1]; + if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[2] | ref_frames_used[3]; + if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[2]; + if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[1] | ref_frames_used[3]; + if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames; + } + + int prune_horz = 0; + int prune_vert = 0; + if (cpi->sf.ml_prune_rect_partition && !frame_is_intra_only(cm) && + (partition_horz_allowed || partition_vert_allowed)) { + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes); + ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd, + split_rd, &prune_horz, &prune_vert); + } + // PARTITION_HORZ - if (partition_horz_allowed && + if (partition_horz_allowed && !prune_horz && (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) { av1_init_rd_stats(&sum_rdc); subsize = get_partition_subsize(bsize, PARTITION_HORZ); if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && - partition_none_allowed) + partition_none_allowed) { pc_tree->horizontal[0].pred_interp_filter = av1_extract_interp_filter(ctx_none->mic.interp_filters, 0); - - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, + } + int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX + ? INT64_MAX + : (best_rdc.rdcost - sum_rdc.rdcost); + sum_rdc.rate = partition_cost[PARTITION_HORZ]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_HORZ, subsize, &pc_tree->horizontal[0], - best_rdc.rdcost); - horz_rd[0] = sum_rdc.rdcost; + best_remain_rdcost); + + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; + } + horz_rd[0] = this_rdc.rdcost; if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) { - PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0]; - MB_MODE_INFO *const mbmi = &(pc_tree->horizontal[0].mic); - PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0]; + const MB_MODE_INFO *const mbmi = &pc_tree->horizontal[0].mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; // Neither palette mode nor cfl predicted if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1; @@ -3501,24 +3715,15 @@ BEGIN_PARTITION_SEARCH: if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && - partition_none_allowed) + partition_none_allowed) { pc_tree->horizontal[1].pred_interp_filter = av1_extract_interp_filter(ctx_h->mic.interp_filters, 0); - + } rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc, PARTITION_HORZ, subsize, &pc_tree->horizontal[1], best_rdc.rdcost - sum_rdc.rdcost); horz_rd[1] = this_rdc.rdcost; -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) { - update_state(cpi, tile_data, td, &pc_tree->horizontal[1], - mi_row + mi_step, mi_col, subsize, DRY_RUN_NORMAL); - encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, - mi_row + mi_step, mi_col, subsize, NULL); - } -#endif // CONFIG_DIST_8X8 - if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; } else { @@ -3526,24 +3731,9 @@ BEGIN_PARTITION_SEARCH: sum_rdc.dist += this_rdc.dist; sum_rdc.rdcost += this_rdc.rdcost; } -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX && - bsize == BLOCK_8X8) { - int64_t dist_8x8; - dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8); -#ifdef DEBUG_DIST_8X8 - // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/) - assert(sum_rdc.dist == dist_8x8); -#endif // DEBUG_DIST_8X8 - sum_rdc.dist = dist_8x8; - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); - } -#endif // CONFIG_DIST_8X8 } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += partition_cost[PARTITION_HORZ]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; @@ -3555,7 +3745,7 @@ BEGIN_PARTITION_SEARCH: } // PARTITION_VERT - if (partition_vert_allowed && + if (partition_vert_allowed && !prune_vert && (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) { av1_init_rd_stats(&sum_rdc); subsize = get_partition_subsize(bsize, PARTITION_VERT); @@ -3563,18 +3753,31 @@ BEGIN_PARTITION_SEARCH: if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && - partition_none_allowed) + partition_none_allowed) { pc_tree->vertical[0].pred_interp_filter = av1_extract_interp_filter(ctx_none->mic.interp_filters, 0); - - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, + } + sum_rdc.rate = partition_cost[PARTITION_VERT]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX + ? INT64_MAX + : (best_rdc.rdcost - sum_rdc.rdcost); + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_VERT, subsize, &pc_tree->vertical[0], - best_rdc.rdcost); - vert_rd[0] = sum_rdc.rdcost; + best_remain_rdcost); + + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; + } + vert_rd[0] = this_rdc.rdcost; const int64_t vert_max_rdcost = best_rdc.rdcost; if (sum_rdc.rdcost < vert_max_rdcost && has_cols) { - MB_MODE_INFO *const mbmi = &(pc_tree->vertical[0].mic); - PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const MB_MODE_INFO *const mbmi = &pc_tree->vertical[0].mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; // Neither palette mode nor cfl predicted if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1; @@ -3587,24 +3790,15 @@ BEGIN_PARTITION_SEARCH: if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && - partition_none_allowed) + partition_none_allowed) { pc_tree->vertical[1].pred_interp_filter = av1_extract_interp_filter(ctx_none->mic.interp_filters, 0); - + } rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc, PARTITION_VERT, subsize, &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost); vert_rd[1] = this_rdc.rdcost; -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) { - update_state(cpi, tile_data, td, &pc_tree->vertical[1], mi_row, - mi_col + mi_step, subsize, DRY_RUN_NORMAL); - encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, - mi_col + mi_step, subsize, NULL); - } -#endif // CONFIG_DIST_8X8 - if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; } else { @@ -3612,25 +3806,9 @@ BEGIN_PARTITION_SEARCH: sum_rdc.dist += this_rdc.dist; sum_rdc.rdcost += this_rdc.rdcost; } -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX && - bsize == BLOCK_8X8) { - int64_t dist_8x8; - dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8); -#ifdef DEBUG_DIST_8X8 - // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && - 0 /* !CONFIG_CFL */) - assert(sum_rdc.dist == dist_8x8); -#endif // DEBUG_DIST_8X8 - sum_rdc.dist = dist_8x8; - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); - } -#endif // CONFIG_DIST_8X8 } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += partition_cost[PARTITION_VERT]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; @@ -3641,6 +3819,17 @@ BEGIN_PARTITION_SEARCH: restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } + if (pb_source_variance == UINT_MAX) { + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + pb_source_variance = av1_high_get_sby_perpixel_variance( + cpi, &x->plane[0].src, bsize, xd->bd); + } else { + pb_source_variance = + av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + } + } + const int ext_partition_allowed = do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed; @@ -3649,15 +3838,26 @@ BEGIN_PARTITION_SEARCH: int horzab_partition_allowed = ext_partition_allowed; int vertab_partition_allowed = ext_partition_allowed; +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) { + if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) { + horzab_partition_allowed = 0; + vertab_partition_allowed = 0; + } + } +#endif + if (cpi->sf.prune_ext_partition_types_search_level) { if (cpi->sf.prune_ext_partition_types_search_level == 1) { + // TODO(debargha,huisu@google.com): may need to tune the threshold for + // pb_source_variance. horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || (pc_tree->partitioning == PARTITION_NONE && - x->source_variance < 32) || + pb_source_variance < 32) || pc_tree->partitioning == PARTITION_SPLIT); vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT || (pc_tree->partitioning == PARTITION_NONE && - x->source_variance < 32) || + pb_source_variance < 32) || pc_tree->partitioning == PARTITION_SPLIT); } else { horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || @@ -3712,6 +3912,9 @@ BEGIN_PARTITION_SEARCH: if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed && partition_horz_allowed && partition_vert_allowed) { + // TODO(huisu@google.com): x->source_variance may not be the current block's + // variance. The correct one to use is pb_source_variance. + // Need to re-train the model to fix it. ml_prune_ab_partition(bsize, pc_tree->partitioning, get_unsigned_bits(x->source_variance), best_rdc.rdcost, horz_rd, vert_rd, split_rd, @@ -3736,6 +3939,21 @@ BEGIN_PARTITION_SEARCH: pc_tree->horizontala[1].rd_mode_is_ready = 1; } } + pc_tree->horizontala[0].skip_ref_frame_mask = 0; + pc_tree->horizontala[1].skip_ref_frame_mask = 0; + pc_tree->horizontala[2].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + int used_frames; + used_frames = ref_frames_used[0]; + if (used_frames) + pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[1]; + if (used_frames) + pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[2] | ref_frames_used[3]; + if (used_frames) + pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames; + } rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row, @@ -3754,6 +3972,21 @@ BEGIN_PARTITION_SEARCH: pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B; pc_tree->horizontalb[0].rd_mode_is_ready = 1; } + pc_tree->horizontalb[0].skip_ref_frame_mask = 0; + pc_tree->horizontalb[1].skip_ref_frame_mask = 0; + pc_tree->horizontalb[2].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + int used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[1]; + if (used_frames) + pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[2]; + if (used_frames) + pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[3]; + if (used_frames) + pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames; + } rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B, mi_row, mi_col, subsize, @@ -3773,6 +4006,18 @@ BEGIN_PARTITION_SEARCH: pc_tree->verticala[0].mic.partition = PARTITION_VERT_A; pc_tree->verticala[0].rd_mode_is_ready = 1; } + pc_tree->verticala[0].skip_ref_frame_mask = 0; + pc_tree->verticala[1].skip_ref_frame_mask = 0; + pc_tree->verticala[2].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + int used_frames; + used_frames = ref_frames_used[0]; + if (used_frames) pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[2]; + if (used_frames) pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[1] | ref_frames_used[3]; + if (used_frames) pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames; + } rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala, ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A, mi_row, mi_col, bsize2, @@ -3791,6 +4036,18 @@ BEGIN_PARTITION_SEARCH: pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B; pc_tree->verticalb[0].rd_mode_is_ready = 1; } + pc_tree->verticalb[0].skip_ref_frame_mask = 0; + pc_tree->verticalb[1].skip_ref_frame_mask = 0; + pc_tree->verticalb[2].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + int used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[2]; + if (used_frames) pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[1]; + if (used_frames) pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[3]; + if (used_frames) pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames; + } rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row, @@ -3823,9 +4080,19 @@ BEGIN_PARTITION_SEARCH: partition_horz_allowed && partition_vert_allowed) { ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning, best_rdc.rdcost, horz_rd, vert_rd, split_rd, &partition_horz4_allowed, - &partition_vert4_allowed); + &partition_vert4_allowed, pb_source_variance, mi_row, + mi_col); } +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) { + if (block_size_high[bsize] <= 16 || block_size_wide[bsize] <= 16) { + partition_horz4_allowed = 0; + partition_vert4_allowed = 0; + } + } +#endif + // PARTITION_HORZ_4 if (partition_horz4_allowed && has_rows && (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) { @@ -3834,25 +4101,33 @@ BEGIN_PARTITION_SEARCH: PICK_MODE_CONTEXT *ctx_prev = ctx_none; subsize = get_partition_subsize(bsize, PARTITION_HORZ_4); + sum_rdc.rate = partition_cost[PARTITION_HORZ_4]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); for (int i = 0; i < 4; ++i) { - int this_mi_row = mi_row + i * quarter_step; + const int this_mi_row = mi_row + i * quarter_step; if (i > 0 && this_mi_row >= cm->mi_rows) break; PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i]; ctx_this->rd_mode_is_ready = 0; - if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3), - this_mi_row, mi_col, subsize, &best_rdc, &sum_rdc, - &this_rdc, PARTITION_HORZ_4, ctx_prev, ctx_this)) + ctx_this->skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int used_frames = i <= 1 + ? (ref_frames_used[0] | ref_frames_used[1]) + : (ref_frames_used[2] | ref_frames_used[3]); + if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames; + } + if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row, + mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc, + PARTITION_HORZ_4, ctx_prev, ctx_this)) break; ctx_prev = ctx_this; } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += partition_cost[PARTITION_HORZ_4]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; @@ -3870,16 +4145,25 @@ BEGIN_PARTITION_SEARCH: PICK_MODE_CONTEXT *ctx_prev = ctx_none; subsize = get_partition_subsize(bsize, PARTITION_VERT_4); + sum_rdc.rate = partition_cost[PARTITION_VERT_4]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); for (int i = 0; i < 4; ++i) { - int this_mi_col = mi_col + i * quarter_step; + const int this_mi_col = mi_col + i * quarter_step; if (i > 0 && this_mi_col >= cm->mi_cols) break; PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i]; ctx_this->rd_mode_is_ready = 0; - if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3), mi_row, + ctx_this->skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int used_frames = i <= 1 + ? (ref_frames_used[0] | ref_frames_used[2]) + : (ref_frames_used[1] | ref_frames_used[3]); + if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames; + } + if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row, this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc, PARTITION_VERT_4, ctx_prev, ctx_this)) break; @@ -3888,7 +4172,6 @@ BEGIN_PARTITION_SEARCH: } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += partition_cost[PARTITION_VERT_4]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; @@ -3924,14 +4207,6 @@ BEGIN_PARTITION_SEARCH: } } -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && best_rdc.rate < INT_MAX && - best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) { - encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, - pc_tree, NULL); - } -#endif // CONFIG_DIST_8X8 - if (bsize == cm->seq_params.sb_size) { assert(best_rdc.rate < INT_MAX); assert(best_rdc.dist < INT64_MAX); @@ -3950,6 +4225,15 @@ static void init_first_partition_pass_stats_tables( } } +// clear pc_tree_stats +static INLINE void clear_pc_tree_stats(PC_TREE *pt) { + if (pt == NULL) return; + pt->pc_tree_stats.valid = 0; + for (int i = 0; i < 4; ++i) { + clear_pc_tree_stats(pt->split[i]); + } +} + // Minimum number of samples to trigger the // mode_pruning_based_on_two_pass_partition_search feature. #define FIRST_PARTITION_PASS_MIN_SAMPLES 16 @@ -3963,7 +4247,6 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; SPEED_FEATURES *const sf = &cpi->sf; - int mi_col; const int leaf_nodes = 256; // Initialize the left context for the new SB row @@ -3977,26 +4260,16 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, } } + PC_TREE *const pc_root = + td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2]; // Code each SB in the row - for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; + for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; mi_col += cm->seq_params.mib_size) { - const struct segmentation *const seg = &cm->seg; - int dummy_rate; - int64_t dummy_dist; - RD_STATS dummy_rdc; - int i; - int seg_skip = 0; - - const int idx_str = cm->mi_stride * mi_row + mi_col; - MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str; - PC_TREE *const pc_root = - td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2]; - av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes); av1_fill_mode_rates(cm, x, xd->tile_ctx); if (sf->adaptive_pred_interp_filter) { - for (i = 0; i < leaf_nodes; ++i) { + for (int i = 0; i < leaf_nodes; ++i) { td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE; td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE; td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE; @@ -4015,10 +4288,12 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, av1_zero(x->pred_mv); pc_root->index = 0; + const struct segmentation *const seg = &cm->seg; + int seg_skip = 0; if (seg->enabled) { const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - int segment_id = + const int segment_id = map ? get_segment_id(cm, map, cm->seq_params.sb_size, mi_row, mi_col) : 0; seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); @@ -4039,15 +4314,14 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, cpi, block_wavelet_energy_level); } else { const int block_var_level = - av1_block_energy(cpi, x, cm->seq_params.sb_size); + av1_log_block_var(cpi, x, cm->seq_params.sb_size); x->sb_energy_level = block_var_level; offset_qindex = av1_compute_deltaq_from_energy_level(cpi, block_var_level); } - int qmask = ~(cm->delta_q_res - 1); + const int qmask = ~(cm->delta_q_res - 1); int current_qindex = clamp(cm->base_qindex + offset_qindex, cm->delta_q_res, 256 - cm->delta_q_res); - current_qindex = ((current_qindex - cm->base_qindex + cm->delta_q_res / 2) & qmask) + cm->base_qindex; @@ -4058,18 +4332,16 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, xd->mi[0]->current_qindex = current_qindex; av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id); if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) { - int j, k; - int lfmask = ~(cm->delta_lf_res - 1); - int delta_lf_from_base = offset_qindex / 2; - delta_lf_from_base = - ((delta_lf_from_base + cm->delta_lf_res / 2) & lfmask); + const int lfmask = ~(cm->delta_lf_res - 1); + const int delta_lf_from_base = + ((offset_qindex / 2 + cm->delta_lf_res / 2) & lfmask); // pre-set the delta lf for loop filter. Note that this value is set // before mi is assigned for each block in current superblock - for (j = 0; j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row); - j++) { - for (k = 0; k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col); - k++) { + for (int j = 0; + j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row); j++) { + for (int k = 0; + k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col); k++) { cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)] .delta_lf_from_base = clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); @@ -4085,19 +4357,24 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, } } + int dummy_rate; + int64_t dummy_dist; + RD_STATS dummy_rdc; + const int idx_str = cm->mi_stride * mi_row + mi_col; + MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str; x->source_variance = UINT_MAX; if (sf->partition_search_type == FIXED_PARTITION || seg_skip) { - BLOCK_SIZE bsize; set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size); - bsize = seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size; + const BLOCK_SIZE bsize = + seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size; set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1, pc_root); } else if (cpi->partition_search_skippable_frame) { - BLOCK_SIZE bsize; set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size); - bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col); + const BLOCK_SIZE bsize = + get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col); set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1, @@ -4113,9 +4390,9 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, reset_partition(pc_root, cm->seq_params.sb_size); x->use_cb_search_range = 0; init_first_partition_pass_stats_tables(x->first_partition_pass_stats); + // Do the first pass if we need two pass partition search if (cpi->sf.two_pass_partition_search && - cpi->sf.use_square_partition_only_threshold < - cm->seq_params.sb_size && + cpi->sf.use_square_partition_only_threshold > BLOCK_4X4 && mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows && mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols && cm->frame_type != KEY_FRAME) { @@ -4123,6 +4400,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, // Reset the stats tables. if (sf->mode_pruning_based_on_two_pass_partition_search) av1_zero(x->first_partition_pass_stats); + clear_pc_tree_stats(pc_root); rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, pc_root, NULL); @@ -4130,7 +4408,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, x->source_variance = UINT_MAX; if (sf->adaptive_pred_interp_filter) { - for (i = 0; i < leaf_nodes; ++i) { + for (int i = 0; i < leaf_nodes; ++i) { td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE; td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE; td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE; @@ -4157,7 +4435,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, x->use_cb_search_range = 1; if (sf->mode_pruning_based_on_two_pass_partition_search) { - for (i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) { + for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) { FIRST_PARTITION_PASS_STATS *const stat = &x->first_partition_pass_stats[i]; if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) { @@ -4174,21 +4452,17 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, } } } - - rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, - cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, - pc_root, NULL); - } else { - rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, - cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, - pc_root, NULL); } + + rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, + cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, pc_root, + NULL); } #if CONFIG_COLLECT_INTER_MODE_RD_STATS // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile. if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 && cm->tile_rows == 1) { - av1_inter_mode_data_fit(x->rdmult); + av1_inter_mode_data_fit(tile_data, x->rdmult); } #endif } @@ -4233,6 +4507,32 @@ static TX_MODE select_tx_mode(const AV1_COMP *cpi) { return cpi->common.tx_mode; } +void av1_alloc_tile_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; + int tile_col, tile_row; + + if (cpi->tile_data != NULL) aom_free(cpi->tile_data); + CHECK_MEM_ERROR( + cm, cpi->tile_data, + aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data))); + cpi->allocated_tiles = tile_cols * tile_rows; + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + int i, j; + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + for (j = 0; j < MAX_MODES; ++j) { + tile_data->thresh_freq_fact[i][j] = 32; + tile_data->mode_map[i][j] = j; + } + } + } +} + void av1_init_tile_data(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); @@ -4240,28 +4540,9 @@ void av1_init_tile_data(AV1_COMP *cpi) { const int tile_rows = cm->tile_rows; int tile_col, tile_row; TOKENEXTRA *pre_tok = cpi->tile_tok[0][0]; + TOKENLIST *tplist = cpi->tplist[0][0]; unsigned int tile_tok = 0; - - if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { - if (cpi->tile_data != NULL) aom_free(cpi->tile_data); - CHECK_MEM_ERROR( - cm, cpi->tile_data, - aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data))); - cpi->allocated_tiles = tile_cols * tile_rows; - - for (tile_row = 0; tile_row < tile_rows; ++tile_row) - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileDataEnc *const tile_data = - &cpi->tile_data[tile_row * tile_cols + tile_col]; - int i, j; - for (i = 0; i < BLOCK_SIZES_ALL; ++i) { - for (j = 0; j < MAX_MODES; ++j) { - tile_data->thresh_freq_fact[i][j] = 32; - tile_data->mode_map[i][j] = j; - } - } - } - } + int tplist_count = 0; for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { @@ -4274,6 +4555,9 @@ void av1_init_tile_data(AV1_COMP *cpi) { pre_tok = cpi->tile_tok[tile_row][tile_col]; tile_tok = allocated_tokens( *tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes); + cpi->tplist[tile_row][tile_col] = tplist + tplist_count; + tplist = cpi->tplist[tile_row][tile_col]; + tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info); tile_data->allow_update_cdf = !cm->large_scale_tile; tile_data->allow_update_cdf = tile_data->allow_update_cdf && !cm->disable_cdf_update; @@ -4281,15 +4565,56 @@ void av1_init_tile_data(AV1_COMP *cpi) { } } +void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row, + int tile_col, int mi_row) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const int tile_cols = cm->tile_cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + TOKENEXTRA *tok = NULL; + int sb_row_in_tile; + int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; + + int num_mb_rows_in_sb = + ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4; + + sb_row_in_tile = + (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2; + + get_start_tok(cpi, tile_row, tile_col, mi_row, &tok, + cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes); + cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok; + + encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); + + cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok; + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count = + (unsigned int)(cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop - + cpi->tplist[tile_row][tile_col][sb_row_in_tile].start); + + assert( + (unsigned int)(tok - + cpi->tplist[tile_row][tile_col][sb_row_in_tile].start) <= + get_token_alloc(num_mb_rows_in_sb, tile_mb_cols, + cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes)); + + (void)tile_mb_cols; + (void)num_mb_rows_in_sb; +} + void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, int tile_col) { AV1_COMMON *const cm = &cpi->common; TileDataEnc *const this_tile = &cpi->tile_data[tile_row * cm->tile_cols + tile_col]; const TileInfo *const tile_info = &this_tile->tile_info; - TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col]; int mi_row; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + av1_inter_mode_data_init(this_tile); +#endif + av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start, tile_info->mi_col_end, tile_row); av1_init_above_context(cm, &td->mb.e_mbd, tile_row); @@ -4310,25 +4635,23 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; mi_row += cm->seq_params.mib_size) { - encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); + av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row); } - - cpi->tok_count[tile_row][tile_col] = - (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]); - assert(cpi->tok_count[tile_row][tile_col] <= - allocated_tokens(*tile_info, - cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, - av1_num_planes(cm))); } static void encode_tiles(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; int tile_col, tile_row; + if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) + av1_alloc_tile_data(cpi); + av1_init_tile_data(cpi); - for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) { - for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) { + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { av1_encode_tile(cpi, &cpi->td, tile_row, tile_col); cpi->intrabc_used |= cpi->td.intrabc_used_this_tile; } @@ -4616,6 +4939,13 @@ static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) { return 0; } +static void set_default_interp_skip_flags(AV1_COMP *cpi) { + const int num_planes = av1_num_planes(&cpi->common); + cpi->default_interp_skip_flags = (num_planes == 1) + ? DEFAULT_LUMA_INTERP_SKIP_FLAG + : DEFAULT_INTERP_SKIP_FLAG; +} + static void encode_frame_internal(AV1_COMP *cpi) { ThreadData *const td = &cpi->td; MACROBLOCK *const x = &td->mb; @@ -4683,41 +5013,41 @@ static void encode_frame_internal(AV1_COMP *cpi) { av1_hash_table_create(&cm->cur_frame->hash_table); av1_generate_block_2x2_hash_value(cpi->source, block_hash_values[0], - is_block_same[0]); + is_block_same[0], &cpi->td.mb); av1_generate_block_hash_value(cpi->source, 4, block_hash_values[0], block_hash_values[1], is_block_same[0], - is_block_same[1]); + is_block_same[1], &cpi->td.mb); av1_add_to_hash_map_by_row_with_precal_data( &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2], pic_width, pic_height, 4); av1_generate_block_hash_value(cpi->source, 8, block_hash_values[1], block_hash_values[0], is_block_same[1], - is_block_same[0]); + is_block_same[0], &cpi->td.mb); av1_add_to_hash_map_by_row_with_precal_data( &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2], pic_width, pic_height, 8); av1_generate_block_hash_value(cpi->source, 16, block_hash_values[0], block_hash_values[1], is_block_same[0], - is_block_same[1]); + is_block_same[1], &cpi->td.mb); av1_add_to_hash_map_by_row_with_precal_data( &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2], pic_width, pic_height, 16); av1_generate_block_hash_value(cpi->source, 32, block_hash_values[1], block_hash_values[0], is_block_same[1], - is_block_same[0]); + is_block_same[0], &cpi->td.mb); av1_add_to_hash_map_by_row_with_precal_data( &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2], pic_width, pic_height, 32); av1_generate_block_hash_value(cpi->source, 64, block_hash_values[0], block_hash_values[1], is_block_same[0], - is_block_same[1]); + is_block_same[1], &cpi->td.mb); av1_add_to_hash_map_by_row_with_precal_data( &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2], pic_width, pic_height, 64); av1_generate_block_hash_value(cpi->source, 128, block_hash_values[1], block_hash_values[0], is_block_same[1], - is_block_same[0]); + is_block_same[0], &cpi->td.mb); av1_add_to_hash_map_by_row_with_precal_data( &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2], pic_width, pic_height, 128); @@ -4769,7 +5099,7 @@ static void encode_frame_internal(AV1_COMP *cpi) { av1_initialize_rd_consts(cpi); av1_initialize_me_consts(cpi, x, cm->base_qindex); init_encode_frame_mb_context(cpi); - + set_default_interp_skip_flags(cpi); if (cm->prev_frame) cm->last_frame_seg_map = cm->prev_frame->seg_map; else @@ -4793,6 +5123,9 @@ static void encode_frame_internal(AV1_COMP *cpi) { av1_zero(rdc->global_motion_used); av1_zero(cpi->gmparams_cost); +#if !CONFIG_GLOBAL_MOTION_SEARCH + cpi->global_motion_search_done = 1; +#endif // !CONFIG_GLOBAL_MOTION_SEARCH if (cpi->common.frame_type == INTER_FRAME && cpi->source && !cpi->global_motion_search_done) { YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES]; @@ -4939,27 +5272,13 @@ static void encode_frame_internal(AV1_COMP *cpi) { } #endif -#if CONFIG_COLLECT_INTER_MODE_RD_STATS - av1_inter_mode_data_init(); -#endif - - // If allowed, encoding tiles in parallel with one thread handling one tile. - // TODO(geza.lore): The multi-threaded encoder is not safe with more than - // 1 tile rows, as it uses the single above_context et al arrays from - // cpi->common - if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols) > 1 && cm->tile_rows == 1) + if (cpi->row_mt && (cpi->oxcf.max_threads > 1)) + av1_encode_tiles_mt(cpi); + else if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols * cm->tile_rows) > 1) av1_encode_tiles_mt(cpi); else encode_tiles(cpi); -#if CONFIG_COLLECT_INTER_MODE_RD_STATS -#if INTER_MODE_RD_TEST - if (cpi->sf.inter_mode_rd_model_estimation) { - av1_inter_mode_data_show(cm); - } -#endif -#endif - aom_usec_timer_mark(&emr_timer); cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer); } @@ -5407,7 +5726,7 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4; } mbmi->tx_size = tx_size; - set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h, + set_txfm_ctxs(tx_size, xd->n4_w, xd->n4_h, (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd); } CFL_CTX *const cfl = &xd->cfl; diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h index 62141dba4..e8cf9b468 100644 --- a/third_party/aom/av1/encoder/encodeframe.h +++ b/third_party/aom/av1/encoder/encodeframe.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_ENCODEFRAME_H_ -#define AV1_ENCODER_ENCODEFRAME_H_ +#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_ +#define AOM_AV1_ENCODER_ENCODEFRAME_H_ #include "aom/aom_integer.h" #include "av1/common/blockd.h" @@ -20,7 +20,7 @@ extern "C" { #endif -#define DELTAQ_MODULATION 0 // 0: variance based, 1: wavelet AC energy based +#define DELTAQ_MODULATION 1 // 0: variance based, 1: wavelet AC energy based struct macroblock; struct yv12_buffer_config; @@ -33,12 +33,15 @@ void av1_setup_src_planes(struct macroblock *x, void av1_encode_frame(struct AV1_COMP *cpi); +void av1_alloc_tile_data(struct AV1_COMP *cpi); void av1_init_tile_data(struct AV1_COMP *cpi); void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row, int tile_col); +void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td, + int tile_row, int tile_col, int mi_row); #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_ENCODEFRAME_H_ +#endif // AOM_AV1_ENCODER_ENCODEFRAME_H_ diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c index cea8db6f9..ad12577e6 100644 --- a/third_party/aom/av1/encoder/encodemb.c +++ b/third_party/aom/av1/encoder/encodemb.c @@ -222,11 +222,8 @@ static void encode_block(int plane, int block, int blk_row, int blk_col, a = &args->ta[blk_col]; l = &args->tl[blk_row]; - // Assert not magic number (uninitialized). - assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234); - if ((plane != 0 || x->blk_skip[blk_row * bw + blk_col] == 0) && - !mbmi->skip_mode) { + if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) { TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, tx_size, cm->reduced_tx_set_used); if (args->enable_optimize_b) { @@ -350,6 +347,66 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col, } } +void av1_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, + foreach_transformed_block_visitor visit, void *arg) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const uint8_t txw_unit = tx_size_wide_unit[tx_size]; + const uint8_t txh_unit = tx_size_high_unit[tx_size]; + const int step = txw_unit * txh_unit; + int i = 0, r, c; + + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + + int blk_row, blk_col; + + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y); + int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0]; + int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0]; + mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); + mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (r = 0; r < max_blocks_high; r += mu_blocks_high) { + const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high); + // Skip visiting the sub blocks that are wholly within the UMV. + for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) { + const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide); + for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) { + for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) { + visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg); + i += step; + } + } + } + } +} + +void av1_foreach_transformed_block(const MACROBLOCKD *const xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, + foreach_transformed_block_visitor visit, + void *arg, const int num_planes) { + for (int plane = 0; plane < num_planes; ++plane) { + if (!is_chroma_reference(mi_row, mi_col, bsize, + xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y)) + continue; + av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); + } +} + typedef struct encode_block_pass1_args { AV1_COMMON *cm; MACROBLOCK *x; @@ -382,7 +439,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col, txfm_param.tx_set_type = av1_get_ext_tx_set_type( txfm_param.tx_size, is_inter_block(xd->mi[0]), cm->reduced_tx_set_used); if (txfm_param.is_hbd) { - av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, &txfm_param); + av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); return; } av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); @@ -513,9 +570,7 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; - // Assert not magic number (uninitialized). - assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234); - if (plane == 0 && x->blk_skip[blk_row * bw + blk_col]) { + if (plane == 0 && is_blk_skip(x, plane, blk_row * bw + blk_col)) { *eob = 0; p->txb_entropy_ctx[block] = 0; } else { diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h index 673f87ea7..39080de59 100644 --- a/third_party/aom/av1/encoder/encodemb.h +++ b/third_party/aom/av1/encoder/encodemb.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_ENCODEMB_H_ -#define AV1_ENCODER_ENCODEMB_H_ +#ifndef AOM_AV1_ENCODER_ENCODEMB_H_ +#define AOM_AV1_ENCODER_ENCODEMB_H_ #include "config/aom_config.h" @@ -47,7 +47,18 @@ typedef enum AV1_XFORM_QUANT { void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, RUN_TYPE dry_run); + +void av1_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, + foreach_transformed_block_visitor visit, void *arg); + +void av1_foreach_transformed_block(const MACROBLOCKD *const xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, + foreach_transformed_block_visitor visit, + void *arg, const int num_planes); + void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize); + void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, TX_TYPE tx_type, @@ -82,4 +93,4 @@ void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x, } // extern "C" #endif -#endif // AV1_ENCODER_ENCODEMB_H_ +#endif // AOM_AV1_ENCODER_ENCODEMB_H_ diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c index 944e2c53d..42eb5abf6 100644 --- a/third_party/aom/av1/encoder/encodemv.c +++ b/third_party/aom/av1/encoder/encodemv.c @@ -18,19 +18,37 @@ #include "av1/encoder/encodemv.h" #include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/bitops.h" + +static INLINE int mv_class_base(MV_CLASS_TYPE c) { + return c ? CLASS0_SIZE << (c + 2) : 0; +} + +// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0. +static INLINE uint8_t log_in_base_2(unsigned int n) { + // get_msb() is only valid when n != 0. + return n == 0 ? 0 : get_msb(n); +} + +static INLINE MV_CLASS_TYPE get_mv_class(int z, int *offset) { + const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096) + ? MV_CLASS_10 + : (MV_CLASS_TYPE)log_in_base_2(z >> 3); + if (offset) *offset = z - mv_class_base(c); + return c; +} static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp, MvSubpelPrecision precision) { + assert(comp != 0); int offset; const int sign = comp < 0; const int mag = sign ? -comp : comp; - const int mv_class = av1_get_mv_class(mag - 1, &offset); + const int mv_class = get_mv_class(mag - 1, &offset); const int d = offset >> 3; // int mv data const int fr = (offset >> 1) & 3; // fractional mv data const int hp = offset & 1; // high precision mv data - assert(comp != 0); - // Sign aom_write_symbol(w, sign, mvcomp->sign_cdf, 2); @@ -89,7 +107,7 @@ static void build_nmv_component_cost_table(int *mvcost, for (v = 1; v <= MV_MAX; ++v) { int z, c, o, d, e, f, cost = 0; z = v - 1; - c = av1_get_mv_class(z, &o); + c = get_mv_class(z, &o); cost += class_cost[c]; d = (o >> 3); /* int mv data */ f = (o >> 1) & 3; /* fractional pel mv data */ diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h index 64e9e7162..37ff547c8 100644 --- a/third_party/aom/av1/encoder/encodemv.h +++ b/third_party/aom/av1/encoder/encodemv.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_ENCODEMV_H_ -#define AV1_ENCODER_ENCODEMV_H_ +#ifndef AOM_AV1_ENCODER_ENCODEMV_H_ +#define AOM_AV1_ENCODER_ENCODEMV_H_ #include "av1/encoder/encoder.h" @@ -40,8 +40,16 @@ void av1_find_best_ref_mvs_from_stack(int allow_hp, int_mv *nearest_mv, int_mv *near_mv, int is_integer); +static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) { + if (mv->row == 0) { + return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ; + } else { + return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ; + } +} + #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_ENCODEMV_H_ +#endif // AOM_AV1_ENCODER_ENCODEMV_H_ diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c index 13ea32e38..a2da2df89 100644 --- a/third_party/aom/av1/encoder/encoder.c +++ b/third_party/aom/av1/encoder/encoder.c @@ -14,9 +14,28 @@ #include #include "config/aom_config.h" -#include "config/av1_rtcd.h" #include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#if CONFIG_DENOISE +#include "aom_dsp/grain_table.h" +#include "aom_dsp/noise_util.h" +#include "aom_dsp/noise_model.h" +#endif +#include "aom_dsp/psnr.h" +#if CONFIG_INTERNAL_STATS +#include "aom_dsp/ssim.h" +#endif +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" +#include "aom_scale/aom_scale.h" +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "av1/common/alloccommon.h" #include "av1/common/cdef.h" @@ -38,6 +57,7 @@ #include "av1/encoder/encodetxb.h" #include "av1/encoder/ethread.h" #include "av1/encoder/firstpass.h" +#include "av1/encoder/grain_test_vectors.h" #include "av1/encoder/hash_motion.h" #include "av1/encoder/mbgraph.h" #include "av1/encoder/picklpf.h" @@ -49,26 +69,6 @@ #include "av1/encoder/speed_features.h" #include "av1/encoder/temporal_filter.h" -#include "aom_dsp/psnr.h" -#if CONFIG_INTERNAL_STATS -#include "aom_dsp/ssim.h" -#endif -#include "av1/encoder/grain_test_vectors.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#if CONFIG_DENOISE -#include "aom_dsp/grain_table.h" -#include "aom_dsp/noise_util.h" -#include "aom_dsp/noise_model.h" -#endif -#include "aom_ports/aom_timer.h" -#include "aom_ports/mem.h" -#include "aom_ports/system_state.h" -#include "aom_scale/aom_scale.h" -#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG -#include "aom_util/debug_util.h" -#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG - #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7 // av1 uses 10,000,000 ticks/second as time stamp @@ -413,18 +413,13 @@ static void swap_mi_and_prev_mi(AV1_COMMON *cm) { } void av1_initialize_enc(void) { - static volatile int init_done = 0; - - if (!init_done) { - av1_rtcd(); - aom_dsp_rtcd(); - aom_scale_rtcd(); - av1_init_intra_predictors(); - av1_init_me_luts(); - av1_rc_init_minq_luts(); - av1_init_wedge_masks(); - init_done = 1; - } + av1_rtcd(); + aom_dsp_rtcd(); + aom_scale_rtcd(); + av1_init_intra_predictors(); + av1_init_me_luts(); + av1_rc_init_minq_luts(); + av1_init_wedge_masks(); } static void dealloc_context_buffers_ext(AV1_COMP *cpi) { @@ -506,6 +501,11 @@ static void dealloc_compressor_data(AV1_COMP *cpi) { aom_free(cpi->td.mb.wsrc_buf); cpi->td.mb.wsrc_buf = NULL; + for (int i = 0; i < 2; i++) + for (int j = 0; j < 2; j++) { + aom_free(cpi->td.mb.hash_value_buffer[i][j]); + cpi->td.mb.hash_value_buffer[i][j] = NULL; + } aom_free(cpi->td.mb.mask_buf); cpi->td.mb.mask_buf = NULL; @@ -527,10 +527,18 @@ static void dealloc_compressor_data(AV1_COMP *cpi) { aom_free(cpi->tile_tok[0][0]); cpi->tile_tok[0][0] = 0; + aom_free(cpi->tplist[0][0]); + cpi->tplist[0][0] = NULL; + av1_free_pc_tree(&cpi->td, num_planes); aom_free(cpi->td.mb.palette_buffer); + aom_free(cpi->td.mb.tmp_conv_dst); + for (int j = 0; j < 2; ++j) { + aom_free(cpi->td.mb.tmp_obmc_bufs[j]); + } + #if CONFIG_DENOISE if (cpi->denoise_and_model) { aom_denoise_and_model_free(cpi->denoise_and_model); @@ -785,6 +793,10 @@ static void alloc_compressor_data(AV1_COMP *cpi) { av1_alloc_context_buffers(cm, cm->width, cm->height); + int mi_rows_aligned_to_sb = + ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2); + int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2; + av1_alloc_txb_buf(cpi); alloc_context_buffers_ext(cpi); @@ -797,6 +809,11 @@ static void alloc_compressor_data(AV1_COMP *cpi) { CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0], aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0]))); } + aom_free(cpi->tplist[0][0]); + + CHECK_MEM_ERROR(cm, cpi->tplist[0][0], + aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS, + sizeof(*cpi->tplist[0][0]))); av1_setup_pc_tree(&cpi->common, &cpi->td); } @@ -1067,6 +1084,32 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { 10; // Default value (not signaled) } + if (cm->seq_params.monochrome) { + cm->seq_params.subsampling_x = 1; + cm->seq_params.subsampling_y = 1; + } else if (cm->seq_params.color_primaries == AOM_CICP_CP_BT_709 && + cm->seq_params.transfer_characteristics == AOM_CICP_TC_SRGB && + cm->seq_params.matrix_coefficients == AOM_CICP_MC_IDENTITY) { + cm->seq_params.subsampling_x = 0; + cm->seq_params.subsampling_y = 0; + } else { + if (cm->seq_params.profile == 0) { + cm->seq_params.subsampling_x = 1; + cm->seq_params.subsampling_y = 1; + } else if (cm->seq_params.profile == 1) { + cm->seq_params.subsampling_x = 0; + cm->seq_params.subsampling_y = 0; + } else { + if (cm->seq_params.bit_depth == AOM_BITS_12) { + cm->seq_params.subsampling_x = oxcf->chroma_subsampling_x; + cm->seq_params.subsampling_y = oxcf->chroma_subsampling_y; + } else { + cm->seq_params.subsampling_x = 1; + cm->seq_params.subsampling_y = 0; + } + } + } + cm->width = oxcf->width; cm->height = oxcf->height; set_sb_size(&cm->seq_params, @@ -2326,6 +2369,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { cpi->oxcf = *oxcf; cpi->common.options = oxcf->cfg; + cpi->row_mt = oxcf->row_mt; x->e_mbd.bd = (int)seq_params->bit_depth; x->e_mbd.global_motion = cm->global_motion; @@ -2350,6 +2394,22 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { CHECK_MEM_ERROR(cm, x->palette_buffer, aom_memalign(16, sizeof(*x->palette_buffer))); } + + if (x->tmp_conv_dst == NULL) { + CHECK_MEM_ERROR( + cm, x->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst))); + x->e_mbd.tmp_conv_dst = x->tmp_conv_dst; + } + for (int i = 0; i < 2; ++i) { + if (x->tmp_obmc_bufs[i] == NULL) { + CHECK_MEM_ERROR(cm, x->tmp_obmc_bufs[i], + aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*x->tmp_obmc_bufs[i]))); + x->e_mbd.tmp_obmc_bufs[i] = x->tmp_obmc_bufs[i]; + } + } + av1_reset_segment_features(cm); set_high_precision_mv(cpi, 1, 0); @@ -2367,11 +2427,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { rc->worst_quality = cpi->oxcf.worst_allowed_q; rc->best_quality = cpi->oxcf.best_allowed_q; - if (!oxcf->large_scale_tile) - cm->interp_filter = cpi->sf.default_interp_filter; - else - cm->interp_filter = EIGHTTAP_REGULAR; - + cm->interp_filter = oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE; cm->switchable_motion_mode = 1; if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) { @@ -2588,6 +2644,15 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, (int32_t *)aom_memalign( 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf))); + for (int x = 0; x < 2; x++) + for (int y = 0; y < 2; y++) + CHECK_MEM_ERROR( + cm, cpi->td.mb.hash_value_buffer[x][y], + (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*cpi->td.mb.hash_value_buffer[0][0]))); + + cpi->td.mb.g_crc_initialized = 0; + CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf, (int32_t *)aom_memalign( 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf))); @@ -2913,9 +2978,19 @@ void av1_remove_compressor(AV1_COMP *cpi) { // Deallocate allocated thread data. if (t < cpi->num_workers - 1) { aom_free(thread_data->td->palette_buffer); + aom_free(thread_data->td->tmp_conv_dst); + for (int j = 0; j < 2; ++j) { + aom_free(thread_data->td->tmp_obmc_bufs[j]); + } aom_free(thread_data->td->above_pred_buf); aom_free(thread_data->td->left_pred_buf); aom_free(thread_data->td->wsrc_buf); + for (int x = 0; x < 2; x++) { + for (int y = 0; y < 2; y++) { + aom_free(thread_data->td->hash_value_buffer[x][y]); + thread_data->td->hash_value_buffer[x][y] = NULL; + } + } aom_free(thread_data->td->mask_buf); aom_free(thread_data->td->counts); av1_free_pc_tree(thread_data->td, num_planes); @@ -3058,53 +3133,7 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) { } #endif -#if USE_GF16_MULTI_LAYER -static void check_show_existing_frame_gf16(AV1_COMP *cpi) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - AV1_COMMON *const cm = &cpi->common; - const FRAME_UPDATE_TYPE next_frame_update_type = - gf_group->update_type[gf_group->index]; - - if (cm->show_existing_frame == 1) { - cm->show_existing_frame = 0; - } else if (cpi->rc.is_last_bipred_frame) { - cpi->rc.is_last_bipred_frame = 0; - cm->show_existing_frame = 1; - cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[BWDREF_FRAME - 1]; - } else if (next_frame_update_type == OVERLAY_UPDATE || - next_frame_update_type == INTNL_OVERLAY_UPDATE) { - // Check the temporal filtering status for the next OVERLAY frame - const int num_arfs_in_gf = cpi->num_extra_arfs + 1; - int which_arf = 0, arf_idx; - // Identify the index to the next overlay frame. - for (arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) { - if (gf_group->index == cpi->arf_pos_for_ovrly[arf_idx]) { - which_arf = arf_idx; - break; - } - } - assert(arf_idx < num_arfs_in_gf); - if (cpi->is_arf_filter_off[which_arf]) { - cm->show_existing_frame = 1; - cpi->rc.is_src_frame_alt_ref = 1; - cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE) - ? cpi->ref_fb_idx[ALTREF_FRAME - 1] - : cpi->ref_fb_idx[BWDREF_FRAME - 1]; - cpi->is_arf_filter_off[which_arf] = 0; - } - } - cpi->rc.is_src_frame_ext_arf = 0; -} -#endif // USE_GF16_MULTI_LAYER - static void check_show_existing_frame(AV1_COMP *cpi) { -#if USE_GF16_MULTI_LAYER - if (cpi->rc.baseline_gf_interval == 16) { - check_show_existing_frame_gf16(cpi); - return; - } -#endif // USE_GF16_MULTI_LAYER - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; AV1_COMMON *const cm = &cpi->common; const FRAME_UPDATE_TYPE next_frame_update_type = @@ -3350,13 +3379,13 @@ static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) { EXTREF_FRAME - 1 }; for (int i = 2; i > 0; --i) { - cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]]; - // [0] is allocated to the current coded frame, i.e. bwdref memcpy( cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME], cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME], sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME])); + + cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]]; } } @@ -3370,52 +3399,16 @@ static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) { EXTREF_FRAME - 1 }; for (int i = 0; i < 2; ++i) { - cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]]; - // [0] is allocated to the current coded frame, i.e. bwdref memcpy( cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME], cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME], sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME])); - } -} -#endif // USE_SYMM_MULTI_LAYER -#if USE_GF16_MULTI_LAYER -static void update_reference_frames_gf16(AV1_COMP *cpi) { - AV1_COMMON *const cm = &cpi->common; - BufferPool *const pool = cm->buffer_pool; - - if (cm->frame_type == KEY_FRAME) { - for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { - ref_cnt_fb(pool->frame_bufs, - &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]], - cm->new_fb_idx); - } - } else { - if (cpi->refresh_last_frame || cpi->refresh_golden_frame || - cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame || - cpi->refresh_alt_ref_frame) { - assert(cpi->refresh_fb_idx >= 0 && cpi->refresh_fb_idx < REF_FRAMES); - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->refresh_fb_idx], - cm->new_fb_idx); - } - - // TODO(zoeliu): To handle cpi->interp_filter_selected[]. - - // For GF of 16, an additional ref frame index mapping needs to be handled - // if this is the last frame to encode in the current GF group. - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - if (gf_group->update_type[gf_group->index + 1] == OVERLAY_UPDATE) - av1_ref_frame_map_idx_updates(cpi, gf_group->index + 1); + cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]]; } - -#if DUMP_REF_FRAME_IMAGES == 1 - // Dump out all reference frame images. - dump_ref_frame_images(cpi); -#endif // DUMP_REF_FRAME_IMAGES } -#endif // USE_GF16_MULTI_LAYER +#endif // USE_SYMM_MULTI_LAYER static void update_reference_frames(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; @@ -3424,12 +3417,20 @@ static void update_reference_frames(AV1_COMP *cpi) { // for the purpose to verify no mismatch between encoder and decoder. if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx; -#if USE_GF16_MULTI_LAYER - if (cpi->rc.baseline_gf_interval == 16) { - update_reference_frames_gf16(cpi); - return; + // In the case of show_existing frame, we will not send fresh flag + // to decoder. Any change in the reference frame buffer can be done by + // switching the virtual indices. + if (cm->show_existing_frame) { + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_bwd_ref_frame = 0; + cpi->refresh_alt2_ref_frame = 0; + cpi->refresh_alt_ref_frame = 0; + + cpi->rc.is_bwd_ref_frame = 0; + cpi->rc.is_last_bipred_frame = 0; + cpi->rc.is_bipred_frame = 0; } -#endif // USE_GF16_MULTI_LAYER BufferPool *const pool = cm->buffer_pool; @@ -3458,9 +3459,15 @@ static void update_reference_frames(AV1_COMP *cpi) { // slot and, if we're updating the GF, the current frame becomes the new GF. int tmp; - ref_cnt_fb(pool->frame_bufs, - &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]], - cm->new_fb_idx); + // ARF in general is a better reference than overlay. We shouldkeep ARF as + // reference instead of replacing it with overlay. + + if (!cpi->preserve_arf_as_gld) { + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]], + cm->new_fb_idx); + } + tmp = cpi->ref_fb_idx[ALTREF_FRAME - 1]; cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1]; cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp; @@ -3758,7 +3765,7 @@ static void set_size_independent_vars(AV1_COMP *cpi) { av1_set_speed_features_framesize_independent(cpi); av1_set_rd_speed_thresholds(cpi); av1_set_rd_speed_thresholds_sub8x8(cpi); - cpi->common.interp_filter = cpi->sf.default_interp_filter; + cpi->common.interp_filter = SWITCHABLE; cpi->common.switchable_motion_mode = 1; } @@ -3818,7 +3825,8 @@ static void set_restoration_unit_size(int width, int height, int sx, int sy, rst[2].restoration_unit_size = rst[1].restoration_unit_size; } -static void init_ref_frame_bufs(AV1_COMMON *cm) { +static void init_ref_frame_bufs(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; int i; BufferPool *const pool = cm->buffer_pool; cm->new_fb_idx = INVALID_IDX; @@ -3828,7 +3836,7 @@ static void init_ref_frame_bufs(AV1_COMMON *cm) { } if (cm->seq_params.force_screen_content_tools) { for (i = 0; i < FRAME_BUFFERS; ++i) { - av1_hash_table_init(&pool->frame_bufs[i].hash_table); + av1_hash_table_init(&pool->frame_bufs[i].hash_table, &cpi->td.mb); } } } @@ -3846,7 +3854,7 @@ static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth, seq_params->use_highbitdepth = use_highbitdepth; alloc_raw_frame_buffers(cpi); - init_ref_frame_bufs(cm); + init_ref_frame_bufs(cpi); alloc_util_frame_buffers(cpi); init_motion_estimation(cpi); // TODO(agrange) This can be removed. @@ -4220,7 +4228,7 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { if (lf->filter_level[0] || lf->filter_level[1]) { #if LOOP_FILTER_BITMASK - av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0); + av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, 0, num_planes, 0); #else if (cpi->num_workers > 1) av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0, @@ -4587,8 +4595,8 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { rc->projected_frame_size < rc->max_frame_bandwidth) loop = 0; - if (recode_loop_test_global_motion(cpi)) { - loop = 1; + if (!cpi->sf.gm_disable_recode) { + if (recode_loop_test_global_motion(cpi)) loop = 1; } if (loop) { @@ -4716,47 +4724,6 @@ static void set_ext_overrides(AV1_COMP *cpi) { cpi->ext_use_error_resilient && cpi->common.frame_type != KEY_FRAME; } -static int setup_interp_filter_search_mask(AV1_COMP *cpi) { - InterpFilter ifilter; - int ref_total[REF_FRAMES] = { 0 }; - MV_REFERENCE_FRAME ref; - int mask = 0; - int arf_idx = ALTREF_FRAME; - - if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || - cpi->refresh_alt2_ref_frame) - return mask; - - for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) - for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) - ref_total[ref] += cpi->interp_filter_selected[ref][ifilter]; - - for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) { - if ((ref_total[LAST_FRAME] && - cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) && - (ref_total[LAST2_FRAME] == 0 || - cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50 < - ref_total[LAST2_FRAME]) && - (ref_total[LAST3_FRAME] == 0 || - cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50 < - ref_total[LAST3_FRAME]) && - (ref_total[GOLDEN_FRAME] == 0 || - cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 < - ref_total[GOLDEN_FRAME]) && - (ref_total[BWDREF_FRAME] == 0 || - cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50 < - ref_total[BWDREF_FRAME]) && - (ref_total[ALTREF2_FRAME] == 0 || - cpi->interp_filter_selected[ALTREF2_FRAME][ifilter] * 50 < - ref_total[ALTREF2_FRAME]) && - (ref_total[ALTREF_FRAME] == 0 || - cpi->interp_filter_selected[arf_idx][ifilter] * 50 < - ref_total[ALTREF_FRAME])) - mask |= 1 << ifilter; - } - return mask; -} - #define DUMP_RECON_FRAMES 0 #if DUMP_RECON_FRAMES == 1 @@ -4914,7 +4881,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, if (cm->current_video_frame > 0) cpi->ref_frame_flags = get_ref_frame_flags(cpi); - if (cm->show_existing_frame) { + if (encode_show_existing_frame(cm)) { // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current // BWDREF_FRAME in the reference frame buffer. if (cm->frame_type == KEY_FRAME) { @@ -4925,20 +4892,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, cm->show_frame = 1; cpi->frame_flags = *frame_flags; - // In the case of show_existing frame, we will not send fresh flag - // to decoder. Any change in the reference frame buffer can be done by - // switching the virtual indices. - - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - - cpi->rc.is_bwd_ref_frame = 0; - cpi->rc.is_last_bipred_frame = 0; - cpi->rc.is_bipred_frame = 0; - restore_coding_context(cpi); // Build the bitstream @@ -4990,10 +4943,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, av1_rc_postencode_update(cpi, *size); } - // Decrement count down till next gf - if (cpi->rc.frames_till_gf_update_due > 0) - cpi->rc.frames_till_gf_update_due--; - ++cm->current_video_frame; return AOM_CODEC_OK; @@ -5002,9 +4951,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; - if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search) - cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi); - // Set various flags etc to special state if it is a key frame. if (frame_is_intra_only(cm) || frame_is_sframe(cm)) { // Reset the loop filter deltas and segmentation map. @@ -5246,15 +5192,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, // takes a space in the gf group. Therefore, even when // it is not shown, we still need update the count down. - // TODO(weitinglin): This is a work-around to handle the condition - // when a frame is drop. We should fix the cm->show_frame flag - // instead of checking the other condition to update the counter properly. - if (cm->show_frame || is_frame_droppable(cpi)) { - // Decrement count down till next gf - if (cpi->rc.frames_till_gf_update_due > 0) - cpi->rc.frames_till_gf_update_due--; - } - if (cm->show_frame) { // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that // are @@ -5279,6 +5216,50 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, return AOM_CODEC_OK; } +static INLINE void update_keyframe_counters(AV1_COMP *cpi) { + // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME + // differently here for rc->avg_frame_bandwidth. + if (cpi->common.show_frame || cpi->rc.is_bwd_ref_frame) { + if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref || + cpi->common.frame_type == KEY_FRAME) { + // If this is a show_existing_frame with a source other than altref, + // or if it is not a displayed forward keyframe, the keyframe update + // counters were incremented when it was originally encoded. + cpi->rc.frames_since_key++; + cpi->rc.frames_to_key--; + } + } +} + +static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) { + // TODO(weitinglin): Updating this counter for is_frame_droppable + // is a work-around to handle the condition when a frame is drop. + // We should fix the cpi->common.show_frame flag + // instead of checking the other condition to update the counter properly. + if (cpi->common.show_frame || is_frame_droppable(cpi)) { + // Decrement count down till next gf + if (cpi->rc.frames_till_gf_update_due > 0) + cpi->rc.frames_till_gf_update_due--; + } +} + +static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) { + // Increment the gf group index ready for the next frame. If this is + // a show_existing_frame with a source other than altref, or if it is not + // a displayed forward keyframe, the index was incremented when it was + // originally encoded. + if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref || + cpi->common.frame_type == KEY_FRAME) { + ++cpi->twopass.gf_group.index; + } +} + +static void update_rc_counts(AV1_COMP *cpi) { + update_keyframe_counters(cpi); + update_frames_till_gf_update(cpi); + if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi); +} + static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, int skip_adapt, unsigned int *frame_flags) { if (cpi->oxcf.rc_mode == AOM_CBR) { @@ -5290,6 +5271,7 @@ static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, AOM_CODEC_OK) { return AOM_CODEC_ERROR; } + update_rc_counts(cpi); check_show_existing_frame(cpi); return AOM_CODEC_OK; } @@ -5319,14 +5301,8 @@ static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, cm->cum_txcoeff_cost_timer); #endif - // Do not do post-encoding update for those frames that do not have a spot - // in - // a gf group, but note that an OVERLAY frame always has a spot in a gf - // group, - // even when show_existing_frame is used. - if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref) { - av1_twopass_postencode_update(cpi); - } + av1_twopass_postencode_update(cpi); + update_rc_counts(cpi); check_show_existing_frame(cpi); return AOM_CODEC_OK; } @@ -5734,7 +5710,7 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture, av1_get_block_hash_value( cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur, block_size, &hash_value_1, &hash_value_2, - (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH)); + (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb); // Hashing does not work for highbitdepth currently. // TODO(Roger): Make it work for highbitdepth. if (av1_use_hash_me(&cpi->common)) { @@ -5822,13 +5798,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0); - // Is multi-arf enabled. - // Note that at the moment multi_arf is only configured for 2 pass VBR - if ((oxcf->pass == 2) && (cpi->oxcf.enable_auto_arf > 1)) - cpi->multi_arf_allowed = 1; - else - cpi->multi_arf_allowed = 0; - // Normal defaults cm->refresh_frame_context = oxcf->frame_parallel_decoding_mode ? REFRESH_FRAME_CONTEXT_DISABLED @@ -5850,16 +5819,20 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, struct lookahead_entry *lookahead_src = NULL; if (cm->current_video_frame > 0) lookahead_src = av1_lookahead_peek(cpi->lookahead, 0); - if (lookahead_src != NULL && - ((cpi->oxcf.error_resilient_mode | - ((lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT) != 0)) || - (cpi->oxcf.s_frame_mode | - ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0))) && - !(rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) { - cm->show_existing_frame = 0; + + int use_show_existing = 1; + if (lookahead_src != NULL) { + const int is_error_resilient = + cpi->oxcf.error_resilient_mode || + (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT); + const int is_s_frame = cpi->oxcf.s_frame_mode || + (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME); + const int is_key_frame = + (rc->frames_to_key == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY); + use_show_existing = !(is_error_resilient || is_s_frame) || is_key_frame; } - if (oxcf->pass == 2 && cm->show_existing_frame) { + if (oxcf->pass == 2 && cm->show_existing_frame && use_show_existing) { // Manage the source buffer and flush out the source frame that has been // coded already; Also get prepared for PSNR calculation if needed. if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) { @@ -6415,3 +6388,50 @@ int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) { const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1; return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC; } + +aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) { + if (!cpi) return NULL; + + uint8_t header_buf[512] = { 0 }; + const uint32_t sequence_header_size = + write_sequence_header_obu(cpi, &header_buf[0]); + assert(sequence_header_size <= sizeof(header_buf)); + if (sequence_header_size == 0) return NULL; + + const size_t obu_header_size = 1; + const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size); + const size_t payload_offset = obu_header_size + size_field_size; + + if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL; + memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size); + + if (write_obu_header(OBU_SEQUENCE_HEADER, 0, &header_buf[0]) != + obu_header_size) { + return NULL; + } + + size_t coded_size_field_size = 0; + if (aom_uleb_encode(sequence_header_size, size_field_size, + &header_buf[obu_header_size], + &coded_size_field_size) != 0) { + return NULL; + } + assert(coded_size_field_size == size_field_size); + + aom_fixed_buf_t *global_headers = + (aom_fixed_buf_t *)malloc(sizeof(*global_headers)); + if (!global_headers) return NULL; + + const size_t global_header_buf_size = + obu_header_size + size_field_size + sequence_header_size; + + global_headers->buf = malloc(global_header_buf_size); + if (!global_headers->buf) { + free(global_headers); + return NULL; + } + + memcpy(global_headers->buf, &header_buf[0], global_header_buf_size); + global_headers->sz = global_header_buf_size; + return global_headers; +} diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h index 2b7ab711d..ee7fc4637 100644 --- a/third_party/aom/av1/encoder/encoder.h +++ b/third_party/aom/av1/encoder/encoder.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_ENCODER_H_ -#define AV1_ENCODER_ENCODER_H_ +#ifndef AOM_AV1_ENCODER_ENCODER_H_ +#define AOM_AV1_ENCODER_ENCODER_H_ #include @@ -142,7 +142,6 @@ typedef struct AV1EncoderConfig { int noise_sensitivity; // pre processing blur: recommendation 0 int sharpness; // sharpening output: recommendation 0: int speed; - int dev_sf; // maximum allowed bitrate for any intra frame in % of bitrate target. unsigned int rc_max_intra_bitrate_pct; // maximum allowed bitrate for any inter frame in % of bitrate target. @@ -249,6 +248,7 @@ typedef struct AV1EncoderConfig { int min_gf_interval; int max_gf_interval; + int row_mt; int tile_columns; int tile_rows; int tile_width_count; @@ -309,6 +309,9 @@ typedef struct AV1EncoderConfig { float noise_level; int noise_block_size; #endif + + unsigned int chroma_subsampling_x; + unsigned int chroma_subsampling_y; } AV1EncoderConfig; static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) { @@ -401,6 +404,43 @@ typedef struct FRAME_COUNTS { [SWITCHABLE_FILTERS]; } FRAME_COUNTS; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS +#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400 + +typedef struct { + int ready; + double a; + double b; + double dist_mean; + double ld_mean; + double sse_mean; + double sse_sse_mean; + double sse_ld_mean; + int num; + double dist_sum; + double ld_sum; + double sse_sum; + double sse_sse_sum; + double sse_ld_sum; +} InterModeRdModel; + +typedef struct { + int idx; + int64_t rd; +} RdIdxPair; +// TODO(angiebird): This is an estimated size. We still need to figure what is +// the maximum number of modes. +#define MAX_INTER_MODES 1024 +typedef struct inter_modes_info { + int num; + MB_MODE_INFO mbmi_arr[MAX_INTER_MODES]; + int mode_rate_arr[MAX_INTER_MODES]; + int64_t sse_arr[MAX_INTER_MODES]; + int64_t est_rd_arr[MAX_INTER_MODES]; + RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES]; +} InterModesInfo; +#endif + // TODO(jingning) All spatially adaptive variables should go to TileDataEnc. typedef struct TileDataEnc { TileInfo tile_info; @@ -411,8 +451,18 @@ typedef struct TileDataEnc { CFL_CTX cfl; DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); uint8_t allow_update_cdf; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; + InterModesInfo inter_modes_info; +#endif } TileDataEnc; +typedef struct { + TOKENEXTRA *start; + TOKENEXTRA *stop; + unsigned int count; +} TOKENLIST; + typedef struct RD_COUNTS { int64_t comp_pred_diff[REFERENCE_MODES]; // Stores number of 4x4 blocks using global motion per reference frame. @@ -427,11 +477,14 @@ typedef struct ThreadData { FRAME_COUNTS *counts; PC_TREE *pc_tree; PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1]; + uint32_t *hash_value_buffer[2][2]; int32_t *wsrc_buf; int32_t *mask_buf; uint8_t *above_pred_buf; uint8_t *left_pred_buf; PALETTE_BUFFER *palette_buffer; + CONV_BUF_TYPE *tmp_conv_dst; + uint8_t *tmp_obmc_bufs[2]; int intrabc_used_this_tile; } ThreadData; @@ -502,6 +555,7 @@ typedef struct AV1_COMP { int previous_index; int cur_poc; // DebugInfo + unsigned int row_mt; int scaled_ref_idx[REF_FRAMES]; int ref_fb_idx[REF_FRAMES]; int refresh_fb_idx; // ref frame buffer index to refresh @@ -647,13 +701,12 @@ typedef struct AV1_COMP { search_site_config ss_cfg; - int multi_arf_allowed; - TileDataEnc *tile_data; int allocated_tiles; // Keep track of memory allocated for tiles. TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS]; unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS]; + TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS]; TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; @@ -703,8 +756,13 @@ typedef struct AV1_COMP { #if CONFIG_DENOISE struct aom_denoise_and_model_t *denoise_and_model; #endif + // Stores the default value of skip flag depending on chroma format + // Set as 1 for monochrome and 3 for other color formats + int default_interp_skip_flags; + int preserve_arf_as_gld; } AV1_COMP; +// Must not be called more than once. void av1_initialize_enc(void); struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, @@ -833,6 +891,22 @@ static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2, return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes); } +static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col, + int mi_row, TOKENEXTRA **tok, int sb_size_log2, + int num_planes) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tile_cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + + const int tile_mb_cols = + (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; + const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2; + + *tok = cpi->tile_tok[tile_row][tile_col] + + get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes); +} + void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags); #define ALT_MIN_LAG 3 @@ -885,8 +959,27 @@ static INLINE int av1_frame_scaled(const AV1_COMMON *cm) { return !av1_superres_scaled(cm) && av1_resize_scaled(cm); } +// Don't allow a show_existing_frame to coincide with an error resilient +// frame. An exception can be made for a forward keyframe since it has no +// previous dependencies. +static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) { + return cm->show_existing_frame && + (!cm->error_resilient_mode || cm->frame_type == KEY_FRAME); +} + +// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon +// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this +// function, the memory must be freed by the caller. Both the buf member of the +// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory +// returned must be freed via call to free(). +// +// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically, +// the obu_has_size_field bit is set, and the buffer contains the obu_size +// field. +aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_ENCODER_H_ +#endif // AOM_AV1_ENCODER_ENCODER_H_ diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c index 81f360733..5a31d93d7 100644 --- a/third_party/aom/av1/encoder/encodetxb.c +++ b/third_party/aom/av1/encoder/encodetxb.c @@ -133,6 +133,38 @@ static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff, return error; } +static const int8_t eob_to_pos_small[33] = { + 0, 1, 2, // 0-2 + 3, 3, // 3-4 + 4, 4, 4, 4, // 5-8 + 5, 5, 5, 5, 5, 5, 5, 5, // 9-16 + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32 +}; + +static const int8_t eob_to_pos_large[17] = { + 6, // place holder + 7, // 33-64 + 8, 8, // 65-128 + 9, 9, 9, 9, // 129-256 + 10, 10, 10, 10, 10, 10, 10, 10, // 257-512 + 11 // 513- +}; + +static INLINE int get_eob_pos_token(const int eob, int *const extra) { + int t; + + if (eob < 33) { + t = eob_to_pos_small[eob]; + } else { + const int e = AOMMIN((eob - 1) >> 5, 16); + t = eob_to_pos_large[e]; + } + + *extra = eob - k_eob_group_start[t]; + + return t; +} + #if CONFIG_ENTROPY_STATS void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size, TX_CLASS tx_class, PLANE_TYPE plane, @@ -464,8 +496,12 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_writer *w, int blk_row, int blk_col, int plane, TX_SIZE tx_size, const tran_low_t *tcoeff, uint16_t eob, TXB_CTX *txb_ctx) { - const PLANE_TYPE plane_type = get_plane_type(plane); const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, eob == 0, + ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2); + if (eob == 0) return; + const PLANE_TYPE plane_type = get_plane_type(plane); const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size, cm->reduced_tx_set_used); const TX_CLASS tx_class = tx_type_to_class[tx_type]; @@ -475,18 +511,10 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd, const int bwl = get_txb_bwl(tx_size); const int width = get_txb_wide(tx_size); const int height = get_txb_high(tx_size); - FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + uint8_t levels_buf[TX_PAD_2D]; uint8_t *const levels = set_levels(levels_buf, width); DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); - - aom_write_symbol(w, eob == 0, - ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2); - if (plane == 0 && eob == 0) { - assert(tx_type == DCT_DCT); - } - if (eob == 0) return; - av1_txb_init_levels(tcoeff, width, height, levels); av1_write_tx_type(cm, xd, blk_row, blk_col, plane, tx_size, w); diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h index 0442cc613..40ae343b0 100644 --- a/third_party/aom/av1/encoder/encodetxb.h +++ b/third_party/aom/av1/encoder/encodetxb.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef ENCODETXB_H_ -#define ENCODETXB_H_ +#ifndef AOM_AV1_ENCODER_ENCODETXB_H_ +#define AOM_AV1_ENCODER_ENCODETXB_H_ #include "config/aom_config.h" @@ -84,4 +84,4 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, } #endif -#endif // COEFFS_CODING_H_ +#endif // AOM_AV1_ENCODER_ENCODETXB_H_ diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c index 637d6824c..e8ac30bb5 100644 --- a/third_party/aom/av1/encoder/ethread.c +++ b/third_party/aom/av1/encoder/ethread.c @@ -27,7 +27,8 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag; } -static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { +static int enc_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; AV1_COMP *const cpi = thread_data->cpi; const AV1_COMMON *const cm = &cpi->common; const int tile_cols = cm->tile_cols; @@ -47,88 +48,141 @@ static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { return 1; } -void av1_encode_tiles_mt(AV1_COMP *cpi) { +static void create_enc_workers(AV1_COMP *cpi, int num_workers) { AV1_COMMON *const cm = &cpi->common; - const int tile_cols = cm->tile_cols; const AVxWorkerInterface *const winterface = aom_get_worker_interface(); - int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols); - int i; - av1_init_tile_data(cpi); + CHECK_MEM_ERROR(cm, cpi->workers, + aom_malloc(num_workers * sizeof(*cpi->workers))); - // Only run once to create threads and allocate thread data. - if (cpi->num_workers == 0) { - CHECK_MEM_ERROR(cm, cpi->workers, - aom_malloc(num_workers * sizeof(*cpi->workers))); + CHECK_MEM_ERROR(cm, cpi->tile_thr_data, + aom_calloc(num_workers, sizeof(*cpi->tile_thr_data))); - CHECK_MEM_ERROR(cm, cpi->tile_thr_data, - aom_calloc(num_workers, sizeof(*cpi->tile_thr_data))); + for (int i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; - for (i = 0; i < num_workers; i++) { - AVxWorker *const worker = &cpi->workers[i]; - EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; + ++cpi->num_workers; + winterface->init(worker); + + thread_data->cpi = cpi; + + if (i < num_workers - 1) { + // Allocate thread data. + CHECK_MEM_ERROR(cm, thread_data->td, + aom_memalign(32, sizeof(*thread_data->td))); + av1_zero(*thread_data->td); + + // Set up pc_tree. + thread_data->td->pc_tree = NULL; + av1_setup_pc_tree(cm, thread_data->td); + + CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf, + (uint8_t *)aom_memalign( + 16, MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->td->above_pred_buf))); + CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf, + (uint8_t *)aom_memalign( + 16, MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->td->left_pred_buf))); + + CHECK_MEM_ERROR( + cm, thread_data->td->wsrc_buf, + (int32_t *)aom_memalign( + 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf))); + + for (int x = 0; x < 2; x++) + for (int y = 0; y < 2; y++) + CHECK_MEM_ERROR( + cm, thread_data->td->hash_value_buffer[x][y], + (uint32_t *)aom_malloc( + AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*thread_data->td->hash_value_buffer[0][0]))); + + CHECK_MEM_ERROR( + cm, thread_data->td->mask_buf, + (int32_t *)aom_memalign( + 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf))); + // Allocate frame counters in thread data. + CHECK_MEM_ERROR(cm, thread_data->td->counts, + aom_calloc(1, sizeof(*thread_data->td->counts))); + + // Allocate buffers used by palette coding mode. + CHECK_MEM_ERROR( + cm, thread_data->td->palette_buffer, + aom_memalign(16, sizeof(*thread_data->td->palette_buffer))); + + CHECK_MEM_ERROR( + cm, thread_data->td->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * + sizeof(*thread_data->td->tmp_conv_dst))); + for (int j = 0; j < 2; ++j) { + CHECK_MEM_ERROR( + cm, thread_data->td->tmp_obmc_bufs[j], + aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->td->tmp_obmc_bufs[j]))); + } - ++cpi->num_workers; - winterface->init(worker); + // Create threads + if (!winterface->reset(worker)) + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Tile encoder thread creation failed"); + } else { + // Main thread acts as a worker and uses the thread data in cpi. + thread_data->td = &cpi->td; + } + winterface->sync(worker); + } +} - thread_data->cpi = cpi; +static void launch_enc_workers(AV1_COMP *cpi, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + // Encode a frame + for (int i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; - if (i < num_workers - 1) { - // Allocate thread data. - CHECK_MEM_ERROR(cm, thread_data->td, - aom_memalign(32, sizeof(*thread_data->td))); - av1_zero(*thread_data->td); + // Set the starting tile for each thread. + thread_data->start = i; - // Set up pc_tree. - thread_data->td->pc_tree = NULL; - av1_setup_pc_tree(cm, thread_data->td); + if (i == cpi->num_workers - 1) + winterface->execute(worker); + else + winterface->launch(worker); + } +} - CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf, - (uint8_t *)aom_memalign( - 16, MAX_MB_PLANE * MAX_SB_SQUARE * - sizeof(*thread_data->td->above_pred_buf))); - CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf, - (uint8_t *)aom_memalign( - 16, MAX_MB_PLANE * MAX_SB_SQUARE * - sizeof(*thread_data->td->left_pred_buf))); +static void sync_enc_workers(AV1_COMP *cpi, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); - CHECK_MEM_ERROR( - cm, thread_data->td->wsrc_buf, - (int32_t *)aom_memalign( - 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf))); - CHECK_MEM_ERROR( - cm, thread_data->td->mask_buf, - (int32_t *)aom_memalign( - 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf))); - // Allocate frame counters in thread data. - CHECK_MEM_ERROR(cm, thread_data->td->counts, - aom_calloc(1, sizeof(*thread_data->td->counts))); - - // Allocate buffers used by palette coding mode. - CHECK_MEM_ERROR( - cm, thread_data->td->palette_buffer, - aom_memalign(16, sizeof(*thread_data->td->palette_buffer))); - - // Create threads - if (!winterface->reset(worker)) - aom_internal_error(&cm->error, AOM_CODEC_ERROR, - "Tile encoder thread creation failed"); - } else { - // Main thread acts as a worker and uses the thread data in cpi. - thread_data->td = &cpi->td; - } + // Encoding ends. + for (int i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + winterface->sync(worker); + } +} - winterface->sync(worker); +static void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) { + for (int i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile; + // Accumulate counters. + if (i < cpi->num_workers - 1) { + av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts); + accumulate_rd_opt(&cpi->td, thread_data->td); + cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count; } - } else { - num_workers = AOMMIN(num_workers, cpi->num_workers); } +} - for (i = 0; i < num_workers; i++) { +static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + for (int i = 0; i < num_workers; i++) { AVxWorker *const worker = &cpi->workers[i]; EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; - worker->hook = (AVxWorkerHook)enc_worker_hook; + worker->hook = hook; worker->data1 = thread_data; worker->data2 = NULL; @@ -139,47 +193,59 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) { thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf; thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf; thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf; + for (int x = 0; x < 2; x++) { + for (int y = 0; y < 2; y++) { + memcpy(thread_data->td->hash_value_buffer[x][y], + cpi->td.mb.hash_value_buffer[x][y], + AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*thread_data->td->hash_value_buffer[0][0])); + thread_data->td->mb.hash_value_buffer[x][y] = + thread_data->td->hash_value_buffer[x][y]; + } + } thread_data->td->mb.mask_buf = thread_data->td->mask_buf; } if (thread_data->td->counts != &cpi->counts) { memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts)); } - if (i < num_workers - 1) + if (i < num_workers - 1) { thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer; - } - - // Encode a frame - for (i = 0; i < num_workers; i++) { - AVxWorker *const worker = &cpi->workers[i]; - EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; - - // Set the starting tile for each thread. - thread_data->start = i; + thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + thread_data->td->mb.tmp_obmc_bufs[j] = + thread_data->td->tmp_obmc_bufs[j]; + } - if (i == cpi->num_workers - 1) - winterface->execute(worker); - else - winterface->launch(worker); + thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] = + thread_data->td->mb.tmp_obmc_bufs[j]; + } + } } +} - // Encoding ends. - for (i = 0; i < num_workers; i++) { - AVxWorker *const worker = &cpi->workers[i]; - winterface->sync(worker); - } +void av1_encode_tiles_mt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; + int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows); - for (i = 0; i < num_workers; i++) { - AVxWorker *const worker = &cpi->workers[i]; - EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; - cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile; - // Accumulate counters. - if (i < cpi->num_workers - 1) { - av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts); - accumulate_rd_opt(&cpi->td, thread_data->td); - cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count; - } + if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) + av1_alloc_tile_data(cpi); + + av1_init_tile_data(cpi); + // Only run once to create threads and allocate thread data. + if (cpi->num_workers == 0) { + create_enc_workers(cpi, num_workers); + } else { + num_workers = AOMMIN(num_workers, cpi->num_workers); } + prepare_enc_workers(cpi, enc_worker_hook, num_workers); + launch_enc_workers(cpi, num_workers); + sync_enc_workers(cpi, num_workers); + accumulate_counters_enc_workers(cpi, num_workers); } // Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int' diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h index b6b1fed4e..5de4b4803 100644 --- a/third_party/aom/av1/encoder/ethread.h +++ b/third_party/aom/av1/encoder/ethread.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_ETHREAD_H_ -#define AV1_ENCODER_ETHREAD_H_ +#ifndef AOM_AV1_ENCODER_ETHREAD_H_ +#define AOM_AV1_ENCODER_ETHREAD_H_ #ifdef __cplusplus extern "C" { @@ -34,4 +34,4 @@ void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts, } // extern "C" #endif -#endif // AV1_ENCODER_ETHREAD_H_ +#endif // AOM_AV1_ENCODER_ETHREAD_H_ diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h index 48178b964..e0432cc97 100644 --- a/third_party/aom/av1/encoder/extend.h +++ b/third_party/aom/av1/encoder/extend.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_EXTEND_H_ -#define AV1_ENCODER_EXTEND_H_ +#ifndef AOM_AV1_ENCODER_EXTEND_H_ +#define AOM_AV1_ENCODER_EXTEND_H_ #include "aom_scale/yv12config.h" #include "aom/aom_integer.h" @@ -29,4 +29,4 @@ void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, } // extern "C" #endif -#endif // AV1_ENCODER_EXTEND_H_ +#endif // AOM_AV1_ENCODER_EXTEND_H_ diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c index ef0800c79..69dd20c52 100644 --- a/third_party/aom/av1/encoder/firstpass.c +++ b/third_party/aom/av1/encoder/firstpass.c @@ -31,6 +31,7 @@ #include "av1/encoder/aq_variance.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/block.h" +#include "av1/encoder/dwt.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/encodemv.h" @@ -39,7 +40,7 @@ #include "av1/encoder/firstpass.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/rd.h" -#include "av1/encoder/dwt.h" +#include "av1/encoder/reconinter_enc.h" #define OUTPUT_FPF 0 #define ARF_STATS_OUTPUT 0 @@ -51,9 +52,10 @@ #define FACTOR_PT_LOW 0.70 #define FACTOR_PT_HIGH 0.90 #define FIRST_PASS_Q 10.0 -#define GF_MAX_BOOST 96.0 +#define GF_MAX_BOOST 90.0 #define INTRA_MODE_PENALTY 1024 -#define KF_MAX_BOOST 128.0 +#define KF_MIN_FRAME_BOOST 80.0 +#define KF_MAX_FRAME_BOOST 128.0 #define MIN_ARF_GF_BOOST 240 #define MIN_DECAY_FACTOR 0.01 #define MIN_KF_BOOST 300 @@ -62,6 +64,7 @@ #define DEFAULT_GRP_WEIGHT 1.0 #define RC_FACTOR_MIN 0.75 #define RC_FACTOR_MAX 1.75 +#define MIN_FWD_KF_INTERVAL 8 #define NCOUNT_INTRA_THRESH 8192 #define NCOUNT_INTRA_FACTOR 3 @@ -1562,576 +1565,9 @@ static int calculate_boost_bits(int frame_count, int boost, 0); } -#if USE_GF16_MULTI_LAYER -// === GF Group of 16 === -#define GF_INTERVAL_16 16 -#define GF_FRAME_PARAMS (REF_FRAMES + 5) - -// GF Group of 16: multi-layer hierarchical coding structure -// 1st Layer: Frame 0 and Frame 16 (ALTREF) -// 2nd Layer: Frame 8 (ALTREF2) -// 3rd Layer: Frame 4 and 12 (ALTREF2) -// 4th Layer: Frame 2, 6, 10, and 14 (BWDREF) -// 5th Layer: Frame 1, 3, 5, 7, 9, 11, 13, and 15 -static const unsigned char gf16_multi_layer_params[][GF_FRAME_PARAMS] = { - // gf_group->index: coding order - // (Frame #) : display order - { - // gf_group->index == 0 (Frame 0) - OVERLAY_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // References (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF_FRAME, // Index (current) of reference to get updated - GOLDEN_FRAME // cpi->refresh_golden_frame = 1 - }, - { - // gf_group->index == 1 (Frame 16) - ARF_UPDATE, // update_type - GF_INTERVAL_16 - 1, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - ALTREF_FRAME, // cpi->alt_fb_idx ===> cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - GOLDEN_FRAME, // cpi->gld_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF_FRAME, // Index (current) of reference to get updated - ALTREF_FRAME // cpi->refresh_alt_ref_frame = 1 - }, - { - // gf_group->index == 2 (Frame 8) - INTNL_ARF_UPDATE, // update_type - (GF_INTERVAL_16 >> 1) - 1, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF2_FRAME, // Index (current) of reference to get updated - ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1 - }, - { - // gf_group->index == 3 (Frame 4) - INTNL_ARF_UPDATE, // update_type - (GF_INTERVAL_16 >> 2) - 1, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx - // (BWDREF_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx - // (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF2_FRAME, // Index (current) of reference to get updated - ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1 - }, - { - // gf_group->index == 4 (Frame 2) - BRF_UPDATE, // update_type - 0, // arf_src_offset - 1, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx - // (BWDREF_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx - // (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - REF_FRAMES, // Index (current) of reference to get updated - BWDREF_FRAME // cpi->refresh_bwd_ref_frame = 1 - }, - { - // gf_group->index == 5 (Frame 1) - LAST_BIPRED_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - REF_FRAMES, // cpi->ext_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx ===> cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 6 (Frame 3) - LF_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME) - REF_FRAMES, // cpi->ext_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME) - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 7 (Frame 4 - OVERLAY) - INTNL_OVERLAY_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - BWDREF_FRAME, // Index (current) of reference to get updated - ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1 - }, - { - // gf_group->index == 8 (Frame 6) - BRF_UPDATE, // update_type - 0, // arf_src_offset - 1, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx -> cpi->bwd_fb_idx (BWDREF_FRAME) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF2_FRAME, // Index (current) of reference to get updated - BWDREF_FRAME // cpi->refresh_bwd_frame = 1 - }, - { - // gf_group->index == 9 (Frame 5) - LAST_BIPRED_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 10 (Frame 7) - LF_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 11 (Frame 8 - OVERLAY) - INTNL_OVERLAY_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - BWDREF_FRAME, // Index (current) of reference to get updated - ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1 - }, - { - // gf_group->index == 12 (Frame 12) - INTNL_ARF_UPDATE, // update_type - (GF_INTERVAL_16 >> 2) - 1, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF2_FRAME, // Index (current) of reference to get updated - ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1 - }, - { - // gf_group->index == 13 (Frame 10) - BRF_UPDATE, // update_type - 0, // arf_src_offset - 1, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF2_FRAME, // Index (current) of reference to get updated - BWDREF_FRAME // cpi->refresh_bwd_frame = 1 - }, - { - // gf_group->index == 14 (Frame 9) - LAST_BIPRED_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 15 (Frame 11) - LF_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME) - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 16 (Frame 12 - OVERLAY) - INTNL_OVERLAY_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - BWDREF_FRAME, // Index (current) of reference to get updated - ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1 - }, - { - // gf_group->index == 17 (Frame 14) - BRF_UPDATE, // update_type - 0, // arf_src_offset - 1, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - BWDREF_FRAME, // Index (current) of reference to get updated - BWDREF_FRAME // cpi->refresh_bwd_frame = 1 - }, - { - // gf_group->index == 18 (Frame 13) - LAST_BIPRED_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 19 (Frame 15) - LF_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME - - // LAST_FRAME] - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - LAST3_FRAME, // Index (current) of reference to get updated - LAST_FRAME // cpi->refresh_last_frame = 1 - }, - { - // gf_group->index == 20 (Frame 16 - OVERLAY: Belonging to the next GF - // group) - OVERLAY_UPDATE, // update_type - 0, // arf_src_offset - 0, // brf_src_offset - // Reference frame indexes (previous ===> current) - LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] - LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] - LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===> - // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] - GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME) - BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME) - ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME) - ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME) - REF_FRAMES, // cpi->ext_fb_idx (extra ref frame) - // Refreshment (index, flag) - ALTREF_FRAME, // Index (current) of reference to get updated - GOLDEN_FRAME // cpi->refresh_golden_frame = 1 - } -}; - -// === GF Group of 16 === -static void define_gf_group_structure_16(AV1_COMP *cpi) { - RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - const int key_frame = cpi->common.frame_type == KEY_FRAME; - - assert(rc->baseline_gf_interval == GF_INTERVAL_16); - - // Total number of frames to consider for GF group of 16: - // = GF group interval + number of OVERLAY's - // = rc->baseline_gf_interval + MAX_EXT_ARFS + 1 + 1 - // NOTE: The OVERLAY frame for the next GF group also needs to consider to - // prepare for the reference frame index mapping. - - const int gf_update_frames = rc->baseline_gf_interval + MAX_EXT_ARFS + 2; - - for (int frame_index = 0; frame_index < gf_update_frames; ++frame_index) { - int param_idx = 0; - - // Treat KEY_FRAME differently - if (frame_index == 0 && key_frame) { - gf_group->update_type[frame_index] = KF_UPDATE; - - gf_group->rf_level[frame_index] = KF_STD; - gf_group->arf_src_offset[frame_index] = 0; - gf_group->brf_src_offset[frame_index] = 0; - gf_group->bidir_pred_enabled[frame_index] = 0; - for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) - gf_group->ref_fb_idx_map[frame_index][ref_idx] = ref_idx; - gf_group->refresh_idx[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1]; - gf_group->refresh_flag[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1]; - - continue; - } - - // == update_type == - gf_group->update_type[frame_index] = - gf16_multi_layer_params[frame_index][param_idx++]; - - // == rf_level == - // Derive rf_level from update_type - switch (gf_group->update_type[frame_index]) { - case LF_UPDATE: gf_group->rf_level[frame_index] = INTER_NORMAL; break; - case ARF_UPDATE: gf_group->rf_level[frame_index] = GF_ARF_LOW; break; - case OVERLAY_UPDATE: - gf_group->rf_level[frame_index] = INTER_NORMAL; - break; - case BRF_UPDATE: gf_group->rf_level[frame_index] = GF_ARF_LOW; break; - case LAST_BIPRED_UPDATE: - gf_group->rf_level[frame_index] = INTER_NORMAL; - break; - case BIPRED_UPDATE: gf_group->rf_level[frame_index] = INTER_NORMAL; break; - case INTNL_ARF_UPDATE: - gf_group->rf_level[frame_index] = GF_ARF_LOW; - break; - case INTNL_OVERLAY_UPDATE: - gf_group->rf_level[frame_index] = INTER_NORMAL; - break; - default: gf_group->rf_level[frame_index] = INTER_NORMAL; break; - } - - // == arf_src_offset == - gf_group->arf_src_offset[frame_index] = - gf16_multi_layer_params[frame_index][param_idx++]; - - // == brf_src_offset == - gf_group->brf_src_offset[frame_index] = - gf16_multi_layer_params[frame_index][param_idx++]; - - // == bidir_pred_enabled == - // Derive bidir_pred_enabled from bidir_src_offset - gf_group->bidir_pred_enabled[frame_index] = - gf_group->brf_src_offset[frame_index] ? 1 : 0; - - // == ref_fb_idx_map == - for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) - gf_group->ref_fb_idx_map[frame_index][ref_idx] = - gf16_multi_layer_params[frame_index][param_idx++]; - - // == refresh_idx == - gf_group->refresh_idx[frame_index] = - gf16_multi_layer_params[frame_index][param_idx++]; - - // == refresh_flag == - gf_group->refresh_flag[frame_index] = - gf16_multi_layer_params[frame_index][param_idx]; - } - - // Mark the ARF_UPDATE / INTNL_ARF_UPDATE and OVERLAY_UPDATE / - // INTNL_OVERLAY_UPDATE for rate allocation - // NOTE: Indexes are designed in the display order backward: - // ALT[3] .. ALT[2] .. ALT[1] .. ALT[0], - // but their coding order is as follows: - // ALT0-ALT2-ALT3 .. OVERLAY3 .. OVERLAY2-ALT1 .. OVERLAY1 .. OVERLAY0 - - const int num_arfs_in_gf = cpi->num_extra_arfs + 1; - const int sub_arf_interval = rc->baseline_gf_interval / num_arfs_in_gf; - - // == arf_pos_for_ovrly ==: Position for OVERLAY - for (int arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) { - const int prior_num_arfs = - (arf_idx <= 1) ? num_arfs_in_gf : (num_arfs_in_gf - 1); - cpi->arf_pos_for_ovrly[arf_idx] = - sub_arf_interval * (num_arfs_in_gf - arf_idx) + prior_num_arfs; - } - - // == arf_pos_in_gf ==: Position for ALTREF - cpi->arf_pos_in_gf[0] = 1; - cpi->arf_pos_in_gf[1] = cpi->arf_pos_for_ovrly[2] + 1; - cpi->arf_pos_in_gf[2] = 2; - cpi->arf_pos_in_gf[3] = 3; - - // == arf_update_idx == - // == arf_ref_idx == - // NOTE: Due to the hierarchical nature of GF16, these two parameters only - // relect the index to the nearest future overlay. - int start_frame_index = 0; - for (int arf_idx = (num_arfs_in_gf - 1); arf_idx >= 0; --arf_idx) { - const int end_frame_index = cpi->arf_pos_for_ovrly[arf_idx]; - for (int frame_index = start_frame_index; frame_index <= end_frame_index; - ++frame_index) { - gf_group->arf_update_idx[frame_index] = arf_idx; - gf_group->arf_ref_idx[frame_index] = arf_idx; - } - start_frame_index = end_frame_index + 1; - } -} -#endif // USE_GF16_MULTI_LAYER - #if USE_SYMM_MULTI_LAYER +// #define CHCEK_GF_PARAMETER +#ifdef CHCEK_GF_PARAMETER void check_frame_params(GF_GROUP *const gf_group, int gf_interval, int frame_nums) { static const char *update_type_strings[] = { @@ -2149,9 +1585,15 @@ void check_frame_params(GF_GROUP *const gf_group, int gf_interval, gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i], gf_group->arf_update_idx[i], gf_group->pyramid_level[i]); } + + fprintf(fid, "number of nodes in each level: \n"); + for (int i = 0; i < MAX_PYRAMID_LVL; ++i) { + fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]); + } + fprintf(fid, "\n"); fclose(fid); } - +#endif // CHCEK_GF_PARAMETER static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) { // Derive rf_level from update_type switch (update_type) { @@ -2169,14 +1611,17 @@ static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) { static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r, int *frame_ind, int arf_ind, int level) { - if (r - l == 2) { - // leaf node, not a look-ahead frame - gf_group->update_type[*frame_ind] = LF_UPDATE; - gf_group->arf_src_offset[*frame_ind] = 0; - gf_group->arf_pos_in_gf[*frame_ind] = 0; - gf_group->arf_update_idx[*frame_ind] = arf_ind; - gf_group->pyramid_level[*frame_ind] = level; - ++(*frame_ind); + if (r - l < 4) { + while (++l < r) { + // leaf nodes, not a look-ahead frame + gf_group->update_type[*frame_ind] = LF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->arf_pos_in_gf[*frame_ind] = 0; + gf_group->arf_update_idx[*frame_ind] = arf_ind; + gf_group->pyramid_level[*frame_ind] = 0; + ++gf_group->pyramid_lvl_nodes[0]; + ++(*frame_ind); + } } else { int m = (l + r) / 2; int arf_pos_in_gf = *frame_ind; @@ -2186,6 +1631,7 @@ static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r, gf_group->arf_pos_in_gf[*frame_ind] = 0; gf_group->arf_update_idx[*frame_ind] = 1; // mark all internal ARF 1 gf_group->pyramid_level[*frame_ind] = level; + ++gf_group->pyramid_lvl_nodes[level]; ++(*frame_ind); // set parameters for frames displayed before this frame @@ -2209,7 +1655,7 @@ static INLINE unsigned char get_pyramid_height(int pyramid_width) { assert(pyramid_width <= 16 && pyramid_width >= 4 && "invalid gf interval for pyramid structure"); - return pyramid_width == 16 ? 4 : (pyramid_width >= 8 ? 3 : 2); + return pyramid_width > 12 ? 4 : (pyramid_width > 6 ? 3 : 2); } static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group, @@ -2217,6 +1663,10 @@ static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group, int frame_index = 0; gf_group->pyramid_height = get_pyramid_height(gf_interval); + assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL); + + av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL); + // At the beginning of each GF group it will be a key or overlay frame, gf_group->update_type[frame_index] = OVERLAY_UPDATE; gf_group->arf_src_offset[frame_index] = 0; @@ -2236,9 +1686,6 @@ static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group, // set parameters for the rest of the frames set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0, gf_group->pyramid_height - 1); - - // check_frame_params(gf_group, gf_interval, frame_index); - return frame_index; } @@ -2248,8 +1695,8 @@ void define_customized_gf_group_structure(AV1_COMP *cpi) { GF_GROUP *const gf_group = &twopass->gf_group; const int key_frame = cpi->common.frame_type == KEY_FRAME; - assert(rc->baseline_gf_interval == 4 || rc->baseline_gf_interval == 8 || - rc->baseline_gf_interval == 16); + assert(rc->baseline_gf_interval >= 4 && + rc->baseline_gf_interval <= MAX_PYRAMID_SIZE); const int gf_update_frames = construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval); @@ -2305,8 +1752,9 @@ void define_customized_gf_group_structure(AV1_COMP *cpi) { // This parameter is useless? gf_group->arf_ref_idx[frame_index] = 0; - +#ifdef CHCEK_GF_PARAMETER check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames); +#endif } // It is an example of how to define a GF stucture manually. The function will @@ -2447,16 +1895,10 @@ static int define_gf_group_structure_4(AV1_COMP *cpi) { static void define_gf_group_structure(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; -#if USE_GF16_MULTI_LAYER - if (rc->baseline_gf_interval == 16) { - define_gf_group_structure_16(cpi); - return; - } -#endif // USE_GF16_MULTI_LAYER #if USE_SYMM_MULTI_LAYER - const int valid_customized_gf_length = rc->baseline_gf_interval == 4 || - rc->baseline_gf_interval == 8 || - rc->baseline_gf_interval == 16; + const int valid_customized_gf_length = + rc->baseline_gf_interval >= 4 && + rc->baseline_gf_interval <= MAX_PYRAMID_SIZE; // used the new structure only if extra_arf is allowed if (valid_customized_gf_length && rc->source_alt_ref_pending && cpi->extra_arf_allowed > 0) { @@ -2685,6 +2127,18 @@ static void define_gf_group_structure(AV1_COMP *cpi) { gf_group->brf_src_offset[frame_index] = 0; } +#if USE_SYMM_MULTI_LAYER +#define LEAF_REDUCTION_FACTOR 0.75f +#define LVL_3_BOOST_FACTOR 0.8f +#define LVL_2_BOOST_FACTOR 0.3f + +static float_t lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = { + { 1, 0, 0 }, + { LVL_3_BOOST_FACTOR, 0, 0 }, // Leaking budget works better + { LVL_3_BOOST_FACTOR, (1 - LVL_3_BOOST_FACTOR) * LVL_2_BOOST_FACTOR, + (1 - LVL_3_BOOST_FACTOR) * (1 - LVL_2_BOOST_FACTOR) } +}; +#endif // USE_SYMM_MULTI_LAYER static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, double group_error, int gf_arf_bits) { RATE_CONTROL *const rc = &cpi->rc; @@ -2771,20 +2225,39 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, // BIPRED_UPDATE frames need to be further adjusted. gf_group->bit_allocation[frame_index] = target_frame_size; #if USE_SYMM_MULTI_LAYER - } else if (cpi->new_bwdref_update_rule == 1 && + } else if (cpi->new_bwdref_update_rule && gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) { + assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL && + gf_group->pyramid_height >= 0 && + "non-valid height for a pyramid structure"); + int arf_pos = gf_group->arf_pos_in_gf[frame_index]; gf_group->bit_allocation[frame_index] = 0; - // Tried boosting up the allocated bits on backward reference frame - // by (target_frame_size >> 2) as in the original setting. However it - // does not bring gains for pyramid structure with GF length = 16. gf_group->bit_allocation[arf_pos] = target_frame_size; -#endif +#if MULTI_LVL_BOOST_VBR_CQ + const int pyr_h = gf_group->pyramid_height - 2; + const int this_lvl = gf_group->pyramid_level[arf_pos]; + const int dist2top = gf_group->pyramid_height - 1 - this_lvl; + + const float_t budget = + LEAF_REDUCTION_FACTOR * gf_group->pyramid_lvl_nodes[0]; + const float_t lvl_boost = budget * lvl_budget_factor[pyr_h][dist2top] / + gf_group->pyramid_lvl_nodes[this_lvl]; + + gf_group->bit_allocation[arf_pos] += (int)(target_frame_size * lvl_boost); +#endif // MULTI_LVL_BOOST_VBR_CQ +#endif // USE_SYMM_MULTI_LAYER } else { assert(gf_group->update_type[frame_index] == LF_UPDATE || gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE); gf_group->bit_allocation[frame_index] = target_frame_size; +#if MULTI_LVL_BOOST_VBR_CQ + if (cpi->new_bwdref_update_rule) { + gf_group->bit_allocation[frame_index] -= + (int)(target_frame_size * LEAF_REDUCTION_FACTOR); + } +#endif // MULTI_LVL_BOOST_VBR_CQ } ++frame_index; @@ -2833,9 +2306,11 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { int i; double boost_score = 0.0; -#if !FIX_GF_INTERVAL_LENGTH +#if !CONFIG_FIX_GF_LENGTH double old_boost_score = 0.0; double mv_ratio_accumulator_thresh; + int active_max_gf_interval; + int active_min_gf_interval; #endif double gf_group_err = 0.0; #if GROUP_ADAPTIVE_MAXQ @@ -2862,8 +2337,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { int f_boost = 0; int b_boost = 0; int flash_detected; - int active_max_gf_interval; - int active_min_gf_interval; int64_t gf_group_bits; double gf_group_error_left; int gf_arf_bits; @@ -2898,11 +2371,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group_skip_pct -= this_frame->intra_skip_pct; gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows; } -#if !FIX_GF_INTERVAL_LENGTH +#if !CONFIG_FIX_GF_LENGTH // Motion breakout threshold for loop below depends on image size. mv_ratio_accumulator_thresh = (cpi->initial_height + cpi->initial_width) / 4.0; -#endif // Set a maximum and minimum interval for the GF group. // If the image appears almost completely static we can extend beyond this. { @@ -2915,23 +2387,19 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (active_min_gf_interval > rc->max_gf_interval) active_min_gf_interval = rc->max_gf_interval; - if (cpi->multi_arf_allowed) { + // The value chosen depends on the active Q range. At low Q we have + // bits to spare and are better with a smaller interval and smaller boost. + // At high Q when there are few bits to spare we are better with a longer + // interval to spread the cost of the GF. + active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6)); + + // We have: active_min_gf_interval <= rc->max_gf_interval + if (active_max_gf_interval < active_min_gf_interval) + active_max_gf_interval = active_min_gf_interval; + else if (active_max_gf_interval > rc->max_gf_interval) active_max_gf_interval = rc->max_gf_interval; - } else { - // The value chosen depends on the active Q range. At low Q we have - // bits to spare and are better with a smaller interval and smaller boost. - // At high Q when there are few bits to spare we are better with a longer - // interval to spread the cost of the GF. - active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6)); - - // We have: active_min_gf_interval <= rc->max_gf_interval - if (active_max_gf_interval < active_min_gf_interval) - active_max_gf_interval = active_min_gf_interval; - else if (active_max_gf_interval > rc->max_gf_interval) - active_max_gf_interval = rc->max_gf_interval; - } } - +#endif // !CONFIG_FIX_GF_LENGTH double avg_sr_coded_error = 0; double avg_raw_err_stdev = 0; int non_zero_stdev_count = 0; @@ -2990,10 +2458,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { boost_score += decay_accumulator * calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST); -#if FIX_GF_INTERVAL_LENGTH +#if CONFIG_FIX_GF_LENGTH if (i == (FIXED_GF_LENGTH + 1)) break; #else - // Skip breaking condition for FIX_GF_INTERVAL_LENGTH + // Skip breaking condition for CONFIG_FIX_GF_LENGTH // Break out conditions. if ( // Break at active_max_gf_interval unless almost totally static. @@ -3017,7 +2485,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { } } old_boost_score = boost_score; -#endif // FIX_GF_INTERVAL_LENGTH +#endif // CONFIG_FIX_GF_LENGTH *this_frame = next_frame; } twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0); @@ -3030,44 +2498,116 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { assert(num_mbs > 0); if (i) avg_sr_coded_error /= i; + if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count; + + // Disable extra altrefs and backward refs for "still" gf group: + // zero_motion_accumulator: minimum percentage of (0,0) motion; + // avg_sr_coded_error: average of the SSE per pixel of each frame; + // avg_raw_err_stdev: average of the standard deviation of (0,0) + // motion error per block of each frame. + const int disable_bwd_extarf = + (zero_motion_accumulator > MIN_ZERO_MOTION && + avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR && + avg_raw_err_stdev < MAX_RAW_ERR_VAR); + + if (disable_bwd_extarf) cpi->extra_arf_allowed = 0; + +#define REDUCE_GF_LENGTH_THRESH 4 +#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9 +#define REDUCE_GF_LENGTH_BY 1 + int alt_offset = 0; +#if REDUCE_LAST_GF_LENGTH + // TODO(weitinglin): The length reduction stretagy is tweaking using AOM_Q + // mode, and hurting the performance of VBR mode. We need to investigate how + // to adjust GF length for other modes. + + int allow_gf_length_reduction = + cpi->oxcf.rc_mode == AOM_Q || cpi->extra_arf_allowed == 0; + + // We are going to have an alt ref, but we don't have do adjustment for + // lossless mode + if (allow_alt_ref && allow_gf_length_reduction && + (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval) && + !is_lossless_requested(&cpi->oxcf)) { + // adjust length of this gf group if one of the following condition met + // 1: only one overlay frame left and this gf is too long + // 2: next gf group is too short to have arf compared to the current gf + + // maximum length of next gf group + const int next_gf_len = rc->frames_to_key - i; + const int single_overlay_left = + next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH; + // the next gf is probably going to have a ARF but it will be shorter than + // this gf + const int unbalanced_gf = + i > REDUCE_GF_LENGTH_TO_KEY_THRESH && + next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH && + next_gf_len + 1 >= rc->min_gf_interval; + + if (single_overlay_left || unbalanced_gf) { + // Note: Tried roll_back = DIVIDE_AND_ROUND(i, 8), but is does not work + // better in the current setting + const int roll_back = REDUCE_GF_LENGTH_BY; + alt_offset = -roll_back; + i -= roll_back; + } + } +#endif + // Should we use the alternate reference frame. if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) { // Calculate the boost for alt ref. rc->gfu_boost = - calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); + calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost); rc->source_alt_ref_pending = 1; + + // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF + cpi->preserve_arf_as_gld = 1; } else { rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST); rc->source_alt_ref_pending = 0; + cpi->preserve_arf_as_gld = 0; } // Set the interval until the next gf. - if (cpi->oxcf.fwd_kf_enabled) { - // Ensure the gf group before the next keyframe will contain an altref - if ((rc->frames_to_key - i < rc->min_gf_interval) && - (rc->frames_to_key != i)) { - rc->baseline_gf_interval = AOMMIN(rc->frames_to_key - rc->min_gf_interval, - rc->static_scene_max_gf_interval); - } else { + // If forward keyframes are enabled, ensure the final gf group obeys the + // MIN_FWD_KF_INTERVAL. + if (cpi->oxcf.fwd_kf_enabled && + ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) { + if (i == rc->frames_to_key) { rc->baseline_gf_interval = i; + // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL + } else if ((rc->frames_to_key - i < + AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) && + (rc->frames_to_key != i)) { + // if possible, merge the last two gf groups + if (rc->frames_to_key <= MAX_PYRAMID_SIZE) { + rc->baseline_gf_interval = rc->frames_to_key; + // if merging the last two gf groups creates a group that is too long, + // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL + } else { + rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL; + } + } else { + rc->baseline_gf_interval = + i - (is_key_frame || rc->source_alt_ref_pending); } } else { rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending); } - if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count; - // Disable extra altrefs and backward refs for "still" gf group: - // zero_motion_accumulator: minimum percentage of (0,0) motion; - // avg_sr_coded_error: average of the SSE per pixel of each frame; - // avg_raw_err_stdev: average of the standard deviation of (0,0) - // motion error per block of each frame. - const int disable_bwd_extarf = - (zero_motion_accumulator > MIN_ZERO_MOTION && - avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR && - avg_raw_err_stdev < MAX_RAW_ERR_VAR); - - if (disable_bwd_extarf) cpi->extra_arf_allowed = 0; +#if REDUCE_LAST_ALT_BOOST +#define LAST_ALR_BOOST_FACTOR 0.2f + rc->arf_boost_factor = 1.0; + if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) { + // Reduce the boost of altref in the last gf group + if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY || + rc->frames_to_key - i == 0) { + rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR; + } + } +#endif if (!cpi->extra_arf_allowed) { cpi->num_extra_arfs = 0; @@ -3439,6 +2979,11 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { // how many bits to spend on it. decay_accumulator = 1.0; boost_score = 0.0; + const double kf_max_boost = + cpi->oxcf.rc_mode == AOM_Q + ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST), + KF_MAX_FRAME_BOOST) + : KF_MAX_FRAME_BOOST; for (i = 0; i < (rc->frames_to_key - 1); ++i) { if (EOF == input_stats(twopass, &next_frame)) break; @@ -3450,7 +2995,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { if ((i <= rc->max_gf_interval) || ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) { const double frame_boost = - calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST); + calc_frame_boost(cpi, this_frame, 0, kf_max_boost); // How fast is prediction quality decaying. if (!detect_flash(twopass, 0)) { @@ -3513,147 +3058,6 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->modified_error_left -= kf_group_err; } -#if USE_GF16_MULTI_LAYER -// === GF Group of 16 === -void av1_ref_frame_map_idx_updates(AV1_COMP *cpi, int gf_frame_index) { - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - - int ref_fb_idx_prev[REF_FRAMES]; - int ref_fb_idx_curr[REF_FRAMES]; - - for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { - ref_fb_idx_prev[ref_frame] = cpi->ref_fb_idx[ref_frame]; - } - - // Update map index for each reference frame - for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) { - int ref_frame = gf_group->ref_fb_idx_map[gf_frame_index][ref_idx]; - ref_fb_idx_curr[ref_idx] = ref_fb_idx_prev[ref_frame - LAST_FRAME]; - } - - for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { - cpi->ref_fb_idx[ref_frame] = ref_fb_idx_curr[ref_frame]; - } -} - -// Define the reference buffers that will be updated post encode. -static void configure_buffer_updates_16(AV1_COMP *cpi) { - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - - if (gf_group->update_type[gf_group->index] == KF_UPDATE) { - cpi->refresh_fb_idx = 0; - - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 1; - cpi->refresh_bwd_ref_frame = 1; - cpi->refresh_alt2_ref_frame = 1; - cpi->refresh_alt_ref_frame = 1; - - return; - } - - // Update reference frame map indexes - av1_ref_frame_map_idx_updates(cpi, gf_group->index); - - // Update refresh index - switch (gf_group->refresh_idx[gf_group->index]) { - case LAST_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST_FRAME - LAST_FRAME]; - break; - - case LAST2_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST2_FRAME - LAST_FRAME]; - break; - - case LAST3_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST3_FRAME - LAST_FRAME]; - break; - - case GOLDEN_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1]; - break; - - case BWDREF_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[BWDREF_FRAME - 1]; - break; - - case ALTREF2_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF2_FRAME - 1]; - break; - - case ALTREF_FRAME: - cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1]; - break; - - case REF_FRAMES: - cpi->refresh_fb_idx = cpi->ref_fb_idx[REF_FRAMES - 1]; - break; - - default: assert(0); break; - } - - // Update refresh flags - switch (gf_group->refresh_flag[gf_group->index]) { - case LAST_FRAME: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - break; - - case GOLDEN_FRAME: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 1; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - break; - - case BWDREF_FRAME: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 1; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - break; - - case ALTREF2_FRAME: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 1; - cpi->refresh_alt_ref_frame = 0; - break; - - case ALTREF_FRAME: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 1; - break; - - default: assert(0); break; - } - - switch (gf_group->update_type[gf_group->index]) { - case BRF_UPDATE: cpi->rc.is_bwd_ref_frame = 1; break; - - case LAST_BIPRED_UPDATE: cpi->rc.is_last_bipred_frame = 1; break; - - case BIPRED_UPDATE: cpi->rc.is_bipred_frame = 1; break; - - case INTNL_OVERLAY_UPDATE: cpi->rc.is_src_frame_ext_arf = 1; - case OVERLAY_UPDATE: cpi->rc.is_src_frame_alt_ref = 1; break; - - default: break; - } -} -#endif // USE_GF16_MULTI_LAYER - // Define the reference buffers that will be updated post encode. static void configure_buffer_updates(AV1_COMP *cpi) { TWO_PASS *const twopass = &cpi->twopass; @@ -3667,14 +3071,6 @@ static void configure_buffer_updates(AV1_COMP *cpi) { cpi->rc.is_bipred_frame = 0; cpi->rc.is_src_frame_ext_arf = 0; -#if USE_GF16_MULTI_LAYER - RATE_CONTROL *const rc = &cpi->rc; - if (rc->baseline_gf_interval == 16) { - configure_buffer_updates_16(cpi); - return; - } -#endif // USE_GF16_MULTI_LAYER - switch (twopass->gf_group.update_type[twopass->gf_group.index]) { case KF_UPDATE: cpi->refresh_last_frame = 1; @@ -3979,8 +3375,7 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) { : cpi->common.MBs; // The multiplication by 256 reverses a scaling factor of (>> 8) // applied when combining MB error values for the frame. - twopass->mb_av_energy = - log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0); + twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0); twopass->frame_avg_haar_energy = log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0); } @@ -4020,9 +3415,6 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) { } twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0); - // Increment the gf group index ready for the next frame. - ++twopass->gf_group.index; - // If the rate control is drifting consider adjustment to min or maxq. if ((cpi->oxcf.rc_mode != AOM_Q) && (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) && diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h index b0c1a21e4..4b7325ae2 100644 --- a/third_party/aom/av1/encoder/firstpass.h +++ b/third_party/aom/av1/encoder/firstpass.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_FIRSTPASS_H_ -#define AV1_ENCODER_FIRSTPASS_H_ +#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_ +#define AOM_AV1_ENCODER_FIRSTPASS_H_ #include "av1/common/enums.h" #include "av1/common/onyxc_int.h" @@ -47,15 +47,7 @@ typedef struct { // number of bi-predictive frames. #define BFG_INTERVAL 2 // The maximum number of extra ALTREF's except ALTREF_FRAME -// NOTE: REF_FRAMES indicates the maximum number of frames that may be buffered -// to serve as references. Currently REF_FRAMES == 8. -#define USE_GF16_MULTI_LAYER 0 - -#if USE_GF16_MULTI_LAYER -#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME) -#else // !USE_GF16_MULTI_LAYER #define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME - 1) -#endif // USE_GF16_MULTI_LAYER #define MIN_EXT_ARF_INTERVAL 4 @@ -126,6 +118,7 @@ typedef struct { unsigned char arf_pos_in_gf[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char pyramid_level[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char pyramid_height; + unsigned char pyramid_lvl_nodes[MAX_PYRAMID_LVL]; #endif unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1]; @@ -197,10 +190,6 @@ void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi, // Post encode update of the rate control parameters for 2-pass void av1_twopass_postencode_update(struct AV1_COMP *cpi); -#if USE_GF16_MULTI_LAYER -void av1_ref_frame_map_idx_updates(struct AV1_COMP *cpi, int gf_frame_index); -#endif // USE_GF16_MULTI_LAYER - static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) { if (arf_pending && MAX_EXT_ARFS > 0) return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1) @@ -216,4 +205,4 @@ static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) { } // extern "C" #endif -#endif // AV1_ENCODER_FIRSTPASS_H_ +#endif // AOM_AV1_ENCODER_FIRSTPASS_H_ diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c index f07d1bc00..e9f8b0bb4 100644 --- a/third_party/aom/av1/encoder/global_motion.c +++ b/third_party/aom/av1/encoder/global_motion.c @@ -32,8 +32,8 @@ // Border over which to compute the global motion #define ERRORADV_BORDER 0 -static const double erroradv_tr[] = { 0.75, 0.70, 0.65 }; -static const double erroradv_prod_tr[] = { 22000, 20000, 18000 }; +static const double erroradv_tr[] = { 0.65, 0.60, 0.55 }; +static const double erroradv_prod_tr[] = { 20000, 18000, 16000 }; int is_enough_erroradvantage(double best_erroradvantage, int params_cost, int erroradv_type) { diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h index 2c15753fd..c7c016c43 100644 --- a/third_party/aom/av1/encoder/global_motion.h +++ b/third_party/aom/av1/encoder/global_motion.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_GLOBAL_MOTION_H_ -#define AV1_ENCODER_GLOBAL_MOTION_H_ +#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_ +#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_ #include "aom/aom_integer.h" #include "aom_scale/yv12config.h" @@ -61,4 +61,4 @@ int compute_global_motion_feature_based(TransformationType type, #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_GLOBAL_MOTION_H_ +#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_H_ diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h index 45632da9b..945dc3733 100644 --- a/third_party/aom/av1/encoder/grain_test_vectors.h +++ b/third_party/aom/av1/encoder/grain_test_vectors.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_GRAIN_TEST_VECTORS_H_ -#define AV1_GRAIN_TEST_VECTORS_H_ +#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ +#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ /* Test vectors for emulation of different film grain types. * Note that bit depth would be derived from the bitstream and @@ -778,4 +778,4 @@ static aom_film_grain_t film_grain_test_vectors[16] = { 45231 /* random_seed */ }, }; -#endif // AV1_GRAIN_TEST_VECTORS_H_ +#endif // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h index 8b6227540..826c004d6 100644 --- a/third_party/aom/av1/encoder/hash.h +++ b/third_party/aom/av1/encoder/hash.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_HASH_H_ -#define AV1_ENCODER_HASH_H_ +#ifndef AOM_AV1_ENCODER_HASH_H_ +#define AOM_AV1_ENCODER_HASH_H_ #include "config/aom_config.h" @@ -43,8 +43,10 @@ typedef struct _CRC32C { // init table for software version crc32c void av1_crc32c_calculator_init(CRC32C *p_crc32c); +#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096) + #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_HASH_H_ +#endif // AOM_AV1_ENCODER_HASH_H_ diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c index f2ff5b495..e85a516e8 100644 --- a/third_party/aom/av1/encoder/hash_motion.c +++ b/third_party/aom/av1/encoder/hash_motion.c @@ -13,14 +13,12 @@ #include "config/av1_rtcd.h" +#include "av1/encoder/block.h" #include "av1/encoder/hash.h" #include "av1/encoder/hash_motion.h" static const int crc_bits = 16; static const int block_size_bits = 3; -static CRC_CALCULATOR crc_calculator1; -static CRC_CALCULATOR crc_calculator2; -static int g_crc_initialized = 0; static void hash_table_clear_all(hash_table *p_hash_table) { if (p_hash_table->p_lookup_table == NULL) { @@ -106,11 +104,11 @@ static int hash_block_size_to_index(int block_size) { } } -void av1_hash_table_init(hash_table *p_hash_table) { - if (g_crc_initialized == 0) { - av1_crc_calculator_init(&crc_calculator1, 24, 0x5D6DCB); - av1_crc_calculator_init(&crc_calculator2, 24, 0x864CFB); - g_crc_initialized = 1; +void av1_hash_table_init(hash_table *p_hash_table, MACROBLOCK *x) { + if (x->g_crc_initialized == 0) { + av1_crc_calculator_init(&x->crc_calculator1, 24, 0x5D6DCB); + av1_crc_calculator_init(&x->crc_calculator2, 24, 0x864CFB); + x->g_crc_initialized = 1; } p_hash_table->p_lookup_table = NULL; } @@ -181,7 +179,8 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture, uint32_t *pic_block_hash[2], - int8_t *pic_block_same_info[3]) { + int8_t *pic_block_same_info[3], + MACROBLOCK *x) { const int width = 2; const int height = 2; const int x_end = picture->y_crop_width - width + 1; @@ -201,9 +200,9 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture, pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p); pic_block_hash[0][pos] = av1_get_crc_value( - &crc_calculator1, (uint8_t *)p, length * sizeof(p[0])); + &x->crc_calculator1, (uint8_t *)p, length * sizeof(p[0])); pic_block_hash[1][pos] = av1_get_crc_value( - &crc_calculator2, (uint8_t *)p, length * sizeof(p[0])); + &x->crc_calculator2, (uint8_t *)p, length * sizeof(p[0])); pos++; } pos += width - 1; @@ -220,9 +219,9 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture, pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p); pic_block_hash[0][pos] = - av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0])); + av1_get_crc_value(&x->crc_calculator1, p, length * sizeof(p[0])); pic_block_hash[1][pos] = - av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0])); + av1_get_crc_value(&x->crc_calculator2, p, length * sizeof(p[0])); pos++; } pos += width - 1; @@ -235,7 +234,8 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture, uint32_t *src_pic_block_hash[2], uint32_t *dst_pic_block_hash[2], int8_t *src_pic_block_same_info[3], - int8_t *dst_pic_block_same_info[3]) { + int8_t *dst_pic_block_same_info[3], + MACROBLOCK *x) { const int pic_width = picture->y_crop_width; const int x_end = picture->y_crop_width - block_size + 1; const int y_end = picture->y_crop_height - block_size + 1; @@ -254,14 +254,14 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture, p[2] = src_pic_block_hash[0][pos + src_size * pic_width]; p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size]; dst_pic_block_hash[0][pos] = - av1_get_crc_value(&crc_calculator1, (uint8_t *)p, length); + av1_get_crc_value(&x->crc_calculator1, (uint8_t *)p, length); p[0] = src_pic_block_hash[1][pos]; p[1] = src_pic_block_hash[1][pos + src_size]; p[2] = src_pic_block_hash[1][pos + src_size * pic_width]; p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size]; dst_pic_block_hash[1][pos] = - av1_get_crc_value(&crc_calculator2, (uint8_t *)p, length); + av1_get_crc_value(&x->crc_calculator2, (uint8_t *)p, length); dst_pic_block_same_info[0][pos] = src_pic_block_same_info[0][pos] && @@ -388,17 +388,9 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, return 1; } -// global buffer for hash value calculation of a block -// used only in av1_get_block_hash_value() -#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096) -// [first hash/second hash] -// [two buffers used ping-pong] -// [num of 2x2 blocks in 128x128] -static uint32_t hash_value_buffer[2][2][AOM_BUFFER_SIZE_FOR_BLOCK_HASH]; - void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, uint32_t *hash_value1, uint32_t *hash_value2, - int use_highbitdepth) { + int use_highbitdepth, MACROBLOCK *x) { uint32_t to_hash[4]; const int add_value = hash_block_size_to_index(block_size) << crc_bits; assert(add_value >= 0); @@ -415,10 +407,12 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, get_pixels_in_1D_short_array_by_block_2x2( y16_src + y_pos * stride + x_pos, stride, pixel_to_hash); assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); - hash_value_buffer[0][0][pos] = av1_get_crc_value( - &crc_calculator1, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash)); - hash_value_buffer[1][0][pos] = av1_get_crc_value( - &crc_calculator2, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash)); + x->hash_value_buffer[0][0][pos] = + av1_get_crc_value(&x->crc_calculator1, (uint8_t *)pixel_to_hash, + sizeof(pixel_to_hash)); + x->hash_value_buffer[1][0][pos] = + av1_get_crc_value(&x->crc_calculator2, (uint8_t *)pixel_to_hash, + sizeof(pixel_to_hash)); } } } else { @@ -429,10 +423,10 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos, stride, pixel_to_hash); assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); - hash_value_buffer[0][0][pos] = av1_get_crc_value( - &crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash)); - hash_value_buffer[1][0][pos] = av1_get_crc_value( - &crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash)); + x->hash_value_buffer[0][0][pos] = av1_get_crc_value( + &x->crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash)); + x->hash_value_buffer[1][0][pos] = av1_get_crc_value( + &x->crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash)); } } } @@ -457,24 +451,24 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, assert(srcPos + src_sub_block_in_width + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); - to_hash[0] = hash_value_buffer[0][src_idx][srcPos]; - to_hash[1] = hash_value_buffer[0][src_idx][srcPos + 1]; + to_hash[0] = x->hash_value_buffer[0][src_idx][srcPos]; + to_hash[1] = x->hash_value_buffer[0][src_idx][srcPos + 1]; to_hash[2] = - hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width]; - to_hash[3] = - hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width + 1]; + x->hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width]; + to_hash[3] = x->hash_value_buffer[0][src_idx] + [srcPos + src_sub_block_in_width + 1]; - hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value( - &crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash)); + x->hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value( + &x->crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash)); - to_hash[0] = hash_value_buffer[1][src_idx][srcPos]; - to_hash[1] = hash_value_buffer[1][src_idx][srcPos + 1]; + to_hash[0] = x->hash_value_buffer[1][src_idx][srcPos]; + to_hash[1] = x->hash_value_buffer[1][src_idx][srcPos + 1]; to_hash[2] = - hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width]; - to_hash[3] = - hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width + 1]; - hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value( - &crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash)); + x->hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width]; + to_hash[3] = x->hash_value_buffer[1][src_idx] + [srcPos + src_sub_block_in_width + 1]; + x->hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value( + &x->crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash)); dst_pos++; } } @@ -483,8 +477,6 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, sub_block_in_width >>= 1; } - *hash_value1 = (hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value; - *hash_value2 = hash_value_buffer[1][dst_idx][0]; + *hash_value1 = (x->hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value; + *hash_value2 = x->hash_value_buffer[1][dst_idx][0]; } - -#undef AOM_BUFFER_SIZE_FOR_BLOCK_HASH diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h index 8deb92eb6..df3ec3215 100644 --- a/third_party/aom/av1/encoder/hash_motion.h +++ b/third_party/aom/av1/encoder/hash_motion.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_HASH_MOTION_H_ -#define AV1_ENCODER_HASH_MOTION_H_ +#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_ +#define AOM_AV1_ENCODER_HASH_MOTION_H_ #include "config/aom_config.h" @@ -34,7 +34,7 @@ typedef struct _hash_table { Vector **p_lookup_table; } hash_table; -void av1_hash_table_init(hash_table *p_hash_table); +void av1_hash_table_init(hash_table *p_hash_table, struct macroblock *x); void av1_hash_table_destroy(hash_table *p_hash_table); void av1_hash_table_create(hash_table *p_hash_table); int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value); @@ -44,13 +44,15 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, uint32_t hash_value2); void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture, uint32_t *pic_block_hash[2], - int8_t *pic_block_same_info[3]); + int8_t *pic_block_same_info[3], + struct macroblock *x); void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture, int block_size, uint32_t *src_pic_block_hash[2], uint32_t *dst_pic_block_hash[2], int8_t *src_pic_block_same_info[3], - int8_t *dst_pic_block_same_info[3]); + int8_t *dst_pic_block_same_info[3], + struct macroblock *x); void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table, uint32_t *pic_hash[2], int8_t *pic_is_same, @@ -67,10 +69,10 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, int block_size, int x_start, int y_start); void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, uint32_t *hash_value1, uint32_t *hash_value2, - int use_highbitdepth); + int use_highbitdepth, struct macroblock *x); #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_HASH_MOTION_H_ +#endif // AOM_AV1_ENCODER_HASH_MOTION_H_ diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c index 0922557d0..67898fd18 100644 --- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c +++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c @@ -121,15 +121,45 @@ static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff, static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; - av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, - txfm_param->bd); + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + switch (tx_type) { + // use the c version for anything including identity for now + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd); + break; + default: + av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd); + break; + } } static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; - av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, - txfm_param->bd); + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + switch (tx_type) { + // use the c version for anything including identity for now + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd); + break; + default: + av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd); + break; + } } static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff, diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h index 6155b255a..daabc7119 100644 --- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h +++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_HYBRID_FWD_TXFM_H_ -#define AV1_ENCODER_HYBRID_FWD_TXFM_H_ +#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ +#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ #include "config/aom_config.h" @@ -28,4 +28,4 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, } // extern "C" #endif -#endif // AV1_ENCODER_HYBRID_FWD_TXFM_H_ +#endif // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h index 3897c2a6a..e55224cf7 100644 --- a/third_party/aom/av1/encoder/lookahead.h +++ b/third_party/aom/av1/encoder/lookahead.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_LOOKAHEAD_H_ -#define AV1_ENCODER_LOOKAHEAD_H_ +#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_ +#define AOM_AV1_ENCODER_LOOKAHEAD_H_ #include "aom_scale/yv12config.h" #include "aom/aom_integer.h" @@ -103,4 +103,4 @@ unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx); } // extern "C" #endif -#endif // AV1_ENCODER_LOOKAHEAD_H_ +#endif // AOM_AV1_ENCODER_LOOKAHEAD_H_ diff --git a/third_party/aom/av1/encoder/mathutils.h b/third_party/aom/av1/encoder/mathutils.h index 23243dd9e..64f936176 100644 --- a/third_party/aom/av1/encoder/mathutils.h +++ b/third_party/aom/av1/encoder/mathutils.h @@ -9,6 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#ifndef AOM_AV1_ENCODER_MATHUTILS_H_ +#define AOM_AV1_ENCODER_MATHUTILS_H_ + #include #include #include @@ -23,7 +26,7 @@ static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) { double c; // Forward elimination for (k = 0; k < n - 1; k++) { - // Bring the largest magitude to the diagonal position + // Bring the largest magnitude to the diagonal position for (i = n - 1; i > k; i--) { if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) { for (j = 0; j < n; j++) { @@ -352,3 +355,5 @@ static INLINE int SVD(double *U, double *W, double *V, double *matx, int M, return 0; } + +#endif // AOM_AV1_ENCODER_MATHUTILS_H_ diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c index 472173634..1a35ff77c 100644 --- a/third_party/aom/av1/encoder/mbgraph.c +++ b/third_party/aom/av1/encoder/mbgraph.c @@ -17,11 +17,12 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/system_state.h" -#include "av1/encoder/segmentation.h" -#include "av1/encoder/mcomp.h" #include "av1/common/blockd.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/segmentation.h" static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv, int mb_row, int mb_col) { @@ -140,7 +141,7 @@ static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) { // calculate SATD for each intra prediction mode; // we're intentionally not doing 4x4, we just want a rough estimate - for (mode = DC_PRED; mode <= PAETH_PRED; mode++) { + for (mode = INTRA_MODE_START; mode < INTRA_MODE_END; mode++) { unsigned int err; xd->mi[0]->mode = mode; diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h index 3e0a4fa9b..ba08476f7 100644 --- a/third_party/aom/av1/encoder/mbgraph.h +++ b/third_party/aom/av1/encoder/mbgraph.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_MBGRAPH_H_ -#define AV1_ENCODER_MBGRAPH_H_ +#ifndef AOM_AV1_ENCODER_MBGRAPH_H_ +#define AOM_AV1_ENCODER_MBGRAPH_H_ #ifdef __cplusplus extern "C" { @@ -38,4 +38,4 @@ void av1_update_mbgraph_stats(struct AV1_COMP *cpi); } // extern "C" #endif -#endif // AV1_ENCODER_MBGRAPH_H_ +#endif // AOM_AV1_ENCODER_MBGRAPH_H_ diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c index c4572a341..8f6de9b53 100644 --- a/third_party/aom/av1/encoder/mcomp.c +++ b/third_party/aom/av1/encoder/mcomp.c @@ -29,6 +29,7 @@ #include "av1/encoder/encodemv.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" // #define NEW_DIAMOND_SEARCH @@ -219,7 +220,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { thismse = upsampled_pref_error( \ xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride, \ pre(y, y_stride, r, c), y_stride, sp(c), sp(r), second_pred, mask, \ - mask_stride, invert_mask, w, h, &sse); \ + mask_stride, invert_mask, w, h, &sse, use_accurate_subpel_search); \ v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \ v += thismse; \ if (v < besterr) { \ @@ -342,19 +343,19 @@ static unsigned int setup_center_error( if (second_pred != NULL) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]); + uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16); if (mask) { - aom_highbd_comp_mask_pred(comp_pred16, second_pred, w, h, y + offset, + aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride, mask, mask_stride, invert_mask); } else { if (xd->jcp_param.use_jnt_comp_avg) - aom_highbd_jnt_comp_avg_pred(comp_pred16, second_pred, w, h, - y + offset, y_stride, &xd->jcp_param); + aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, + y_stride, &xd->jcp_param); else - aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, + aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); } - besterr = - vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } else { DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); if (mask) { @@ -648,51 +649,54 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm, int subpel_x_q3, int subpel_y_q3, const uint8_t *second_pred, const uint8_t *mask, int mask_stride, int invert_mask, int w, int h, - unsigned int *sse) { + unsigned int *sse, int subpel_search) { unsigned int besterr; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); + uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16); if (second_pred != NULL) { if (mask) { aom_highbd_comp_mask_upsampled_pred( - xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd); + xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd, + subpel_search); } else { if (xd->jcp_param.use_jnt_comp_avg) aom_highbd_jnt_comp_avg_upsampled_pred( - xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h, - subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param); + xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param, subpel_search); else - aom_highbd_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, - second_pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride, xd->bd); + aom_highbd_comp_avg_upsampled_pred( + xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride, xd->bd, subpel_search); } } else { - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h, - subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h, + subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, + subpel_search); } - - besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse); + besterr = vfp->vf(pred8, w, src, src_stride, sse); } else { DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); if (second_pred != NULL) { if (mask) { - aom_comp_mask_upsampled_pred( - xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask); + aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, + second_pred, w, h, subpel_x_q3, + subpel_y_q3, y, y_stride, mask, + mask_stride, invert_mask, subpel_search); } else { if (xd->jcp_param.use_jnt_comp_avg) aom_jnt_comp_avg_upsampled_pred( xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride, &xd->jcp_param); + subpel_y_q3, y, y_stride, &xd->jcp_param, subpel_search); else aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride); + subpel_y_q3, y, y_stride, subpel_search); } } else { aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride); + subpel_y_q3, y, y_stride, subpel_search); } besterr = vfp->vf(pred, w, src, src_stride, sse); @@ -707,10 +711,11 @@ static unsigned int upsampled_setup_center_error( const int src_stride, const uint8_t *const y, int y_stride, const uint8_t *second_pred, const uint8_t *mask, int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2], - unsigned int *sse1, int *distortion) { - unsigned int besterr = upsampled_pref_error( - xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride, y + offset, - y_stride, 0, 0, second_pred, mask, mask_stride, invert_mask, w, h, sse1); + unsigned int *sse1, int *distortion, int subpel_search) { + unsigned int besterr = + upsampled_pref_error(xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride, + y + offset, y_stride, 0, 0, second_pred, mask, + mask_stride, invert_mask, w, h, sse1, subpel_search); *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); return besterr; @@ -781,7 +786,8 @@ int av1_find_best_sub_pixel_tree( besterr = upsampled_setup_center_error( xd, cm, mi_row, mi_col, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, y_stride, second_pred, mask, mask_stride, invert_mask, w, - h, offset, mvjcost, mvcost, sse1, distortion); + h, offset, mvjcost, mvcost, sse1, distortion, + use_accurate_subpel_search); else besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, y_stride, @@ -802,7 +808,8 @@ int av1_find_best_sub_pixel_tree( thismse = upsampled_pref_error( xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride, pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred, - mask, mask_stride, invert_mask, w, h, &sse); + mask, mask_stride, invert_mask, w, h, &sse, + use_accurate_subpel_search); } else { thismse = estimate_upsampled_pref_error( xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc), @@ -837,7 +844,8 @@ int av1_find_best_sub_pixel_tree( thismse = upsampled_pref_error( xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride, pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred, - mask, mask_stride, invert_mask, w, h, &sse); + mask, mask_stride, invert_mask, w, h, &sse, + use_accurate_subpel_search); } else { thismse = estimate_upsampled_pref_error( xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc), @@ -929,8 +937,8 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x, int16_t bc = mbmi->mv[0].as_mv.col; int16_t *tr = &mbmi->mv[0].as_mv.row; int16_t *tc = &mbmi->mv[0].as_mv.col; - WarpedMotionParams best_wm_params = mbmi->wm_params[0]; - int best_num_proj_ref = mbmi->num_proj_ref[0]; + WarpedMotionParams best_wm_params = mbmi->wm_params; + int best_num_proj_ref = mbmi->num_proj_ref; unsigned int bestmse; int minc, maxc, minr, maxr; const int start = cm->allow_high_precision_mv ? 0 : 4; @@ -962,18 +970,18 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x, memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); if (total_samples > 1) - mbmi->num_proj_ref[0] = + mbmi->num_proj_ref = selectSamples(&this_mv, pts, pts_inref, total_samples, bsize); - if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, *tr, - *tc, &mbmi->wm_params[0], mi_row, mi_col)) { + if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, *tr, + *tc, &mbmi->wm_params, mi_row, mi_col)) { thismse = av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col, &this_mv); if (thismse < bestmse) { best_idx = idx; - best_wm_params = mbmi->wm_params[0]; - best_num_proj_ref = mbmi->num_proj_ref[0]; + best_wm_params = mbmi->wm_params; + best_num_proj_ref = mbmi->num_proj_ref; bestmse = thismse; } } @@ -990,8 +998,8 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x, *tr = br; *tc = bc; - mbmi->wm_params[0] = best_wm_params; - mbmi->num_proj_ref[0] = best_num_proj_ref; + mbmi->wm_params = best_wm_params; + mbmi->num_proj_ref = best_num_proj_ref; return bestmse; } @@ -2013,8 +2021,16 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, const uint8_t *mask, int mask_stride, int invert_mask, const MV *center_mv, const uint8_t *second_pred) { - const MV neighbors[8] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 }, - { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; + static const search_neighbors neighbors[8] = { + { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 }, + { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 }, + { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 }, + { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 }, + { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 }, + { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 }, + { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 }, + { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 } + }; const MACROBLOCKD *const xd = &x->e_mbd; const struct buf_2d *const what = &x->plane[0].src; const struct buf_2d *const in_what = &xd->plane[0].pre[0]; @@ -2022,6 +2038,10 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, MV *best_mv = &x->best_mv.as_mv; unsigned int best_sad = INT_MAX; int i, j; + uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P * SEARCH_GRID_STRIDE_8P] = + { 0 }; + int grid_center = SEARCH_GRID_CENTER_8P; + int grid_coord = grid_center; clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max, x->mv_limits.row_min, x->mv_limits.row_max); @@ -2043,13 +2063,20 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit); } + do_refine_search_grid[grid_coord] = 1; + for (i = 0; i < search_range; ++i) { int best_site = -1; for (j = 0; j < 8; ++j) { - const MV mv = { best_mv->row + neighbors[j].row, - best_mv->col + neighbors[j].col }; + grid_coord = grid_center + neighbors[j].coord_offset; + if (do_refine_search_grid[grid_coord] == 1) { + continue; + } + const MV mv = { best_mv->row + neighbors[j].coord.row, + best_mv->col + neighbors[j].coord.col }; + do_refine_search_grid[grid_coord] = 1; if (is_mv_in(&x->mv_limits, &mv)) { unsigned int sad; if (mask) { @@ -2079,8 +2106,9 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, if (best_site == -1) { break; } else { - best_mv->row += neighbors[best_site].row; - best_mv->col += neighbors[best_site].col; + best_mv->row += neighbors[best_site].coord.row; + best_mv->col += neighbors[best_site].coord.col; + grid_center += neighbors[best_site].coord_offset; } } return best_sad; @@ -2099,11 +2127,11 @@ static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) { } int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - MV *mvp_full, int step_param, int error_per_bit, + MV *mvp_full, int step_param, int method, + int run_mesh_search, int error_per_bit, int *cost_list, const MV *ref_mv, int var_max, int rd, int x_pos, int y_pos, int intra) { const SPEED_FEATURES *const sf = &cpi->sf; - const SEARCH_METHODS method = sf->mv.search_method; const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; int var = 0; @@ -2168,11 +2196,35 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, default: assert(0 && "Invalid search method."); } + // Should we allow a follow on exhaustive search? + if (!run_mesh_search) { + if (method == NSTEP) { + if (is_exhaustive_allowed(cpi, x)) { + int exhuastive_thr = sf->exhaustive_searches_thresh; + exhuastive_thr >>= + 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); + // Threshold variance for an exhaustive full search. + if (var > exhuastive_thr) run_mesh_search = 1; + } + } + } + + if (run_mesh_search) { + int var_ex; + MV tmp_mv_ex; + var_ex = full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit, + cost_list, fn_ptr, ref_mv, &tmp_mv_ex); + if (var_ex < var) { + var = var_ex; + x->best_mv.as_mv = tmp_mv_ex; + } + } + if (method != NSTEP && rd && var < var_max) var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1); do { - if (!av1_use_hash_me(&cpi->common)) break; + if (!intra || !av1_use_hash_me(&cpi->common)) break; // already single ME // get block size and original buffer of current block @@ -2195,7 +2247,7 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, av1_get_block_hash_value( what, what_stride, block_width, &hash_value1, &hash_value2, - x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, x); const int count = av1_hash_table_count(ref_frame_hash, hash_value1); // for intra, at lest one matching can be found, itself. @@ -2279,7 +2331,8 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV this_mv = { r, c }; \ thismse = upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, &this_mv, \ mask, vfp, z, pre(y, y_stride, r, c), \ - y_stride, sp(c), sp(r), w, h, &sse); \ + y_stride, sp(c), sp(r), w, h, &sse, \ + use_accurate_subpel_search); \ if ((v = MVC(r, c) + thismse) < besterr) { \ besterr = v; \ br = r; \ @@ -2307,18 +2360,20 @@ static int upsampled_obmc_pref_error( MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, const int32_t *mask, const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc, const uint8_t *const y, int y_stride, - int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse) { + int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse, + int subpel_search) { unsigned int besterr; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h, - subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd); - besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse); + DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h, + subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, + subpel_search); + besterr = vfp->ovf(pred8, w, wsrc, mask, sse); } else { - DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3, - subpel_y_q3, y, y_stride); + subpel_y_q3, y, y_stride, subpel_search); besterr = vfp->ovf(pred, w, wsrc, mask, sse); } @@ -2330,10 +2385,11 @@ static unsigned int upsampled_setup_obmc_center_error( const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit, const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc, const uint8_t *const y, int y_stride, int w, int h, int offset, - int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) { - unsigned int besterr = - upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc, - y + offset, y_stride, 0, 0, w, h, sse1); + int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion, + int subpel_search) { + unsigned int besterr = upsampled_obmc_pref_error( + xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc, y + offset, y_stride, 0, + 0, w, h, sse1, subpel_search); *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); return besterr; @@ -2388,11 +2444,12 @@ int av1_find_best_obmc_sub_pixel_tree_up( bestmv->row *= 8; bestmv->col *= 8; - // use_accurate_subpel_search can be 0 or 1 + // use_accurate_subpel_search can be 0 or 1 or 2 if (use_accurate_subpel_search) besterr = upsampled_setup_obmc_center_error( xd, cm, mi_row, mi_col, mask, bestmv, ref_mv, error_per_bit, vfp, z, y, - y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion); + y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion, + use_accurate_subpel_search); else besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp, z, y, y_stride, offset, mvjcost, mvcost, @@ -2408,7 +2465,8 @@ int av1_find_best_obmc_sub_pixel_tree_up( if (use_accurate_subpel_search) { thismse = upsampled_obmc_pref_error( xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address, - pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse); + pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse, + use_accurate_subpel_search); } else { thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), src_address, mask, &sse); @@ -2439,7 +2497,8 @@ int av1_find_best_obmc_sub_pixel_tree_up( if (use_accurate_subpel_search) { thismse = upsampled_obmc_pref_error( xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address, - pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse); + pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse, + use_accurate_subpel_search); } else { thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), src_address, mask, &sse); @@ -2643,11 +2702,12 @@ int obmc_diamond_search_sad(const MACROBLOCK *x, const search_site_config *cfg, return best_sad; } -int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x, - MV *mvp_full, int step_param, int sadpb, - int further_steps, int do_refine, - const aom_variance_fn_ptr_t *fn_ptr, - const MV *ref_mv, MV *dst_mv, int is_second) { +static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv, + int is_second) { const int32_t *wsrc = x->wsrc_buf; const int32_t *mask = x->mask_buf; MV temp_mv; @@ -2704,6 +2764,31 @@ int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x, return bestsme; } +int av1_obmc_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full, + int step_param, int sadpb, int further_steps, + int do_refine, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv, int is_second) { + if (cpi->sf.obmc_full_pixel_search_level == 0) { + return obmc_full_pixel_diamond(cpi, x, mvp_full, step_param, sadpb, + further_steps, do_refine, fn_ptr, ref_mv, + dst_mv, is_second); + } else { + const int32_t *wsrc = x->wsrc_buf; + const int32_t *mask = x->mask_buf; + const int search_range = 8; + *dst_mv = *mvp_full; + clamp_mv(dst_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + int thissme = obmc_refining_search_sad( + x, wsrc, mask, dst_mv, sadpb, search_range, fn_ptr, ref_mv, is_second); + if (thissme < INT_MAX) + thissme = get_obmc_mvpred_var(x, wsrc, mask, dst_mv, ref_mv, fn_ptr, 1, + is_second); + return thissme; + } +} + // Note(yunqingwang): The following 2 functions are only used in the motion // vector unit test, which return extreme motion vectors allowed by the MV // limits. diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h index 539e8f4e4..a975218b0 100644 --- a/third_party/aom/av1/encoder/mcomp.h +++ b/third_party/aom/av1/encoder/mcomp.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_MCOMP_H_ -#define AV1_ENCODER_MCOMP_H_ +#ifndef AOM_AV1_ENCODER_MCOMP_H_ +#define AOM_AV1_ENCODER_MCOMP_H_ #include "av1/encoder/block.h" #include "aom_dsp/variance.h" @@ -31,6 +31,11 @@ extern "C" { // for Block_16x16 #define BORDER_MV_PIXELS_B16 (16 + AOM_INTERP_EXTEND) +#define SEARCH_RANGE_8P 3 +#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1) +#define SEARCH_GRID_CENTER_8P \ + (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P) + // motion search site typedef struct search_site { MV mv; @@ -43,6 +48,11 @@ typedef struct search_site_config { int searches_per_step; } search_site_config; +typedef struct { + MV coord; + int coord_offset; +} search_neighbors; + void av1_init_dsmotion_compensation(search_site_config *cfg, int stride); void av1_init3smotion_compensation(search_site_config *cfg, int stride); @@ -120,14 +130,15 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, - int error_per_bit, int *cost_list, const MV *ref_mv, - int var_max, int rd, int x_pos, int y_pos, int intra); - -int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x, - MV *mvp_full, int step_param, int sadpb, - int further_steps, int do_refine, - const aom_variance_fn_ptr_t *fn_ptr, - const MV *ref_mv, MV *dst_mv, int is_second); + int method, int run_mesh_search, int error_per_bit, + int *cost_list, const MV *ref_mv, int var_max, int rd, + int x_pos, int y_pos, int intra); + +int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv, int is_second); int av1_find_best_obmc_sub_pixel_tree_up( MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col, MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit, @@ -147,4 +158,4 @@ unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi, } // extern "C" #endif -#endif // AV1_ENCODER_MCOMP_H_ +#endif // AOM_AV1_ENCODER_MCOMP_H_ diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c index 3a27e5845..d21def43a 100644 --- a/third_party/aom/av1/encoder/ml.c +++ b/third_party/aom/av1/encoder/ml.c @@ -10,7 +10,9 @@ */ #include +#include +#include "aom_dsp/aom_dsp_common.h" #include "av1/encoder/ml.h" void av1_nn_predict(const float *features, const NN_CONFIG *nn_config, @@ -55,3 +57,17 @@ void av1_nn_predict(const float *features, const NN_CONFIG *nn_config, weights += num_input_nodes; } } + +void av1_nn_softmax(const float *input, float *output, int n) { + // Softmax function is invariant to adding the same constant + // to all input values, so we subtract the maximum input to avoid + // possible overflow. + float max_inp = input[0]; + for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]); + float sum_out = 0.0f; + for (int i = 0; i < n; i++) { + output[i] = (float)exp(input[i] - max_inp); + sum_out += output[i]; + } + for (int i = 0; i < n; i++) output[i] /= sum_out; +} diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h index 614cb60bb..cb8ef2871 100644 --- a/third_party/aom/av1/encoder/ml.h +++ b/third_party/aom/av1/encoder/ml.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_ML_H_ -#define AV1_ENCODER_ML_H_ +#ifndef AOM_AV1_ENCODER_ML_H_ +#define AOM_AV1_ENCODER_ML_H_ #ifdef __cplusplus extern "C" { @@ -37,8 +37,13 @@ typedef struct { void av1_nn_predict(const float *features, const NN_CONFIG *nn_config, float *output); +// Applies the softmax normalization function to the input +// to get a valid probability distribution in the output: +// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k])) +void av1_nn_softmax(const float *input, float *output, int n); + #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_RD_H_ +#endif // AOM_AV1_ENCODER_ML_H_ diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h index bbdd50784..8b88c4755 100644 --- a/third_party/aom/av1/encoder/palette.h +++ b/third_party/aom/av1/encoder/palette.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_PALETTE_H_ -#define AV1_ENCODER_PALETTE_H_ +#ifndef AOM_AV1_ENCODER_PALETTE_H_ +#define AOM_AV1_ENCODER_PALETTE_H_ #include "av1/common/blockd.h" @@ -93,4 +93,4 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, } // extern "C" #endif -#endif /* AV1_ENCODER_PALETTE_H_ */ +#endif // AOM_AV1_ENCODER_PALETTE_H_ diff --git a/third_party/aom/av1/encoder/partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h index 279d39495..437ea43f9 100644 --- a/third_party/aom/av1/encoder/partition_model_weights.h +++ b/third_party/aom/av1/encoder/partition_model_weights.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_ -#define AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_ +#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ #ifdef __cplusplus extern "C" { @@ -1314,204 +1314,112 @@ static const NN_CONFIG av1_ab_partition_nnconfig_16 = { #define FEATURE_SIZE 18 #define LABEL_SIZE 4 -static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 48] = { - 0.121894f, 0.058485f, 0.702226f, 0.015457f, -0.123380f, -0.573450f, - 0.319576f, 0.118808f, 0.166057f, 0.526984f, 0.015211f, -0.025050f, - 0.085717f, -0.028221f, -0.580062f, -0.270530f, -0.092371f, 0.037679f, - 0.083573f, 0.007112f, -0.358623f, -0.264443f, -0.064819f, 0.022013f, - -0.040077f, -0.291967f, -0.293100f, 0.072266f, -0.270572f, -0.292253f, - -0.260105f, -0.294472f, -0.275752f, 0.054315f, 0.000085f, 0.105115f, - -0.363572f, -0.016542f, 0.185943f, -0.359903f, 0.038765f, -0.377668f, - 0.172692f, 0.127749f, -0.031275f, -0.242528f, -0.145880f, -0.055247f, - -0.000265f, -0.355224f, 0.089917f, -0.377841f, -0.209766f, 0.030899f, - 0.039546f, -0.375030f, -0.041605f, 0.137677f, 0.021282f, -0.150442f, - -0.189445f, 0.009293f, -0.316033f, 0.038745f, -0.278761f, 0.005692f, - -0.071763f, -0.302936f, -0.224572f, -0.211841f, 0.057503f, 0.005435f, - -0.930979f, 0.115513f, 0.689958f, 0.221318f, 1.003891f, 0.359540f, - -0.640534f, -0.162373f, -0.118105f, 0.205587f, 0.019710f, 0.025067f, - -0.025344f, 0.002831f, 0.033078f, 0.040175f, -0.007502f, 0.026272f, - 0.083443f, -0.880884f, 0.436948f, 0.293297f, 0.051678f, -0.133328f, - -0.180323f, 0.667835f, 0.070733f, -0.003060f, -0.221804f, 0.146601f, - 0.064024f, 0.056758f, -0.077361f, 0.105587f, -0.185500f, -0.133552f, - 0.138269f, 0.165055f, 0.628284f, 0.846449f, 0.058825f, 0.223157f, - 0.277896f, -0.381303f, 0.408241f, 0.643301f, 0.067494f, 0.120822f, - -0.182491f, -0.111373f, -0.033374f, 0.131387f, -0.114654f, 0.114318f, - 0.094718f, -0.052232f, 0.385903f, 1.212304f, 0.425305f, -0.052993f, - 0.291474f, -0.319730f, 0.023090f, -0.317259f, 0.011181f, -0.034185f, - -0.100671f, 0.186185f, -0.432511f, -0.115957f, -0.067746f, -0.177810f, - -0.226700f, 0.004464f, 0.006809f, 0.171360f, -0.080723f, 0.099826f, - -0.062301f, -0.358755f, -0.202549f, -0.084616f, -0.042313f, -0.325560f, - 0.010452f, -0.341089f, -0.013566f, -0.340129f, 0.034675f, -0.036518f, - -0.036473f, -0.192892f, 0.650235f, 0.609437f, -0.160982f, 0.125535f, - -1.004575f, 0.521969f, 1.318091f, 0.614004f, -0.106622f, -0.077453f, - -0.037328f, -0.081940f, 0.007640f, 0.026654f, -0.080332f, -0.077356f, - -0.288170f, -0.319680f, -0.131712f, -0.150985f, 0.073218f, 0.089502f, - -0.280502f, 0.003941f, -0.249937f, 0.244263f, 0.023269f, 0.080263f, - 0.073172f, -0.200036f, 0.022381f, 0.008592f, -0.339517f, -0.135073f, - 0.177199f, 0.208363f, 0.652360f, 0.272990f, 0.609535f, 0.145805f, - 0.022527f, -0.088378f, 0.205008f, 0.101021f, -0.019673f, -0.252681f, - 0.116034f, -0.062052f, 0.009991f, 0.138933f, -0.182428f, 0.052542f, - -0.350825f, -0.122654f, -0.154687f, 0.066747f, 0.021541f, -0.212169f, - -0.087093f, -0.087488f, 0.178129f, -0.146544f, 0.013919f, -0.273899f, - 0.223753f, -0.187327f, -0.118795f, -0.191892f, -0.355979f, 0.023794f, - -0.135236f, 0.058918f, 0.069080f, 0.279287f, 0.369689f, 1.134526f, - 0.659511f, 0.250223f, 0.286040f, 0.515284f, 0.067791f, -0.156385f, - 0.143283f, 0.050884f, 0.089956f, -0.040850f, -0.003650f, -0.081162f, - 0.086004f, 0.116578f, 0.826254f, 0.504869f, -0.196022f, -0.207279f, - 0.200503f, -0.196801f, 0.008211f, 0.411158f, -0.075855f, -0.036690f, - 0.111519f, -0.057838f, -0.005846f, 0.111067f, 0.174712f, -0.078054f, - 0.765897f, 0.018670f, -0.306960f, -0.020034f, -0.332875f, 0.662707f, - -0.461233f, -1.007542f, -0.693995f, -1.243352f, -0.014745f, 0.004036f, - -0.009141f, 0.003325f, -0.011233f, -0.000819f, 0.006369f, 0.002418f, - -0.035906f, -0.005135f, 1.073830f, 1.020736f, -0.182611f, -1.038976f, - -0.226695f, -0.375663f, 0.364568f, 0.620995f, -0.018615f, 0.011347f, - 0.045786f, 0.041077f, 0.010886f, -0.148428f, 0.028007f, -0.022322f, - -0.165985f, 0.233315f, -0.277531f, -0.329683f, -0.516967f, -0.390750f, - 0.006948f, 0.133744f, -0.375681f, -0.116877f, -0.009441f, -0.008597f, - -0.160679f, 0.102150f, -0.142647f, -0.117501f, 0.035035f, 0.228687f, - -1.117397f, -0.005171f, -0.008708f, 0.413042f, -0.298532f, 0.614909f, - -0.181084f, -0.711770f, 0.344033f, 0.287220f, -0.112848f, -0.052866f, - -0.222466f, 0.025029f, -0.107558f, 0.137036f, -0.276661f, -0.038808f, - -0.057448f, 0.037563f, 0.526020f, 0.447997f, 0.288366f, 0.264815f, - 0.319974f, -0.193091f, 0.353830f, 0.412950f, -0.280454f, 0.092737f, - 0.070919f, 0.043336f, 0.041214f, -0.052147f, 0.010860f, 0.191325f, - 0.079783f, -0.425672f, -0.053469f, -0.005495f, 0.184526f, -0.166171f, - 0.084459f, -0.042165f, -0.261759f, -0.248723f, -0.073483f, -0.377884f, - -0.189614f, -0.054146f, -0.261279f, 0.196347f, -0.087568f, 0.070533f, - -0.145492f, -0.041500f, -0.465861f, 0.077369f, 0.020645f, -0.440232f, - -0.414585f, -0.168627f, -0.050011f, -0.336676f, -0.344943f, -0.288140f, - 0.085513f, -0.200425f, 0.218516f, 0.049604f, -0.280952f, -0.242674f, - -1.969931f, 0.013374f, -0.039643f, 1.113947f, 0.018568f, 0.916330f, - -0.302934f, -0.225816f, 0.189529f, -0.361971f, 0.021073f, -0.050143f, - -0.041415f, 0.015126f, 0.018091f, -0.082401f, 0.017152f, 0.064856f, - 0.156170f, 0.145323f, -0.281409f, 0.213357f, -0.058966f, 0.158668f, - 0.033742f, 0.378820f, -0.662875f, -0.455532f, -0.702928f, 0.234325f, - 0.139627f, -1.360650f, 0.040921f, -0.044373f, -0.059999f, -0.048565f, - 0.115339f, -0.105888f, -0.170567f, -0.206097f, -0.349537f, 0.107941f, - -0.356286f, -0.374928f, 0.143257f, -0.317790f, 0.079875f, -0.359345f, - 0.081321f, -0.219772f, -0.077213f, 0.110624f, -0.252329f, -0.266481f, - 0.190135f, 0.121214f, 0.661064f, -0.037820f, -0.373068f, -0.065209f, - -0.286154f, -0.120695f, -0.110670f, -0.193589f, -0.010867f, -0.048054f, - -0.032010f, 0.110627f, 0.054094f, -0.884309f, -1.171623f, -0.386911f, - -0.756058f, 0.030362f, 0.563628f, -0.334227f, -0.111213f, 1.143898f, - -0.940454f, 0.084510f, 0.671010f, 0.312244f, -0.052592f, -0.014376f, - 0.039965f, -0.010763f, -0.114936f, -0.146020f, 0.015874f, 0.027439f, - -1.702315f, 0.148702f, 0.153021f, 0.363147f, -0.488933f, 0.220772f, - 0.640310f, -0.173911f, -0.169523f, -0.082261f, -0.014854f, 0.024414f, - 0.061041f, -0.013998f, 0.086539f, 0.000466f, 0.037472f, -0.010665f, - -0.326646f, 0.106971f, 0.405589f, 0.555345f, -0.318315f, 0.526498f, - 0.119246f, 0.022213f, 0.171237f, 0.214651f, 0.062904f, -0.023764f, - 0.011831f, 0.079644f, -0.096530f, -0.054373f, -0.306309f, -0.203709f, - -0.353217f, -0.350005f, -0.329549f, 0.062679f, -0.387625f, -0.237111f, - -0.025050f, -0.193987f, 0.002235f, -0.380821f, -0.051036f, -0.136020f, - 0.077989f, -0.361691f, 0.120485f, 0.157746f, 0.073394f, -0.284401f, - 0.113221f, 0.109808f, 0.000197f, 0.122523f, 0.081411f, -0.048544f, - -0.136577f, -0.007158f, -0.208952f, -0.276831f, 0.260479f, -1.392915f, - -0.865248f, 0.114577f, -0.000749f, -0.060338f, -0.091176f, -0.108421f, - 0.221256f, 0.100176f, -0.877560f, -1.248838f, 0.643005f, 0.064580f, - -0.049878f, 0.267988f, -0.434340f, -0.299254f, -0.097572f, 0.009606f, - 0.063810f, -0.090525f, 0.027760f, 0.043484f, 0.041697f, 0.108024f, - -0.359586f, -0.197090f, 0.121397f, 0.152206f, -0.391126f, -0.283145f, - 0.008754f, -0.059022f, -0.218745f, 0.043042f, -0.056716f, 0.153051f, - -0.210372f, -0.029681f, -0.288354f, 0.065242f, -0.189376f, 0.115013f, - -0.251488f, -0.533091f, 0.037768f, -0.319107f, -0.161364f, -0.103967f, - 0.063271f, -0.313289f, -0.312093f, -0.045239f, 0.150607f, 0.001487f, - 0.019602f, -0.338031f, -0.036214f, 0.112736f, -0.367762f, 0.122367f, - 0.094670f, 0.175590f, 0.301041f, -0.135257f, 0.539620f, 0.328619f, - -0.163971f, 0.137256f, 0.238805f, 0.483722f, 0.121353f, 0.083630f, - -0.283568f, 0.291661f, -0.061122f, -0.195295f, 0.153459f, -0.153727f, - -0.238839f, -0.071736f, 0.601437f, -0.664072f, 0.230827f, 0.198753f, - -0.039196f, 0.206751f, 0.529020f, 0.904132f, -0.219471f, 0.186694f, - -0.208608f, -0.093385f, -0.161617f, 0.003930f, -0.429869f, -0.123563f, - 0.626098f, -0.002495f, -0.245511f, -1.069848f, 0.296115f, -0.940267f, - -1.649122f, -0.512937f, -0.802874f, -1.000239f, -0.027629f, 0.020434f, - -0.003030f, 0.035986f, -0.004812f, -0.009193f, -0.004644f, -0.024347f, - 0.068439f, -0.314339f, 0.095057f, -0.212372f, 0.197523f, -0.040878f, - -0.272164f, -0.243326f, -0.204955f, 0.157199f, -0.049964f, -0.091537f, - -0.058012f, -0.306650f, 0.098621f, -0.146778f, -0.154447f, -0.177889f, - -0.009698f, 0.025427f, 0.350576f, -0.448237f, -0.068823f, 1.224960f, - -0.776883f, -0.692167f, -0.948497f, -0.492598f, 0.029440f, -0.056460f, - 0.021654f, 0.004352f, 0.041508f, -0.027179f, 0.006789f, -0.023573f, - 0.207775f, -0.280273f, -0.347984f, -0.129935f, 0.151512f, -0.087294f, - -0.494352f, -0.341424f, 0.044084f, -0.064080f, 0.073091f, -0.145574f, - 0.094715f, -0.258786f, -0.020419f, -0.401823f, 0.009397f, -0.138642f, - -0.034953f, -0.077419f, 0.636610f, 0.314980f, 1.110610f, -0.343368f, - 0.696647f, -0.649667f, 0.653491f, -0.096006f, -0.090469f, -0.066975f, - -0.105864f, -0.015666f, 0.102056f, -0.105344f, -0.273495f, -0.014686f, - 0.122031f, 0.139524f, -1.042029f, -0.562510f, 0.885644f, 1.088059f, - 0.189223f, 0.049404f, -0.167371f, 0.018703f, -0.208390f, -0.159002f, - -0.377130f, -0.151118f, 0.117861f, 0.026986f, -0.032433f, 0.081603f, - -0.106729f, -0.040134f, 0.015161f, 0.290572f, 0.241446f, 1.390085f, - 0.438915f, -0.358097f, -0.171799f, 0.879758f, -0.014110f, 0.029562f, - -0.073583f, -0.125817f, -0.036512f, -0.040275f, 0.037997f, 0.120979f, - 0.064538f, -0.038841f, 0.034797f, 0.110229f, -0.239779f, -0.004558f, - 0.226534f, 0.111286f, -0.268198f, 0.237673f, -0.328237f, -0.090774f, - -0.269690f, -0.202147f, -0.181808f, -0.305238f, 0.110058f, -0.169217f, - -0.300125f, 0.069031f, -0.081358f, -0.376174f, -0.349980f, 0.071443f, - -0.396278f, -0.389503f, -0.190410f, -0.014767f, -0.265229f, -0.099787f, - 0.079847f, -0.214580f, -0.235661f, -0.184227f, 0.111099f, -0.083945f, - -0.153809f, -0.284092f, -0.132497f, -0.154841f, -0.517157f, -0.640603f, - -0.357036f, -0.486142f, -0.182819f, -0.475022f, 0.079282f, 0.081168f, - -0.120831f, -0.016048f, -0.232495f, 0.214329f, -0.055058f, 0.032856f, - 0.061753f, 0.003226f, 0.097028f, 0.084535f, -1.563199f, 0.434928f, - -0.403710f, 0.520696f, -0.401696f, 0.450568f, -0.074121f, 0.076622f, - -0.098421f, 0.167036f, -0.255250f, -0.526313f, -0.933693f, -0.558104f, - 0.194341f, 0.173326f, 0.071112f, -0.651961f, -1.327587f, -0.705289f, - -1.138889f, 0.197167f, -0.714654f, -0.113891f, 0.080158f, 0.000301f, - 0.057905f, 0.060718f, -0.635995f, 0.100026f, -0.038239f, -0.025530f, -}; - -static const float av1_4_partition_nn_bias_16_layer0[48] = { - -0.079252f, -0.083606f, -0.112759f, -0.071622f, 0.444562f, 0.215649f, - -0.337661f, -0.242379f, -0.053829f, 0.165168f, -0.076613f, -0.190579f, - -0.060175f, -0.571661f, -0.454075f, -1.462711f, -0.161563f, -0.088748f, - -0.030279f, -0.456293f, -0.134473f, -0.194976f, 0.044373f, -0.503954f, - -0.083563f, 0.123344f, 0.011821f, 0.085445f, -0.050294f, -0.135194f, - 0.057815f, 0.543558f, -0.090602f, -0.104671f, -0.285075f, 0.354335f, - 1.037007f, -0.023879f, -0.025025f, -0.094408f, -0.101200f, -0.142105f, - -0.380607f, -0.059067f, -0.113017f, -0.137448f, -0.177840f, 0.468505f, -}; - -static const float av1_4_partition_nn_weights_16_layer1[48 * LABEL_SIZE] = { - 0.174954f, -0.239117f, 0.073252f, 0.258881f, 0.579781f, 0.441827f, - 0.372037f, -0.062362f, 0.068477f, 0.376811f, -0.130520f, 0.214951f, - -0.200674f, 0.240347f, 0.152954f, 1.360264f, 0.334630f, -0.064789f, - -0.270826f, 0.212699f, 0.045669f, -0.150852f, -0.412603f, 0.122481f, - -0.230246f, 0.005004f, 0.321417f, -0.554083f, -0.186742f, -0.197687f, - -0.028669f, -0.138559f, -0.117773f, 0.024953f, 0.326367f, -0.109951f, - -1.098959f, -0.136134f, 0.563218f, 0.191799f, 0.126191f, -0.093113f, - 0.185371f, 0.058468f, 0.245247f, -0.138064f, -0.471573f, -0.209372f, - -0.111171f, 0.222275f, -0.350556f, -0.106336f, 0.268877f, 0.090639f, - -0.083008f, -0.190791f, -0.243922f, -0.121182f, -0.133733f, -0.078450f, - 0.099751f, 0.353020f, -0.199079f, -0.463492f, -0.647884f, 0.166611f, - -0.464034f, 0.045096f, -0.312178f, -0.190972f, -0.468297f, 0.662376f, - -0.197071f, -0.653123f, -0.354365f, -0.088501f, -0.302671f, 0.140713f, - 0.885444f, 0.350273f, -0.003345f, 0.217260f, 0.219156f, 0.240653f, - 0.347840f, 0.101849f, -0.244565f, -0.166971f, 0.091056f, 0.319912f, - 0.268459f, 0.250726f, -0.155819f, -0.087588f, 0.010749f, -0.192344f, - 0.344808f, 0.223482f, -0.189563f, -0.067317f, -0.348191f, -0.085265f, - 0.259318f, 0.102408f, 0.096675f, -0.255564f, -0.168480f, -0.068189f, - -0.457704f, 0.010565f, 0.228573f, -0.124421f, 0.202488f, 0.148519f, - 0.002180f, 0.099099f, -0.179019f, 0.245414f, -0.038307f, 0.116897f, - -0.031377f, 0.368533f, -0.793891f, 0.148614f, 0.075441f, 0.102465f, - -0.310002f, -0.355369f, -0.206713f, -0.262276f, 0.068578f, -0.044980f, - 0.092689f, -0.181058f, 0.016279f, 0.155965f, 0.545361f, -0.390699f, - -0.042457f, 0.110238f, 0.114640f, 0.112525f, 0.522221f, 0.533164f, - -0.331720f, -0.212966f, 0.140823f, 0.251311f, -0.006092f, -0.800438f, - 0.007981f, -0.585140f, -0.006526f, 0.541683f, -0.298498f, 0.084322f, - -0.056467f, -0.361806f, -0.256347f, -1.419173f, -0.159093f, 0.023017f, - 0.667915f, -0.176995f, 0.022307f, -0.169493f, 0.581377f, 0.044929f, - 0.044914f, -0.056290f, 0.324196f, 0.648043f, -0.089381f, -0.054971f, - 0.064782f, 0.629356f, -0.003760f, -0.123822f, 0.144133f, -0.378821f, - 1.116858f, 0.128552f, -0.668783f, 0.207194f, -0.437781f, -0.283321f, - -0.549404f, 0.010538f, 0.208997f, 0.231396f, -0.174347f, 0.161910f, +static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = { + -2.032866f, 0.056691f, 0.495960f, 0.778785f, 0.548153f, -0.806942f, + 0.481155f, 0.282298f, 0.584980f, 0.504688f, 0.209648f, 0.234616f, + 0.213484f, 0.221969f, 0.205862f, 0.235054f, 0.317863f, 0.257139f, + 0.529478f, 0.098122f, -0.657532f, 0.036296f, 0.327728f, 1.323180f, + -0.813082f, 0.160216f, -0.702030f, 0.722733f, -0.270576f, -0.347416f, + -0.264700f, -0.254248f, 0.159820f, 0.087995f, -0.184163f, 0.117357f, + 0.074194f, -0.667369f, 0.498246f, 0.420506f, 0.072409f, -0.121581f, + 0.315788f, 0.000525f, 0.414986f, 0.678166f, -0.011230f, 0.188131f, + -0.227749f, 0.009564f, 0.108672f, 0.106923f, -0.080695f, -0.279382f, + -0.061339f, -0.297835f, -0.134707f, 0.145865f, -0.009655f, -0.000842f, + -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f, + -0.143413f, 0.279668f, 0.000885f, -0.022380f, -0.140194f, -0.310473f, + 0.252699f, 0.066204f, 0.477568f, 0.994609f, -0.276000f, 1.213182f, + 0.277028f, -0.411570f, -0.211559f, 0.377815f, 0.121488f, -0.100559f, + -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f, + -0.143196f, -0.334035f, 0.162305f, 0.142279f, -0.001210f, -0.135252f, + -0.033562f, 0.204307f, -0.039757f, -0.394174f, 0.126617f, -0.128648f, + -0.410979f, 0.107641f, -0.117573f, -0.326512f, 0.235166f, 0.084959f, + 0.290063f, -0.005838f, 0.459894f, 1.023709f, -0.196145f, 1.100137f, + -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f, + -0.029743f, 0.125113f, -0.131024f, -0.321458f, -0.143996f, 0.008714f, + -0.101234f, 0.079706f, -1.128615f, -0.467381f, 0.220563f, -0.409900f, + -0.435353f, 0.759499f, -0.465799f, -0.394309f, 0.176282f, -0.086275f, + -0.161225f, -0.354814f, 0.562871f, 0.418253f, 0.414361f, 0.445480f, + -0.995903f, -0.086632f, -0.230645f, 0.354656f, -0.317576f, 0.079926f, + 0.424369f, 0.997232f, -0.304388f, 1.071667f, -0.023540f, 0.029677f, + 0.108564f, 0.183581f, -0.201395f, -0.054854f, -0.193039f, -0.049899f, + -0.271949f, -0.358483f, 0.304930f, 0.023823f, -0.009319f, -0.214247f, + 0.100712f, -0.050162f, 0.327103f, -0.212999f, -0.030496f, 0.316380f, + -0.439589f, -0.249959f, 0.229777f, -0.353664f, -0.384559f, 0.114236f, + 0.023119f, 0.007927f, 0.618368f, 0.957759f, -0.019780f, -1.002389f, + 0.564277f, -0.839531f, 1.040445f, 0.054340f, 0.031908f, -0.032893f, + -0.019170f, -0.042011f, 0.568928f, 0.362567f, -0.559999f, -0.605344f, + -0.586146f, -0.290778f, 0.195943f, -0.109580f, -0.088898f, -0.113054f, + 0.293282f, 0.429019f, 0.306136f, 0.863025f, 0.021234f, 0.125770f, + -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f, 0.064151f, + 0.029883f, 0.076287f, 0.757543f, 0.276713f, -2.529775f, -0.351727f, + -1.832316f, 0.544780f, -0.944529f, 0.509705f, -0.010236f, -0.016181f, + 0.021520f, 0.086417f, 0.041312f, 0.296853f, -0.372378f, 0.354446f, + -1.366762f, 0.048875f, 0.464918f, -0.007450f, 0.750013f, -0.360261f, + 0.518532f, 0.753776f, 0.641448f, 0.710746f, 0.250866f, 0.257063f, + 0.283421f, 0.253585f, 0.170303f, 0.210426f, 0.208842f, 0.158000f, + -0.033144f, 0.130748f, 0.907147f, 0.409248f, -0.854301f, -0.981307f, + 0.294427f, -0.507137f, 1.079967f, 0.203203f, 0.383890f, 0.368278f, + 0.305122f, 0.449288f, -0.044507f, -0.547263f, -0.298245f, -0.497834f, + 0.007016f, -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f, + 0.033502f, -0.018578f, -0.231531f, 0.177949f, 0.099564f, -0.010233f, + -0.333055f, -0.078586f, -0.417867f, 0.171271f, 0.013662f, -0.143599f, + -0.117296f, 0.135382f, 0.048321f, 0.000924f, -0.055024f, -0.405595f, + -0.068260f, -0.271011f, -0.436425f, 0.206751f, -0.899890f, 0.605510f, + 0.535649f, -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f, + 0.004660f, 0.176644f, -1.008475f, -0.038895f, 0.155429f, -0.095229f, + -0.680124f, -0.258063f, -0.261901f, 0.110380f, -0.337649f, -0.505870f, + -1.428536f, 0.610629f, 0.254905f, 0.045098f, 0.044109f, 0.172329f, + 0.060001f, -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f, + -0.312134f, 0.081261f, 0.160166f, 0.112690f, 0.266081f, 0.030175f, + -0.242746f, 0.000754f, -0.341811f, -0.149774f, -0.017484f, -0.301342f, + -0.121466f, 0.067300f, 0.342176f, 0.474538f, 0.085441f, -0.263935f, + 0.479235f, -0.003713f, -0.784840f, 0.119480f, 0.456632f, -0.640082f, + -0.080575f, -0.744403f, 0.259970f, 0.034667f, -0.274641f, -0.257594f, + -1.121124f, -0.003745f, -0.420693f, 0.300441f, -0.100976f, -1.049016f, + 0.201960f, 0.113054f, 0.187010f, 1.237427f, 0.054803f, -0.028673f, + 0.003596f, -0.034724f, 0.117246f, 0.190977f, 0.278915f, 0.224307f, + 0.017852f, -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f, + 0.045698f, -0.301095f, 0.184447f, 0.348240f, -0.017021f, -0.145064f, + -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f, + 0.647597f, -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f, + -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f, + -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f, + -0.971412f, 0.038542f, 0.705204f, 0.887113f, 0.150430f, -0.243676f, + 0.638410f, 0.320953f, 0.776676f, 0.527584f, 0.070389f, 0.051554f, + 0.177519f, 0.140451f, 0.128892f, 0.087771f, 0.197660f, 0.194764f, +}; + +static const float av1_4_partition_nn_bias_16_layer0[24] = { + 0.614063f, -0.384872f, 0.084884f, -0.023980f, -0.378765f, -0.082312f, + -0.458271f, 0.189578f, -0.046169f, -0.073308f, -0.372322f, 0.162793f, + 0.148803f, 0.829214f, -0.221162f, -0.111157f, -0.017484f, -0.280596f, + -0.031905f, -0.143459f, 0.078823f, -0.021940f, 0.026834f, 0.257472f, +}; + +static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = { + -0.985391f, 0.587616f, 0.740683f, 0.192066f, 0.447080f, -0.016585f, + 0.680449f, 0.028983f, 0.643111f, 0.234338f, 0.107148f, 0.328456f, + -0.216394f, 1.106838f, -0.179062f, -0.129108f, -0.121655f, -0.151340f, + -0.306017f, -0.350989f, 0.859284f, -0.372831f, -0.954419f, 0.250495f, + 1.046732f, 0.287923f, -0.421088f, 0.326613f, -0.314396f, -0.084757f, + -0.474228f, 0.687999f, 0.052334f, 0.441708f, -0.630698f, -0.350348f, + -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f, 0.603119f, + 0.120245f, 0.182920f, -1.117797f, -0.239594f, -0.296296f, -0.718093f, + 0.489497f, -0.527019f, 0.102453f, 0.426731f, 0.034606f, 0.311461f, + -0.012723f, -0.229877f, -0.284290f, 0.383227f, 0.065696f, -0.222400f, + 1.279248f, -0.862190f, 0.629766f, -0.250011f, -0.325060f, -0.360115f, + -0.159540f, -0.291856f, -0.038348f, 0.224639f, 0.600934f, 0.030205f, + 1.337615f, -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f, + -0.481860f, 0.563625f, -0.154709f, -0.101198f, -0.758796f, -0.507616f, + -0.095253f, -0.711135f, 0.207759f, 0.076313f, -0.056087f, -0.162719f, + -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f, -1.504446f, }; static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = { - -0.197883f, - -0.136696f, - 0.094115f, - 0.612799f, + -0.462133f, + 0.465060f, + 0.062211f, + 0.401786f, }; static const NN_CONFIG av1_4_partition_nnconfig_16 = { @@ -1519,7 +1427,7 @@ static const NN_CONFIG av1_4_partition_nnconfig_16 = { LABEL_SIZE, // num_outputs 1, // num_hidden_layers { - 48, // num_hidden_nodes + 24, // num_hidden_nodes }, { av1_4_partition_nn_weights_16_layer0, @@ -1532,143 +1440,143 @@ static const NN_CONFIG av1_4_partition_nnconfig_16 = { }; static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = { - 0.114554f, 0.043669f, 0.313291f, 0.167688f, -0.413357f, 0.088232f, - 0.301915f, -0.358117f, 0.267711f, -0.252716f, -0.038531f, -0.032805f, - -0.025382f, 0.023624f, -0.949694f, -0.065480f, -0.375721f, -0.697319f, - -0.117387f, -0.204309f, -0.190797f, -0.223867f, -0.190248f, 0.026668f, - 0.199717f, 0.216902f, -0.239241f, -0.096894f, -0.225046f, 0.246523f, - 0.002333f, -0.254385f, -0.205815f, 0.123139f, -0.476923f, 0.137557f, - 0.059686f, -0.124013f, 0.974675f, 0.889753f, 0.378940f, 0.526413f, - -0.208747f, -0.001913f, 0.094081f, 0.848010f, 0.062042f, 0.159831f, - 0.071016f, 0.024437f, 0.212611f, 0.039501f, -0.149922f, -0.055229f, - -0.229270f, 0.129004f, -0.182803f, 0.291223f, -1.197804f, -0.916991f, - -0.024095f, 0.738729f, -0.300326f, 0.402480f, 0.023944f, -0.022613f, - -0.004554f, 0.001784f, 0.035143f, -0.202237f, 0.080252f, -0.003912f, - -0.040345f, -0.121881f, 0.126672f, 0.093507f, -0.081305f, -0.081099f, - -0.218824f, -0.459254f, -0.055250f, -0.095096f, 0.207278f, 0.245259f, - -0.380849f, -0.334458f, -0.351449f, -0.513045f, -0.407823f, -0.222423f, - 0.103205f, -0.299965f, -0.211472f, -0.348690f, -0.283688f, -0.152743f, - -0.204005f, -0.173636f, 0.020302f, -0.109112f, 0.081203f, -0.137344f, - -0.364582f, -0.343133f, -0.176167f, -0.446541f, 0.144844f, -0.268105f, - -0.003889f, -0.309560f, -0.236092f, -0.299450f, 0.248269f, 0.207510f, - -0.279023f, -0.272472f, -0.166427f, 0.205973f, -0.345692f, -0.238400f, - -0.319178f, -0.327246f, -0.321756f, 0.043191f, -0.027520f, -0.029310f, - 0.161379f, 0.031154f, -0.605365f, -0.230926f, 0.261142f, -0.262678f, - -0.373351f, -0.326245f, 0.279222f, 0.684357f, -0.864302f, 0.036132f, - 0.239307f, 0.136262f, 0.124002f, -0.410379f, -0.172722f, -0.376670f, - -0.195889f, 0.037292f, -0.055295f, 1.022308f, 0.237600f, -0.618435f, - 0.366154f, 0.168308f, -0.473467f, -0.756558f, -0.044830f, 0.019057f, - -0.084214f, -0.007789f, -0.066028f, -0.074562f, 0.002082f, 0.001007f, - -0.269676f, -0.164768f, -0.027271f, -0.098935f, 0.009431f, 0.254431f, - 0.124238f, -0.198181f, 0.142723f, -0.112997f, -0.164224f, -0.355160f, - 0.135330f, -0.379557f, 0.079392f, 0.210607f, -0.354927f, -0.277678f, - -0.931111f, 0.056208f, -0.347710f, -0.355415f, 0.826145f, 0.390625f, - 0.374414f, -0.205685f, 0.562485f, 0.152288f, 0.130635f, 0.056622f, - 0.057972f, 0.095526f, -0.082436f, -0.085938f, -0.070570f, -0.087634f, - 0.335934f, 0.084860f, 0.544424f, -0.278917f, 0.476740f, 0.050927f, - -1.288817f, -0.078320f, -0.553041f, -0.160538f, -0.109365f, -0.127146f, - -0.032524f, -0.105117f, -0.182965f, -0.024723f, 0.083317f, 0.060073f, - -0.042945f, 0.015249f, 1.241504f, 0.662613f, 0.530496f, -0.180519f, - -1.099086f, -0.825844f, 0.551856f, -0.025009f, -0.006619f, -0.001049f, - 0.014828f, -0.035166f, -0.241091f, -0.136364f, -0.003219f, -0.014581f, - -0.379945f, -0.226191f, -0.161241f, -0.496390f, -0.147175f, -0.118004f, - -0.128206f, -0.389770f, -0.184288f, -0.119076f, -0.379211f, 0.236180f, - -0.468730f, -0.175170f, 0.136433f, 0.167739f, -0.377602f, 0.135772f, - 0.040972f, -0.193974f, -0.319475f, -0.016469f, -0.412027f, -0.322605f, - 0.111125f, -0.078456f, -0.387234f, -0.401605f, -0.088717f, -0.340682f, - 0.010556f, 0.058256f, -0.127352f, 0.017665f, 0.072632f, -0.171966f, - -0.117342f, -0.166050f, -0.182689f, -0.073182f, 0.096279f, -0.260229f, - 0.025216f, -0.332236f, -0.218706f, -0.200153f, -0.110303f, 0.073499f, - -0.280123f, 0.132262f, -0.308330f, -0.119036f, -0.303874f, -0.065445f, - -0.412137f, 0.057167f, 0.044582f, -0.330952f, -0.232572f, 0.039732f, - -0.326877f, -0.300569f, -0.467164f, -0.371499f, 0.034430f, 0.058277f, - -0.042485f, -0.409028f, -0.110889f, -0.500758f, -0.343141f, 0.042023f, - -1.071050f, 0.086854f, -0.004932f, -0.259698f, 0.125301f, -0.742663f, - -0.370517f, -0.772840f, 0.193628f, 0.554676f, 0.051283f, -0.196639f, - 0.040344f, 0.027391f, -0.040501f, 0.038303f, 0.032972f, -0.014638f, - 0.097720f, -0.206897f, -0.015480f, 0.008543f, 0.034469f, 0.127234f, - -0.396463f, -0.390189f, 0.117538f, -0.435622f, 0.043420f, -0.241987f, - -0.118254f, -0.190349f, 0.190273f, -0.085625f, -0.141253f, -0.377438f, - -0.249211f, 0.214512f, -0.363191f, -0.754851f, 0.238045f, 1.127635f, - 0.173947f, -0.357620f, 0.073671f, 0.220617f, 0.072067f, -0.076214f, - -0.044583f, -0.018371f, 0.010952f, -0.135116f, 0.076597f, 0.034480f, - -0.070212f, -0.454429f, -0.135215f, 0.163851f, -0.625990f, -0.283991f, - 0.284051f, 0.182935f, -0.048717f, 0.002484f, -0.009086f, 0.321724f, - 0.125162f, -0.069624f, -0.430299f, -0.007224f, -0.284725f, -0.475662f, - 0.123807f, -0.313614f, -0.103142f, 0.072125f, 0.100320f, -0.185558f, - -0.481522f, -0.247311f, -0.386762f, -0.258850f, 0.178844f, -0.381231f, - -0.436001f, -0.374834f, 0.230104f, -0.500679f, 0.170880f, 0.029657f, - -0.105857f, -0.366671f, -0.268833f, 0.036885f, -0.026776f, 0.037837f, - -0.362095f, -0.254933f, 0.129650f, 0.007945f, -0.304715f, -0.100813f, - -0.342849f, -0.269223f, 0.178490f, 0.186735f, -0.353995f, 0.050381f, - -0.440186f, 0.025985f, 1.096969f, 1.132937f, 0.581545f, 0.271734f, - -0.109169f, -0.014239f, 0.688644f, 0.602702f, 0.048616f, 0.022335f, - 0.037545f, 0.081667f, -0.109038f, -0.088565f, -0.002506f, -0.041420f, - -0.132515f, 0.187312f, 0.677273f, 1.111182f, 0.199096f, -0.211551f, - -0.896508f, 0.257981f, 0.007803f, 0.160343f, -0.124864f, -0.097150f, - 0.225090f, 0.242900f, -0.195665f, 0.011310f, 0.160765f, 0.169195f, - -0.081994f, -0.017372f, -0.566190f, -0.902086f, 0.027768f, 0.511419f, - 0.076009f, -0.165861f, 0.240487f, 0.006298f, -0.153334f, 0.041249f, - 0.387092f, 0.313011f, -0.032269f, 0.019024f, 0.052568f, 0.124247f, - 0.197640f, 0.002537f, 0.651044f, 0.829828f, -0.446444f, -0.402042f, - -0.469399f, -0.019842f, 0.371960f, 0.140373f, -0.044808f, 0.008283f, - 0.093791f, 0.052149f, 0.143123f, -0.449571f, -0.868816f, -0.265661f, - -0.225232f, -0.014704f, 0.543836f, -0.374498f, 0.561647f, 1.309445f, - 0.056789f, -0.048447f, 0.255758f, 0.644553f, -0.124802f, 0.097419f, - -0.149336f, 0.021596f, -0.043699f, 0.057591f, -0.000077f, 0.034488f, - -0.049353f, -0.007799f, 0.437914f, 0.509369f, 0.674428f, 1.858949f, - -0.205964f, 0.060776f, 0.184213f, 0.037177f, -0.062535f, -0.115408f, - 0.076498f, 0.010235f, -0.142253f, 0.009983f, 0.073436f, 0.038716f, - -0.369983f, -0.185959f, -0.137867f, 0.032134f, 0.213814f, -0.125571f, - 0.247874f, -0.166871f, -0.160890f, 0.147029f, 0.267143f, -0.298488f, - -0.210203f, -0.188313f, -0.085024f, -0.244962f, -0.189833f, -0.261242f, - 0.399519f, 0.143200f, -0.776419f, -0.374639f, -0.022066f, 0.582904f, - 0.006430f, -0.139134f, -0.491894f, -0.430579f, -0.358221f, -0.231365f, - -0.398255f, -0.173231f, 0.211789f, -0.036121f, -0.266856f, 0.042956f, - -1.138513f, -0.070313f, 0.158803f, 0.406989f, -0.015974f, 0.651020f, - -0.468982f, -0.310019f, 0.416922f, 0.895162f, 0.019921f, 0.004023f, - 0.006962f, 0.000863f, -0.216395f, -0.074913f, -0.002613f, 0.026703f, + -0.219494f, -0.428273f, 0.471006f, 0.448210f, -0.152935f, 0.440435f, + 0.922857f, -0.074436f, 1.002195f, 0.414176f, -0.327202f, -0.380066f, + -0.212346f, 0.061868f, -0.056620f, 0.594134f, 0.617995f, 0.308358f, + 0.232484f, 0.129849f, 1.483593f, -0.071460f, 1.984515f, 1.116422f, + -1.141762f, -0.306220f, 0.089075f, -0.271845f, 0.187524f, 0.050396f, + -0.061025f, 0.030809f, 0.172799f, -0.458151f, -0.318357f, 0.122052f, + -0.414329f, 0.089366f, 0.118898f, -0.376213f, -0.206151f, -0.519946f, + -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f, -0.245280f, + -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f, + -0.500856f, 0.003388f, -0.527397f, -0.449330f, -0.174272f, 0.123769f, + 0.023005f, 0.157273f, 0.073400f, 0.019099f, -0.113848f, -0.098601f, + -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f, + -1.282604f, 0.048505f, -0.746382f, 0.093740f, -0.706583f, -0.085729f, + 0.947382f, -0.002961f, 1.175362f, 1.007309f, 0.141638f, -0.037608f, + -0.118807f, -0.021474f, -0.146763f, 0.069363f, -0.074372f, -0.215713f, + -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f, -0.534598f, + -0.357759f, -0.455950f, 0.139469f, 0.036582f, -0.384743f, -0.168828f, + -0.239250f, 0.003520f, -0.049003f, 0.075702f, -0.025809f, -0.225972f, + -0.228905f, -0.412489f, 0.060570f, -0.328819f, -0.206446f, -0.080231f, + -0.372008f, -0.218118f, -0.011954f, 0.024155f, 0.156014f, 0.020679f, + 0.194398f, -0.283491f, -0.024463f, -0.275099f, 0.028031f, 0.026340f, + -0.254668f, 0.103637f, 2.178693f, 0.552284f, 0.109366f, -0.474806f, + -0.379286f, -0.026315f, 2.487924f, -0.089466f, 0.206428f, 0.114578f, + 0.152248f, 0.184050f, -0.631948f, -0.014793f, -0.283782f, -0.830353f, + 0.009343f, -0.021029f, -0.060534f, -0.025164f, 1.841311f, 1.842748f, + -1.979708f, 0.450985f, -1.606357f, -0.785454f, -0.212679f, -0.344342f, + 0.198991f, -0.258070f, 0.055974f, 0.224069f, 0.453051f, 0.408053f, + 0.027873f, -0.180538f, 0.056609f, 0.207654f, 0.104086f, -0.194426f, + -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f, + -0.160439f, -0.044856f, -0.346647f, 0.044859f, 0.231398f, -0.023643f, + -0.140316f, -0.260177f, 0.206965f, -0.425386f, -0.420268f, -0.409748f, + 0.006971f, 0.066186f, -0.034950f, -0.345518f, 0.018633f, -0.122489f, + -0.038506f, -0.330942f, 0.161236f, -0.314119f, -0.050202f, -0.179597f, + 0.731897f, -0.184481f, 0.153598f, -0.539501f, -0.301493f, -0.184967f, + -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f, + -0.101083f, -0.064142f, 0.161190f, 0.430826f, 0.355647f, 0.138266f, + 0.051114f, -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f, + -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f, + 0.120193f, 0.011360f, -0.010475f, -0.092739f, -0.159650f, -0.033129f, + -0.259893f, -0.073217f, 0.200128f, 0.103407f, -0.229233f, 0.128831f, + -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f, + -0.442879f, -0.310456f, -0.112881f, 0.263696f, -0.205014f, -0.497936f, + -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f, + 0.117232f, -0.577809f, 0.154596f, -0.409522f, -0.413113f, -0.359199f, + 0.307294f, -0.008746f, -0.310522f, 0.347620f, -0.384845f, -0.451398f, + -0.226199f, 0.054154f, -0.167608f, 0.046836f, -0.013285f, -0.408119f, + -0.177973f, -0.248293f, -0.465830f, 0.035827f, -0.222208f, -0.221717f, + 0.066392f, -0.349769f, -0.428029f, -0.516692f, 0.022398f, -0.251682f, + 0.134746f, 0.011167f, -2.078787f, 0.173592f, -1.948348f, 0.330060f, + 1.993785f, -0.052859f, -0.004795f, -3.703177f, 0.013450f, -0.011687f, + 0.073079f, 0.034803f, 0.025515f, 0.005994f, 0.101731f, 0.074303f, + -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f, + 0.007667f, -0.358453f, 0.027412f, 0.033492f, 0.021197f, -0.049991f, + 0.104468f, -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f, + -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f, + -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f, -0.246475f, + -0.229144f, -0.087744f, -0.346909f, 0.172611f, 0.004377f, -0.009386f, + -0.023104f, 0.008000f, -0.029390f, -0.317842f, 0.549674f, -0.195337f, + -0.863979f, 0.160889f, -0.269014f, -0.442104f, -1.799191f, 1.396533f, + -0.112837f, 0.881303f, 0.000764f, -0.035415f, -0.141877f, 0.184831f, + -0.363566f, -0.178569f, 0.254134f, -0.326893f, 0.127325f, 0.310620f, + -0.384621f, 0.146058f, -0.287682f, -0.373447f, 0.026930f, 0.251650f, + 0.053817f, 0.227509f, 0.121396f, 0.396514f, -0.278381f, -0.038969f, + -1.538756f, -0.002856f, -0.892900f, 0.363426f, -1.257922f, 0.743795f, + 0.941177f, 0.219345f, 0.684189f, 1.396858f, 0.026299f, -0.093433f, + -0.066182f, 0.057868f, -0.089278f, -0.159680f, -0.262035f, -0.236656f, + 0.005349f, -0.031314f, 0.027917f, -0.182113f, -0.212086f, -0.160774f, + 0.051468f, 0.036787f, 0.183881f, -0.288205f, -0.349691f, 0.162511f, + 0.117878f, -0.294534f, -0.365037f, -0.246313f, 0.073977f, -0.072378f, + -0.173579f, -0.584560f, 0.547194f, 0.259853f, -0.405287f, -0.421146f, + 0.165788f, -0.146964f, 0.257415f, 0.772394f, -0.475302f, -0.310906f, + 0.058723f, 0.276833f, 0.586842f, 0.248998f, -0.061135f, 0.255779f, + 0.152158f, -0.024781f, 2.821834f, 1.365141f, 0.914744f, 0.165752f, + -1.048304f, -0.333891f, 1.804087f, -0.437028f, -0.120211f, -0.020443f, + 0.040077f, 0.258600f, -0.598893f, -0.494579f, -0.281054f, -0.517041f, + 0.005258f, 0.053986f, 0.322755f, 0.429495f, -1.992364f, -0.717192f, + -1.774802f, 2.047362f, -0.016194f, 0.312606f, 0.019331f, 0.060950f, + 0.116428f, 0.168458f, -0.307001f, -0.420734f, 0.475843f, 0.425346f, + -0.107119f, 0.049892f, -1.168619f, 0.010878f, 0.354872f, 0.902717f, + -0.391407f, 0.332772f, -1.335037f, -0.447100f, 0.481719f, -0.101069f, + -1.806565f, 0.925280f, 0.346999f, 0.093809f, 0.006275f, 0.270814f, + -0.691123f, 0.230748f, 0.137033f, 0.068228f, 1.555975f, -0.271637f, + -0.370403f, 0.236131f, 0.367464f, -0.136562f, 0.428838f, 0.181750f, + 0.338762f, 0.292449f, -0.748204f, -0.922731f, -0.959445f, -0.806418f, + -0.140501f, 0.070525f, 1.248748f, 0.637990f, -1.307246f, -0.514055f, + 0.393858f, -1.858727f, 0.713591f, -0.141044f, 0.080723f, 0.120220f, + -0.031175f, 0.224488f, 0.753818f, -0.833351f, -1.099132f, 0.651100f, + -0.135061f, -0.043820f, 0.026983f, -0.059259f, 0.001345f, -0.281775f, + 0.006958f, 0.046103f, -0.246539f, 0.057630f, -0.360778f, -0.160681f, + -0.414870f, -0.301979f, 0.000683f, 0.132957f, -0.477609f, 0.106110f, + -0.637769f, -0.078374f, -0.229494f, 0.583108f, -0.822973f, -0.107540f, + 1.063426f, -0.268346f, 1.105787f, 2.587550f, -0.020314f, -0.002161f, + -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f, + -0.019870f, -0.018920f, -0.219732f, 0.035608f, -1.789450f, 0.483032f, + -0.464729f, 1.563277f, -1.054195f, 0.359991f, 0.065204f, 0.135623f, + 0.158380f, -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f, }; static const float av1_4_partition_nn_bias_32_layer0[32] = { - 0.133615f, -0.113389f, -0.575989f, 0.589389f, -0.193574f, -0.132463f, - 0.000000f, 0.060317f, 0.264577f, -0.060599f, 0.540147f, -0.127782f, - -0.548802f, -0.172235f, -0.193032f, -0.026301f, -0.177527f, 0.267821f, - -0.115455f, -0.137162f, -0.079595f, -0.041443f, -0.043856f, -0.657220f, - -0.448931f, 0.446300f, 0.250002f, 0.223559f, -0.647723f, -0.014369f, - 0.084333f, -0.056270f, + 0.421645f, -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f, + -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f, + -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f, -0.032831f, + -0.165621f, 0.145844f, -0.032104f, -0.453246f, -0.113444f, 0.321589f, + -0.862375f, -0.108826f, -0.486259f, 0.685325f, 0.072569f, -0.187961f, + 0.109579f, -0.082685f, }; static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = { - -0.069633f, -0.087239f, 0.365816f, -0.068579f, 0.231198f, -0.067856f, - -0.139892f, -0.100235f, -0.488166f, -0.150112f, -0.005546f, 0.210832f, - 0.778888f, 0.169624f, 0.089968f, -0.243569f, 0.353483f, 0.032296f, - -0.157408f, 0.286885f, -0.063537f, -0.324055f, -0.161464f, 0.430600f, - 0.277707f, -0.196463f, 0.154647f, 0.059804f, 0.176408f, 0.303179f, - -0.040156f, 0.375810f, -0.363032f, -0.186808f, -0.264561f, -0.158937f, - -0.007949f, -0.076394f, 0.056475f, 0.308528f, 0.695387f, 0.051336f, - 0.433063f, -0.229948f, -1.210712f, 0.036286f, 0.183868f, -0.117660f, - 0.230134f, -0.093469f, 0.237918f, 0.625986f, -0.236671f, -0.377172f, - 0.331091f, -0.394004f, -0.214349f, 0.243940f, -0.600348f, 0.069843f, - 0.088325f, 0.225775f, 0.276884f, -0.604493f, 0.769812f, 0.259574f, - 0.086220f, 0.511515f, -0.282584f, -0.157719f, 0.278778f, -0.332732f, - 0.068985f, -0.237236f, -0.006102f, -0.154883f, 0.710288f, -0.245896f, - -0.255895f, -0.398038f, 0.304084f, -0.317065f, 0.192609f, -0.235613f, - 0.461340f, 0.117194f, 0.116817f, 0.196150f, 0.421622f, -0.264495f, - 0.617852f, -0.351756f, -0.310016f, 0.135932f, -0.242622f, -0.073094f, - 0.042077f, 0.039230f, -0.482715f, 0.553187f, 0.360637f, 0.313484f, - -0.131540f, -0.104731f, 0.374704f, 0.222173f, 0.437657f, 0.029827f, - -0.545156f, -0.203176f, 0.267824f, 0.169237f, -0.057871f, 0.552197f, - 0.272243f, 0.025681f, -0.262192f, 0.255934f, -0.202407f, -0.483317f, - -0.204721f, 0.288807f, -0.030735f, -0.047161f, -0.780724f, 0.381939f, - -0.295318f, 0.537378f, + 0.255012f, 0.658860f, 0.216907f, 0.165947f, 0.241182f, 0.340854f, + 0.409445f, 0.165220f, 0.553373f, -0.242385f, -0.209571f, 0.255515f, + 0.222500f, 0.037032f, 0.238590f, 0.061624f, -2.038693f, 0.264167f, + -0.230144f, 0.129952f, -0.027979f, 0.847761f, 0.438922f, 0.462323f, + 0.555345f, 0.030689f, 0.336357f, -0.357326f, -0.113137f, 0.272631f, + 0.421022f, 0.367776f, -0.197094f, 0.157117f, -0.015008f, -0.056123f, + -0.283913f, 0.186417f, 0.178561f, -0.763041f, 0.602038f, 0.341092f, + 0.320453f, -0.312776f, -0.371240f, -0.356279f, 0.220117f, -0.131871f, + 1.517429f, 0.162223f, -0.255069f, 0.451861f, 0.045071f, -0.223257f, + 0.003257f, 0.015734f, -0.630447f, -0.672588f, 0.670164f, 0.571031f, + -0.657948f, 0.034506f, -0.249076f, 0.790293f, 0.066491f, -0.131245f, + 0.355173f, 0.564622f, 0.374048f, 0.033974f, 0.253970f, 0.495498f, + -0.556321f, -0.104651f, 0.276947f, 0.057148f, -0.039126f, -0.170050f, + -0.141542f, 0.158541f, 0.582763f, -0.100992f, 0.096705f, -0.209029f, + 0.008449f, 0.255865f, 0.103565f, 0.317719f, 0.479499f, 0.599126f, + -0.065613f, -0.268614f, 0.508736f, 0.180813f, -0.815868f, 0.051238f, + 0.001223f, -0.305423f, -0.270079f, 0.036180f, 0.304342f, 0.202634f, + 0.218348f, -0.304304f, -0.438297f, 0.241123f, 0.200230f, 0.151804f, + 0.051944f, 0.160422f, -0.262981f, -0.417412f, 1.845729f, -0.086183f, + 0.403517f, 0.059667f, 0.564543f, -0.081752f, 0.114907f, -0.284489f, + -0.673943f, 0.056965f, 0.362221f, 0.403224f, -0.000233f, -0.209552f, + -0.800926f, -0.134132f, }; static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = { - -0.332518f, - 0.114452f, - 0.098949f, - 0.465896f, + -0.019518f, + 0.198546f, + 0.339015f, + -0.261961f, }; static const NN_CONFIG av1_4_partition_nnconfig_32 = { @@ -1688,82 +1596,112 @@ static const NN_CONFIG av1_4_partition_nnconfig_32 = { }, }; -static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 16] = { - 0.256343f, -0.021774f, -0.117102f, 0.416930f, 0.188160f, 0.148768f, - -0.611181f, -0.121607f, -0.394825f, -0.875025f, -0.167071f, 0.016408f, - 0.222769f, -0.199332f, 0.058667f, -0.679529f, 0.081744f, 0.044438f, - -0.182941f, -0.110339f, -0.137822f, -0.096164f, -0.132319f, 0.140036f, - -0.049503f, -0.309894f, -0.323991f, 0.166113f, 0.138104f, -0.263629f, - -0.368460f, -0.273989f, 0.147239f, 0.044566f, -0.363357f, -0.030792f, - 0.020734f, 0.068506f, -0.434214f, 0.581644f, -1.244146f, -0.569162f, - 0.179499f, -0.188900f, 0.078431f, -0.392126f, -0.006431f, 0.112146f, - -0.065892f, -0.051319f, 0.094607f, 0.251700f, -0.000650f, 0.011911f, - 0.080449f, 0.022816f, 0.322382f, 0.577070f, 0.927738f, 0.178707f, - -0.101237f, -0.212521f, 0.560261f, -0.206492f, -0.077591f, -0.069960f, - 0.025727f, 0.041122f, -0.735228f, -0.506091f, -0.600776f, -0.117829f, - 0.103556f, 0.141823f, 0.853448f, 0.339488f, 0.994022f, 0.121693f, - -2.065366f, -0.352510f, -0.174323f, -0.323400f, -0.002193f, 0.004161f, - 0.042469f, -0.005319f, -0.305784f, -0.371353f, 0.011194f, -0.018597f, - 0.209260f, 0.071577f, 0.242470f, -0.856593f, 0.288842f, 1.062608f, - -0.300472f, 0.221623f, -0.813563f, -0.250347f, -0.081455f, -0.092779f, - -0.168132f, -0.180640f, -0.075130f, -0.052906f, -0.015645f, 0.127158f, - -0.006546f, 0.051671f, 0.545608f, 1.101804f, 0.288086f, 1.107046f, - -0.200012f, 0.220182f, -0.189220f, -0.554973f, 0.040711f, -0.058029f, - 0.043737f, 0.016164f, -0.391790f, -0.287770f, -0.046545f, 0.045071f, - 0.190005f, -0.076963f, 0.836839f, 1.633266f, 0.902928f, 0.991972f, - -0.127932f, 0.293680f, -0.035984f, 0.476179f, -0.098024f, 0.068314f, - -0.058365f, 0.096221f, -0.000321f, -0.128840f, 0.136441f, -0.061853f, - 0.270367f, -0.184129f, -0.373670f, -0.177381f, 0.262109f, -0.378013f, - -0.053249f, -0.456389f, 0.222972f, -0.228067f, -0.115210f, -0.277797f, - 0.096913f, -0.014512f, -0.015533f, 0.026389f, -0.360536f, -0.078477f, - -0.203186f, 0.199574f, 0.770476f, 0.595592f, 0.360828f, 0.547721f, - -0.804787f, 0.389690f, -0.437645f, 0.576776f, 0.081903f, 0.082750f, - 0.007166f, -0.143755f, 0.114462f, 0.472432f, -0.058974f, 0.077761f, - -2.015181f, -0.054942f, -0.110894f, 0.529188f, -0.003300f, 0.913895f, - -0.324643f, 0.316135f, -0.291729f, 1.072647f, -0.029236f, 0.045592f, - -0.039399f, 0.043472f, -0.303244f, -0.108761f, -0.011154f, 0.009693f, - -0.374985f, 0.027758f, 0.302075f, -0.295758f, -0.165563f, -0.297259f, - -0.485624f, -0.469310f, -0.028247f, -0.124440f, -0.428082f, 0.096325f, - 0.089003f, -0.301585f, 0.022474f, 0.077477f, -0.032233f, -0.231036f, - 0.143206f, 0.169113f, -0.556486f, 0.346327f, -0.667790f, 0.126983f, - 0.179727f, 0.397307f, -0.490612f, -1.708789f, -0.040336f, -0.028547f, - -0.091319f, -0.119367f, -0.518796f, -0.543383f, 0.037162f, 0.031344f, - -0.131692f, 0.119353f, 0.799313f, 0.443848f, -0.499919f, -1.002983f, - 0.375477f, 0.221096f, -0.238033f, 0.284849f, 0.021897f, 0.023338f, - -0.059067f, 0.117276f, 0.039540f, 0.049630f, 0.175150f, 0.014166f, - -0.071486f, 0.091234f, -1.007432f, -1.417378f, 0.640528f, 1.442576f, - -0.257183f, -0.597016f, 0.861785f, 0.276121f, -0.098017f, 0.120514f, - -0.133184f, 0.106529f, 0.171644f, 0.059513f, 0.215952f, -0.009441f, - -0.505313f, 0.063174f, 0.229148f, -0.344213f, 0.862721f, 1.549941f, - -0.220129f, 0.493094f, 0.264095f, 0.143641f, 0.084968f, -0.078266f, - 0.032335f, -0.019006f, -0.098205f, 0.119213f, -0.103465f, 0.072811f, -}; - -static const float av1_4_partition_nn_bias_64_layer0[16] = { - 0.111611f, -0.067682f, 0.633594f, 0.143559f, -1.051284f, -0.266625f, - -0.829789f, -0.956123f, -0.153484f, -0.787741f, 0.004832f, -0.080769f, - 0.235166f, 0.449468f, 0.294689f, -0.395300f, -}; - -static const float av1_4_partition_nn_weights_64_layer1[16 * LABEL_SIZE] = { - -0.069999f, -0.093710f, -0.423714f, -0.028138f, 0.684415f, 0.141445f, - 0.507161f, 0.435533f, -0.263268f, 0.585105f, 0.235301f, 0.127536f, - -0.688639f, -0.217993f, -0.540066f, 0.406718f, 0.018210f, -0.077349f, - -0.124823f, -0.488220f, -0.957026f, 0.302632f, 0.285490f, -0.411356f, - 0.091089f, 0.103862f, -0.549291f, 0.148628f, 0.640603f, -0.601018f, - 0.178024f, 0.601370f, 0.313780f, 0.051938f, 0.524083f, 0.814631f, - -0.415522f, -0.738849f, 0.477881f, -0.342864f, 0.105181f, 0.040010f, - -0.177521f, 0.400646f, 0.167093f, 0.388279f, -0.898439f, -0.111936f, - 0.469875f, -0.099528f, -0.217370f, 0.283742f, -0.033798f, -0.142797f, - -0.174057f, -1.293311f, -0.038777f, -0.003846f, 0.093642f, -0.527150f, - -0.021259f, 0.194651f, -0.276294f, -0.109514f, +static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = { + -0.152649f, 0.074509f, 1.000136f, 0.601661f, -1.416694f, -1.932396f, + -1.163850f, 0.640931f, -0.888625f, -0.345711f, 0.161799f, 0.103165f, + 0.147513f, 0.089956f, 0.204329f, 0.196922f, 0.014927f, 0.283714f, + -0.110422f, 0.062005f, -0.531870f, -0.075287f, -0.448349f, -0.218881f, + -0.005592f, -0.130490f, -0.015779f, 0.093521f, -0.158487f, 0.072241f, + 0.066879f, -0.418566f, -0.206281f, 0.025634f, 0.048334f, -0.534750f, + 0.302081f, 0.028707f, -1.543248f, 0.103799f, -1.214052f, 0.395870f, + 0.394754f, -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f, + 0.014742f, 0.065263f, 0.000879f, -0.019768f, 0.101275f, 0.163059f, + -0.371392f, -0.283484f, 0.241915f, 0.012684f, -0.210101f, -0.166534f, + -0.024894f, 0.274696f, 0.098993f, 0.104086f, 0.055044f, -0.289378f, + 0.146571f, -0.147441f, 0.004056f, 0.112244f, -0.416162f, -0.033176f, + -0.214836f, -0.213787f, 0.023197f, -0.339043f, 0.301109f, -0.408551f, + 0.284922f, -0.344418f, -0.039255f, 0.158748f, -0.344169f, 0.078286f, + -0.043957f, -0.302162f, -0.310826f, 0.063425f, 0.198166f, -0.285324f, + -0.108252f, 0.038992f, -1.053110f, -1.663290f, -0.417185f, 1.504443f, + 0.643206f, -0.850240f, 0.889641f, -0.733214f, 0.147302f, 0.060291f, + -0.052954f, 0.167453f, 0.111870f, 0.085471f, 0.035107f, 0.064361f, + 0.176053f, 0.184373f, 0.676576f, 0.066164f, 1.455569f, 0.925111f, + -0.640845f, 0.803795f, -0.653782f, -0.201038f, 0.060033f, 0.016964f, + -0.047590f, 0.045908f, 0.354162f, 0.014812f, 0.156978f, 0.058792f, + -0.238119f, 0.002450f, -0.094388f, -0.155229f, 0.194858f, -0.355429f, + -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f, -0.425339f, + -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f, -0.185816f, + -0.317294f, 0.002453f, -0.498241f, -0.204302f, -0.079093f, 0.020646f, + -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f, + -0.146812f, 0.171111f, 0.090261f, -0.367033f, -0.299051f, -0.322132f, + 0.428192f, -0.252613f, 0.488498f, -0.559682f, 0.486720f, -0.511084f, + 0.992506f, 0.346765f, -0.118697f, -0.065127f, -0.376612f, -0.345137f, + -0.426517f, -0.516836f, 0.307083f, 0.609362f, 0.369555f, 0.093775f, + -0.375664f, -0.221595f, -0.025465f, 0.134374f, -0.387031f, 0.096236f, + 0.337465f, -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f, + -0.247705f, 0.146559f, -0.236206f, -0.036073f, 0.064206f, -0.330919f, + 0.516591f, -0.013492f, 1.269568f, 1.182530f, -0.455390f, -1.328091f, + -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f, 0.021176f, + 0.169119f, 0.103707f, -0.174504f, -0.462719f, -0.079445f, -0.247128f, + 0.459111f, 0.036129f, 0.769570f, -0.080405f, 1.667107f, 0.355567f, + -2.433896f, 0.627572f, -0.600090f, -0.651872f, -0.059769f, -0.041945f, + -0.009933f, 0.014864f, -0.049378f, -0.041561f, 0.075180f, 0.138307f, + 0.122366f, -0.160756f, 0.215327f, 0.013572f, 0.198194f, -0.762650f, + 0.054466f, 1.110332f, 1.692853f, 0.658654f, -0.409549f, 0.506085f, + 0.330962f, -0.223008f, 0.007448f, -0.289062f, -0.476231f, -0.228359f, + 0.013977f, -0.000609f, -0.673604f, 0.275996f, 0.405291f, 1.693561f, + -1.079768f, 1.122516f, -0.203227f, 0.099265f, -0.165207f, -0.323899f, + -0.269973f, -0.080122f, 0.127700f, 0.190201f, 0.219527f, 0.306194f, + 0.026049f, -0.003779f, 1.107357f, 1.720315f, 1.017908f, 0.078664f, + -1.599813f, -0.482636f, -0.117450f, 0.122249f, 0.030220f, 0.039794f, + 0.176350f, 0.129715f, -0.305755f, -0.274044f, -0.299640f, -0.187335f, + -0.073616f, -0.564507f, -0.127758f, 0.044855f, -0.191090f, 0.039095f, + 0.115378f, 0.969352f, -0.088360f, 0.301443f, 0.065726f, -0.019740f, + -0.102350f, -0.084913f, -0.194615f, 0.118582f, 0.920789f, -0.171615f, + -1.436553f, -0.026419f, -0.730864f, 0.615697f, -0.795079f, 0.119701f, + 0.601782f, 0.792902f, 0.184920f, 1.635090f, -0.085860f, -0.033187f, + -0.166883f, 0.008487f, -0.128300f, -0.089923f, -0.108781f, -0.133719f, + -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f, + -0.063101f, -0.121002f, 0.058072f, -0.031166f, 0.086413f, -0.016203f, + -0.305075f, -0.005420f, -0.168796f, 0.148745f, -0.116737f, -0.050222f, + -0.287952f, -0.290982f, -0.090449f, 0.076098f, -0.345632f, -0.061309f, + 0.142218f, 0.035692f, 0.304517f, -0.228031f, 0.119608f, -0.120350f, + 0.163404f, -0.105605f, -0.305462f, -0.176657f, 0.210070f, -0.227600f, + -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f, 0.017162f, + -0.069792f, 0.305768f, -0.421095f, 0.187740f, -0.032059f, 0.575115f, + -0.064283f, -0.091828f, 0.772648f, -0.393189f, -0.297098f, 0.141420f, + 0.826389f, -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f, + 1.546000f, -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f, + -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f, + -0.492625f, 0.025350f, -0.332075f, -0.475638f, -0.076667f, -0.065779f, + 0.108957f, 0.246298f, -0.289007f, -0.442552f, -0.206692f, -0.257453f, + 0.073806f, -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f, + 0.018810f, -0.098241f, 1.027369f, 0.479328f, 1.129707f, 0.484813f, + -0.085207f, 0.621873f, -0.520981f, 0.236175f, 0.273487f, 0.061426f, + 0.306085f, 0.161487f, 0.220991f, 0.223783f, -0.091826f, 0.391031f, +}; + +static const float av1_4_partition_nn_bias_64_layer0[24] = { + 0.580225f, -0.191304f, 1.091767f, -0.134522f, -0.089361f, 0.398750f, + -0.882708f, -0.213102f, -0.119981f, 0.378296f, -0.075719f, 0.426598f, + -2.015505f, 0.202534f, -1.044792f, -0.841519f, 0.266421f, -0.047115f, + -0.131147f, -0.075066f, -0.009441f, 0.853007f, -0.175606f, -0.868306f, +}; + +static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = { + -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f, -0.340847f, + 0.498032f, 0.308652f, -0.051574f, 0.323146f, -0.097547f, -0.040269f, + 1.909655f, 0.098348f, 0.588136f, 0.568112f, 0.313297f, 0.920848f, + -0.014486f, 0.386014f, 0.029199f, -0.537330f, -0.021502f, 0.349073f, + -0.524715f, -0.351848f, 1.565454f, -0.297148f, 0.020177f, 0.648369f, + 0.027321f, -0.096052f, -0.363163f, -0.132642f, 0.024292f, -0.734176f, + -0.782700f, 0.408299f, 0.476945f, -0.489512f, -0.728318f, -0.632042f, + 0.405417f, 0.184086f, -0.400730f, 0.359032f, 0.019710f, -0.217409f, + 0.519159f, -0.136316f, 0.993592f, -0.147128f, 0.097495f, 0.426189f, + -0.295233f, 0.278799f, 0.080667f, -0.025052f, -0.307757f, 0.418716f, + -0.853388f, -0.374878f, -0.322725f, 0.696335f, -0.380649f, -0.160356f, + -0.140060f, 0.502455f, 0.656728f, -0.095023f, -0.184198f, -0.347069f, + 0.456372f, -0.029754f, 0.907923f, 0.265710f, -0.065505f, 0.226763f, + -0.277798f, 0.413292f, -0.593899f, -0.060740f, -0.313358f, -0.249944f, + -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f, + -0.057382f, 0.334741f, -0.283083f, 0.368280f, -0.407197f, -0.441849f, }; static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = { - -0.688947f, - 0.121075f, - 0.289597f, - 0.948091f, + -0.478735f, + 0.292948f, + 0.293172f, + 0.040013f, }; static const NN_CONFIG av1_4_partition_nnconfig_64 = { @@ -1771,7 +1709,7 @@ static const NN_CONFIG av1_4_partition_nnconfig_64 = { LABEL_SIZE, // num_outputs 1, // num_hidden_layers { - 16, // num_hidden_nodes + 24, // num_hidden_nodes }, { av1_4_partition_nn_weights_64_layer0, @@ -1786,8 +1724,725 @@ static const NN_CONFIG av1_4_partition_nnconfig_64 = { #undef FEATURE_SIZE #undef LABEL_SIZE +#define FEATURE_SIZE 4 +static const float + av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = { + -0.331785f, 0.068675f, -0.323814f, 0.033714f, -0.237835f, 0.166316f, + -0.498766f, -0.545634f, -0.266173f, -0.476957f, -0.120409f, -0.021042f, + 0.124056f, -0.278750f, -0.110120f, -0.372812f, 4.547939f, 0.097618f, + -0.002710f, -0.064169f, -1.841173f, -0.403833f, 0.005536f, 0.067188f, + -0.434935f, -0.227421f, -0.000011f, -0.139961f, -0.174056f, -0.652384f, + -0.000015f, -0.262847f, -3.319706f, -0.947693f, 0.002981f, 0.016717f, + -10.408850f, -0.014568f, -0.000018f, 0.019084f, 1.523383f, 0.074525f, + -0.002076f, -0.020734f, 4.881495f, 0.002799f, 0.000342f, -0.019623f, + 1.786154f, 0.037462f, -0.019037f, 0.052833f, 11.408153f, -0.044602f, + 0.026155f, -0.518627f, -0.474499f, -0.427430f, -0.442733f, -0.011116f, + -22.379410f, -0.000549f, -0.001418f, 0.008090f, -0.295090f, -0.230268f, + -0.337278f, -0.001127f, -0.644282f, -0.598783f, -0.539417f, -0.003303f, + 9.189824f, 0.038066f, -0.004097f, -0.460045f, -0.308858f, -0.242691f, + -0.230835f, -0.273057f, 0.152226f, 0.179239f, -0.146382f, -0.004655f, + -0.242940f, -0.718862f, -0.001685f, -0.214736f, 3.263186f, 0.079463f, + -0.003854f, -0.187461f, -0.599144f, -0.419808f, -0.000597f, -0.136980f, + 0.184813f, -0.319525f, -0.007246f, 0.079709f, -0.883229f, -0.343748f, + -0.000077f, -0.172214f, -0.548759f, -0.194674f, -0.144786f, 0.043896f, + -0.176364f, -0.248394f, -0.090215f, -0.294743f, -0.280980f, -0.181436f, + -0.115681f, -0.071915f, -13.035494f, -0.075623f, 0.017052f, -0.171152f, + 5.910803f, 0.128344f, 0.010256f, -1.073301f, 2.387826f, 0.166183f, + -0.007193f, -0.257836f, + }; + +static const float av1_partition_breakout_nn_bias_128_layer0[32] = { + 0.115591f, -0.100178f, -0.165523f, -0.122997f, 11.045759f, 1.034761f, + -0.323672f, -0.189087f, 2.850950f, 7.010029f, -21.447067f, 1.877031f, + 0.437442f, 5.929414f, -0.117274f, 4.462253f, -0.135198f, -0.145927f, + 8.727211f, 0.000000f, -3.532987f, -0.405898f, 11.364439f, -0.141728f, + -5.994947f, -0.362574f, 1.857687f, -0.100400f, -0.130312f, 0.006080f, + 0.429660f, -8.439470f, +}; + +static const float av1_partition_breakout_nn_weights_128_layer1[32] = { + -0.013738f, 0.022052f, -0.074437f, -0.211377f, -0.080433f, 0.015543f, + 0.002091f, 0.014252f, 0.134834f, 0.190263f, 0.244175f, -0.031747f, + 0.020068f, -0.068326f, 0.185471f, 0.660268f, -0.134898f, -0.010376f, + -0.276023f, -0.282921f, -0.022769f, 0.007070f, -0.186235f, 0.024407f, + -0.024837f, 0.005764f, 0.016599f, -0.040077f, 0.020990f, 0.095054f, + -0.039662f, 0.131499f, +}; + +static const float av1_partition_breakout_nn_bias_128_layer1[1] = { + 0.86678213f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_128_layer0, + av1_partition_breakout_nn_weights_128_layer1, + }, + { + av1_partition_breakout_nn_bias_128_layer0, + av1_partition_breakout_nn_bias_128_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = { + 0.872892f, -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f, + -0.001373f, 0.112147f, 5.281734f, 0.060704f, 0.000838f, -0.961554f, + 0.244995f, 0.154515f, -0.292654f, -0.167177f, -3.759112f, -0.486347f, + 0.003208f, -0.418226f, 2.618152f, 0.026832f, 0.003988f, -0.404406f, + -0.405434f, 0.102791f, -0.033406f, -0.029820f, -4.492342f, -0.154291f, + 0.012947f, -0.195075f, 0.009311f, -0.411410f, -0.010986f, -0.554822f, + 0.160576f, 0.020796f, -0.457230f, -0.191111f, -7.759542f, -0.065039f, + -0.001322f, 0.055691f, 0.291924f, -0.053076f, -0.148379f, -0.298383f, + 1.022023f, -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f, + -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f, + -2.407131f, -0.062304f, 0.000874f, 0.108786f, + }; + +static const float av1_partition_breakout_nn_bias_64_layer0[16] = { + 0.081425f, -14.404084f, 11.511393f, -0.930053f, 1.841889f, 15.020920f, + -1.872288f, 5.392535f, -0.329335f, -0.005358f, 12.600776f, 0.000000f, + -0.337413f, 4.492778f, 0.000000f, 17.043072f, +}; + +static const float av1_partition_breakout_nn_weights_64_layer1[16] = { + -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f, + 0.014185f, 0.067030f, -0.001939f, -0.175049f, 0.245992f, -0.181660f, + -0.038572f, 0.307899f, -0.294283f, 0.118323f, +}; + +static const float av1_partition_breakout_nn_bias_64_layer1[1] = { + -1.33438122f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_64_layer0, + av1_partition_breakout_nn_weights_64_layer1, + }, + { + av1_partition_breakout_nn_bias_64_layer0, + av1_partition_breakout_nn_bias_64_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = { + -4.825528f, -0.145737f, 0.001907f, 0.145415f, -1.858153f, -0.080744f, + 0.000601f, 0.211991f, 0.384265f, -0.043945f, -0.521332f, -0.170622f, + -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f, + -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f, + -1.202551f, -0.729727f, -0.370084f, 0.088215f, -1.926800f, -0.086519f, + 0.000359f, 0.215120f, 0.718749f, 0.022942f, 0.003840f, -0.176518f, + 1.213451f, 0.080786f, 0.001557f, -1.053430f, 0.202698f, -0.583919f, + -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f, + -0.178518f, -0.585784f, 0.000029f, -0.833014f, -0.331358f, -0.520297f, + -0.088676f, -0.178487f, -1.430755f, 0.022981f, -0.106931f, 0.015573f, + -0.520814f, -0.045386f, -0.443123f, -0.484209f, + }; + +static const float av1_partition_breakout_nn_bias_32_layer0[16] = { + 11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f, + 6.669584f, 16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f, + -0.423808f, 0.000000f, 6.352258f, -0.155787f, +}; + +static const float av1_partition_breakout_nn_weights_32_layer1[16] = { + 0.168561f, -0.122519f, 0.524667f, 0.032474f, 0.059097f, 0.011900f, + 0.166445f, 0.127256f, -0.034838f, -0.212586f, -0.317973f, 0.348419f, + -0.004171f, 0.157694f, 0.117845f, 0.272115f, +}; + +static const float av1_partition_breakout_nn_bias_32_layer1[1] = { + 0.09049262f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_32_layer0, + av1_partition_breakout_nn_weights_32_layer1, + }, + { + av1_partition_breakout_nn_bias_32_layer0, + av1_partition_breakout_nn_bias_32_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = { + 0.209371f, 0.028758f, 0.005764f, -0.384401f, -0.625777f, -0.005647f, + -0.316867f, 0.042985f, 0.127344f, 0.025461f, 0.011465f, -0.071043f, + -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f, + -0.130997f, -0.012326f, 0.024124f, -0.323578f, -0.005790f, -0.085664f, + -1.575066f, -0.119221f, 0.015018f, 0.187204f, 0.238117f, 0.084924f, + -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f, + -0.278642f, -0.011114f, 0.021162f, 0.081290f, -0.467486f, -0.040771f, + -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f, + -0.814479f, -0.050450f, -0.003666f, 0.085668f, -0.272589f, 0.057330f, + -0.206540f, -0.303418f, 0.075335f, -0.180468f, -0.064872f, -0.755948f, + -0.509287f, -0.048877f, -0.001512f, 0.077086f, + }; + +static const float av1_partition_breakout_nn_bias_16_layer0[16] = { + 16.421495f, 4.012273f, -1.828571f, 0.000000f, -0.263564f, -0.201972f, + 6.564987f, 14.651000f, -3.227779f, 2.241833f, -0.137116f, 0.762876f, + 5.625762f, 0.615822f, 0.040057f, 16.668884f, +}; + +static const float av1_partition_breakout_nn_weights_16_layer1[16] = { + -0.096440f, 0.184316f, -0.021148f, 0.424974f, 0.003743f, 0.006310f, + 0.046266f, -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f, + 0.269773f, -0.021105f, -0.146698f, 0.188764f, +}; + +static const float av1_partition_breakout_nn_bias_16_layer1[1] = { + 1.60751927f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_16_layer0, + av1_partition_breakout_nn_weights_16_layer1, + }, + { + av1_partition_breakout_nn_bias_16_layer0, + av1_partition_breakout_nn_bias_16_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = { + -0.255885f, 0.109548f, -0.111054f, -0.476119f, -1.083031f, -0.342003f, + 0.048241f, -0.356013f, -0.085054f, 0.124908f, 0.000084f, -0.149906f, + -0.729829f, 0.133535f, -0.002125f, 0.207516f, -0.210163f, -0.567365f, + -0.590103f, 0.045308f, -0.539406f, 0.130550f, -0.663879f, -0.170549f, + 0.017587f, -0.054187f, 0.000550f, 0.038297f, -0.112891f, -0.012751f, + -0.048067f, 0.095564f, 0.079892f, 0.077285f, -0.749708f, -0.286312f, + -0.054334f, 0.132242f, -0.004152f, -0.209758f, -0.073407f, 0.082306f, + -0.001034f, -0.090990f, 0.122823f, -0.109794f, -0.230066f, -0.391155f, + -0.262245f, -0.004744f, -0.232246f, 0.099290f, -0.637484f, 0.111937f, + -0.548556f, -0.598344f, 0.123265f, -0.281395f, -0.399711f, -0.525671f, + -0.596269f, 0.098494f, -0.005765f, 0.173652f, + }; + +static const float av1_partition_breakout_nn_bias_8_layer0[16] = { + 0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f, + 2.056990f, 5.284306f, 0.639643f, -2.792049f, -2.232339f, -0.232209f, + 2.336705f, -0.278834f, 0.231905f, 7.954366f, +}; + +static const float av1_partition_breakout_nn_weights_8_layer1[16] = { + -0.014439f, 0.010171f, 0.048116f, -0.090659f, -0.081235f, -0.021840f, + -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f, + 0.055858f, 0.230970f, -0.056466f, 0.119780f, +}; + +static const float av1_partition_breakout_nn_bias_8_layer1[1] = { + 1.27784479f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_8 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_8_layer0, + av1_partition_breakout_nn_weights_8_layer1, + }, + { + av1_partition_breakout_nn_bias_8_layer0, + av1_partition_breakout_nn_bias_8_layer1, + }, +}; +#undef FEATURE_SIZE + +#define FEATURE_SIZE 9 // Input layer size +#define NUM_NODES 32 // Hidden layer size +#define LABEL_SIZE 3 // Output layer size + +static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE * + NUM_NODES] = { + 0.22151f, 0.99424f, 0.23415f, -1.13841f, -0.11277f, 0.09530f, 0.14769f, + -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f, 0.15777f, -0.03176f, + 0.02729f, -0.37344f, -0.01727f, -0.05469f, 0.19402f, -3.45508f, 0.90106f, + -2.91557f, 0.19379f, 0.14356f, -0.13291f, 0.05734f, -0.03032f, -0.13060f, + 0.35744f, 1.31630f, -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f, + 1.08305f, -0.21596f, 0.76244f, 1.10616f, -1.71706f, 0.05768f, 0.10966f, + 0.00949f, -0.12680f, 0.00699f, -0.11522f, -0.38566f, 0.34283f, -0.35266f, + -0.40643f, -0.22462f, 0.32300f, -0.39737f, -0.20587f, -0.16096f, 1.07543f, + 0.30314f, -1.35659f, -0.38212f, 0.45857f, 0.76615f, 0.16819f, -1.24459f, + 0.39677f, 0.87436f, -2.33757f, 1.27471f, 0.27488f, 0.01019f, -0.01221f, + -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f, + -0.06777f, -1.13868f, 0.01354f, -0.75590f, -0.78222f, -0.07453f, 0.61788f, + 0.56899f, 1.17144f, 0.70899f, 0.48568f, 0.11266f, 0.81579f, -0.03929f, + 0.01088f, 0.33599f, -0.22401f, -0.49654f, -0.02598f, 0.04509f, -0.08217f, + -0.30687f, 0.19851f, -2.96860f, -2.30698f, 0.01848f, 0.11801f, 0.06614f, + 0.01673f, -0.11002f, -0.08168f, 0.09204f, -0.06379f, 0.27972f, -0.31716f, + -0.00566f, -0.13651f, -0.37276f, 0.01511f, -0.23697f, 0.21696f, -0.19480f, + 0.60758f, -0.43506f, -0.02247f, -1.45073f, 0.84442f, -0.94018f, 0.32550f, + 0.03985f, -0.06581f, 0.21665f, 0.79472f, -2.41080f, 0.04788f, -0.09492f, + -0.10677f, 0.07250f, 0.14329f, -0.37319f, 0.53043f, -0.49108f, 0.25792f, + -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f, + -0.26196f, 0.93379f, 0.72115f, 0.54464f, 0.27642f, 0.04757f, 2.01629f, + 1.55787f, -0.11665f, 1.00722f, -0.24352f, 0.53308f, 0.57719f, 0.39344f, + 0.19174f, 0.06339f, -0.02530f, 0.07724f, -0.32416f, -0.26992f, -0.35887f, + -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f, -1.52153f, -0.26503f, + 0.97552f, -2.96705f, -0.91220f, -0.11827f, 0.00406f, -0.14514f, 0.18417f, + -0.20874f, 0.27293f, -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f, + -0.19293f, -0.18275f, -0.05902f, 0.58625f, -0.05470f, -0.48814f, -0.45382f, + -0.05959f, 2.01250f, -0.30014f, 0.69546f, -1.24180f, 1.34923f, 0.20337f, + 0.16850f, 0.07187f, 0.72630f, -0.15380f, -2.40973f, -2.73561f, -1.71375f, + -1.61695f, 0.50052f, 0.09730f, 0.00579f, 0.06133f, -0.06512f, -0.61439f, + -1.16173f, -0.58716f, 1.60438f, 0.23242f, 0.91847f, 0.49041f, -0.16277f, + -0.02574f, -0.64593f, 1.17028f, 0.46852f, 0.14926f, 0.73853f, -0.78521f, + 0.05959f, -0.35590f, 0.02039f, 0.10812f, -0.28650f, 1.34038f, -0.72188f, + 0.62385f, -0.35271f, -0.39599f, 0.41543f, 0.53124f, -0.23510f, -0.15480f, + -0.05066f, -0.33529f, 0.05238f, -0.35311f, -0.26983f, -0.39764f, 0.01085f, + 0.26593f, -0.18411f, -0.29945f, 0.50090f, -0.03397f, 0.78562f, -0.33068f, + 1.21308f, -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f, 0.65567f, + 0.76496f, 0.44325f, 0.01368f, -0.33619f, -0.64256f, 0.64478f, 0.84553f, + 1.74183f, 0.22563f, -0.14550f, -0.16258f, 0.03010f, 0.49922f, 0.64575f, + -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f, 0.87411f, + -1.08228f, +}; + +static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = { + 0.33919f, -0.03003f, 0.79073f, -0.18508f, 0.00668f, -0.12017f, 0.35362f, + -0.51642f, 0.06536f, 0.41668f, -0.06509f, 0.94606f, -0.15385f, 0.14936f, + 1.46274f, -0.06961f, 2.82537f, -1.95576f, -0.09457f, 0.02042f, -0.07480f, + -0.55083f, 0.26170f, 4.39883f, 0.33999f, -0.10502f, 0.70884f, -0.06992f, + -0.22638f, 1.40940f, -0.09309f, 0.05828f, +}; + +static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.09209f, 0.26236f, 0.62136f, 0.76324f, -1.14678f, 0.42289f, -0.08895f, + -0.97267f, 2.05958f, 0.00843f, 0.35335f, 1.12096f, -0.11679f, 0.07350f, + -1.23231f, -0.61990f, 1.51379f, -1.99450f, 0.22441f, 2.41974f, -0.30488f, + -0.37869f, 0.47168f, -3.70132f, 0.00061f, 0.19432f, 0.11512f, 0.26200f, + -0.35285f, 0.37985f, 0.90571f, 0.27344f, 0.74840f, -0.17965f, -2.51433f, + 0.59235f, 1.16670f, -0.53446f, 0.67897f, 0.04505f, -0.86874f, 0.45361f, + -0.35033f, 1.21283f, 0.31426f, -0.20841f, 0.56757f, 0.45909f, -1.23683f, + 0.09835f, -0.17214f, -0.96323f, 0.01138f, -0.50233f, 0.30104f, 2.01814f, + 1.15821f, -0.11947f, 0.74574f, -0.30714f, -0.39646f, -1.30086f, -0.88541f, + -0.12259f, -0.54977f, 0.30069f, 1.84299f, -0.95141f, -0.65887f, -0.25888f, + -0.63265f, 1.29531f, -0.56672f, 0.10837f, -0.21297f, -2.19131f, 0.01156f, + 0.51912f, 0.46704f, 0.42810f, -0.59271f, 0.98469f, -0.17914f, -1.91163f, + -0.32807f, 0.48199f, -0.99525f, 1.67108f, -0.87631f, -0.60258f, -0.78731f, + -0.32877f, 0.44237f, 0.01087f, 0.07489f, -0.28224f, +}; + +static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = { + 1.70665f, + -0.77954f, + -0.92709f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_8 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_8_layer0, + av1_rect_partition_nn_weights_8_layer1 }, + { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 } +}; + +static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.18480f, -0.05410f, -0.18957f, 0.15451f, -0.38649f, -0.26162f, -0.22727f, + -0.38555f, -0.36738f, 0.74384f, -1.85999f, 0.98491f, -0.72119f, 1.77321f, + 0.39983f, 0.96314f, 0.23695f, 0.30200f, 0.30629f, -0.47617f, -1.43320f, + -1.81730f, 0.36554f, -0.07142f, -1.27242f, -1.27697f, 0.00110f, -0.32179f, + 0.27460f, 0.45428f, 0.15308f, -0.73906f, -0.28577f, -0.01238f, -0.16958f, + -0.85390f, 1.05484f, -1.62812f, 0.77632f, -0.27327f, -0.32527f, 0.32726f, + 1.73255f, 0.53763f, 0.59121f, -0.39068f, -0.32451f, -0.31869f, 0.17777f, + 0.07519f, -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f, + 0.39674f, -0.08596f, 0.18587f, -2.04097f, -1.73993f, 1.57212f, 1.42410f, + -1.36762f, -0.41485f, -1.12103f, 0.56959f, 0.11500f, 0.48945f, -0.13585f, + 1.22125f, 0.67071f, -1.11812f, -0.20660f, -0.52856f, 0.70663f, 0.74382f, + 0.61114f, -0.11454f, 1.14687f, 0.80322f, -0.45965f, -0.44466f, -0.05830f, + 0.13206f, -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f, + 1.34433f, 2.49427f, 2.91955f, 1.71730f, 0.03295f, 0.03587f, -0.14550f, + 0.08189f, -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f, + -1.01334f, -0.57302f, 0.22592f, 0.05916f, -0.05305f, -0.89824f, -0.52969f, + -0.24542f, 0.27029f, -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f, + 1.83695f, 2.19716f, 2.31001f, 0.03657f, 0.00063f, -0.04379f, 0.05835f, + -0.08623f, 0.20557f, -0.17791f, 0.07874f, -0.25456f, -0.19513f, -0.27753f, + -0.31982f, 0.00245f, -0.33183f, 0.26059f, -0.22165f, 0.37582f, -0.30411f, + -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f, 1.03673f, + 0.66139f, 0.44941f, -0.44461f, -0.50376f, -0.49664f, 0.18608f, -0.26175f, + 0.14844f, 0.78715f, -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f, + -0.43135f, -0.22571f, 3.46263f, 3.13580f, -1.33203f, -0.15247f, -0.15866f, + -0.11214f, 0.12211f, 0.03964f, -1.87597f, -4.81597f, -4.80195f, -4.98096f, + -5.62336f, -0.05337f, -0.00943f, 0.00792f, 0.02742f, 1.05679f, 2.41455f, + 0.85382f, 1.42504f, 0.58096f, 0.21443f, 1.02694f, 1.06746f, 1.20242f, + 0.60767f, 1.98667f, -0.80879f, -0.63495f, 1.95508f, 0.23952f, -0.15019f, + -0.16097f, 0.30155f, -3.42407f, -1.34998f, 9.07689f, -2.22559f, 2.22562f, + -0.03348f, -0.05229f, 0.05931f, 0.03042f, -0.18068f, -0.05732f, -0.33010f, + -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f, -0.16366f, -0.24935f, + -0.69124f, 0.58508f, 0.50654f, 0.04492f, 1.38340f, -1.51487f, 1.72889f, + -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f, 0.03784f, + 0.08052f, -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f, + 0.61751f, -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f, + 0.15671f, -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f, -0.11533f, + -0.05799f, -0.03142f, 0.20218f, -0.39499f, -0.33859f, -0.13201f, -0.19527f, + -0.28459f, -0.20346f, 0.89457f, -2.22103f, -2.37455f, -2.00221f, 2.44553f, + 0.33915f, 0.50047f, -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f, + -1.70297f, 1.00482f, -0.00103f, -1.40813f, 0.21311f, 0.39230f, -0.07302f, + -3.49100f, 1.60675f, -2.90692f, 0.11022f, 0.13507f, -0.13308f, 0.15201f, + -0.05573f, +}; + +static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = { + -0.16783f, -0.16023f, 0.52215f, -0.04109f, 2.00122f, -0.11633f, 0.25535f, + 1.80638f, 1.69273f, -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f, + 0.00000f, -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f, -0.78548f, + -1.39335f, -5.42248f, -0.10388f, 0.07634f, 2.81012f, -0.57429f, -0.15629f, + -0.12044f, 1.65478f, -0.75153f, 1.18441f, +}; + +static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES * + LABEL_SIZE] = { + -0.26407f, 0.06322f, 0.87932f, 0.17772f, 0.71686f, -0.12283f, 0.08454f, + 0.20098f, -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f, 3.80486f, + 0.16750f, 0.29218f, 0.57234f, -0.96550f, -0.10599f, -4.91130f, -0.14658f, + 0.95803f, -4.13925f, 0.24567f, 0.25708f, 1.60547f, -1.03251f, -0.31053f, + -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f, 0.13689f, + 0.24504f, 0.49623f, 0.19980f, 0.38349f, 0.37481f, 0.54540f, -0.02198f, + 3.43385f, 1.02543f, -0.40921f, -3.07235f, 0.02996f, 0.00323f, -0.35414f, + 0.71099f, 1.39334f, 2.43741f, -1.11007f, -0.22739f, -4.21757f, 0.11905f, + 0.00353f, -1.69637f, 0.45944f, -0.19884f, 0.03624f, 0.25729f, 0.23659f, + -2.08405f, 0.08573f, -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f, + -0.09884f, -0.69026f, -0.37284f, 0.04622f, 1.32973f, -0.15414f, 0.19138f, + -0.67927f, -0.17658f, 0.36008f, -0.51832f, 0.09887f, -1.94414f, 2.95227f, + 1.76937f, -0.26687f, 8.50976f, 0.26247f, 0.60262f, -0.27910f, 0.30061f, + -0.05117f, 0.16018f, 0.71195f, 0.57871f, 1.57794f, +}; + +static const float av1_rect_partition_nn_bias_16_layer1[3] = { + 2.68750f, + -1.31894f, + -1.36768f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_16_layer0, + av1_rect_partition_nn_weights_16_layer1 }, + { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 } +}; + +static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f, + -0.09858f, -0.09438f, 0.37306f, 0.23934f, -1.86375f, -1.18307f, -0.32995f, + -0.09745f, 0.05431f, -0.13799f, 0.14734f, -0.33219f, 0.18057f, -0.23792f, + -0.28126f, 0.02977f, -0.07431f, 0.07860f, 0.00067f, -0.01927f, 1.01841f, + -0.57739f, 0.08412f, -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f, + -0.16703f, 0.02808f, 0.11994f, -0.26267f, 0.19706f, -0.29707f, -0.25305f, + -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f, -0.37001f, -0.23319f, + -0.11139f, -0.30513f, 0.04213f, -0.12550f, 0.02504f, 0.33245f, 0.01102f, + -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f, + 0.65066f, 0.28443f, -1.24943f, -3.00246f, -1.01897f, 0.09304f, 0.70052f, + -0.12877f, 0.21120f, -0.37476f, 0.23261f, -0.28401f, 0.09837f, 0.00020f, + -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f, 0.16596f, -0.06532f, + 1.72938f, 1.57754f, 0.55963f, 0.33246f, -0.20023f, 0.30715f, 0.08629f, + 0.18945f, -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f, + -0.57698f, 0.04157f, -0.92428f, -1.31268f, 1.78210f, 0.10291f, 1.55042f, + -1.26793f, 1.39042f, -1.43729f, 0.25600f, 5.21263f, 5.31955f, 5.19316f, + 5.43430f, 0.00294f, -0.00970f, -0.02333f, 0.00250f, 1.17672f, 6.27544f, + 4.95973f, 3.54009f, 4.51269f, 0.30750f, 0.78780f, -0.44741f, -0.76442f, + 0.75050f, 0.58799f, 0.03400f, -2.09859f, 1.67313f, 0.12503f, 0.28609f, + 1.15809f, 2.46530f, -0.04898f, 0.23072f, -0.12635f, -0.82097f, -0.63827f, + 2.16779f, 1.77132f, 0.15434f, -1.06427f, 0.06206f, -0.87732f, -0.61897f, + -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f, -0.22992f, + 1.74638f, 1.29199f, -0.55464f, 0.98316f, 0.06665f, 0.50254f, -0.66292f, + 0.17113f, -0.32633f, -1.85803f, -0.92759f, 4.44965f, 1.33057f, 0.02135f, + -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f, + -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f, 0.14248f, + -0.48630f, 0.18840f, 0.11040f, 0.17287f, -0.51880f, 1.12466f, -0.38888f, + -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f, -0.26881f, + -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f, + -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f, 1.29826f, 0.23788f, + 0.04189f, 2.66416f, 0.48815f, -0.06803f, 0.96742f, 1.27165f, -0.70348f, + -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f, + -0.48073f, 2.43657f, -2.60191f, -1.82837f, 0.50440f, 0.71829f, 0.76491f, + 0.28293f, 0.20568f, 0.92642f, -0.02496f, 1.43637f, -0.24474f, -1.21030f, + 0.54084f, 1.05130f, 1.29572f, 0.03750f, -0.36894f, 0.74548f, -1.33857f, + -0.84858f, 1.35230f, 0.80175f, 0.66136f, 1.06473f, 0.18701f, 1.42413f, + 0.04661f, -0.07820f, 0.64990f, -0.43595f, 1.18304f, -0.11437f, -0.06365f, + 0.03558f, 0.78260f, -1.74890f, 1.56217f, -1.23424f, 4.59193f, -3.35072f, + 0.01180f, -0.18296f, -0.20870f, 0.04510f, 1.52595f, -1.37402f, -0.33123f, + -0.85957f, 0.80598f, 0.03743f, 0.02354f, 0.37707f, 1.62095f, -0.29627f, + -0.31778f, -0.45789f, -0.14906f, 0.25315f, -0.10817f, -0.32610f, -0.40890f, + 0.33984f, +}; + +static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = { + -0.17482f, 0.39042f, 0.00000f, 1.69677f, 0.08792f, -0.09301f, 0.13809f, + 4.84061f, 0.00000f, 0.40515f, 0.46246f, 0.20644f, -5.77478f, -1.54510f, + 0.05660f, -0.32013f, 0.23649f, 0.03778f, -2.53710f, -0.27869f, 0.45623f, + -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f, 1.93272f, -1.07032f, + -0.27602f, -1.98063f, 0.20816f, -0.01315f, +}; + +static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.02827f, 1.02560f, -0.07137f, -0.31911f, 0.11365f, 0.13684f, -0.07816f, + -5.23036f, -0.34340f, 0.84526f, -1.51845f, 0.07017f, -8.12570f, 6.24061f, + 0.35739f, -0.09937f, -0.30978f, 0.22032f, 0.74968f, -0.34557f, 0.45547f, + -0.16512f, 0.07118f, 1.66415f, 0.41320f, -1.81533f, -1.96004f, 1.04666f, + 0.84049f, 4.31009f, 0.68850f, 0.26322f, -0.24634f, -1.25889f, 0.31952f, + 0.63632f, 0.05801f, -0.10664f, -0.21992f, 2.44386f, 0.19526f, -0.09838f, + 1.53049f, -0.26630f, 3.54126f, -3.40574f, 0.72730f, 0.04557f, 0.92652f, + 0.15522f, 2.35895f, -0.13347f, 0.56907f, 0.15352f, 0.01823f, -0.73939f, + 0.43104f, 1.90321f, 0.31267f, -0.51972f, 0.50094f, -3.98372f, -3.41518f, + -0.48183f, 0.26661f, 0.64146f, 0.14500f, -0.01695f, 0.16653f, -0.37846f, + 0.08412f, 2.69714f, -0.20258f, -0.75786f, 0.11201f, 0.61878f, 4.22231f, + -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f, + -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f, 2.11441f, + -1.08794f, -1.41694f, 0.02620f, 2.18792f, 0.04271f, +}; + +static const float av1_rect_partition_nn_bias_32_layer1[3] = { + 2.47332f, + -1.65756f, + -0.81573f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_32_layer0, + av1_rect_partition_nn_weights_32_layer1 }, + { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 } +}; + +static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE * + NUM_NODES] = { + 0.08972f, 4.09095f, -0.31398f, -2.43631f, -0.74767f, 1.42471f, 1.60926f, + 1.44721f, 1.88259f, 2.35375f, 1.88299f, 2.01109f, 0.98679f, 2.24131f, + 0.06279f, -0.08315f, 0.32107f, 0.91334f, -0.36569f, 5.55049f, 5.44943f, + 5.20471f, 5.39099f, -0.01943f, -0.00284f, 0.02203f, -0.01309f, 1.41917f, + 6.68460f, -6.15986f, 6.41341f, -3.20630f, -0.00567f, -0.00038f, 0.05960f, + 0.04308f, 0.95366f, 3.48535f, 2.98266f, 4.11784f, 3.44255f, 0.61630f, + 0.71405f, 0.63945f, -0.00713f, 0.39193f, 1.91621f, 3.32755f, 0.71674f, + -0.11647f, 2.07090f, 2.64191f, 0.07949f, -0.05023f, 0.99935f, 0.83145f, + 0.75898f, -0.98764f, -0.58731f, 1.21734f, -0.08076f, -3.26780f, 1.66278f, + 0.04189f, -0.33177f, -1.58648f, 1.00883f, -0.56132f, -2.34877f, 0.67056f, + -2.32297f, -0.91641f, -1.02909f, 4.19781f, 3.87484f, 4.32778f, -1.97171f, + -0.24734f, 0.00822f, 0.05892f, 0.12697f, -3.62915f, -2.93127f, 7.94856f, + -3.29311f, 3.26001f, -0.02231f, 0.02741f, 0.05919f, 0.08190f, -1.49344f, + -0.64475f, -0.24627f, 4.03324f, -1.14799f, -0.18465f, -0.17829f, 0.10394f, + 0.08580f, -5.74721f, 4.42467f, 3.63964f, 3.00258f, -1.22744f, -0.29408f, + 0.00767f, 0.12305f, 0.05249f, -0.17166f, -0.20120f, -0.32941f, -0.31901f, + 0.04628f, -0.35249f, -0.18272f, 0.03956f, -0.19329f, -0.33564f, 0.09856f, + -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f, + -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f, -0.24918f, -0.53516f, + -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f, 6.54429f, + -0.00226f, 0.00430f, 0.00321f, 0.00442f, 0.87551f, -0.16224f, -0.22832f, + -0.60640f, -0.28738f, 0.18062f, 0.22008f, -0.47406f, 0.80302f, 0.12149f, + 1.49530f, 1.05069f, -2.02985f, -0.92833f, 0.25616f, 0.12852f, 3.51840f, + 0.25226f, -2.63283f, -4.04386f, 8.46300f, -2.93408f, 0.44069f, 0.08276f, + 0.34482f, -0.22615f, 0.28666f, 3.02962f, -1.20055f, -1.04832f, -0.97632f, + -0.99530f, 1.44196f, 1.68550f, 0.49360f, 1.08155f, -0.26059f, -0.02876f, + -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f, + 1.17443f, 2.41497f, 1.90537f, 2.37716f, 2.91495f, -0.44455f, -0.51176f, + 0.48195f, 0.53032f, 0.23696f, -1.06211f, 1.47459f, -0.89029f, 0.29521f, + 0.66291f, -0.42653f, 1.82308f, -1.30372f, -0.36192f, -3.40388f, -1.61476f, + -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f, 0.27297f, + -4.81472f, 4.60404f, -0.11053f, 0.14765f, 0.02826f, -0.14688f, -0.07066f, + -0.01224f, 1.20377f, 7.02725f, -6.02627f, 6.87255f, -3.14257f, 0.01074f, + 0.02397f, -0.02359f, 0.01901f, 0.14956f, -1.67671f, 2.26714f, 2.57043f, + -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f, + -2.14814f, -0.67266f, -1.56850f, 0.57137f, -1.14428f, -0.34265f, -0.12521f, + 0.01220f, -0.74906f, -0.19270f, 0.68110f, -0.24737f, -0.70568f, -1.64826f, + -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f, 6.17812f, + -0.03191f, -0.00104f, 0.01402f, -0.00046f, -0.94517f, 1.51266f, -0.56318f, + 0.72260f, -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f, 2.21148f, + -1.47954f, -1.01439f, 0.31536f, 0.77238f, -0.85083f, -0.15758f, -0.50886f, + 0.09101f, +}; + +static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = { + 0.91706f, -1.31328f, -5.16196f, 1.13191f, -0.98044f, -1.61122f, 1.03039f, + -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f, -0.10752f, -0.13065f, + -0.35567f, -0.35693f, 1.74941f, 1.17379f, -3.45555f, 5.66321f, -0.24917f, + -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f, -2.97859f, -0.16774f, + 0.59835f, -0.31269f, -0.30585f, -1.66212f, +}; + +static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.58963f, 4.20320f, -8.62465f, -6.54014f, 5.41108f, 2.33581f, -0.10354f, + -1.17753f, -3.45909f, -2.24722f, 2.20881f, 3.21971f, -0.09087f, -0.21624f, + 0.16529f, -8.40985f, -1.60205f, -1.41538f, 4.41826f, -4.63069f, -0.27742f, + 4.08710f, 0.26439f, -1.46028f, 0.51234f, 6.25212f, -3.35650f, -1.21348f, + 1.37201f, 8.89151f, 0.28859f, -0.97328f, -0.36196f, -2.71701f, 4.54196f, + -0.62476f, -2.43814f, -1.34209f, 0.12850f, 1.73859f, 3.09809f, -4.42434f, + -1.82552f, -3.66420f, -0.31535f, 0.00968f, -0.02019f, 9.66824f, 0.58835f, + 1.50425f, 2.84487f, 2.55522f, 0.01409f, -2.27594f, -0.31800f, 0.91076f, + -0.66808f, 0.33120f, -0.12460f, 0.64457f, -0.36416f, -10.30843f, 1.51013f, + 2.06861f, -0.20989f, -0.87119f, 3.68642f, 7.33662f, -2.88037f, -0.52414f, + -0.35036f, -0.45947f, -0.07406f, 6.46346f, -0.16031f, 0.27071f, 0.38845f, + -0.21940f, 0.08583f, -1.39526f, 0.50554f, 0.45279f, -6.61856f, 1.84069f, + -0.19149f, -1.77235f, 0.75136f, 1.11797f, 0.32677f, -7.10427f, 3.82908f, + 1.04238f, -0.91435f, 1.93317f, -1.84946f, -0.48909f, +}; + +static const float av1_rect_partition_nn_bias_64_layer1[3] = { + 0.32215f, + -0.57522f, + 0.25314f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_64_layer0, + av1_rect_partition_nn_weights_64_layer1 }, + { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 } +}; + +static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.70901f, -3.03481f, 3.30604f, -1.28803f, -0.08610f, -0.33320f, -0.30716f, + 0.25100f, 0.14323f, -0.98422f, -0.89084f, -0.24508f, -1.10785f, -0.82524f, + 0.11766f, -0.42777f, 1.08965f, 4.35125f, -1.19388f, 4.22042f, 4.96306f, + 6.32406f, 3.29899f, -0.90768f, 0.05203f, 0.38467f, 1.74257f, -0.19918f, + -0.11335f, 0.00140f, -0.42303f, -0.04419f, 0.03583f, -0.05441f, -0.19586f, + 0.01484f, -1.19964f, 0.25497f, 3.04502f, 0.05446f, -0.23253f, 0.00266f, + 0.07117f, -2.78986f, -4.62953f, 1.45331f, 0.43923f, 0.92298f, -0.47736f, + 1.49165f, 0.45942f, -1.99787f, 3.33510f, 0.17234f, 0.04024f, -1.42780f, + 0.23566f, -0.90970f, 1.18041f, -1.45865f, 2.30878f, -1.28507f, 1.87290f, + 1.91186f, 4.74826f, -3.70735f, 4.49808f, -4.72275f, -0.02696f, -0.02642f, + -0.06093f, -0.01121f, -0.70683f, 2.69737f, -1.88563f, 2.48637f, 1.10922f, + 0.74624f, 0.40308f, 2.06396f, 1.39289f, 0.00909f, -2.05271f, -1.53539f, + -1.38323f, 0.83303f, -0.32250f, 0.51172f, 3.91249f, 1.66373f, 1.13184f, + -2.22874f, -1.13448f, -0.11185f, 0.19387f, 0.36770f, -0.58933f, 0.22789f, + 1.17307f, 0.77461f, 0.20817f, 0.33417f, 0.54037f, 0.32961f, -0.18456f, + -9.78171f, -0.17216f, -3.44703f, -2.42158f, 0.51946f, 4.35949f, -0.73335f, + -1.61515f, -0.29622f, -0.37617f, -0.42316f, 0.74922f, 1.44386f, 3.92704f, + -3.76274f, 4.19775f, -3.86958f, 0.00074f, -0.02418f, -0.12944f, 0.05857f, + -0.85507f, 5.42546f, 5.40338f, 5.54347f, 5.59791f, -0.01611f, 0.01618f, + -0.01654f, -0.00270f, -0.39608f, -0.40410f, -0.24551f, 0.09124f, -0.34413f, + -0.11504f, 0.12793f, -0.31523f, 0.09148f, -0.08567f, -0.05140f, -0.13310f, + -0.81200f, 0.06882f, -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f, + -1.02573f, 0.32788f, 1.06828f, -1.25503f, -0.42693f, 2.01413f, -2.29103f, + 0.62271f, 1.11764f, -1.83113f, -1.32325f, -1.65651f, -2.87826f, 1.46910f, + 0.60885f, 0.16079f, 0.00171f, -0.25658f, -0.25465f, -0.14149f, 0.19497f, + -0.07866f, -0.37080f, -0.05778f, -0.08870f, -0.20491f, 0.84521f, -0.18214f, + -1.38441f, -1.08932f, -1.76627f, 0.73172f, 0.05967f, 1.28057f, 3.42722f, + 1.69287f, 0.77169f, 0.44528f, 1.85513f, 0.07840f, 1.31252f, 2.89948f, + 1.49489f, 0.15281f, 0.54708f, -1.14185f, -2.51063f, 0.36618f, -0.55322f, + 0.96671f, 1.59470f, 1.38252f, 1.99697f, 0.03266f, -0.23200f, -0.01127f, + -0.18918f, -0.37598f, -0.03119f, -0.36039f, -0.21192f, -0.11565f, -4.22635f, + 1.41252f, 0.56608f, -0.08867f, 3.11924f, -0.54597f, -0.12504f, -0.05289f, + -0.28665f, -0.58297f, -1.18362f, -0.76201f, -1.22011f, -0.58756f, 0.14740f, + 1.43971f, 0.98381f, -0.02998f, -0.40678f, -0.23047f, -0.12979f, 0.04003f, + -0.22081f, -0.09294f, -0.15955f, -0.10379f, -0.10192f, -1.51316f, 2.39482f, + -1.69975f, 3.58976f, -0.91032f, -0.03498f, 0.48982f, -0.13418f, 0.76256f, + 1.61003f, -2.01676f, -1.24430f, -3.25763f, 1.12314f, 2.00740f, 0.04613f, + -0.14746f, -0.57374f, 3.44511f, -0.56767f, -4.08432f, -2.04894f, 2.35951f, + -0.00458f, 0.18512f, 0.09916f, -0.04084f, -1.56207f, 1.38034f, 4.17302f, + -1.47326f, -2.03530f, -0.00210f, 0.27469f, -0.17423f, 0.86860f, 2.76195f, + 2.43269f, -3.57331f, 2.08715f, -1.44171f, -0.17389f, 2.26157f, -0.07852f, + 2.02519f, +}; + +static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = { + 2.53427f, 1.66678f, -0.84914f, -0.15070f, -1.74769f, 0.45218f, -0.26067f, + 2.05916f, 0.08978f, 5.30984f, 2.66243f, -1.62740f, 0.70018f, 1.96403f, + -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f, -0.08373f, 0.00225f, + -1.40692f, -0.27569f, -0.30253f, 0.77377f, -0.67636f, -0.26379f, 1.82348f, + 0.66120f, 0.61119f, -1.42293f, 0.32676f, +}; + +static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES * + LABEL_SIZE] = { + 1.53453f, -0.23707f, 7.88368f, 0.33340f, 0.97523f, 1.38538f, -0.16746f, + 4.42070f, 3.18678f, -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f, + -8.75673f, 0.27398f, -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f, + 3.32134f, 0.25375f, -0.00394f, 2.30213f, -0.14183f, 0.14544f, -1.42830f, + 1.31101f, 3.99389f, -0.00017f, -2.90184f, -2.11444f, 2.16734f, -3.05133f, + 0.39206f, 4.61489f, -2.88181f, -0.47745f, 2.86649f, -1.20621f, 3.70550f, + 1.58029f, -4.58731f, -2.29350f, -0.76930f, 5.19135f, -0.22521f, -5.08782f, + 2.17316f, 1.30563f, 0.16777f, -2.17767f, -2.09904f, 1.37001f, 0.25091f, + -1.76743f, 1.57940f, 0.30544f, -2.39895f, -0.08532f, -1.77122f, 1.84010f, + -0.88449f, 0.79299f, -1.35368f, -4.54110f, 0.02244f, -5.11580f, 1.60883f, + 0.29352f, -6.47042f, -1.81426f, 1.24013f, 0.90980f, 7.93977f, 2.12555f, + 5.24720f, 4.19508f, 0.21499f, 11.06045f, -0.74752f, 0.89396f, 0.26422f, + 1.72332f, -1.25113f, -1.71136f, 0.13676f, -0.07867f, -0.96929f, 0.19911f, + 3.58233f, -0.76470f, -2.24162f, -2.87465f, 3.18736f, +}; + +static const float av1_rect_partition_nn_bias_128_layer1[3] = { + 1.09014f, + -0.53317f, + -0.55668f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_128_layer0, + av1_rect_partition_nn_weights_128_layer1 }, + { av1_rect_partition_nn_bias_128_layer0, + av1_rect_partition_nn_bias_128_layer1 } +}; +#undef FEATURE_SIZE +#undef NUM_NODES +#undef LABEL_SIZE + #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_ +#endif // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c index 461c3af83..c5508e25c 100644 --- a/third_party/aom/av1/encoder/picklpf.c +++ b/third_party/aom/av1/encoder/picklpf.c @@ -70,7 +70,7 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, // TODO(any): please enable multi-thread and remove the flag when loop // filter mask is compatible with multi-thread. #if LOOP_FILTER_BITMASK - av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane, + av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, 0, plane, plane + 1, partial_frame); #else if (cpi->num_workers > 1) @@ -193,6 +193,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, (void)sd; lf->sharpness_level = 0; + cpi->td.mb.rdmult = cpi->rd.RDMULT; if (method == LPF_PICK_MINIMAL_LPF) { lf->filter_level[0] = 0; diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h index 2a168358e..357097ae1 100644 --- a/third_party/aom/av1/encoder/picklpf.h +++ b/third_party/aom/av1/encoder/picklpf.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_PICKLPF_H_ -#define AV1_ENCODER_PICKLPF_H_ +#ifndef AOM_AV1_ENCODER_PICKLPF_H_ +#define AOM_AV1_ENCODER_PICKLPF_H_ #ifdef __cplusplus extern "C" { @@ -27,4 +27,4 @@ void av1_pick_filter_level(const struct yv12_buffer_config *sd, } // extern "C" #endif -#endif // AV1_ENCODER_PICKLPF_H_ +#endif // AOM_AV1_ENCODER_PICKLPF_H_ diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c index 28b693b08..e7804f6b4 100644 --- a/third_party/aom/av1/encoder/pickrst.c +++ b/third_party/aom/av1/encoder/pickrst.c @@ -15,6 +15,7 @@ #include #include "config/aom_scale_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/binary_codes_writer.h" @@ -22,7 +23,6 @@ #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "aom_ports/system_state.h" - #include "av1/common/onyxc_int.h" #include "av1/common/quant_common.h" #include "av1/common/restoration.h" @@ -181,6 +181,77 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc, return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd); } +int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, int32_t *flt1, + int flt1_stride, int xq[2], + const sgr_params_type *params) { + int i, j; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15)); + assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } + } else if (params->r[0] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[0] * (flt0[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + } + } else if (params->r[1] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[1] * (flt1[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt1 += flt1_stride; + } + } else { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t e = (int32_t)(dat[j]) - src[j]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + } + } + + return err; +} + static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int use_highbitdepth, @@ -192,21 +263,9 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height, int xq[2]; decode_xq(xqd, xq, params); if (!use_highbitdepth) { - const uint8_t *src = src8; - const uint8_t *dat = dat8; - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - const int32_t u = - (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); - int32_t v = u << SGRPROJ_PRJ_BITS; - if (params->r[0] > 0) v += xq[0] * (flt0[i * flt0_stride + j] - u); - if (params->r[1] > 0) v += xq[1] * (flt1[i * flt1_stride + j] - u); - const int32_t e = - ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - - src[i * src_stride + j]; - err += e * e; - } - } + err = av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, xq, params); } else { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); @@ -463,9 +522,11 @@ static void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width, // Iterate over the stripe in blocks of width pu_width for (int j = 0; j < width; j += pu_width) { const int w = AOMMIN(pu_width, width - j); - av1_selfguided_restoration(dat8_row + j, w, h, dat_stride, flt0_row + j, - flt1_row + j, flt_stride, sgr_params_idx, - bit_depth, use_highbd); + const int ret = av1_selfguided_restoration( + dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j, + flt_stride, sgr_params_idx, bit_depth, use_highbd); + (void)ret; + assert(!ret); } } } @@ -588,22 +649,9 @@ static void search_sgrproj(const RestorationTileLimits *limits, if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj; } -static double find_average(const uint8_t *src, int h_start, int h_end, - int v_start, int v_end, int stride) { - uint64_t sum = 0; - double avg = 0; - int i, j; - aom_clear_system_state(); - for (i = v_start; i < v_end; i++) - for (j = h_start; j < h_end; j++) sum += src[i * stride + j]; - avg = (double)sum / ((v_end - v_start) * (h_end - h_start)); - return avg; -} - -static void compute_stats(int wiener_win, const uint8_t *dgd, - const uint8_t *src, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, - int src_stride, double *M, double *H) { +void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src, + int h_start, int h_end, int v_start, int v_end, + int dgd_stride, int src_stride, double *M, double *H) { int i, j, k, l; double Y[WIENER_WIN2]; const int wiener_win2 = wiener_win * wiener_win; @@ -626,8 +674,7 @@ static void compute_stats(int wiener_win, const uint8_t *dgd, assert(idx == wiener_win2); for (k = 0; k < wiener_win2; ++k) { M[k] += Y[k] * X; - H[k * wiener_win2 + k] += Y[k] * Y[k]; - for (l = k + 1; l < wiener_win2; ++l) { + for (l = k; l < wiener_win2; ++l) { // H is a symmetric matrix, so we only need to fill out the upper // triangle here. We can copy it down to the lower triangle outside // the (i, j) loops. @@ -1073,9 +1120,9 @@ static void search_wiener(const RestorationTileLimits *limits, limits->h_start, limits->h_end, limits->v_start, limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H); } else { - compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer, limits->h_start, - limits->h_end, limits->v_start, limits->v_end, - rsc->dgd_stride, rsc->src_stride, M, H); + av1_compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer, + limits->h_start, limits->h_end, limits->v_start, + limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H); } const MACROBLOCK *const x = rsc->x; @@ -1266,6 +1313,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { // problem, as these elements are ignored later, but in order to quiet // Valgrind's warnings we initialise the array below. memset(rusi, 0, sizeof(*rusi) * ntiles[0]); + cpi->td.mb.rdmult = cpi->rd.RDMULT; RestSearchCtxt rsc; const int plane_start = AOM_PLANE_Y; diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h index 179b89ff9..3fec0c34b 100644 --- a/third_party/aom/av1/encoder/pickrst.h +++ b/third_party/aom/av1/encoder/pickrst.h @@ -8,22 +8,39 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_PICKRST_H_ -#define AV1_ENCODER_PICKRST_H_ +#ifndef AOM_AV1_ENCODER_PICKRST_H_ +#define AOM_AV1_ENCODER_PICKRST_H_ #ifdef __cplusplus extern "C" { #endif #include "av1/encoder/encoder.h" +#include "aom_ports/system_state.h" struct yv12_buffer_config; struct AV1_COMP; +static const uint8_t g_shuffle_stats_data[16] = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, +}; + +static INLINE double find_average(const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int stride) { + uint64_t sum = 0; + double avg = 0; + int i, j; + aom_clear_system_state(); + for (i = v_start; i < v_end; i++) + for (j = h_start; j < h_end; j++) sum += src[i * stride + j]; + avg = (double)sum / ((v_end - v_start) * (h_end - h_start)); + return avg; +} + void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi); #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_PICKRST_H_ +#endif // AOM_AV1_ENCODER_PICKRST_H_ diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h index 42a4c590b..40dd46768 100644 --- a/third_party/aom/av1/encoder/pustats.h +++ b/third_party/aom/av1/encoder/pustats.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_PUSTATS_H_ -#define AV1_ENCODER_PUSTATS_H_ +#ifndef AOM_AV1_ENCODER_PUSTATS_H_ +#define AOM_AV1_ENCODER_PUSTATS_H_ #ifdef __cplusplus extern "C" { @@ -18,83 +18,78 @@ extern "C" { #include "av1/encoder/ml.h" -#define NUM_FEATURES 11 +#define NUM_FEATURES_PUSTATS 8 #define NUM_HIDDEN_LAYERS 2 #define HIDDEN_LAYERS_0_NODES 12 #define HIDDEN_LAYERS_1_NODES 10 #define LOGITS_NODES 1 static const float - av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES * + av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS * HIDDEN_LAYERS_0_NODES] = { - 21.5067f, 22.6709f, 0.0049f, 0.9288f, -0.0100f, 0.0060f, -0.0071f, - -0.0085f, 0.0348f, -0.1273f, 10.1154f, 6.3405f, 7.8589f, -0.0652f, - -4.6352f, 0.0445f, -3.2748f, 0.1025f, -0.0385f, -0.4505f, 1.1320f, - 3.2634f, 23.2420f, -7.9056f, 0.0522f, -18.1555f, 0.0977f, 0.1155f, - -0.0138f, 0.0267f, -0.3992f, 0.2735f, 22.8063f, 35.1043f, 3.8140f, - -0.0295f, 0.0771f, -0.6938f, 0.0302f, -0.0266f, 0.0989f, -0.0794f, - 0.2981f, 33.3333f, -24.1150f, 1.4986f, -0.0975f, -15.3938f, -0.0858f, - -0.0845f, -0.0869f, -0.0858f, 0.3542f, 0.0155f, -18.2629f, 9.6688f, - -11.9643f, -0.2904f, -5.3026f, -0.1011f, -0.1202f, 0.0127f, -0.0269f, - 0.3434f, 0.0595f, 16.6800f, 41.4730f, 6.9269f, -0.0512f, -1.4540f, - 0.0468f, 0.0077f, 0.0983f, 0.1265f, -0.5234f, 0.9477f, 36.6470f, - -0.4838f, -0.2269f, -0.1143f, -0.3907f, -0.5005f, -0.0179f, -0.1057f, - 0.1233f, -0.4412f, -0.0474f, 0.1140f, -21.6813f, -0.9077f, -0.0078f, - -3.3306f, 0.0417f, 0.0412f, 0.0427f, 0.0418f, -0.1699f, 0.0072f, - -22.3335f, 16.1203f, -10.1220f, -0.0019f, 0.0005f, -0.0054f, -0.0155f, - -0.0302f, -0.0379f, 0.1276f, 0.1568f, 21.6175f, 12.2919f, 11.0327f, - -0.2000f, -8.6691f, -0.5593f, -0.5952f, -0.4203f, -0.4857f, -1.1239f, - 3.1404f, -13.1098f, -5.9165f, 22.2060f, -0.0312f, -3.9642f, -0.0344f, - -0.0656f, -0.0273f, -0.0465f, 0.1412f, -6.1974f, 9.3661f, + -0.1758f, -0.0499f, -10.0069f, -2.2838f, -0.3359f, 0.3459f, -0.3285f, + -0.0515f, -0.5417f, 0.2357f, -0.0575f, -69.0782f, 0.5348f, 1.4068f, + 0.2213f, -1.0490f, -0.0636f, 0.1654f, 1.1002f, 33.4924f, 0.4358f, + 1.2499f, 0.1143f, 0.0592f, -1.6335f, -0.0092f, 1.2207f, -28.4543f, + -0.4973f, 0.4368f, 0.2341f, -0.1623f, -3.8986f, 0.1311f, -1.8789f, + -3.9079f, -0.8158f, -0.8420f, 1.4295f, -2.3629f, -1.4825f, 0.6498f, + -5.3669f, 6.4434f, 1.8393f, -35.0678f, 3.7459f, -2.8504f, 2.0502f, + -0.1812f, -3.9011f, -1.0155f, 1.8375f, -1.4517f, 1.3917f, 3.8664f, + 0.8345f, -0.3472f, 5.7740f, -1.1196f, -0.3264f, -1.2481f, -0.9284f, + -4.9657f, 2.2831f, 0.7337f, 2.3176f, 0.6416f, 0.8804f, 1.9988f, + -1.3426f, 1.2728f, 1.2249f, -0.1551f, 5.6045f, 0.2046f, -2.1464f, + -2.4922f, -0.5334f, 12.1055f, 7.2467f, -0.0070f, 0.0234f, 0.0021f, + 0.0215f, -0.0098f, -0.0682f, -6.1494f, -0.3176f, -1.6069f, -0.2119f, + -1.0533f, -0.3566f, 0.5294f, -0.4335f, 0.1626f, }; static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { - -14.3065f, 2.059f, -62.9916f, -50.1209f, 57.643f, -59.3737f, - -30.4737f, -0.1112f, 72.5427f, 55.402f, 24.9523f, 18.5834f, + 10.5266f, 5.3268f, -1.0678f, 7.7411f, 8.7164f, -0.3235f, + 7.3028f, 9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f, }; static const float av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * HIDDEN_LAYERS_1_NODES] = { - 0.3883f, -0.2784f, -0.2850f, 0.4894f, -2.2450f, 0.4511f, -0.1969f, - -0.0077f, -1.4924f, 0.1138f, -2.9848f, 1.0211f, -0.1712f, -0.1952f, - -0.4774f, 0.0761f, -0.3186f, -0.1002f, 0.8663f, 0.5026f, 1.1920f, - 0.9337f, 0.3911f, -0.3841f, -0.0037f, 0.7295f, -0.3183f, 0.1829f, - -1.3670f, -0.1046f, 0.6629f, 0.0619f, -0.1551f, 0.8174f, 2.1521f, - -1.3323f, -0.0527f, -0.5772f, 0.2001f, -0.6270f, -1.0625f, 0.3342f, - 0.6676f, 0.4605f, -2.0049f, 0.7781f, 0.0713f, -0.0824f, -0.4529f, - 0.1757f, -0.1338f, -0.2319f, -0.2864f, 0.1248f, 0.3887f, -0.1676f, - 1.8422f, 0.6435f, 1.2123f, -0.5667f, -0.2423f, -0.0314f, 0.2411f, - -0.5013f, 0.0422f, 0.2559f, 0.4435f, -0.1223f, 1.5167f, 0.3939f, - 1.0898f, 0.0795f, -0.9251f, -0.0813f, -0.5929f, -0.0741f, 4.0687f, - -0.4368f, -0.0984f, 0.0837f, 3.6169f, 0.0662f, -0.1679f, -0.8090f, - -0.2610f, -0.5791f, 0.0642f, -0.2979f, -0.9036f, 0.2898f, 0.3265f, - 0.4660f, -1.6358f, -0.0347f, 0.1087f, 0.0353f, 0.5687f, -0.5242f, - -0.4895f, 0.7693f, -1.3829f, -0.2244f, -0.2880f, 0.0575f, 2.0563f, - -0.2322f, -1.1597f, 1.6125f, -0.0925f, 1.3540f, 0.1432f, 0.3993f, - -0.0303f, -1.1438f, -1.7323f, -0.4329f, 2.9443f, -0.5724f, 0.0122f, - -1.0829f, + 10.5932f, 2.5192f, -0.0015f, 5.9479f, 5.2426f, -0.4091f, 5.3220f, + 6.0469f, 0.7200f, 3.3241f, 5.5006f, 12.8290f, -1.6396f, 0.5743f, + -0.8370f, 1.9956f, -4.9270f, -1.5295f, 2.1350f, -9.4415f, -0.7094f, + 5.1822f, 19.7287f, -3.0444f, -0.3320f, 0.0031f, -0.2709f, -0.5249f, + 0.3281f, -0.2240f, 0.2225f, -0.2386f, -0.4370f, -0.2438f, -0.4928f, + -0.2842f, -2.1772f, 9.2570f, -17.6655f, 3.5448f, -2.8394f, -1.0167f, + -0.5115f, -1.9260f, -0.2111f, -0.7528f, -1.2387f, -0.0401f, 5.0716f, + -3.3763f, -0.2898f, -0.4956f, -7.9993f, 0.1526f, -0.0242f, 0.7354f, + 6.0432f, 4.8043f, 7.4790f, -0.6295f, 1.7565f, 3.7197f, -2.3963f, + 6.8945f, 2.9717f, -3.1623f, 3.4241f, 4.4676f, -1.8154f, -2.9401f, + -8.5657f, -3.0240f, -1.4661f, 8.1145f, -12.7858f, 3.3624f, -1.0819f, + -4.2856f, 1.1801f, -0.5587f, -1.6062f, -1.1813f, -3.5882f, -0.2490f, + -24.9566f, -0.4140f, -0.1113f, 3.5537f, 4.4112f, 0.1367f, -1.5876f, + 1.6605f, 1.3903f, -0.0253f, -2.1419f, -2.2197f, -0.7659f, -0.4249f, + -0.0424f, 0.1486f, 0.4643f, -0.9068f, -0.3619f, -0.7624f, -0.9132f, + -0.4947f, -0.3527f, -0.5445f, -0.4768f, -1.7761f, -1.0686f, 0.5462f, + 1.3371f, 4.3116f, 0.0777f, -2.7216f, -1.8908f, 3.4989f, 7.7269f, + -2.7566f, }; static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { - -10.3717f, 37.304f, -36.7221f, -52.7572f, 44.0877f, - 41.1631f, 36.3299f, -48.6087f, -4.5189f, 13.0611f, + 13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f, + 6.1715f, 0.5094f, 7.6433f, -0.3992f, -1.3555f, }; static const float av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { - 0.8362f, 1.0615f, -1.5178f, -1.2959f, 1.3233f, - 1.4909f, 1.3554f, -0.8626f, -0.618f, -0.9458f, + 4.3078f, -17.3497f, 0.0195f, 34.6032f, -5.0127f, + 5.3079f, 10.0077f, -13.129f, 0.0087f, -8.4009f, }; static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = { - 30.6878f, + 4.5103f, }; static const NN_CONFIG av1_pustats_rate_nnconfig = { - NUM_FEATURES, // num_inputs + NUM_FEATURES_PUSTATS, // num_inputs LOGITS_NODES, // num_outputs NUM_HIDDEN_LAYERS, // num_hidden_layers { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes @@ -111,76 +106,71 @@ static const NN_CONFIG av1_pustats_rate_nnconfig = { }; static const float - av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES * + av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS * HIDDEN_LAYERS_0_NODES] = { - 0.7770f, 1.0881f, 0.0177f, 0.4939f, -0.2541f, -0.2672f, -0.1705f, - -0.1940f, -0.6395f, 1.2928f, 3.6240f, 2.4445f, 1.6790f, 0.0265f, - 0.1897f, 0.1776f, 0.0422f, 0.0197f, -0.0466f, 0.0462f, -1.0827f, - 2.0231f, 1.8044f, 2.7022f, 0.0064f, 0.2255f, -0.0552f, -0.1010f, - -0.0581f, -0.0781f, 0.2614f, -3.4085f, 1.7478f, 0.1155f, -0.1458f, - -0.0031f, -0.1797f, -0.4378f, -0.0539f, 0.0607f, -0.1347f, -0.3142f, - -0.2014f, -0.4484f, -0.2808f, 1.5913f, 0.0046f, -0.0610f, -0.6479f, - -0.7278f, -0.5592f, -0.6695f, -0.8120f, 2.9056f, -1.1501f, 9.3618f, - 4.2486f, 0.0011f, -0.1499f, -0.0834f, 0.1282f, 0.0409f, 0.1670f, - -0.1398f, -0.4661f, 13.7700f, 8.2061f, -0.0685f, 0.0061f, -0.2951f, - 0.0169f, 0.0520f, 0.0040f, 0.0374f, 0.0467f, -0.0107f, 14.2664f, - -2.2489f, -0.2516f, -0.0061f, -0.9921f, 0.1223f, 0.1212f, 0.1199f, - 0.1185f, -0.4867f, 0.0325f, -5.0757f, -8.7853f, 1.0450f, 0.0169f, - 0.5462f, 0.0051f, 0.1330f, 0.0143f, 0.1429f, -0.0258f, 0.2769f, - -12.8839f, 22.3093f, 1.2761f, 0.0037f, -1.2459f, -0.0466f, 0.0003f, - -0.0464f, -0.0067f, 0.2361f, 0.0355f, 23.3833f, 10.9218f, 2.6811f, - 0.0222f, -1.1055f, 0.1825f, 0.0575f, 0.0114f, -0.1259f, 0.3148f, - -2.0047f, 11.9559f, 5.7375f, 0.8802f, 0.0042f, -0.2469f, -0.1040f, - -1.5679f, 0.1969f, -0.0184f, 0.0157f, 0.6688f, 3.4492f, + -0.2560f, 0.1105f, -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f, + 0.4885f, 1.7518f, 0.4985f, 0.5582f, -0.3739f, 0.9403f, 0.3874f, + 0.3265f, 1.7383f, 3.1747f, 0.0285f, 3.3942f, -0.0123f, 0.5057f, + 0.1584f, 0.2697f, 4.6151f, 3.6251f, -0.0121f, -1.0047f, -0.0037f, + 0.0127f, 0.1935f, -0.5277f, -2.7144f, 0.0729f, -0.1457f, -0.0816f, + -0.5462f, 0.4738f, 0.3599f, -0.0564f, 0.0910f, 0.0126f, -0.0310f, + -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f, -0.2662f, -0.0999f, + -0.2983f, -0.4899f, -0.2314f, 0.2873f, -0.3614f, 0.1783f, -0.1210f, + 0.3569f, 0.5436f, -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f, + 1.9045f, 0.5463f, 0.1102f, -0.9293f, -0.0185f, -0.8302f, -0.4378f, + -0.3531f, -1.3095f, 0.6099f, 0.7977f, 4.1950f, -0.0067f, -0.2762f, + -0.1574f, -0.2149f, 0.6104f, -1.7053f, 0.1904f, 4.2402f, -0.2671f, + 0.8940f, 0.6820f, 0.2241f, -0.9459f, 1.4571f, 0.5255f, 2.3352f, + -0.0806f, 0.5231f, 0.3928f, 0.4146f, 2.0956f, }; static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { - 4.5051f, -4.5858f, 1.4693f, 0.f, 3.7968f, -3.6292f, - -7.3112f, 10.9743f, 8.027f, -2.2692f, -8.748f, -1.3689f, + 1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f, + 0.f, 1.1485f, 2.7085f, -4.7897f, 1.4093f, -1.657f, }; static const float av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * HIDDEN_LAYERS_1_NODES] = { - -0.0182f, -0.0925f, -0.0311f, -0.2962f, 0.1177f, -0.0027f, -0.2136f, - -1.2094f, 0.0935f, -0.1403f, -0.1477f, -0.0752f, 0.1519f, -0.4726f, - -0.3521f, 0.4199f, -0.0168f, -0.2927f, -0.2510f, 0.0706f, -0.2920f, - 0.2046f, -0.0400f, -0.2114f, 0.4240f, -0.7070f, 0.4964f, 0.4471f, - 0.3841f, -0.0918f, -0.6140f, 0.6056f, -0.1123f, 0.3944f, -0.0178f, - -1.7702f, -0.4434f, 0.0560f, 0.1565f, -0.0793f, -0.0041f, 0.0052f, - -0.1843f, 0.2400f, -0.0605f, 0.3196f, -0.0286f, -0.0002f, -0.0595f, - -0.0493f, -0.2636f, -0.3994f, -0.1871f, -0.3298f, -0.0788f, -1.0685f, - 0.1900f, -0.5549f, -0.1350f, -0.0153f, -0.1195f, -0.5874f, 1.0468f, - 0.0212f, -0.2306f, -0.2677f, -0.3000f, -1.0702f, -0.1725f, -0.0656f, - -0.0226f, 0.0616f, -0.3453f, 0.0810f, 0.4838f, -0.3780f, -1.4486f, - 0.7777f, -0.0459f, -0.6568f, 0.0589f, -1.0286f, -0.6001f, 0.0826f, - 0.4794f, -0.0586f, -0.1759f, 0.3811f, -0.1313f, 0.3829f, -0.0968f, - -2.0445f, -0.3566f, -0.1491f, -0.0745f, -0.0202f, 0.0839f, 0.0470f, - -0.2432f, 0.3013f, -0.0743f, -0.3479f, 0.0749f, -5.2490f, 0.0209f, - -0.1653f, -0.0826f, -0.0535f, 0.3225f, -0.3786f, -0.0104f, 0.3091f, - 0.3652f, 0.1757f, -0.3252f, -1.1022f, -0.0574f, -0.4473f, 0.3469f, - -0.5539f, + -0.5203f, -1.3468f, 0.3865f, -0.6859f, 0.0058f, 4.0682f, 0.4807f, + -0.1380f, 0.6050f, 0.8958f, 0.7748f, -0.1311f, 1.7317f, 1.1265f, + 0.0827f, 0.1407f, -0.3605f, 0.5429f, 0.1880f, -0.1439f, 0.2837f, + 1.6477f, 0.0832f, 0.0593f, -1.8464f, -0.7241f, -1.0672f, -0.3546f, + -0.3842f, -2.3637f, 0.2514f, 0.8263f, -0.1872f, 0.5774f, -0.3610f, + -0.0205f, 1.3977f, -0.1083f, 0.6923f, 1.3039f, -0.2870f, 1.0622f, + -0.0566f, 0.2697f, -0.5429f, -0.6193f, 1.7559f, 0.3246f, 1.9159f, + 0.3744f, 0.0686f, 1.0191f, -0.4212f, 1.9591f, -0.0691f, -0.1085f, + -1.2034f, 0.0606f, 1.0116f, 0.5565f, -0.1874f, -0.7898f, 0.4796f, + 0.2290f, 0.4334f, -0.5817f, -0.2949f, 0.1367f, -0.2932f, -1.1265f, + 0.0133f, -0.5309f, -3.3191f, 0.0939f, 0.3895f, -2.5812f, -0.0066f, + -3.0063f, -0.2982f, 0.7309f, -0.2422f, -0.2770f, -0.7152f, 0.1700f, + 1.9630f, 0.1988f, 0.4194f, 0.8762f, 0.3402f, 0.1051f, -0.1598f, + 0.2405f, 0.0392f, 1.1256f, 1.5245f, 0.0950f, 0.2160f, -0.5023f, + 0.2584f, 0.2074f, 0.2218f, 0.3966f, -0.0921f, -0.2435f, -0.4560f, + -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f, -0.3342f, -0.7888f, + -0.4488f, -1.7168f, 0.3341f, 0.1146f, 0.5226f, 0.2610f, -0.4574f, + -0.4164f, }; static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { - 11.9337f, -0.3681f, -6.1324f, 12.674f, 9.0956f, - 4.6069f, -4.4158f, -12.4848f, 10.8473f, 5.7633f, + -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f, + 2.7149f, -2.5649f, 2.7765f, 2.9617f, 2.7684f, }; static const float av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { - 0.3245f, 0.2979f, -0.157f, -0.1441f, 0.1413f, - -0.7496f, -0.1737f, -0.5322f, 0.0748f, 0.2518f, + -0.6868f, -0.6715f, 0.449f, -1.293f, 0.6214f, + 0.9894f, -0.4342f, 0.7002f, 1.4363f, 0.6951f, }; static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = { - 4.6065f, + 2.3371f, }; static const NN_CONFIG av1_pustats_dist_nnconfig = { - NUM_FEATURES, // num_inputs + NUM_FEATURES_PUSTATS, // num_inputs LOGITS_NODES, // num_outputs NUM_HIDDEN_LAYERS, // num_hidden_layers { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes @@ -196,7 +186,6 @@ static const NN_CONFIG av1_pustats_dist_nnconfig = { }, }; -#undef NUM_FEATURES #undef NUM_HIDDEN_LAYERS #undef HIDDEN_LAYERS_0_NODES #undef HIDDEN_LAYERS_1_NODES @@ -206,4 +195,4 @@ static const NN_CONFIG av1_pustats_dist_nnconfig = { } // extern "C" #endif -#endif // AV1_ENCODER_PUSTATS_H_ +#endif // AOM_AV1_ENCODER_PUSTATS_H_ diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h index 9b2dac965..0bca39102 100644 --- a/third_party/aom/av1/encoder/random.h +++ b/third_party/aom/av1/encoder/random.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RANDOM_H_ -#define AV1_ENCODER_RANDOM_H_ +#ifndef AOM_AV1_ENCODER_RANDOM_H_ +#define AOM_AV1_ENCODER_RANDOM_H_ #ifdef __cplusplus extern "C" { @@ -26,4 +26,4 @@ static INLINE unsigned int lcg_rand16(unsigned int *state) { } // extern "C" #endif -#endif // AV1_ENCODER_RANDOM_H_ +#endif // AOM_AV1_ENCODER_RANDOM_H_ diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h index 1019055ed..c429f2ce5 100644 --- a/third_party/aom/av1/encoder/ransac.h +++ b/third_party/aom/av1/encoder/ransac.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RANSAC_H_ -#define AV1_ENCODER_RANSAC_H_ +#ifndef AOM_AV1_ENCODER_RANSAC_H_ +#define AOM_AV1_ENCODER_RANSAC_H_ #include #include @@ -32,4 +32,4 @@ int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion, int ransac_translation(int *matched_points, int npoints, int *num_inliers_by_motion, double *params_by_motion, int num_motions); -#endif // AV1_ENCODER_RANSAC_H_ +#endif // AOM_AV1_ENCODER_RANSAC_H_ diff --git a/third_party/aom/av1/encoder/rate_distortion_model_params.h b/third_party/aom/av1/encoder/rate_distortion_model_params.h index 14d23f10f..7cd0962c5 100644 --- a/third_party/aom/av1/encoder/rate_distortion_model_params.h +++ b/third_party/aom/av1/encoder/rate_distortion_model_params.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ -#define AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ +#ifndef AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ +#define AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ #ifdef __cplusplus extern "C" { @@ -588,4 +588,4 @@ static const NN_CONFIG av1_rdcost_model_nnconfig = { } // extern "C" #endif -#endif // AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ +#endif // AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_ diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c index 3aae0144e..2597fb990 100644 --- a/third_party/aom/av1/encoder/ratectrl.c +++ b/third_party/aom/av1/encoder/ratectrl.c @@ -117,7 +117,7 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, for (i = 0; i < QINDEX_RANGE; i++) { const double maxq = av1_convert_qindex_to_q(i, bit_depth); kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth); - kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); + kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth); arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth); arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth); @@ -253,6 +253,9 @@ int av1_rc_get_default_min_gf_interval(int width, int height, int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) { int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75)); interval += (interval & 0x01); // Round to even value +#if CONFIG_FIX_GF_LENGTH + interval = AOMMAX(FIXED_GF_LENGTH, interval); +#endif return AOMMAX(interval, min_gf_interval); } @@ -299,9 +302,9 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth); for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { - rc->rate_correction_factors[i] = 1.0; + rc->rate_correction_factors[i] = 0.7; } - + rc->rate_correction_factors[KF_STD] = 1.0; rc->min_gf_interval = oxcf->min_gf_interval; rc->max_gf_interval = oxcf->max_gf_interval; if (rc->min_gf_interval == 0) @@ -556,6 +559,14 @@ static int get_gf_active_quality(const RATE_CONTROL *const rc, int q, arfgf_low_motion_minq, arfgf_high_motion_minq); } +#if REDUCE_LAST_ALT_BOOST +static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) { + int *arfgf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); + return arfgf_high_motion_minq[q]; +} +#endif + static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; const unsigned int curr_frame = cpi->common.current_video_frame; @@ -918,7 +929,7 @@ int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) { #define STATIC_MOTION_THRESH 95 static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, int height, int *bottom_index, - int *top_index) { + int *top_index, int *arf_q) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; @@ -959,7 +970,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, qindex = rc->last_boosted_qindex; last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); delta_qindex = av1_compute_qdelta(rc, last_boosted_q, - last_boosted_q * 0.75, bit_depth); + last_boosted_q * 0.5, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } } else { @@ -1000,17 +1011,49 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, // For constrained quality dont allow Q less than the cq level if (oxcf->rc_mode == AOM_CQ) { if (q < cq_level) q = cq_level; +#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ + if (gf_group->update_type[gf_group->index] == ARF_UPDATE || + (is_intrl_arf_boost && !cpi->new_bwdref_update_rule)) { +#endif // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ + active_best_quality = get_gf_active_quality(rc, q, bit_depth); - active_best_quality = get_gf_active_quality(rc, q, bit_depth); - - // Constrained quality use slightly lower active best. - active_best_quality = active_best_quality * 15 / 16; + // Constrained quality use slightly lower active best. + active_best_quality = active_best_quality * 15 / 16; +#if REDUCE_LAST_ALT_BOOST + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + const int min_boost = get_gf_high_motion_quality(q, bit_depth); + const int boost = min_boost - active_best_quality; + active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor); + } +#endif + *arf_q = active_best_quality; +#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ + } else { + active_best_quality = rc->arf_q; + int this_height = gf_group->pyramid_level[gf_group->index]; + while (this_height < gf_group->pyramid_height) { + active_best_quality = (active_best_quality + cq_level + 1) / 2; + ++this_height; + } + } +#endif // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ } else if (oxcf->rc_mode == AOM_Q) { if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) { active_best_quality = cq_level; } else { - active_best_quality = get_gf_active_quality(rc, q, bit_depth); + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + active_best_quality = get_gf_active_quality(rc, q, bit_depth); + *arf_q = active_best_quality; +#if REDUCE_LAST_ALT_BOOST + const int min_boost = get_gf_high_motion_quality(q, bit_depth); + const int boost = min_boost - active_best_quality; + + active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor); +#endif + } else { + active_best_quality = rc->arf_q; + } #if USE_SYMM_MULTI_LAYER if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) { int this_height = gf_group->pyramid_level[gf_group->index]; @@ -1030,6 +1073,12 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, } } else { active_best_quality = get_gf_active_quality(rc, q, bit_depth); +#if REDUCE_LAST_ALT_BOOST + const int min_boost = get_gf_high_motion_quality(q, bit_depth); + const int boost = min_boost - active_best_quality; + + active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor); +#endif #if USE_SYMM_MULTI_LAYER if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) { int this_height = gf_group->pyramid_level[gf_group->index]; @@ -1104,7 +1153,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex); } else { - q = rc->last_boosted_qindex; + q = AOMMIN(rc->last_boosted_qindex, + (active_best_quality + active_worst_quality) / 2); } } else { q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, @@ -1129,7 +1179,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, return q; } -int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height, +int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height, int *bottom_index, int *top_index) { int q; if (cpi->oxcf.pass == 0) { @@ -1140,8 +1190,17 @@ int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height, q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index, top_index); } else { + assert(cpi->oxcf.pass == 2 && "invalid encode pass"); + + GF_GROUP *gf_group = &cpi->twopass.gf_group; + int arf_q = 0; + q = rc_pick_q_and_bounds_two_pass(cpi, width, height, bottom_index, - top_index); + top_index, &arf_q); + + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + cpi->rc.arf_q = arf_q; + } } return q; @@ -1327,13 +1386,6 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { update_golden_frame_stats(cpi); if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0; - - // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME - // differently here for rc->avg_frame_bandwidth. - if (cm->show_frame || rc->is_bwd_ref_frame) { - rc->frames_since_key++; - rc->frames_to_key--; - } // if (cm->current_video_frame == 1 && cm->show_frame) /* rc->this_frame_target = @@ -1635,10 +1687,6 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi, if (rc->max_gf_interval > rc->static_scene_max_gf_interval) rc->max_gf_interval = rc->static_scene_max_gf_interval; -#if FIX_GF_INTERVAL_LENGTH - rc->max_gf_interval = FIXED_GF_LENGTH + 1; -#endif - // Clamp min to max rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval); } diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h index f0508da9e..198ecab97 100644 --- a/third_party/aom/av1/encoder/ratectrl.h +++ b/third_party/aom/av1/encoder/ratectrl.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RATECTRL_H_ -#define AV1_ENCODER_RATECTRL_H_ +#ifndef AOM_AV1_ENCODER_RATECTRL_H_ +#define AOM_AV1_ENCODER_RATECTRL_H_ #include "aom/aom_codec.h" #include "aom/aom_integer.h" @@ -25,13 +25,27 @@ extern "C" { #define BPER_MB_NORMBITS 9 #define CUSTOMIZED_GF 1 -#define FIX_GF_INTERVAL_LENGTH 0 -#if FIX_GF_INTERVAL_LENGTH +#if CONFIG_FIX_GF_LENGTH #define FIXED_GF_LENGTH 16 +#define MAX_PYRAMID_LVL 4 +// We allow a frame to have at most two left/right descendants before changing +// them into to a subtree, i.e., we allow the following structure: +/* OUT_OF_ORDER_FRAME + / / \ \ +(two left children) F F F F (two right children) */ +// Therefore the max gf size supported by 4 layer structure is +// 1 (KEY/OVERLAY) + 1 + 2 + 4 + 16 (two children on both side of their parent) +#define MAX_PYRAMID_SIZE 24 #define USE_SYMM_MULTI_LAYER 1 +#define REDUCE_LAST_ALT_BOOST 1 +#define REDUCE_LAST_GF_LENGTH 1 +#define MULTI_LVL_BOOST_VBR_CQ 1 #else #define USE_SYMM_MULTI_LAYER 0 +#define REDUCE_LAST_ALT_BOOST 0 +#define REDUCE_LAST_GF_LENGTH 0 +#define MULTI_LVL_BOOST_VBR_CQ 0 #endif #if USE_SYMM_MULTI_LAYER @@ -159,6 +173,9 @@ typedef struct { // Auto frame-scaling variables. int rf_level_maxq[RATE_FACTOR_LEVELS]; + float_t arf_boost_factor; + // Q index used for ALT frame + int arf_q; } RATE_CONTROL; struct AV1_COMP; @@ -228,7 +245,7 @@ void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi, int *frame_over_shoot_limit); // Picks q and q bounds given the target for bits -int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int width, int height, +int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height, int *bottom_index, int *top_index); // Estimates q to achieve a target bits per frame @@ -275,4 +292,4 @@ int av1_resize_one_pass_cbr(struct AV1_COMP *cpi); } // extern "C" #endif -#endif // AV1_ENCODER_RATECTRL_H_ +#endif // AOM_AV1_ENCODER_RATECTRL_H_ diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.c b/third_party/aom/av1/encoder/ratectrl_xiph.c deleted file mode 100644 index e69de29bb..000000000 diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.h b/third_party/aom/av1/encoder/ratectrl_xiph.h deleted file mode 100644 index e69de29bb..000000000 diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c index c4d4777bf..b87d89e50 100644 --- a/third_party/aom/av1/encoder/rd.c +++ b/third_party/aom/av1/encoder/rd.c @@ -648,6 +648,473 @@ void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2, } } +static double interp_cubic(const double *p, double x) { + return p[1] + 0.5 * x * + (p[2] - p[0] + + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + + x * (3.0 * (p[1] - p[2]) + p[3] - p[0]))); +} + +static double interp_bicubic(const double *p, int p_stride, double x, + double y) { + double q[4]; + q[0] = interp_cubic(p, x); + q[1] = interp_cubic(p + p_stride, x); + q[2] = interp_cubic(p + 2 * p_stride, x); + q[3] = interp_cubic(p + 3 * p_stride, x); + return interp_cubic(q, y); +} + +static const double interp_rgrid_surf[65 * 18] = { + 0.104019, 0.245714, 0.293686, 0.358635, 0.382167, 0.412446, + 0.419955, 0.421388, 0.426672, 0.427990, 0.428531, 0.456868, + 0.569880, 0.638822, 1.016319, 2.143453, 3.565229, 4.720880, + 0.124618, 0.294211, 0.352023, 0.429991, 0.458206, 0.494510, + 0.503513, 0.505232, 0.511566, 0.513234, 0.519365, 0.570225, + 0.697373, 0.840624, 1.462198, 3.289054, 6.256517, 6.852788, + 0.118630, 0.269669, 0.346620, 0.430999, 0.459385, 0.495783, + 0.504808, 0.506532, 0.512884, 0.514988, 0.543437, 0.662772, + 0.795876, 1.313596, 2.403841, 4.163098, 7.440589, 8.616275, + 0.093329, 0.168205, 0.321320, 0.430607, 0.459385, 0.495783, + 0.504813, 0.506548, 0.512975, 0.520662, 0.571659, 0.701841, + 1.010727, 2.138851, 3.460626, 6.317955, 10.098127, 14.418553, + 0.087021, 0.142905, 0.315011, 0.430509, 0.459385, 0.495787, + 0.505075, 0.507599, 0.513584, 0.543182, 0.669941, 0.825620, + 1.362800, 2.572187, 4.205047, 7.498399, 12.303118, 16.641735, + 0.086923, 0.142513, 0.314913, 0.430508, 0.459385, 0.495803, + 0.506126, 0.511816, 0.514810, 0.549705, 0.725350, 1.127334, + 2.168597, 3.463686, 6.318605, 10.162284, 18.556041, 19.847042, + 0.086923, 0.142513, 0.314913, 0.430506, 0.459376, 0.495805, + 0.506388, 0.512954, 0.520772, 0.580215, 0.810474, 1.391548, + 2.579442, 4.205160, 7.498399, 12.381597, 21.703618, 24.015457, + 0.086923, 0.142513, 0.314911, 0.430353, 0.458765, 0.495652, + 0.506391, 0.513406, 0.544098, 0.702950, 1.121860, 2.168961, + 3.463798, 6.318607, 10.162284, 18.685361, 28.188192, 37.638872, + 0.086923, 0.142513, 0.314901, 0.429742, 0.456313, 0.495045, + 0.506484, 0.519195, 0.580104, 0.810126, 1.391462, 2.579441, + 4.205160, 7.498399, 12.381597, 21.848607, 33.367199, 42.623190, + 0.086923, 0.142513, 0.314899, 0.429589, 0.455706, 0.495155, + 0.507882, 0.542426, 0.702360, 1.119921, 2.168478, 3.463791, + 6.318607, 10.162284, 18.685361, 28.345760, 47.802028, 49.163533, + 0.086924, 0.142548, 0.315086, 0.429842, 0.455870, 0.496336, + 0.512412, 0.556953, 0.773373, 1.266396, 2.548277, 4.204676, + 7.498399, 12.381597, 21.848607, 33.548250, 54.301011, 56.262859, + 0.087067, 0.144957, 0.327436, 0.446616, 0.466362, 0.505706, + 0.522077, 0.610747, 0.972543, 1.666916, 3.338812, 6.316669, + 10.162284, 18.685361, 28.345760, 48.065311, 66.145302, 78.396020, + 0.094295, 0.164235, 0.393722, 0.534219, 0.530922, 0.579308, + 0.603889, 0.760870, 1.229961, 2.423214, 4.173513, 7.497916, + 12.381597, 21.848607, 33.548250, 54.589585, 74.875848, 86.468182, + 0.124096, 0.213005, 0.497188, 0.665176, 0.685973, 0.800200, + 0.911394, 1.077971, 1.677290, 3.332129, 6.314960, 10.162257, + 18.685361, 28.345760, 48.065311, 66.453506, 98.275189, 96.862588, + 0.140999, 0.270140, 0.658212, 0.867661, 0.970183, 1.149516, + 1.480599, 1.664833, 2.421893, 3.857981, 7.418830, 12.380371, + 21.848607, 33.548250, 54.589585, 75.188867, 106.657971, 99.762997, + 0.178353, 0.398001, 0.988462, 1.241473, 1.340967, 1.713568, + 2.335030, 2.701432, 3.348532, 5.077158, 9.829903, 18.676528, + 28.345700, 48.065311, 66.453506, 98.588283, 117.057193, 101.130722, + 0.281079, 0.548300, 1.395825, 1.780770, 2.000508, 2.702964, + 3.638454, 4.573843, 5.051641, 7.079129, 11.293332, 21.594861, + 33.544335, 54.589585, 75.188867, 106.971065, 119.957601, 101.466632, + 0.476762, 0.842189, 2.019678, 2.723895, 3.188467, 4.011610, + 5.545111, 7.508984, 8.176339, 9.774504, 14.720782, 27.334416, + 48.049609, 66.453506, 98.588283, 117.370357, 121.329855, 101.509242, + 0.993999, 1.520111, 3.013605, 4.203530, 4.982992, 6.074944, + 8.583581, 11.818375, 14.192544, 14.937517, 21.258160, 33.305953, + 54.585735, 75.188867, 106.971135, 120.279824, 121.976055, 102.690130, + 1.776487, 2.613655, 4.356487, 6.161726, 7.622196, 9.464193, + 13.077233, 18.051656, 23.221051, 24.080068, 30.085038, 48.345269, + 66.457698, 98.588353, 117.379415, 121.976128, 124.356210, 107.713202, + 3.191085, 4.495201, 5.686033, 8.365566, 11.275339, 14.706437, + 20.300969, 28.152237, 35.688355, 39.341382, 41.030743, 55.752262, + 75.211764, 106.980285, 120.608403, 124.680746, 130.222528, 112.260098, + 6.136611, 7.305215, 7.272532, 10.646713, 15.630815, 22.383168, + 31.349131, 42.419822, 52.301680, 58.983454, 58.915405, 69.161305, + 98.992460, 117.713855, 124.344836, 130.623638, 138.442401, 127.846670, + 11.707980, 13.490761, 11.640845, 14.176132, 22.131124, 33.776462, + 47.365711, 61.603834, 75.281056, 83.463985, 85.510533, 86.026513, + 108.787480, 123.031136, 130.607284, 138.954406, 160.867784, 158.958882, + 27.062874, 32.195139, 24.147297, 22.114632, 35.580506, 52.551674, + 71.652956, 88.606776, 102.107193, 110.703186, 114.398733, 111.118539, + 121.503578, 132.455924, 139.490806, 161.412674, 193.563210, 172.203945, + 35.625692, 47.953028, 42.639820, 42.276254, 58.815664, 84.977282, + 110.656412, 126.168446, 134.658126, 140.604482, 144.006012, 141.702382, + 140.125323, 153.122630, 164.748041, 194.156197, 206.854650, 174.013079, + 49.516447, 65.335381, 71.738306, 81.872819, 98.400740, 136.840488, + 163.775802, 169.440078, 172.747876, 171.222919, 171.679604, 172.173550, + 168.200129, 187.617133, 199.683394, 207.768200, 210.062520, 175.478356, + 60.341673, 92.487135, 119.907299, 136.068010, 144.778950, 189.443534, + 220.120077, 219.641635, 214.616503, 205.894657, 198.453924, 200.013069, + 195.938103, 206.118661, 210.447375, 212.061379, 216.078218, 181.162805, + 78.422159, 112.242899, 158.416312, 181.404320, 193.188690, 229.296967, + 270.461799, 275.168977, 256.511701, 244.706786, 231.344608, 226.065087, + 222.248618, 218.662324, 217.966722, 218.248574, 218.818588, 182.740573, + 88.713664, 123.594164, 172.928179, 213.781414, 245.800351, 252.063414, + 313.283141, 331.703831, 305.866639, 285.177142, 269.759635, 251.988739, + 245.998388, 232.688076, 230.588702, 230.882657, 230.319053, 192.120741, + 102.540561, 152.905927, 189.137131, 241.806756, 273.868497, 284.258017, + 339.689853, 373.561104, 362.657463, 326.291984, 311.922687, 290.460189, + 276.774381, 273.012072, 277.751792, 279.123748, 278.820447, 233.813798, + 132.983118, 176.307242, 197.415684, 243.307787, 280.893995, 332.922370, + 340.329043, 404.530166, 419.475405, 375.775209, 351.300889, 340.042759, + 315.683832, 306.123530, 306.359319, 306.733063, 307.609556, 261.647847, + 149.579109, 185.925581, 207.937033, 245.159084, 301.890957, 350.040480, + 352.250771, 418.742329, 458.112686, 430.125208, 386.460441, 380.346839, + 354.679150, 337.305620, 334.504124, 335.889932, 341.060725, 286.898578, + 153.576812, 202.105624, 219.366967, 248.524506, 314.255692, 350.607526, + 390.567688, 408.629209, 488.000213, 480.563823, 432.461799, 410.412624, + 398.607371, 400.188740, 402.780916, 408.853470, 430.449735, 363.777088, + 161.353129, 214.848904, 231.549852, 258.536466, 313.163177, 368.140577, + 412.136393, 413.409032, 499.838438, 519.571063, 485.833867, 444.562715, + 435.738129, 442.358549, 450.166531, 453.208524, 458.424358, 385.823139, + 175.109034, 227.608058, 250.069563, 286.101747, 312.256740, 378.421485, + 413.344147, 435.058646, 476.960941, 542.448886, 530.189154, 495.408402, + 475.326752, 465.017144, 464.694045, 465.144689, 466.905382, 398.669138, + 184.750180, 240.766694, 283.240772, 305.480150, 322.409001, 374.526162, + 427.141326, 452.840323, 472.604139, 545.366105, 567.676694, 541.666203, + 509.591873, 492.044219, 492.778569, 493.765684, 493.235693, 413.684325, + 194.728357, 254.928927, 289.991157, 300.193195, 324.194589, 371.563147, + 439.226438, 468.295088, 495.654854, 533.506353, 587.476353, 578.298989, + 548.041942, 527.393885, 538.965146, 545.070442, 544.295454, 454.012211, + 205.195287, 283.135677, 297.921431, 319.295927, 355.621830, 392.466463, + 446.696167, 485.053519, 516.426615, 532.264584, 588.481600, 615.906737, + 589.319634, 555.754316, 558.389367, 569.094521, 569.779764, 475.384946, + 218.552054, 298.511016, 319.188338, 351.781666, 372.789510, 412.827434, + 464.569387, 506.270203, 533.049810, 553.347364, 580.644599, 632.759854, + 622.235843, 569.960552, 580.799340, 586.553714, 579.488366, 491.826482, + 244.803348, 299.790203, 324.187975, 363.280782, 403.710443, 441.724083, + 492.732682, 534.722691, 552.193622, 575.112647, 586.097705, 635.224970, + 644.642944, 606.017786, 640.321218, 642.316989, 616.397020, 548.300111, + 256.957358, 318.638991, 355.063346, 389.889307, 433.607315, 468.209001, + 515.178157, 573.556591, 578.113115, 587.246475, 601.762801, 638.454644, + 656.574853, 641.184609, 676.908189, 684.198162, 678.387412, 574.805864, + 251.211502, 323.448532, 364.227424, 411.792704, 462.226488, 503.572288, + 549.299249, 599.124071, 601.227977, 597.118176, 613.247552, 633.278532, + 658.074755, 664.930719, 685.731531, 693.632845, 693.076350, 578.326477, + 267.695377, 354.273736, 389.976833, 438.518178, 493.332686, 544.343027, + 588.895829, 620.206193, 628.327410, 606.067827, 620.998532, 657.985256, + 683.936059, 691.345257, 693.894723, 695.175306, 693.618786, 578.517148, + 274.290725, 363.465288, 411.808596, 463.369805, 515.310226, 581.009306, + 613.070738, 636.638714, 647.333929, 629.867603, 644.646319, 687.796202, + 702.859596, 713.495479, 704.068069, 704.991807, 704.188594, 587.283658, + 302.538449, 389.174737, 438.518422, 493.398902, 547.662399, 601.981814, + 624.773046, 641.629484, 644.699451, 645.848784, 668.033340, 703.643523, + 707.422408, 717.329600, 726.298973, 744.127507, 745.365167, 617.954068, + 310.328188, 410.984766, 463.369805, 515.315010, 581.309832, 613.787792, + 634.988538, 654.145284, 662.632978, 668.413496, 706.494057, 750.545471, + 730.724808, 730.002100, 743.625262, 750.801609, 745.308457, 606.505800, + 329.948756, 437.600191, 493.398902, 547.661910, 601.917884, 622.557745, + 633.244395, 644.055898, 648.224221, 665.062911, 763.555733, 812.391078, + 769.063582, 744.865168, 727.579796, 724.950408, 722.179707, 598.564510, + 350.848328, 462.437458, 515.315010, 581.309823, 613.779123, 634.465309, + 652.056257, 662.179143, 671.466297, 726.881256, 819.824030, 880.232789, + 810.371672, 754.246481, 725.053473, 724.253390, 723.503395, 603.394909, + 373.704088, 492.408266, 547.661910, 601.917884, 622.557620, 633.236320, + 644.023513, 648.232514, 666.381639, 785.498283, 929.441612, 999.772800, + 890.339033, 775.852504, 731.840181, 726.905100, 725.251844, 604.899901, + 394.473422, 514.261306, 581.309823, 613.779123, 634.465309, 652.056257, + 662.179143, 671.466557, 727.134512, 835.764144, 981.747089, 1018.462934, + 939.686967, 811.276731, 739.398459, 727.365647, 725.285425, 604.923525, + 419.976505, 546.538939, 601.917884, 622.557620, 633.236320, 644.023513, + 648.232514, 666.381639, 785.545191, 932.841398, 1036.609617, 1026.945092, + 963.822765, 840.827315, 755.532423, 730.241865, 725.366847, 604.924155, + 437.281359, 580.116337, 613.779123, 634.465309, 652.056257, 662.179143, + 671.466557, 727.134512, 835.764859, 981.996194, 1031.896881, 1002.544732, + 881.157178, 828.151494, 799.340975, 751.314325, 728.316587, 605.005504, + 464.713920, 600.649281, 622.557620, 633.236320, 644.023513, 648.232514, + 666.381639, 785.545191, 932.841398, 1036.735329, 1035.037004, 995.478339, + 858.093733, 823.471976, 819.881754, 798.749289, 749.440463, 607.955244, + 495.880237, 612.473139, 634.465309, 652.056257, 662.179143, 671.466557, + 727.134512, 835.764859, 981.996194, 1032.339788, 1031.105117, 995.303259, + 857.733663, 823.435877, 822.822791, 819.873050, 796.882480, 629.038445, + 510.391280, 621.158273, 633.236320, 644.023513, 648.232514, 666.381639, + 785.545191, 932.841398, 1036.735329, 1035.566013, 1029.599350, 994.926093, + 857.645648, 823.435143, 822.904139, 822.822791, 817.965681, 673.856962, + 514.588176, 632.947715, 652.056257, 662.179143, 671.466557, 727.134512, + 835.764859, 981.996194, 1032.339788, 1031.547475, 1023.835377, 972.158629, + 851.968626, 823.347128, 822.904770, 822.904139, 820.752301, 684.418900, + 520.013294, 631.668183, 644.023513, 648.232514, 666.381639, 785.545191, + 932.841398, 1036.735329, 1035.567378, 1029.776746, 1001.044108, 880.853721, + 829.201546, 822.994150, 822.904770, 822.904770, 820.792975, 684.582020, + 531.253628, 650.479606, 662.179143, 671.466557, 727.134512, 835.764859, + 981.996194, 1032.339788, 1031.636855, 1029.601779, 995.366703, 858.086641, + 823.524524, 822.906135, 822.904770, 822.904770, 820.792975, 684.582020, + 528.531744, 642.424501, 648.232514, 666.381639, 785.545191, 932.841398, + 1036.735329, 1035.567378, 1030.219103, 1029.576226, 995.278687, 857.733663, + 823.436508, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, + 545.401164, 660.550678, 671.508859, 727.304161, 835.807162, 981.996850, + 1032.339788, 1031.636855, 1030.130788, 1029.487827, 994.925709, 857.645648, + 823.435143, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, + 537.684760, 646.650947, 669.110131, 796.487512, 935.569890, 1036.777631, + 1035.567378, 1030.219103, 1030.018584, 1023.810805, 972.158629, 851.968626, + 823.347128, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, + 552.408370, 670.001885, 738.246482, 879.690154, 992.939171, 1032.509436, + 1031.636855, 1030.132153, 1029.665223, 1001.043724, 880.853721, 829.201546, + 822.994150, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, + 539.835902, 667.496388, 799.216004, 946.512211, 1039.506123, 1035.609680, + 1030.219103, 1030.107964, 1029.577207, 995.366703, 858.086641, 823.524524, + 822.906135, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, + 558.362529, 734.277451, 877.197218, 990.478243, 1029.908393, 1028.993978, + 1027.488620, 1027.464048, 1026.933674, 992.724534, 855.532488, 821.323349, + 820.792975, 820.792975, 820.792975, 820.792975, 818.686600, 682.825198, + 453.127195, 649.075095, 780.278390, 867.165890, 862.469711, 857.067460, + 856.956321, 856.955937, 856.513579, 827.981461, 713.556496, 685.024378, + 684.582020, 684.582020, 684.582020, 684.582020, 682.825198, 569.510056, +}; + +static const double interp_dgrid_surf[65 * 18] = { + 10.650434, 12.204694, 12.040917, 11.843008, 11.845578, 12.051535, 12.103583, + 12.136780, 12.266709, 12.299107, 12.299673, 12.303120, 12.316337, 12.293431, + 12.092165, 11.602421, 11.141559, 8.864495, 12.770003, 14.634889, 14.437149, + 14.199413, 14.202487, 14.449423, 14.511827, 14.551629, 14.707410, 14.746265, + 14.747610, 14.753705, 14.762194, 14.699395, 14.390525, 13.690970, 12.874168, + 10.367121, 12.832328, 14.790730, 14.503765, 14.236403, 14.239028, 14.486600, + 14.549164, 14.589069, 14.745250, 14.784258, 14.788320, 14.801930, 14.762798, + 14.499088, 14.021544, 13.469684, 12.661560, 10.108384, 12.950520, 15.264726, + 14.621957, 14.238236, 14.239028, 14.486601, 14.549264, 14.589469, 14.745361, + 14.784949, 14.791572, 14.798652, 14.660251, 14.119394, 13.651131, 12.935657, + 12.176082, 9.228999, 12.979992, 15.382918, 14.651428, 14.238693, 14.239028, + 14.486701, 14.555710, 14.615321, 14.751849, 14.787700, 14.797104, 14.743189, + 14.475057, 13.944406, 13.450468, 12.687876, 11.824993, 8.906683, 12.980449, + 15.384750, 14.651885, 14.238700, 14.239028, 14.487102, 14.581562, 14.718998, + 14.777721, 14.788445, 14.778661, 14.582790, 14.099785, 13.649637, 12.935359, + 12.201859, 10.891931, 8.482221, 12.980449, 15.384750, 14.651886, 14.238801, + 14.239434, 14.487303, 14.588010, 14.744860, 14.784773, 14.786094, 14.735647, + 14.455704, 13.939591, 13.450393, 12.687876, 11.849334, 10.476658, 8.043672, + 12.980449, 15.384750, 14.651987, 14.245320, 14.265579, 14.493824, 14.588211, + 14.745312, 14.787263, 14.775934, 14.582036, 14.099475, 13.649563, 12.935358, + 12.201859, 10.911285, 9.730570, 6.696921, 12.980449, 15.384750, 14.652393, + 14.271466, 14.370434, 14.520069, 14.589027, 14.746028, 14.785482, 14.735605, + 14.455693, 13.939590, 13.450393, 12.687876, 11.849334, 10.494514, 9.195398, + 6.215460, 12.980449, 15.384750, 14.652494, 14.277985, 14.396679, 14.533035, + 14.615021, 14.754825, 14.775610, 14.582796, 14.099664, 13.649565, 12.935358, + 12.201859, 10.911285, 9.747361, 7.779960, 5.617541, 12.980448, 15.384731, + 14.652415, 14.278078, 14.397578, 14.559053, 14.718657, 14.776398, 14.747044, + 14.504690, 13.951810, 13.450583, 12.687876, 11.849334, 10.494514, 9.210817, + 7.210003, 5.164575, 12.980446, 15.383448, 14.647073, 14.277541, 14.403813, + 14.569546, 14.744956, 14.765103, 14.629073, 14.296161, 13.698573, 12.936118, + 12.201859, 10.911285, 9.747361, 7.790897, 6.322998, 3.931551, 12.981550, + 15.376916, 14.615597, 14.274820, 14.437479, 14.575942, 14.707492, 14.734111, + 14.515975, 14.000806, 13.462803, 12.688066, 11.849334, 10.494514, 9.210817, + 7.219566, 5.781392, 3.486081, 12.991899, 15.376201, 14.579444, 14.296898, + 14.473361, 14.522910, 14.491600, 14.543267, 14.288580, 13.700311, 12.936579, + 12.201867, 10.911285, 9.747361, 7.790897, 6.331506, 4.480348, 2.923138, + 13.019848, 15.383477, 14.582260, 14.385262, 14.452673, 14.436019, 14.238174, + 14.255993, 13.977481, 13.532342, 12.705591, 11.849605, 10.494514, 9.210817, + 7.219566, 5.789642, 4.018194, 2.766222, 13.028558, 15.315782, 14.439141, + 14.326286, 14.452429, 14.311731, 14.033235, 13.922587, 13.665868, 13.207897, + 12.274375, 10.912967, 9.747371, 7.790897, 6.331506, 4.488594, 3.454993, + 2.692682, 12.992752, 15.321471, 14.409573, 14.236340, 14.322969, 14.049072, + 13.764823, 13.479242, 13.250105, 12.759133, 12.019174, 10.532951, 9.211409, + 7.219566, 5.789642, 4.026440, 3.298077, 2.674624, 12.945493, 15.276596, + 14.315745, 14.026198, 14.085774, 13.844563, 13.447576, 12.964935, 12.735525, + 12.288592, 11.511693, 9.900227, 7.793270, 6.331506, 4.488594, 3.463236, + 3.224318, 2.672433, 12.757570, 15.056661, 14.095011, 13.722362, 13.812624, + 13.608480, 13.021206, 12.367627, 11.937931, 11.581049, 10.599552, 9.247860, + 7.220151, 5.789642, 4.026437, 3.305882, 3.191260, 2.615317, 12.581293, + 14.824658, 13.909074, 13.496158, 13.491402, 13.221550, 12.514140, 11.677229, + 10.936895, 10.619912, 9.634779, 7.763570, 6.331082, 4.488590, 3.462798, + 3.216460, 3.076315, 2.373499, 12.283499, 14.455760, 13.890593, 13.427587, + 13.183783, 12.763833, 11.861006, 10.740618, 9.820756, 9.354945, 8.669862, + 7.123268, 5.787860, 4.025994, 3.290000, 3.084410, 2.810905, 2.222916, + 12.010893, 14.300919, 13.986624, 13.484026, 13.025385, 12.224281, 11.064265, + 9.631040, 8.594396, 8.003736, 7.561587, 6.274418, 4.466637, 3.446574, + 3.102467, 2.816989, 2.598688, 1.951541, 11.581477, 13.831132, 13.632027, + 13.380414, 12.807880, 11.665651, 10.218236, 8.562237, 7.222614, 6.611808, + 6.261676, 5.402793, 3.938544, 3.174375, 2.818166, 2.602758, 2.213911, + 1.434763, 11.050735, 12.893449, 12.363152, 12.712829, 12.012961, 10.887854, + 9.109699, 7.421701, 5.965603, 5.272129, 4.991435, 4.423000, 3.369988, + 2.800371, 2.593901, 2.217431, 1.670917, 1.215265, 10.641194, 11.766277, + 10.777082, 10.972917, 10.689298, 9.701545, 7.719947, 6.145654, 4.872442, + 4.099600, 3.880934, 3.514159, 2.786474, 2.368963, 2.162376, 1.673670, + 1.450770, 1.185424, 10.071964, 11.107701, 9.172361, 8.551313, 8.412080, + 7.641397, 6.174246, 4.853916, 3.904549, 3.246810, 2.959903, 2.785066, + 2.240001, 1.793166, 1.585520, 1.449824, 1.405368, 1.168856, 9.213182, + 9.173278, 7.219231, 6.242951, 5.626013, 5.768007, 4.908666, 3.809589, + 3.115109, 2.617899, 2.274793, 2.172960, 1.838597, 1.505915, 1.414333, + 1.392666, 1.338173, 1.105611, 7.365015, 7.471370, 5.622346, 4.520127, + 3.936272, 4.208822, 3.623024, 2.977794, 2.450003, 2.097261, 1.824090, + 1.643270, 1.473525, 1.351388, 1.327504, 1.323865, 1.307894, 1.088234, + 6.198210, 6.580712, 4.682511, 3.416952, 2.941929, 2.766637, 2.650686, + 2.315439, 1.925838, 1.659784, 1.464419, 1.252806, 1.162722, 1.197518, + 1.199875, 1.197365, 1.194040, 0.995797, 5.402507, 5.055466, 3.728724, + 2.624359, 2.165810, 1.943189, 1.918190, 1.738078, 1.516328, 1.290520, + 1.155793, 1.015962, 0.881900, 0.807203, 0.754242, 0.743378, 0.740288, + 0.614158, 3.937867, 3.862507, 2.884664, 2.088147, 1.648496, 1.473584, + 1.340123, 1.291769, 1.165381, 1.000224, 0.893316, 0.821333, 0.691363, + 0.610501, 0.586766, 0.583762, 0.577840, 0.468733, 3.104660, 3.181078, + 2.420208, 1.747442, 1.297956, 1.109835, 0.970385, 0.943229, 0.876923, + 0.777584, 0.678183, 0.628623, 0.553745, 0.523430, 0.519490, 0.514394, + 0.492259, 0.403172, 2.593833, 2.533720, 2.010452, 1.480944, 1.060302, + 0.846383, 0.738703, 0.673144, 0.658010, 0.592449, 0.518236, 0.470335, + 0.425088, 0.393168, 0.378116, 0.355846, 0.275469, 0.213128, 2.176988, + 2.089575, 1.671284, 1.225008, 0.895382, 0.672008, 0.566241, 0.496746, + 0.488005, 0.449874, 0.400899, 0.354002, 0.318150, 0.281533, 0.238545, + 0.224159, 0.202399, 0.160681, 1.874679, 1.769165, 1.430124, 1.068727, + 0.780272, 0.557801, 0.441643, 0.377256, 0.352957, 0.338452, 0.304965, + 0.273172, 0.240052, 0.208724, 0.193431, 0.190845, 0.185025, 0.138166, + 1.590226, 1.502830, 1.193127, 0.917885, 0.670432, 0.474546, 0.355420, + 0.292305, 0.259035, 0.249937, 0.232079, 0.208943, 0.181936, 0.160038, + 0.152257, 0.151235, 0.149583, 0.120747, 1.331730, 1.255907, 1.012871, + 0.778422, 0.578977, 0.412432, 0.293155, 0.231824, 0.197187, 0.183921, + 0.174876, 0.157252, 0.140263, 0.127050, 0.110244, 0.105041, 0.104323, + 0.086944, 1.153994, 1.118771, 0.822355, 0.612321, 0.478249, 0.348222, + 0.247408, 0.186141, 0.152714, 0.135445, 0.129810, 0.119994, 0.115619, + 0.131626, 0.095612, 0.079343, 0.077502, 0.064550, 0.946317, 0.925894, + 0.677969, 0.499906, 0.397101, 0.297931, 0.214467, 0.152333, 0.120731, + 0.102686, 0.095062, 0.090361, 0.122319, 0.240194, 0.112687, 0.070690, + 0.070461, 0.054194, 0.824155, 0.787241, 0.581856, 0.419228, 0.313167, + 0.245582, 0.183500, 0.128101, 0.096577, 0.080267, 0.071022, 0.066851, + 0.085754, 0.154163, 0.075884, 0.052401, 0.054270, 0.026656, 0.716310, + 0.671378, 0.489580, 0.349569, 0.256155, 0.206343, 0.157853, 0.111950, + 0.079271, 0.062518, 0.053441, 0.049660, 0.051400, 0.063778, 0.039993, + 0.029133, 0.023382, 0.013725, 0.614125, 0.579096, 0.417126, 0.299465, + 0.217849, 0.165515, 0.129040, 0.093127, 0.065612, 0.049543, 0.041429, + 0.036850, 0.034416, 0.033989, 0.024216, 0.017377, 0.014833, 0.011987, + 0.520407, 0.487239, 0.349473, 0.251741, 0.184897, 0.135813, 0.107098, + 0.073607, 0.053938, 0.040531, 0.032931, 0.028876, 0.025759, 0.022168, + 0.016739, 0.014638, 0.014333, 0.011947, 0.449954, 0.415124, 0.299452, + 0.216942, 0.158874, 0.115334, 0.088821, 0.060105, 0.042610, 0.032566, + 0.026903, 0.023123, 0.019913, 0.016835, 0.014306, 0.013625, 0.013535, + 0.011284, 0.377618, 0.347773, 0.251741, 0.184839, 0.132857, 0.095439, + 0.070462, 0.052244, 0.036078, 0.026025, 0.021518, 0.018487, 0.015361, + 0.012905, 0.011470, 0.010569, 0.010283, 0.008297, 0.319953, 0.297976, + 0.216942, 0.158842, 0.113280, 0.080426, 0.057367, 0.041987, 0.030135, + 0.022295, 0.017901, 0.015121, 0.012224, 0.010035, 0.009353, 0.009108, + 0.008695, 0.006139, 0.267864, 0.250502, 0.184839, 0.132851, 0.095039, + 0.068220, 0.049135, 0.035315, 0.025144, 0.018237, 0.013857, 0.012094, + 0.009715, 0.007743, 0.006937, 0.006446, 0.006243, 0.004929, 0.230449, + 0.215895, 0.158842, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959, + 0.021866, 0.015673, 0.012133, 0.010083, 0.007801, 0.006053, 0.005401, + 0.003834, 0.003429, 0.002851, 0.193984, 0.183963, 0.132851, 0.095039, + 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013175, 0.010422, + 0.008491, 0.006397, 0.004567, 0.003494, 0.002933, 0.002825, 0.002355, + 0.167298, 0.158088, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959, + 0.021866, 0.015669, 0.011955, 0.009257, 0.007051, 0.005543, 0.003905, + 0.002984, 0.002825, 0.002814, 0.002347, 0.143228, 0.132220, 0.095039, + 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394, + 0.008403, 0.006661, 0.005378, 0.003545, 0.002876, 0.002818, 0.002814, + 0.002347, 0.122934, 0.112735, 0.080417, 0.057174, 0.041304, 0.029959, + 0.021866, 0.015669, 0.011955, 0.009258, 0.007182, 0.006012, 0.003762, + 0.002866, 0.002739, 0.002788, 0.002810, 0.002347, 0.101934, 0.094569, + 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394, + 0.008405, 0.006797, 0.005845, 0.003333, 0.002703, 0.002695, 0.002723, + 0.002781, 0.002343, 0.086702, 0.080014, 0.057174, 0.041304, 0.029959, + 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006533, 0.005839, + 0.003326, 0.002700, 0.002690, 0.002694, 0.002716, 0.002314, 0.073040, + 0.067886, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394, + 0.008405, 0.006807, 0.006468, 0.005831, 0.003325, 0.002700, 0.002690, + 0.002690, 0.002687, 0.002253, 0.061685, 0.056890, 0.041304, 0.029959, + 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006542, 0.006360, + 0.005416, 0.003221, 0.002698, 0.002690, 0.002690, 0.002683, 0.002238, + 0.052465, 0.048894, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394, + 0.008405, 0.006807, 0.006472, 0.005943, 0.003748, 0.002805, 0.002692, + 0.002690, 0.002690, 0.002683, 0.002238, 0.043838, 0.041101, 0.029959, + 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006543, 0.006465, + 0.005839, 0.003333, 0.002702, 0.002690, 0.002690, 0.002690, 0.002683, + 0.002238, 0.037824, 0.035133, 0.025140, 0.018150, 0.013174, 0.010394, + 0.008405, 0.006807, 0.006480, 0.006464, 0.005838, 0.003326, 0.002700, + 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.031865, 0.029815, + 0.021866, 0.015668, 0.011955, 0.009258, 0.007190, 0.006543, 0.006475, + 0.006462, 0.005831, 0.003325, 0.002700, 0.002690, 0.002690, 0.002690, + 0.002683, 0.002238, 0.027150, 0.025016, 0.018128, 0.013083, 0.010371, + 0.008405, 0.006807, 0.006480, 0.006472, 0.006359, 0.005416, 0.003221, + 0.002698, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.023094, + 0.021760, 0.015577, 0.011590, 0.009167, 0.007188, 0.006543, 0.006475, + 0.006466, 0.005943, 0.003748, 0.002805, 0.002692, 0.002690, 0.002690, + 0.002690, 0.002683, 0.002238, 0.019269, 0.018038, 0.013060, 0.010280, + 0.008382, 0.006806, 0.006480, 0.006474, 0.006464, 0.005839, 0.003333, + 0.002702, 0.002690, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, + 0.016874, 0.015472, 0.011566, 0.009148, 0.007171, 0.006527, 0.006458, + 0.006457, 0.006447, 0.005823, 0.003318, 0.002693, 0.002683, 0.002683, + 0.002683, 0.002683, 0.002676, 0.002232, 0.011968, 0.011056, 0.008762, + 0.007219, 0.005717, 0.005391, 0.005386, 0.005386, 0.005377, 0.004856, + 0.002767, 0.002246, 0.002238, 0.002238, 0.002238, 0.002238, 0.002232, + 0.001862, +}; + +void av1_model_rd_surffit(double xm, double yl, double *rate_f, + double *dist_f) { + const double x_start = -0.5; + const double x_end = 16.5; + const double x_step = 1; + const double y_start = -15.5; + const double y_end = 16.5; + const double y_step = 0.5; + const double epsilon = 1e-6; + const int stride = (int)rint((x_end - x_start) / x_step) + 1; + (void)y_end; + + xm = AOMMAX(xm, x_start + x_step + epsilon); + xm = AOMMIN(xm, x_end - x_step - epsilon); + yl = AOMMAX(yl, y_start + y_step + epsilon); + yl = AOMMIN(yl, y_end - y_step - epsilon); + + const double y = (yl - y_start) / y_step; + const double x = (xm - x_start) / x_step; + + const int yi = (int)floor(y); + const int xi = (int)floor(x); + assert(xi > 0); + assert(yi > 0); + + const double yo = y - yi; + const double xo = x - xi; + const double *prate = &interp_rgrid_surf[(yi - 1) * stride + (xi - 1)]; + const double *pdist = &interp_dgrid_surf[(yi - 1) * stride + (xi - 1)]; + *rate_f = interp_bicubic(prate, stride, xo, yo); + *dist_f = interp_bicubic(pdist, stride, xo, yo); +} + +static const double interp_rgrid_curv[65] = { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 4.759876, + 8.132086, 13.651828, 21.908271, 33.522054, 48.782376, 71.530983, + 106.728649, 151.942795, 199.893011, 242.850965, 283.933923, 322.154203, + 360.684608, 394.801656, 426.879017, 460.234313, 484.103987, 508.261495, + 536.486763, 558.196737, 586.285894, 614.764511, 634.166333, 647.706472, + 658.211478, 681.360407, 701.052141, 727.007310, 768.663973, 804.407660, + 884.627751, 1065.658131, 1238.875214, 1440.185176, 1678.377931, 1962.243390, + 2300.571467, 2702.152072, 3175.775119, 3730.230519, 4374.308184, 5116.798028, + 5966.489961, 6932.173897, 8022.639747, 9246.677424, 10613.076839, +}; + +static const double interp_dgrid_curv[65] = { + 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, + 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.555776, 14.533692, + 14.439920, 14.257791, 13.977230, 13.623229, 13.064884, 12.355411, 11.560773, + 10.728960, 9.861975, 8.643612, 6.916021, 5.154769, 3.734940, 2.680051, + 1.925506, 1.408410, 1.042223, 0.767641, 0.565392, 0.420116, 0.310427, + 0.231711, 0.172999, 0.128293, 0.094992, 0.072171, 0.052972, 0.039354, + 0.029555, 0.022857, 0.016832, 0.013297, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, +}; + +void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) { + const double x_start = -15.5; + const double x_end = 16.5; + const double x_step = 0.5; + const double epsilon = 1e-6; + (void)x_end; + + xqr = AOMMAX(xqr, x_start + x_step + epsilon); + xqr = AOMMIN(xqr, x_end - x_step - epsilon); + const double x = (xqr - x_start) / x_step; + const int xi = (int)floor(x); + const double xo = x - xi; + + assert(xi > 0); + + const double *prate = &interp_rgrid_curv[(xi - 1)]; + const double *pdist = &interp_dgrid_curv[(xi - 1)]; + *rate_f = interp_cubic(prate, xo); + *distbysse_f = interp_cubic(pdist, xo); +} + static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize, const struct macroblockd_plane *pd, ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], @@ -814,8 +1281,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_NEARESTG] = 0; } - rd->thresh_mult[THR_DC] += 1000; - rd->thresh_mult[THR_NEWMV] += 1000; rd->thresh_mult[THR_NEWL2] += 1000; rd->thresh_mult[THR_NEWL3] += 1000; @@ -840,8 +1305,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_GLOBALG] += 2000; rd->thresh_mult[THR_GLOBALA] += 2000; - rd->thresh_mult[THR_PAETH] += 1000; - rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000; @@ -956,15 +1419,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500; - rd->thresh_mult[THR_H_PRED] += 2000; - rd->thresh_mult[THR_V_PRED] += 2000; - rd->thresh_mult[THR_D135_PRED] += 2500; - rd->thresh_mult[THR_D203_PRED] += 2500; - rd->thresh_mult[THR_D157_PRED] += 2500; - rd->thresh_mult[THR_D67_PRED] += 2500; - rd->thresh_mult[THR_D113_PRED] += 2500; - rd->thresh_mult[THR_D45_PRED] += 2500; - rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600; rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000; rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000; @@ -996,6 +1450,20 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200; rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200; + + rd->thresh_mult[THR_DC] += 1000; + rd->thresh_mult[THR_PAETH] += 1000; + rd->thresh_mult[THR_SMOOTH] += 2000; + rd->thresh_mult[THR_SMOOTH_V] += 2000; + rd->thresh_mult[THR_SMOOTH_H] += 2000; + rd->thresh_mult[THR_H_PRED] += 2000; + rd->thresh_mult[THR_V_PRED] += 2000; + rd->thresh_mult[THR_D135_PRED] += 2500; + rd->thresh_mult[THR_D203_PRED] += 2500; + rd->thresh_mult[THR_D157_PRED] += 2500; + rd->thresh_mult[THR_D67_PRED] += 2500; + rd->thresh_mult[THR_D113_PRED] += 2500; + rd->thresh_mult[THR_D45_PRED] += 2500; } void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) { diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h index 692367d7a..755b61df5 100644 --- a/third_party/aom/av1/encoder/rd.h +++ b/third_party/aom/av1/encoder/rd.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RD_H_ -#define AV1_ENCODER_RD_H_ +#ifndef AOM_AV1_ENCODER_RD_H_ +#define AOM_AV1_ENCODER_RD_H_ #include @@ -57,8 +57,6 @@ typedef enum { THR_NEARESTA, THR_NEARESTG, - THR_DC, - THR_NEWMV, THR_NEWL2, THR_NEWL3, @@ -100,12 +98,6 @@ typedef enum { THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTBA, - THR_PAETH, - - THR_SMOOTH, - THR_SMOOTH_V, - THR_SMOOTH_H, - THR_COMP_NEAR_NEARLA, THR_COMP_NEW_NEARESTLA, THR_COMP_NEAREST_NEWLA, @@ -202,15 +194,6 @@ typedef enum { THR_COMP_NEW_NEWGA2, THR_COMP_GLOBAL_GLOBALGA2, - THR_H_PRED, - THR_V_PRED, - THR_D135_PRED, - THR_D203_PRED, - THR_D157_PRED, - THR_D67_PRED, - THR_D113_PRED, - THR_D45_PRED, - THR_COMP_NEAR_NEARLL2, THR_COMP_NEW_NEARESTLL2, THR_COMP_NEAREST_NEWLL2, @@ -243,7 +226,26 @@ typedef enum { THR_COMP_NEW_NEWBA, THR_COMP_GLOBAL_GLOBALBA, - MAX_MODES + THR_DC, + THR_PAETH, + THR_SMOOTH, + THR_SMOOTH_V, + THR_SMOOTH_H, + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D203_PRED, + THR_D157_PRED, + THR_D67_PRED, + THR_D113_PRED, + THR_D45_PRED, + + MAX_MODES, + + LAST_SINGLE_REF_MODES = THR_GLOBALG, + MAX_SINGLE_REF_MODES = LAST_SINGLE_REF_MODES + 1, + LAST_COMP_REF_MODES = THR_COMP_GLOBAL_GLOBALBA, + MAX_COMP_REF_MODES = LAST_COMP_REF_MODES + 1 } THR_MODES; typedef enum { @@ -392,6 +394,10 @@ void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x, void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n, unsigned int qstep, int *rate, int64_t *dist); +void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f); +void av1_model_rd_surffit(double xm, double yl, double *rate_f, + double *distbysse_f); + int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x, const MACROBLOCKD *xd); @@ -455,4 +461,4 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc, } // extern "C" #endif -#endif // AV1_ENCODER_RD_H_ +#endif // AOM_AV1_ENCODER_RD_H_ diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c index fef6d2875..c2d15534f 100644 --- a/third_party/aom/av1/encoder/rdopt.c +++ b/third_party/aom/av1/encoder/rdopt.c @@ -55,17 +55,97 @@ #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" #include "av1/encoder/tokenize.h" #include "av1/encoder/tx_prune_model_weights.h" -#define DNN_BASED_RD_INTERP_FILTER 0 +typedef void (*model_rd_for_sb_type)( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist); -// Set this macro as 1 to collect data about tx size selection. -#define COLLECT_TX_SIZE_DATA 0 +static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, + int plane_to, int mi_row, int mi_col, + int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, + int64_t *plane_dist); +static void model_rd_for_sb_with_curvfit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +static void model_rd_for_sb_with_surffit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +static void model_rd_for_sb_with_dnn( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +static void model_rd_for_sb_with_fullrdy( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +static void model_rd_from_sse(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, + int plane, int64_t sse, int num_samples, + int *rate, int64_t *dist); +static void model_rd_with_dnn(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, + int plane, int64_t sse, int num_samples, + int *rate, int64_t *dist); +static void model_rd_with_curvfit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist); +static void model_rd_with_surffit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist); -#if COLLECT_TX_SIZE_DATA -static const char av1_tx_size_data_output_file[] = "tx_size_data.txt"; -#endif +typedef enum { + MODELRD_LEGACY, + MODELRD_CURVFIT, + MODELRD_SUFFIT, + MODELRD_DNN, + MODELRD_FULLRDY, + MODELRD_TYPES +} ModelRdType; + +static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = { + model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit, + model_rd_for_sb_with_dnn, model_rd_for_sb_with_fullrdy +}; + +static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = { + model_rd_from_sse, model_rd_with_curvfit, model_rd_with_surffit, + model_rd_with_dnn, NULL +}; + +// 0: Legacy model +// 1: Curve fit model +// 2: Surface fit model +// 3: DNN regression model +// 4: Full rd model +#define MODELRD_TYPE_INTERP_FILTER 1 +#define MODELRD_TYPE_TX_SEARCH_PRUNE 2 +#define MODELRD_TYPE_MASKED_COMPOUND 1 +#define MODELRD_TYPE_INTERINTRA 1 +#define MODELRD_TYPE_INTRA 1 +#define MODELRD_TYPE_JNT_COMPOUND 1 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS) static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = { @@ -103,6 +183,16 @@ typedef enum { FTXS_USE_TRANSFORM_DOMAIN = 1 << 2 } FAST_TX_SEARCH_MODE; +static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row, + int mi_col, int64_t ref_best_rd); + +static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t non_skip_ref_best_rd, + int64_t skip_ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode); + struct rdcost_block_args { const AV1_COMP *cpi; MACROBLOCK *x; @@ -112,6 +202,7 @@ struct rdcost_block_args { int64_t this_rd; int64_t best_rd; int exit_early; + int incomplete_exit; int use_fast_coef_costing; FAST_TX_SEARCH_MODE ftxs_mode; }; @@ -126,8 +217,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } }, { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } }, - { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, - { NEWMV, { LAST_FRAME, NONE_FRAME } }, { NEWMV, { LAST2_FRAME, NONE_FRAME } }, { NEWMV, { LAST3_FRAME, NONE_FRAME } }, @@ -172,12 +261,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, - { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, - - { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, - { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, - { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, - { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, @@ -274,15 +357,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, - { H_PRED, { INTRA_FRAME, NONE_FRAME } }, - { V_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, - { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } }, @@ -314,6 +388,21 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } }, + + // intra modes + { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, + { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, }; static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = { @@ -451,7 +540,6 @@ static int get_prediction_mode_idx(PREDICTION_MODE this_mode, if (this_mode >= SINGLE_INTER_MODE_START && this_mode < SINGLE_INTER_MODE_END) { assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); - assert(second_ref_frame == NONE_FRAME); return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START] [ref_frame]; } @@ -479,6 +567,12 @@ static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = { UV_D113_PRED, UV_D45_PRED, }; +typedef struct SingleInterModeState { + int64_t rd; + MV_REFERENCE_FRAME ref_frame; + int valid; +} SingleInterModeState; + typedef struct InterModeSearchState { int64_t best_rd; MB_MODE_INFO best_mbmode; @@ -510,32 +604,21 @@ typedef struct InterModeSearchState { int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES]; int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES]; int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES]; - int64_t modelled_rd[MB_MODE_COUNT][REF_FRAMES]; + int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES]; + // The rd of simple translation in single inter modes + int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES]; + + // Single search results by [directions][modes][reference frames] + SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; + int single_state_cnt[2][SINGLE_INTER_MODE_NUM]; + SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM] + [FWD_REFS]; + int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM]; + + MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; } InterModeSearchState; #if CONFIG_COLLECT_INTER_MODE_RD_STATS - -typedef struct InterModeRdModel { - int ready; - double a; - double b; - double dist_mean; - int skip_count; - int non_skip_count; - int fp_skip_count; - int bracket_idx; -} InterModeRdModel; - -InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; - -#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400 -static int inter_mode_data_idx[4]; -static int64_t inter_mode_data_sse[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; -static int64_t inter_mode_data_dist[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; -static int inter_mode_data_residue_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; -static int inter_mode_data_all_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; -static int64_t inter_mode_data_ref_best_rd[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; - int inter_mode_data_block_idx(BLOCK_SIZE bsize) { if (bsize == BLOCK_8X8) return 1; if (bsize == BLOCK_16X16) return 2; @@ -543,137 +626,152 @@ int inter_mode_data_block_idx(BLOCK_SIZE bsize) { return -1; } -void av1_inter_mode_data_init() { +void av1_inter_mode_data_init(TileDataEnc *tile_data) { for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { - const int block_idx = inter_mode_data_block_idx(i); - if (block_idx != -1) inter_mode_data_idx[block_idx] = 0; - InterModeRdModel *md = &inter_mode_rd_models[i]; + InterModeRdModel *md = &tile_data->inter_mode_rd_models[i]; md->ready = 0; - md->skip_count = 0; - md->non_skip_count = 0; - md->fp_skip_count = 0; - md->bracket_idx = 0; + md->num = 0; + md->dist_sum = 0; + md->ld_sum = 0; + md->sse_sum = 0; + md->sse_sse_sum = 0; + md->sse_ld_sum = 0; } } -void av1_inter_mode_data_show(const AV1_COMMON *cm) { - printf("frame_offset %d\n", cm->frame_offset); - for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { - const int block_idx = inter_mode_data_block_idx(i); - if (block_idx != -1) inter_mode_data_idx[block_idx] = 0; - InterModeRdModel *md = &inter_mode_rd_models[i]; - if (md->ready) { - printf("bsize %d non_skip_count %d skip_count %d fp_skip_count %d\n", i, - md->non_skip_count, md->skip_count, md->fp_skip_count); +static int get_est_rate_dist(TileDataEnc *tile_data, BLOCK_SIZE bsize, + int64_t sse, int *est_residue_cost, + int64_t *est_dist) { + aom_clear_system_state(); + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + if (md->ready) { + const double est_ld = md->a * sse + md->b; + if (sse < md->dist_mean) { + *est_residue_cost = 0; + *est_dist = sse; + } else { + *est_residue_cost = (int)round((sse - md->dist_mean) / est_ld); + *est_dist = (int64_t)round(md->dist_mean); } + return 1; } + return 0; } -static int64_t get_est_rd(BLOCK_SIZE bsize, int rdmult, int64_t sse, - int curr_cost) { - aom_clear_system_state(); - InterModeRdModel *md = &inter_mode_rd_models[bsize]; - if (md->ready) { - const double est_ld = md->a * sse + md->b; - const double est_residue_cost = (sse - md->dist_mean) / est_ld; - const int64_t est_cost = (int64_t)round(est_residue_cost) + curr_cost; - const int64_t int64_dist_mean = (int64_t)round(md->dist_mean); - const int64_t est_rd = RDCOST(rdmult, est_cost, int64_dist_mean); +static int64_t get_est_rd(TileDataEnc *tile_data, BLOCK_SIZE bsize, int rdmult, + int64_t sse, int curr_cost) { + int est_residue_cost; + int64_t est_dist; + if (get_est_rate_dist(tile_data, bsize, sse, &est_residue_cost, &est_dist)) { + int rate = est_residue_cost + curr_cost; + int64_t est_rd = RDCOST(rdmult, rate, est_dist); return est_rd; } return 0; } -#define DATA_BRACKETS 7 -static const int data_num_threshold[DATA_BRACKETS] = { - 200, 400, 800, 1600, 3200, 6400, INT32_MAX -}; - -void av1_inter_mode_data_fit(int rdmult) { +void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) { aom_clear_system_state(); for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { const int block_idx = inter_mode_data_block_idx(bsize); - InterModeRdModel *md = &inter_mode_rd_models[bsize]; + InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; if (block_idx == -1) continue; - int data_num = inter_mode_data_idx[block_idx]; - if (data_num < data_num_threshold[md->bracket_idx]) { + if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) { continue; + } else { + if (md->ready == 0) { + md->dist_mean = md->dist_sum / md->num; + md->ld_mean = md->ld_sum / md->num; + md->sse_mean = md->sse_sum / md->num; + md->sse_sse_mean = md->sse_sse_sum / md->num; + md->sse_ld_mean = md->sse_ld_sum / md->num; + } else { + const double factor = 3; + md->dist_mean = + (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1); + md->ld_mean = + (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1); + md->sse_mean = + (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1); + md->sse_sse_mean = + (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) / + (factor + 1); + md->sse_ld_mean = + (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) / + (factor + 1); + } + + const double my = md->ld_mean; + const double mx = md->sse_mean; + const double dx = sqrt(md->sse_sse_mean); + const double dxy = md->sse_ld_mean; + + md->a = (dxy - mx * my) / (dx * dx - mx * mx); + md->b = my - md->a * mx; + md->ready = 1; + + md->num = 0; + md->dist_sum = 0; + md->ld_sum = 0; + md->sse_sum = 0; + md->sse_sse_sum = 0; + md->sse_ld_sum = 0; } - double my = 0; - double mx = 0; - double dx = 0; - double dxy = 0; - double dist_mean = 0; - const int train_num = data_num; - for (int i = 0; i < train_num; ++i) { - const double sse = (double)inter_mode_data_sse[block_idx][i]; - const double dist = (double)inter_mode_data_dist[block_idx][i]; - const double residue_cost = inter_mode_data_residue_cost[block_idx][i]; - const double ld = (sse - dist) / residue_cost; - dist_mean += dist; - my += ld; - mx += sse; - dx += sse * sse; - dxy += sse * ld; - } - dist_mean = dist_mean / data_num; - my = my / train_num; - mx = mx / train_num; - dx = sqrt(dx / train_num); - dxy = dxy / train_num; - - md->dist_mean = dist_mean; - md->a = (dxy - mx * my) / (dx * dx - mx * mx); - md->b = my - md->a * mx; - ++md->bracket_idx; - md->ready = 1; - assert(md->bracket_idx < DATA_BRACKETS); - (void)rdmult; -#if 0 - int skip_count = 0; - int fp_skip_count = 0; - double avg_error = 0; - const int test_num = data_num; - for (int i = 0; i < data_num; ++i) { - const int64_t sse = inter_mode_data_sse[block_idx][i]; - const int64_t dist = inter_mode_data_dist[block_idx][i]; - const int64_t residue_cost = inter_mode_data_residue_cost[block_idx][i]; - const int64_t all_cost = inter_mode_data_all_cost[block_idx][i]; - const int64_t est_rd = - get_est_rd(bsize, rdmult, sse, all_cost - residue_cost); - const int64_t real_rd = RDCOST(rdmult, all_cost, dist); - const int64_t ref_best_rd = inter_mode_data_ref_best_rd[block_idx][i]; - if (est_rd > ref_best_rd) { - ++skip_count; - if (real_rd < ref_best_rd) { - ++fp_skip_count; - } - } - avg_error += abs(est_rd - real_rd) * 100. / real_rd; - } - avg_error /= test_num; - printf("test_num %d bsize %d avg_error %f skip_count %d fp_skip_count %d\n", - test_num, bsize, avg_error, skip_count, fp_skip_count); -#endif } } -static void inter_mode_data_push(BLOCK_SIZE bsize, int64_t sse, int64_t dist, - int residue_cost, int all_cost, - int64_t ref_best_rd) { +static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize, + int64_t sse, int64_t dist, int residue_cost) { if (residue_cost == 0 || sse == dist) return; const int block_idx = inter_mode_data_block_idx(bsize); if (block_idx == -1) return; - if (inter_mode_data_idx[block_idx] < INTER_MODE_RD_DATA_OVERALL_SIZE) { - const int data_idx = inter_mode_data_idx[block_idx]; - inter_mode_data_sse[block_idx][data_idx] = sse; - inter_mode_data_dist[block_idx][data_idx] = dist; - inter_mode_data_residue_cost[block_idx][data_idx] = residue_cost; - inter_mode_data_all_cost[block_idx][data_idx] = all_cost; - inter_mode_data_ref_best_rd[block_idx][data_idx] = ref_best_rd; - ++inter_mode_data_idx[block_idx]; + InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize]; + if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) { + aom_clear_system_state(); + const double ld = (sse - dist) * 1. / residue_cost; + ++rd_model->num; + rd_model->dist_sum += dist; + rd_model->ld_sum += ld; + rd_model->sse_sum += sse; + rd_model->sse_sse_sum += sse * sse; + rd_model->sse_ld_sum += sse * ld; + } +} + +static void inter_modes_info_push(InterModesInfo *inter_modes_info, + int mode_rate, int64_t sse, int64_t est_rd, + const MB_MODE_INFO *mbmi) { + const int num = inter_modes_info->num; + assert(num < MAX_INTER_MODES); + inter_modes_info->mbmi_arr[num] = *mbmi; + inter_modes_info->mode_rate_arr[num] = mode_rate; + inter_modes_info->sse_arr[num] = sse; + inter_modes_info->est_rd_arr[num] = est_rd; + ++inter_modes_info->num; +} + +static int compare_rd_idx_pair(const void *a, const void *b) { + if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) { + return 0; + } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) { + return 1; + } else { + return -1; + } +} + +static void inter_modes_info_sort(const InterModesInfo *inter_modes_info, + RdIdxPair *rd_idx_pair_arr) { + if (inter_modes_info->num == 0) { + return; } + for (int i = 0; i < inter_modes_info->num; ++i) { + rd_idx_pair_arr[i].idx = i; + rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i]; + } + qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]), + compare_rd_idx_pair); } #endif // CONFIG_COLLECT_INTER_MODE_RD_STATS @@ -1528,13 +1626,13 @@ static void score_2D_transform_pow8(float *scores_2D, float shift) { // will lead to pruning i+1 TX types on average static const float *prune_2D_adaptive_thresholds[] = { // TX_4X4 - (float[]){ 0.02014f, 0.02722f, 0.03430f, 0.04114f, 0.04724f, 0.05212f, - 0.05627f, 0.06018f, 0.06409f, 0.06824f, 0.07312f, 0.07849f, - 0.08606f, 0.09827f }, + (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f, + 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f, + 0.09778f, 0.11780f }, // TX_8X8 - (float[]){ 0.00745f, 0.01355f, 0.02039f, 0.02795f, 0.03625f, 0.04407f, - 0.05042f, 0.05579f, 0.06067f, 0.06604f, 0.07239f, 0.08093f, - 0.09363f, 0.11682f }, + (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f, + 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f, + 0.10803f, 0.14124f }, // TX_16X16 (float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f, 0.06897f, 0.07629f, 0.08875f, 0.11169f }, @@ -1543,35 +1641,37 @@ static const float *prune_2D_adaptive_thresholds[] = { // TX_64X64 NULL, // TX_4X8 - (float[]){ 0.01282f, 0.02087f, 0.02844f, 0.03601f, 0.04285f, 0.04871f, - 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f, - 0.09119f, 0.10828f }, + (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f, + 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f, + 0.10168f, 0.12585f }, // TX_8X4 - (float[]){ 0.01184f, 0.01941f, 0.02722f, 0.03503f, 0.04187f, 0.04822f, - 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f, - 0.09167f, 0.10974f }, + (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f, + 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f, + 0.10583f, 0.13123f }, // TX_8X16 - (float[]){ 0.00525f, 0.01135f, 0.01819f, 0.02576f, 0.03357f, 0.04114f, - 0.04773f, 0.05383f, 0.05920f, 0.06506f, 0.07190f, 0.08118f, - 0.09509f, 0.12097f }, + (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f, + 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f, + 0.10730f, 0.14221f }, // TX_16X8 - (float[]){ 0.00525f, 0.01160f, 0.01819f, 0.02527f, 0.03308f, 0.04065f, - 0.04773f, 0.05383f, 0.05969f, 0.06531f, 0.07214f, 0.08118f, - 0.09485f, 0.12048f }, + (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f, + 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f, + 0.10339f, 0.13464f }, // TX_16X32 - (float[]){ 0.01257f, 0.02576f, 0.03723f, 0.04578f, 0.05212f, 0.05798f, - 0.06506f, 0.07385f, 0.08606f, 0.10925f }, + NULL, // TX_32X16 - (float[]){ 0.01233f, 0.02527f, 0.03699f, 0.04602f, 0.05286f, 0.05896f, - 0.06531f, 0.07336f, 0.08582f, 0.11072f }, + NULL, // TX_32X64 NULL, // TX_64X32 NULL, // TX_4X16 - NULL, + (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f, + 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f, + 0.10242f, 0.12878f }, // TX_16X4 - NULL, + (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f, + 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f, + 0.10217f, 0.12610f }, // TX_8X32 NULL, // TX_32X8 @@ -1631,7 +1731,18 @@ static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, cur_scores_2D[3]; } score_2D_average /= 16; - score_2D_transform_pow8(scores_2D, (20 - score_2D_average)); + + const int prune_aggr_table[2][2] = { { 6, 4 }, { 10, 7 } }; + int pruning_aggressiveness = 1; + if (tx_set_type == EXT_TX_SET_ALL16) { + score_2D_transform_pow8(scores_2D, (10 - score_2D_average)); + pruning_aggressiveness = + prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0]; + } else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) { + score_2D_transform_pow8(scores_2D, (20 - score_2D_average)); + pruning_aggressiveness = + prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1]; + } // Always keep the TX type with the highest score, prune all others with // score below score_thresh. @@ -1645,18 +1756,6 @@ static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, } } - int pruning_aggressiveness = 0; - if (prune_mode == PRUNE_2D_ACCURATE) { - if (tx_set_type == EXT_TX_SET_ALL16) - pruning_aggressiveness = 6; - else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) - pruning_aggressiveness = 4; - } else if (prune_mode == PRUNE_2D_FAST) { - if (tx_set_type == EXT_TX_SET_ALL16) - pruning_aggressiveness = 10; - else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) - pruning_aggressiveness = 7; - } const float score_thresh = prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1]; @@ -1724,9 +1823,11 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, } static void model_rd_from_sse(const AV1_COMP *const cpi, - const MACROBLOCKD *const xd, BLOCK_SIZE bsize, - int plane, int64_t sse, int *rate, - int64_t *dist) { + const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, + int plane, int64_t sse, int num_samples, + int *rate, int64_t *dist) { + (void)num_samples; + const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int dequant_shift = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; @@ -1734,15 +1835,17 @@ static void model_rd_from_sse(const AV1_COMP *const cpi, // Fast approximate the modelling function. if (cpi->sf.simple_model_rd_from_var) { const int64_t square_error = sse; - int quantizer = (pd->dequant_Q3[1] >> dequant_shift); + int quantizer = pd->dequant_Q3[1] >> dequant_shift; if (quantizer < 120) - *rate = (int)((square_error * (280 - quantizer)) >> - (16 - AV1_PROB_COST_SHIFT)); + *rate = (int)AOMMIN( + (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT), + INT_MAX); else *rate = 0; + assert(*rate >= 0); *dist = (square_error * quantizer) >> 8; } else { - av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize], + av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize], pd->dequant_Q3[1] >> dequant_shift, rate, dist); } @@ -1776,22 +1879,23 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, - int plane_to, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb, int *plane_rate, - int64_t *plane_sse, int64_t *plane_dist) { + int plane_to, int mi_row, int mi_col, + int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, + int64_t *plane_dist) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. int plane; + (void)mi_row; + (void)mi_col; const int ref = xd->mi[0]->ref_frame[0]; int64_t rate_sum = 0; int64_t dist_sum = 0; int64_t total_sse = 0; - x->pred_sse[ref] = 0; - for (plane = plane_from; plane <= plane_to; ++plane) { struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; @@ -1805,26 +1909,31 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, if (x->skip_chroma_rd && plane) continue; - // TODO(geza): Write direct sse functions that do not compute - // variance as well. - sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); + model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist); + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); total_sse += sse; - - model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &rate, &dist); - rate_sum += rate; dist_sum += dist; if (plane_rate) plane_rate[plane] = rate; if (plane_sse) plane_sse[plane] = sse; if (plane_dist) plane_dist[plane] = dist; + assert(rate_sum >= 0); } if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + rate_sum = AOMMIN(rate_sum, INT_MAX); *out_rate_sum = (int)rate_sum; *out_dist_sum = dist_sum; } @@ -1949,7 +2058,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, assert(visible_cols > 0); #if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && plane == 0 && txb_cols >= 8 && txb_rows >= 8) + if (x->using_dist_8x8 && plane == 0) return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, tx_bsize, txb_cols, txb_rows, visible_cols, visible_rows, x->qindex); @@ -1967,8 +2076,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row, int blk_col, const BLOCK_SIZE plane_bsize, - const BLOCK_SIZE tx_bsize, - int force_sse) { + const BLOCK_SIZE tx_bsize) { int visible_rows, visible_cols; const MACROBLOCKD *xd = &x->e_mbd; get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, @@ -1978,8 +2086,7 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, #if CONFIG_DIST_8X8 int txb_height = block_size_high[tx_bsize]; int txb_width = block_size_wide[tx_bsize]; - if (!force_sse && x->using_dist_8x8 && plane == 0 && txb_width >= 8 && - txb_height >= 8) { + if (x->using_dist_8x8 && plane == 0) { const int src_stride = x->plane[plane].src.stride; const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0]; @@ -2145,29 +2252,7 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x, av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon, MAX_TX_SIZE, eob, cpi->common.reduced_tx_set_used); -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) { - // Save decoded pixels for inter block in pd->pred to avoid - // block_8x8_rd_txfm_daala_dist() need to produce them - // by calling av1_inverse_transform_block() again. - const int pred_stride = block_size_wide[plane_bsize]; - const int pred_idx = (blk_row * pred_stride + blk_col) - << tx_size_wide_log2[0]; - int16_t *pred = &x->pred_luma[pred_idx]; - int i, j; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (j = 0; j < bsh; j++) - for (i = 0; i < bsw; i++) - pred[j * pred_stride + i] = - CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i]; - } else { - for (j = 0; j < bsh; j++) - for (i = 0; i < bsw; i++) - pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i]; - } - } -#endif // CONFIG_DIST_8X8 return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE, blk_row, blk_col, plane_bsize, tx_bsize); } @@ -2258,11 +2343,11 @@ static void get_2x2_normalized_sses_and_sads( } } +// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values +// 0: Do not collect any RD stats +// 1: Collect RD stats for transform units +// 2: Collect RD stats for partition units #if CONFIG_COLLECT_RD_STATS - // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values - // 0: Do not collect any RD stats - // 1: Collect RD stats for transform units - // 2: Collect RD stats for partition units #if CONFIG_COLLECT_RD_STATS == 1 static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, @@ -2274,7 +2359,7 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, // Generate small sample to restrict output size. static unsigned int seed = 21743; - if (lcg_rand16(&seed) % 100 > 0) return; + if (lcg_rand16(&seed) % 256 > 0) return; const char output_file[] = "tu_stats.txt"; FILE *fout = fopen(output_file, "a"); @@ -2336,7 +2421,8 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, int model_rate; int64_t model_dist; - model_rd_from_sse(cpi, xd, tx_bsize, plane, sse, &model_rate, &model_dist); + model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples, + &model_rate, &model_dist); const double model_rate_norm = (double)model_rate / num_samples; const double model_dist_norm = (double)model_dist / num_samples; fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); @@ -2360,7 +2446,7 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, } #endif // CONFIG_COLLECT_RD_STATS == 1 -#if CONFIG_COLLECT_RD_STATS == 2 +#if CONFIG_COLLECT_RD_STATS >= 2 static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats, BLOCK_SIZE plane_bsize) { @@ -2369,7 +2455,7 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, // Generate small sample to restrict output size. static unsigned int seed = 95014; - if (lcg_rand16(&seed) % 100 > 0) return; + if (lcg_rand16(&seed) % 256 > 0) return; const char output_file[] = "pu_stats.txt"; FILE *fout = fopen(output_file, "a"); @@ -2390,8 +2476,10 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const double rate_norm = (double)rd_stats->rate / num_samples; const double dist_norm = (double)rd_stats->dist / num_samples; + const double rdcost_norm = + (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples; - fprintf(fout, "%g %g", rate_norm, dist_norm); + fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm); const int src_stride = p->src.stride; const uint8_t *const src = p->src.buf; @@ -2426,14 +2514,18 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, fprintf(fout, " %g", sad_norm_arr[i]); } - fprintf(fout, " %d %d %d", q_step, bw, bh); + fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh); int model_rate; int64_t model_dist; - model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, &model_dist); + model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples, + &model_rate, &model_dist); + const double model_rdcost_norm = + (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples; const double model_rate_norm = (double)model_rate / num_samples; const double model_dist_norm = (double)model_dist / num_samples; - fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); + fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm, + model_rdcost_norm); double mean = get_mean(src_diff, diff_stride, bw, bh); mean /= (1 << shift); @@ -2450,53 +2542,51 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, fprintf(fout, "\n"); fclose(fout); } -#endif // CONFIG_COLLECT_RD_STATS == 2 +#endif // CONFIG_COLLECT_RD_STATS >= 2 #endif // CONFIG_COLLECT_RD_STATS -static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x, - BLOCK_SIZE plane_bsize, int plane, int64_t *rsse, +static void model_rd_with_dnn(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, + int plane, int64_t sse, int num_samples, int *rate, int64_t *dist) { const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int log_numpels = num_pels_log2_lookup[plane_bsize]; + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int q_step = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1); + const struct macroblock_plane *const p = &x->plane[plane]; int bw, bh; - const int diff_stride = block_size_wide[plane_bsize]; get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, &bh); - const int num_samples = bw * bh; - const int dequant_shift = - (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; - const int q_step = pd->dequant_Q3[1] >> dequant_shift; - const int src_stride = p->src.stride; const uint8_t *const src = p->src.buf; const int dst_stride = pd->dst.stride; const uint8_t *const dst = pd->dst.buf; const int16_t *const src_diff = p->src_diff; + const int diff_stride = block_size_wide[plane_bsize]; const int shift = (xd->bd - 8); - int64_t sse = aom_sum_squares_2d_i16(p->src_diff, diff_stride, bw, bh); - sse = ROUND_POWER_OF_TWO(sse, shift * 2); - const double sse_norm = (double)sse / num_samples; if (sse == 0) { if (rate) *rate = 0; if (dist) *dist = 0; - if (rsse) *rsse = sse; return; } if (plane) { int model_rate; int64_t model_dist; - model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, - &model_dist); + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, num_samples, + &model_rate, &model_dist); if (rate) *rate = model_rate; if (dist) *dist = model_dist; - if (rsse) *rsse = sse; return; } + aom_clear_system_state(); + const double sse_norm = (double)sse / num_samples; + double sse_norm_arr[4]; get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst, dst_stride, src_diff, diff_stride, @@ -2506,25 +2596,26 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x, for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift)); mean /= (1 << shift); } - const double variance = sse_norm - mean * mean; - assert(variance >= 0.0); + double sse_norm_sum = 0.0, sse_frac_arr[3]; + for (int k = 0; k < 4; ++k) sse_norm_sum += sse_norm_arr[k]; + for (int k = 0; k < 3; ++k) + sse_frac_arr[k] = + sse_norm_sum > 0.0 ? sse_norm_arr[k] / sse_norm_sum : 0.25; const double q_sqr = (double)(q_step * q_step); const double q_sqr_by_sse_norm = q_sqr / (sse_norm + 1.0); + const double mean_sqr_by_sse_norm = mean * mean / (sse_norm + 1.0); double hor_corr, vert_corr; get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr); - float features[11]; + float features[NUM_FEATURES_PUSTATS]; features[0] = (float)hor_corr; features[1] = (float)log_numpels; - features[2] = (float)q_sqr; + features[2] = (float)mean_sqr_by_sse_norm; features[3] = (float)q_sqr_by_sse_norm; - features[4] = (float)sse_norm_arr[0]; - features[5] = (float)sse_norm_arr[1]; - features[6] = (float)sse_norm_arr[2]; - features[7] = (float)sse_norm_arr[3]; - features[8] = (float)sse_norm; - features[9] = (float)variance; - features[10] = (float)vert_corr; + features[4] = (float)sse_frac_arr[0]; + features[5] = (float)sse_frac_arr[1]; + features[6] = (float)sse_frac_arr[2]; + features[7] = (float)vert_corr; float rate_f, dist_by_sse_norm_f; av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f); @@ -2532,27 +2623,29 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x, const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm)); int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + aom_clear_system_state(); // Check if skip is better - if (RDCOST(x->rdmult, rate_i, dist_i) >= RDCOST(x->rdmult, 0, (sse << 4))) { + if (rate_i == 0) { dist_i = sse << 4; + } else if (RDCOST(x->rdmult, rate_i, dist_i) >= + RDCOST(x->rdmult, 0, sse << 4)) { rate_i = 0; - } else if (rate_i == 0) { dist_i = sse << 4; } if (rate) *rate = rate_i; if (dist) *dist = dist_i; - if (rsse) *rsse = sse; return; } -void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize, - MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, - int plane_to, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb, int *plane_rate, - int64_t *plane_sse, int64_t *plane_dist) { +static void model_rd_for_sb_with_dnn( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + (void)mi_row; + (void)mi_col; // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -2562,19 +2655,30 @@ void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize, int64_t dist_sum = 0; int64_t total_sse = 0; - x->pred_sse[ref] = 0; - for (int plane = plane_from; plane <= plane_to; ++plane) { struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); - int64_t sse; + int64_t dist, sse; int rate; - int64_t dist; if (x->skip_chroma_rd && plane) continue; - model_rd_with_dnn(cpi, x, plane_bsize, plane, &sse, &rate, &dist); + const struct macroblock_plane *const p = &x->plane[plane]; + const int shift = (xd->bd - 8); + int bw, bh; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, + &bw, &bh); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + sse = ROUND_POWER_OF_TWO(sse, shift * 2); + + model_rd_with_dnn(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist); if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); @@ -2593,110 +2697,385 @@ void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize, *out_dist_sum = dist_sum; } -static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, - int block, int blk_row, int blk_col, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - const TXB_CTX *const txb_ctx, - FAST_TX_SEARCH_MODE ftxs_mode, - int use_fast_coef_costing, int64_t ref_best_rd, - RD_STATS *best_rd_stats) { - const AV1_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &x->e_mbd; - struct macroblockd_plane *const pd = &xd->plane[plane]; - MB_MODE_INFO *mbmi = xd->mi[0]; - const int is_inter = is_inter_block(mbmi); - int64_t best_rd = INT64_MAX; - uint16_t best_eob = 0; - TX_TYPE best_tx_type = DCT_DCT; - TX_TYPE last_tx_type = TX_TYPES; - const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY; - // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff - // of the best tx_type - DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]); - tran_low_t *orig_dqcoeff = pd->dqcoeff; - tran_low_t *best_dqcoeff = this_dqcoeff; - const int txk_type_idx = - av1_get_txk_type_index(plane_bsize, blk_row, blk_col); - av1_invalid_rd_stats(best_rd_stats); +// Fits a surface for rate and distortion using as features: +// log2(sse_norm + 1) and log2(sse_norm/qstep^2) +static void model_rd_with_surffit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist) { + (void)cpi; + (void)plane_bsize; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1); + if (sse == 0) { + if (rate) *rate = 0; + if (dist) *dist = 0; + return; + } + aom_clear_system_state(); + const double sse_norm = (double)sse / num_samples; + const double qstepsqr = (double)qstep * qstep; + const double xm = log(sse_norm + 1.0) / log(2.0); + const double yl = log(sse_norm / qstepsqr) / log(2.0); + double rate_f, dist_by_sse_norm_f; - TXB_RD_INFO *intra_txb_rd_info = NULL; - uint16_t cur_joint_ctx = 0; - const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); - const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); - const int within_border = - mi_row >= xd->tile.mi_row_start && - (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) && - mi_col >= xd->tile.mi_col_start && - (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end); - if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) && - !is_inter && plane == 0 && - tx_size_wide[tx_size] == tx_size_high[tx_size]) { - const uint32_t intra_hash = - get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size); - const int intra_hash_idx = - find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash); - intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx]; + av1_model_rd_surffit(xm, yl, &rate_f, &dist_by_sse_norm_f); - cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx; - if (intra_hash_idx > 0 && - intra_txb_rd_info->entropy_context == cur_joint_ctx && - x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) { - mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type; - const TX_TYPE ref_tx_type = - av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col, - tx_size, cpi->common.reduced_tx_set_used); - if (ref_tx_type == intra_txb_rd_info->tx_type) { - best_rd_stats->rate = intra_txb_rd_info->rate; - best_rd_stats->dist = intra_txb_rd_info->dist; - best_rd_stats->sse = intra_txb_rd_info->sse; - best_rd_stats->skip = intra_txb_rd_info->eob == 0; - x->plane[plane].eobs[block] = intra_txb_rd_info->eob; - x->plane[plane].txb_entropy_ctx[block] = - intra_txb_rd_info->txb_entropy_ctx; - best_rd = RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist); - best_eob = intra_txb_rd_info->eob; - best_tx_type = intra_txb_rd_info->tx_type; - update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, - best_tx_type); - goto RECON_INTRA; - } - } - } + const double dist_f = dist_by_sse_norm_f * sse_norm; + int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); + int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + aom_clear_system_state(); - int rate_cost = 0; - TX_TYPE txk_start = DCT_DCT; - TX_TYPE txk_end = TX_TYPES - 1; - if ((!is_inter && x->use_default_intra_tx_type) || - (is_inter && x->use_default_inter_tx_type)) { - txk_start = txk_end = get_default_tx_type(0, xd, tx_size); - } else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) { - if (plane == 0) txk_end = DCT_DCT; + // Check if skip is better + if (rate_i == 0) { + dist_i = sse << 4; + } else if (RDCOST(x->rdmult, rate_i, dist_i) >= + RDCOST(x->rdmult, 0, sse << 4)) { + rate_i = 0; + dist_i = sse << 4; } - uint8_t best_txb_ctx = 0; - const TxSetType tx_set_type = - av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used); + if (rate) *rate = rate_i; + if (dist) *dist = dist_i; +} - TX_TYPE uv_tx_type = DCT_DCT; - if (plane) { - // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y - uv_tx_type = txk_start = txk_end = - av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size, - cm->reduced_tx_set_used); - } - const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type]; - if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 || - ext_tx_used_flag == 0x0001) { - txk_start = txk_end = DCT_DCT; - } - uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip. - if (txk_start == txk_end) { - allowed_tx_mask = 1 << txk_start; - allowed_tx_mask &= ext_tx_used_flag; - } else if (fast_tx_search) { - allowed_tx_mask = 0x0c01; // V_DCT, H_DCT, DCT_DCT - allowed_tx_mask &= ext_tx_used_flag; - } else { +static void model_rd_for_sb_with_surffit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + (void)mi_row; + (void)mi_col; + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int64_t dist, sse; + int rate; + + if (x->skip_chroma_rd && plane) continue; + + int bw, bh; + const struct macroblock_plane *const p = &x->plane[plane]; + const int shift = (xd->bd - 8); + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, + &bw, &bh); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + sse = ROUND_POWER_OF_TWO(sse, shift * 2); + + model_rd_with_surffit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +// Fits a curve for rate and distortion using as feature: +// log2(sse_norm/qstep^2) +static void model_rd_with_curvfit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist) { + (void)cpi; + (void)plane_bsize; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1); + + if (sse == 0) { + if (rate) *rate = 0; + if (dist) *dist = 0; + return; + } + aom_clear_system_state(); + const double sse_norm = (double)sse / num_samples; + const double qstepsqr = (double)qstep * qstep; + const double xqr = log(sse_norm / qstepsqr) / log(2.0); + + double rate_f, dist_by_sse_norm_f; + av1_model_rd_curvfit(xqr, &rate_f, &dist_by_sse_norm_f); + + const double dist_f = dist_by_sse_norm_f * sse_norm; + int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); + int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + aom_clear_system_state(); + + // Check if skip is better + if (rate_i == 0) { + dist_i = sse << 4; + } else if (RDCOST(x->rdmult, rate_i, dist_i) >= + RDCOST(x->rdmult, 0, sse << 4)) { + rate_i = 0; + dist_i = sse << 4; + } + + if (rate) *rate = rate_i; + if (dist) *dist = dist_i; +} + +static void model_rd_for_sb_with_curvfit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + (void)mi_row; + (void)mi_col; + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int64_t dist, sse; + int rate; + + if (x->skip_chroma_rd && plane) continue; + + int bw, bh; + const struct macroblock_plane *const p = &x->plane[plane]; + const int shift = (xd->bd - 8); + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, + &bw, &bh); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + + sse = ROUND_POWER_OF_TWO(sse, shift * 2); + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +static void model_rd_for_sb_with_fullrdy( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + int64_t sse; + int rate; + int64_t dist; + + if (x->skip_chroma_rd && plane) continue; + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); + + RD_STATS rd_stats; + if (plane == 0) { + select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX); + if (rd_stats.invalid_rate) { + rate = 0; + dist = sse << 4; + } else { + rate = rd_stats.rate; + dist = rd_stats.dist; + } + } else { + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); + } + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, + FAST_TX_SEARCH_MODE ftxs_mode, + int use_fast_coef_costing, int64_t ref_best_rd, + RD_STATS *best_rd_stats) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + int64_t best_rd = INT64_MAX; + uint16_t best_eob = 0; + TX_TYPE best_tx_type = DCT_DCT; + TX_TYPE last_tx_type = TX_TYPES; + const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY; + // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff + // of the best tx_type + DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]); + tran_low_t *orig_dqcoeff = pd->dqcoeff; + tran_low_t *best_dqcoeff = this_dqcoeff; + const int txk_type_idx = + av1_get_txk_type_index(plane_bsize, blk_row, blk_col); + av1_invalid_rd_stats(best_rd_stats); + + TXB_RD_INFO *intra_txb_rd_info = NULL; + uint16_t cur_joint_ctx = 0; + const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); + const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); + const int within_border = + mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end); + if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) && + !is_inter && plane == 0 && + tx_size_wide[tx_size] == tx_size_high[tx_size]) { + const uint32_t intra_hash = + get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size); + const int intra_hash_idx = + find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash); + intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx]; + + cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx; + if (intra_txb_rd_info->entropy_context == cur_joint_ctx && + x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) { + mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type; + const TX_TYPE ref_tx_type = + av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col, + tx_size, cpi->common.reduced_tx_set_used); + if (ref_tx_type == intra_txb_rd_info->tx_type) { + best_rd_stats->rate = intra_txb_rd_info->rate; + best_rd_stats->dist = intra_txb_rd_info->dist; + best_rd_stats->sse = intra_txb_rd_info->sse; + best_rd_stats->skip = intra_txb_rd_info->eob == 0; + x->plane[plane].eobs[block] = intra_txb_rd_info->eob; + x->plane[plane].txb_entropy_ctx[block] = + intra_txb_rd_info->txb_entropy_ctx; + best_rd = RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist); + best_eob = intra_txb_rd_info->eob; + best_tx_type = intra_txb_rd_info->tx_type; + update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, + best_tx_type); + goto RECON_INTRA; + } + } + } + + int rate_cost = 0; + TX_TYPE txk_start = DCT_DCT; + TX_TYPE txk_end = TX_TYPES - 1; + if ((!is_inter && x->use_default_intra_tx_type) || + (is_inter && x->use_default_inter_tx_type)) { + txk_start = txk_end = get_default_tx_type(0, xd, tx_size); + } else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) { + if (plane == 0) txk_end = DCT_DCT; + } + + uint8_t best_txb_ctx = 0; + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used); + + TX_TYPE uv_tx_type = DCT_DCT; + if (plane) { + // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y + uv_tx_type = txk_start = txk_end = + av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size, + cm->reduced_tx_set_used); + } + const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type]; + if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 || + ext_tx_used_flag == 0x0001) { + txk_start = txk_end = DCT_DCT; + } + uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip. + if (txk_start == txk_end) { + allowed_tx_mask = 1 << txk_start; + allowed_tx_mask &= ext_tx_used_flag; + } else if (fast_tx_search) { + allowed_tx_mask = 0x0c01; // V_DCT, H_DCT, DCT_DCT + allowed_tx_mask &= ext_tx_used_flag; + } else { assert(plane == 0); allowed_tx_mask = ext_tx_used_flag; // !fast_tx_search && txk_end != txk_start && plane == 0 @@ -2727,7 +3106,6 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, #if CONFIG_DIST_8X8 if (x->using_dist_8x8) use_transform_domain_distortion = 0; #endif - int calc_pixel_domain_distortion_final = cpi->sf.use_transform_domain_distortion == 1 && use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD && @@ -2740,7 +3118,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; int64_t block_sse = - pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize, 1); + pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize); if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2); block_sse *= 16; @@ -2834,7 +3212,6 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, assert(best_rd != INT64_MAX); best_rd_stats->skip = best_eob == 0; - if (best_eob == 0) best_tx_type = DCT_DCT; if (plane == 0) { update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, best_tx_type); @@ -2914,24 +3291,12 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, int64_t rd1, rd2, rd; RD_STATS this_rd_stats; -#if CONFIG_DIST_8X8 - // If sub8x8 tx, 8x8 or larger partition, and luma channel, - // dist-8x8 disables early skip, because the distortion metrics for - // sub8x8 tx (MSE) and reference distortion from 8x8 or larger partition - // (new distortion metric) are different. - // Exception is: dist-8x8 is enabled but still MSE is used, - // i.e. "--tune=" encoder option is not used. - int bw = block_size_wide[plane_bsize]; - int bh = block_size_high[plane_bsize]; - int disable_early_skip = - x->using_dist_8x8 && plane == AOM_PLANE_Y && bw >= 8 && bh >= 8 && - (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) && - x->tune_metric != AOM_TUNE_PSNR; -#endif // CONFIG_DIST_8X8 - av1_init_rd_stats(&this_rd_stats); - if (args->exit_early) return; + if (args->exit_early) { + args->incomplete_exit = 1; + return; + } if (!is_inter_block(mbmi)) { av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); @@ -2954,11 +3319,14 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, #endif // CONFIG_RD_DEBUG av1_set_txb_context(x, plane, block, tx_size, a, l); - if (plane == 0) { - x->blk_skip[blk_row * - (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) + - blk_col] = (x->plane[plane].eobs[block] == 0); - } + const int blk_idx = + blk_row * (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) + + blk_col; + + if (plane == 0) + set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0); + else + set_blk_skip(x, plane, blk_idx, 0); rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse); @@ -2972,100 +3340,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, args->this_rd += rd; -#if CONFIG_DIST_8X8 - if (!disable_early_skip) -#endif - if (args->this_rd > args->best_rd) { - args->exit_early = 1; - return; - } -} - -#if CONFIG_DIST_8X8 -static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, - struct rdcost_block_args *args) { - MACROBLOCKD *const xd = &x->e_mbd; - const struct macroblockd_plane *const pd = &xd->plane[0]; - const struct macroblock_plane *const p = &x->plane[0]; - MB_MODE_INFO *const mbmi = xd->mi[0]; - const int src_stride = p->src.stride; - const int dst_stride = pd->dst.stride; - const uint8_t *src = &p->src.buf[0]; - const uint8_t *dst = &pd->dst.buf[0]; - const int16_t *pred = &x->pred_luma[0]; - int bw = block_size_wide[bsize]; - int bh = block_size_high[bsize]; - int visible_w = bw; - int visible_h = bh; - - int i, j; - int64_t rd, rd1, rd2; - int64_t sse = INT64_MAX, dist = INT64_MAX; - int qindex = x->qindex; - - assert((bw & 0x07) == 0); - assert((bh & 0x07) == 0); - - get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w, - &visible_h); - - const int diff_stride = block_size_wide[bsize]; - const int16_t *diff = p->src_diff; - sse = dist_8x8_diff(x, src, src_stride, diff, diff_stride, bw, bh, visible_w, - visible_h, qindex); - sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); - sse *= 16; - - if (!is_inter_block(mbmi)) { - dist = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize, bw, bh, - visible_w, visible_h, qindex); - dist *= 16; - } else { - // For inter mode, the decoded pixels are provided in x->pred_luma, - // while the predicted pixels are in dst. - uint8_t *pred8; - DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - pred8 = CONVERT_TO_BYTEPTR(pred16); - else - pred8 = (uint8_t *)pred16; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (j = 0; j < bh; j++) - for (i = 0; i < bw; i++) - CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i]; - } else { - for (j = 0; j < bh; j++) - for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i]; - } - - dist = av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw, bh, - visible_w, visible_h, qindex); - dist *= 16; - } - -#ifdef DEBUG_DIST_8X8 - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) { - assert(args->rd_stats.sse == sse); - assert(args->rd_stats.dist == dist); + if (args->this_rd > args->best_rd) { + args->exit_early = 1; + return; } -#endif // DEBUG_DIST_8X8 - - args->rd_stats.sse = sse; - args->rd_stats.dist = dist; - - rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist); - rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse); - rd = AOMMIN(rd1, rd2); - - args->rd_stats.rdcost = rd; - args->this_rd = rd; - - if (args->this_rd > args->best_rd) args->exit_early = 1; } -#endif // CONFIG_DIST_8X8 static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, RD_STATS *rd_stats, int64_t ref_best_rd, int plane, @@ -3089,16 +3368,12 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm, &args); -#if CONFIG_DIST_8X8 - int bw = block_size_wide[bsize]; - int bh = block_size_high[bsize]; - if (x->using_dist_8x8 && !args.exit_early && plane == 0 && bw >= 8 && - bh >= 8 && (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) - dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args); -#endif + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early; - if (args.exit_early) { + if (invalid_rd) { av1_invalid_rd_stats(rd_stats); } else { *rd_stats = args.rd_stats; @@ -3269,6 +3544,11 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16); for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) { +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) { + if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue; + } +#endif RD_STATS this_rd_stats; if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD; rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE); @@ -3284,10 +3564,13 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, } if (n == TX_4X4) break; } - mbmi->tx_size = best_tx_size; - memcpy(mbmi->txk_type, best_txk_type, - sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN); - memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4); + + if (rd_stats->rate != INT_MAX) { + mbmi->tx_size = best_tx_size; + memcpy(mbmi->txk_type, best_txk_type, + sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN); + memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4); + } // Reset the pruning flags. av1_zero(x->tx_search_prune); @@ -3429,7 +3712,8 @@ static int conditional_skipintra(PREDICTION_MODE mode, // Model based RD estimation for luma intra blocks. static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x, - BLOCK_SIZE bsize, int mode_cost) { + BLOCK_SIZE bsize, int mode_cost, int mi_row, + int mi_col) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -3450,10 +3734,9 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x, } } // RD estimation. - av1_subtract_plane(x, bsize, 0); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, - &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, - NULL, NULL); + model_rd_sb_fn[MODELRD_TYPE_INTRA]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &this_rd_stats.rate, + &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL); if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) { mode_cost += x->angle_delta_cost[mbmi->mode - V_PRED] @@ -3519,13 +3802,16 @@ static void optimize_palette_colors(uint16_t *color_cache, int n_cache, // Given the base colors as specified in centroids[], calculate the RD cost // of palette mode. -static void palette_rd_y( - const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, - BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n, - uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi, - uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd, - int *rate, int *rate_tokenonly, int *rate_overhead, int64_t *distortion, - int *skippable, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip) { +static void palette_rd_y(const AV1_COMP *const cpi, MACROBLOCK *x, + MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int mi_row, + int mi_col, int dc_mode_cost, const int *data, + int *centroids, int n, uint16_t *color_cache, + int n_cache, MB_MODE_INFO *best_mbmi, + uint8_t *best_palette_color_map, int64_t *best_rd, + int64_t *best_model_rd, int *rate, int *rate_tokenonly, + int *rate_overhead, int64_t *distortion, + int *skippable, PICK_MODE_CONTEXT *ctx, + uint8_t *blk_skip) { optimize_palette_colors(color_cache, n_cache, n, 1, centroids); int k = av1_remove_duplicates(centroids, n); if (k < PALETTE_MIN_SIZE) { @@ -3551,7 +3837,8 @@ static void palette_rd_y( extend_palette_color_map(color_map, cols, rows, block_width, block_height); const int palette_mode_cost = intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost); - int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost); + int64_t this_model_rd = + intra_model_yrd(cpi, x, bsize, palette_mode_cost, mi_row, mi_col); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 1)) return; @@ -3580,11 +3867,11 @@ static void palette_rd_y( } static int rd_pick_palette_intra_sby( - const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, - int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly, - int64_t *distortion, int *skippable, PICK_MODE_CONTEXT *ctx, - uint8_t *best_blk_skip) { + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col, int dc_mode_cost, MB_MODE_INFO *best_mbmi, + uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd, + int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, + PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip) { int rate_overhead = 0; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -3668,10 +3955,11 @@ static int rd_pick_palette_intra_sby( // where the dominant colors and the k-means results are similar. for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) { for (i = 0; i < n; ++i) centroids[i] = top_colors[i]; - palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, - color_cache, n_cache, best_mbmi, best_palette_color_map, - best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead, - distortion, skippable, ctx, best_blk_skip); + palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data, + centroids, n, color_cache, n_cache, best_mbmi, + best_palette_color_map, best_rd, best_model_rd, rate, + rate_tokenonly, &rate_overhead, distortion, skippable, ctx, + best_blk_skip); } // K-means clustering. @@ -3688,10 +3976,11 @@ static int rd_pick_palette_intra_sby( } av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr); } - palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, - color_cache, n_cache, best_mbmi, best_palette_color_map, - best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead, - distortion, skippable, ctx, best_blk_skip); + palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data, + centroids, n, color_cache, n_cache, best_mbmi, + best_palette_color_map, best_rd, best_model_rd, rate, + rate_tokenonly, &rate_overhead, distortion, skippable, ctx, + best_blk_skip); } } @@ -3705,10 +3994,11 @@ static int rd_pick_palette_intra_sby( // Return 1 if an filter intra mode is selected; return 0 otherwise. static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, - int *rate, int *rate_tokenonly, - int64_t *distortion, int *skippable, - BLOCK_SIZE bsize, int mode_cost, - int64_t *best_rd, int64_t *best_model_rd, + int mi_row, int mi_col, int *rate, + int *rate_tokenonly, int64_t *distortion, + int *skippable, BLOCK_SIZE bsize, + int mode_cost, int64_t *best_rd, + int64_t *best_model_rd, PICK_MODE_CONTEXT *ctx) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; @@ -3727,7 +4017,7 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, int64_t this_rd, this_model_rd; RD_STATS tokenonly_rd_stats; mbmi->filter_intra_mode_info.filter_intra_mode = mode; - this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost); + this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 1)) continue; @@ -3770,20 +4060,18 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, // Run RD calculation with given luma intra prediction angle., and return // the RD cost. Update the best mode info. if the RD cost is the best so far. static int64_t calc_rd_given_intra_angle( - const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost, - int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate, - RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size, - int64_t *best_rd, int64_t *best_model_rd, TX_TYPE *best_txk_type, - uint8_t *best_blk_skip) { - int this_rate; + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col, int mode_cost, int64_t best_rd_in, int8_t angle_delta, + int max_angle_delta, int *rate, RD_STATS *rd_stats, int *best_angle_delta, + TX_SIZE *best_tx_size, int64_t *best_rd, int64_t *best_model_rd, + TX_TYPE *best_txk_type, uint8_t *best_blk_skip) { RD_STATS tokenonly_rd_stats; int64_t this_rd, this_model_rd; MB_MODE_INFO *mbmi = x->e_mbd.mi[0]; const int n4 = bsize_to_num_blk(bsize); assert(!is_inter_block(mbmi)); - mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta; - this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost); + this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 1)) return INT64_MAX; @@ -3791,10 +4079,9 @@ static int64_t calc_rd_given_intra_angle( super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in); if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX; - this_rate = - tokenonly_rd_stats.rate + mode_cost + - x->angle_delta_cost[mbmi->mode - V_PRED] - [max_angle_delta + mbmi->angle_delta[PLANE_TYPE_Y]]; + int this_rate = + mode_cost + tokenonly_rd_stats.rate + + x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta]; this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (this_rd < *best_rd) { @@ -3815,32 +4102,32 @@ static int64_t calc_rd_given_intra_angle( // With given luma directional intra prediction mode, pick the best angle delta // Return the RD cost corresponding to the best angle delta. static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x, - int *rate, RD_STATS *rd_stats, - BLOCK_SIZE bsize, int mode_cost, - int64_t best_rd, + int mi_row, int mi_col, int *rate, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int mode_cost, int64_t best_rd, int64_t *best_model_rd) { - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = xd->mi[0]; + MB_MODE_INFO *mbmi = x->e_mbd.mi[0]; assert(!is_inter_block(mbmi)); - int i, angle_delta, best_angle_delta = 0; - int first_try = 1; - int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; + + int best_angle_delta = 0; + int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; TX_SIZE best_tx_size = mbmi->tx_size; - const int n4 = bsize_to_num_blk(bsize); TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN]; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; - for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; + for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; - for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { - for (i = 0; i < 2; ++i) { - best_rd_in = (best_rd == INT64_MAX) - ? INT64_MAX - : (best_rd + (best_rd >> (first_try ? 3 : 5))); - this_rd = calc_rd_given_intra_angle( - cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta, - MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size, - &best_rd, best_model_rd, best_txk_type, best_blk_skip); + int first_try = 1; + for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + for (int i = 0; i < 2; ++i) { + const int64_t best_rd_in = + (best_rd == INT64_MAX) ? INT64_MAX + : (best_rd + (best_rd >> (first_try ? 3 : 5))); + const int64_t this_rd = calc_rd_given_intra_angle( + cpi, x, bsize, mi_row, mi_col, mode_cost, best_rd_in, + (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, rd_stats, + &best_angle_delta, &best_tx_size, &best_rd, best_model_rd, + best_txk_type, best_blk_skip); rd_cost[2 * angle_delta + i] = this_rd; if (first_try && this_rd == INT64_MAX) return best_rd; first_try = 0; @@ -3852,28 +4139,31 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x, } assert(best_rd != INT64_MAX); - for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { - int64_t rd_thresh; - for (i = 0; i < 2; ++i) { + for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + for (int i = 0; i < 2; ++i) { int skip_search = 0; - rd_thresh = best_rd + (best_rd >> 5); + const int64_t rd_thresh = best_rd + (best_rd >> 5); if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) skip_search = 1; if (!skip_search) { - calc_rd_given_intra_angle( - cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta, - MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size, - &best_rd, best_model_rd, best_txk_type, best_blk_skip); + calc_rd_given_intra_angle(cpi, x, bsize, mi_row, mi_col, mode_cost, + best_rd, (1 - 2 * i) * angle_delta, + MAX_ANGLE_DELTA, rate, rd_stats, + &best_angle_delta, &best_tx_size, &best_rd, + best_model_rd, best_txk_type, best_blk_skip); } } } - mbmi->tx_size = best_tx_size; - mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta; - memcpy(mbmi->txk_type, best_txk_type, - sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); - memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4); + if (rd_stats->rate != INT_MAX) { + mbmi->tx_size = best_tx_size; + mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta; + memcpy(mbmi->txk_type, best_txk_type, + sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); + memcpy(x->blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize)); + } return best_rd; } @@ -4052,10 +4342,10 @@ static void intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, // This function is used only for intra_only frames static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, - int *rate, int *rate_tokenonly, - int64_t *distortion, int *skippable, - BLOCK_SIZE bsize, int64_t best_rd, - PICK_MODE_CONTEXT *ctx) { + int mi_row, int mi_col, int *rate, + int *rate_tokenonly, int64_t *distortion, + int *skippable, BLOCK_SIZE bsize, + int64_t best_rd, PICK_MODE_CONTEXT *ctx) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); @@ -4098,13 +4388,14 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO best_mbmi = *mbmi; /* Y Search for intra prediction mode */ - for (int mode_idx = DC_PRED; mode_idx < INTRA_MODES; ++mode_idx) { + for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) { RD_STATS this_rd_stats; int this_rate, this_rate_tokenonly, s; int64_t this_distortion, this_rd, this_model_rd; mbmi->mode = intra_rd_search_mode_order[mode_idx]; mbmi->angle_delta[PLANE_TYPE_Y] = 0; - this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]); + this_model_rd = + intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode], mi_row, mi_col); if (best_model_rd != INT64_MAX && this_model_rd > best_model_rd + (best_model_rd >> 1)) continue; @@ -4113,8 +4404,9 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue; if (is_directional_mode && av1_use_angle_delta(bsize)) { this_rd_stats.rate = INT_MAX; - rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize, - bmode_costs[mbmi->mode], best_rd, &best_model_rd); + rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &this_rate, + &this_rd_stats, bsize, bmode_costs[mbmi->mode], + best_rd, &best_model_rd); } else { super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd); } @@ -4151,16 +4443,16 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } if (try_palette) { - rd_pick_palette_intra_sby(cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, - best_palette_color_map, &best_rd, &best_model_rd, - rate, rate_tokenonly, distortion, skippable, ctx, - ctx->blk_skip); + rd_pick_palette_intra_sby( + cpi, x, bsize, mi_row, mi_col, bmode_costs[DC_PRED], &best_mbmi, + best_palette_color_map, &best_rd, &best_model_rd, rate, rate_tokenonly, + distortion, skippable, ctx, ctx->blk_skip); } if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) { - if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion, - skippable, bsize, bmode_costs[DC_PRED], - &best_rd, &best_model_rd, ctx)) { + if (rd_pick_filter_intra_sby( + cpi, x, mi_row, mi_col, rate, rate_tokenonly, distortion, skippable, + bsize, bmode_costs[DC_PRED], &best_rd, &best_model_rd, ctx)) { best_mbmi = *mbmi; } } @@ -4230,16 +4522,12 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, int blk_row, int blk_col, int plane, int block, - int plane_bsize, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l, RD_STATS *rd_stats, + int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost, TXB_RD_INFO *rd_info_array) { const struct macroblock_plane *const p = &x->plane[plane]; - TXB_CTX txb_ctx; - get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); const uint16_t cur_joint_ctx = - (txb_ctx.dc_sign_ctx << 8) + txb_ctx.txb_skip_ctx; - + (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx; const int txk_type_idx = av1_get_txk_type_index(plane_bsize, blk_row, blk_col); // Look up RD and terminate early in case when we've already processed exactly @@ -4264,7 +4552,7 @@ static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, RD_STATS this_rd_stats; search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - &txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats); + txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats); av1_merge_rd_stats(rd_stats, &this_rd_stats); @@ -4428,8 +4716,8 @@ static void try_tx_block_no_split( rd_stats->zero_rate = zero_blk_rate; const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col); mbmi->inter_tx_size[index] = tx_size; - tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta, - ptl, rd_stats, ftxs_mode, ref_best_rd, + tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, + &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, rd_info_node != NULL ? rd_info_node->rd_info_array : NULL); assert(rd_stats->rate < INT_MAX); @@ -4444,12 +4732,12 @@ static void try_tx_block_no_split( rd_stats->rate = zero_blk_rate; rd_stats->dist = rd_stats->sse; rd_stats->skip = 1; - x->blk_skip[blk_row * bw + blk_col] = 1; + set_blk_skip(x, 0, blk_row * bw + blk_col, 1); p->eobs[block] = 0; update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, DCT_DCT); } else { - x->blk_skip[blk_row * bw + blk_col] = 0; + set_blk_skip(x, 0, blk_row * bw + blk_col, 0); rd_stats->skip = 0; } @@ -4482,7 +4770,6 @@ static void try_tx_block_split( MACROBLOCKD *const xd = &x->e_mbd; const int max_blocks_high = max_block_high(xd, plane_bsize, 0); const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); - struct macroblock_plane *const p = &x->plane[0]; const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; @@ -4490,10 +4777,7 @@ static void try_tx_block_split( RD_STATS this_rd_stats; int this_cost_valid = 1; int64_t tmp_rd = 0; -#if CONFIG_DIST_8X8 - int sub8x8_eob[4] = { 0, 0, 0, 0 }; - struct macroblockd_plane *const pd = &xd->plane[0]; -#endif + split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1]; assert(tx_size < TX_SIZES_ALL); @@ -4511,123 +4795,22 @@ static void try_tx_block_split( &this_cost_valid, ftxs_mode, (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL); -#if CONFIG_DIST_8X8 - if (!x->using_dist_8x8) -#endif - if (!this_cost_valid) goto LOOP_EXIT; -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && tx_size == TX_8X8) { - sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block]; - } -#endif // CONFIG_DIST_8X8 + if (!this_cost_valid) goto LOOP_EXIT; + av1_merge_rd_stats(split_rd_stats, &this_rd_stats); tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); -#if CONFIG_DIST_8X8 - if (!x->using_dist_8x8) -#endif - if (no_split_rd < tmp_rd) { - this_cost_valid = 0; - goto LOOP_EXIT; - } + + if (no_split_rd < tmp_rd) { + this_cost_valid = 0; + goto LOOP_EXIT; + } block += sub_step; } } LOOP_EXIT : {} -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) { - const int src_stride = p->src.stride; - const int dst_stride = pd->dst.stride; - - const uint8_t *src = - &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]]; - const uint8_t *dst = - &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; - - int64_t dist_8x8; - const int qindex = x->qindex; - const int pred_stride = block_size_wide[plane_bsize]; - const int pred_idx = (blk_row * pred_stride + blk_col) - << tx_size_wide_log2[0]; - const int16_t *pred = &x->pred_luma[pred_idx]; - int i, j; - int row, col; - - uint8_t *pred8; - DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]); - - dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, BLOCK_8X8, - 8, 8, 8, 8, qindex) * - 16; - -#ifdef DEBUG_DIST_8X8 - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) - assert(sum_rd_stats.sse == dist_8x8); -#endif // DEBUG_DIST_8X8 - - split_rd_stats->sse = dist_8x8; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - pred8 = CONVERT_TO_BYTEPTR(pred8_16); - else - pred8 = (uint8_t *)pred8_16; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (row = 0; row < 2; ++row) { - for (col = 0; col < 2; ++col) { - int idx = row * 2 + col; - int eob = sub8x8_eob[idx]; - - if (eob > 0) { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - CONVERT_TO_SHORTPTR(pred8) - [(row * 4 + j) * 8 + 4 * col + i] = - pred[(row * 4 + j) * pred_stride + 4 * col + i]; - } else { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - CONVERT_TO_SHORTPTR(pred8) - [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR( - dst)[(row * 4 + j) * dst_stride + 4 * col + i]; - } - } - } - } else { - for (row = 0; row < 2; ++row) { - for (col = 0; col < 2; ++col) { - int idx = row * 2 + col; - int eob = sub8x8_eob[idx]; - - if (eob > 0) { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - pred8[(row * 4 + j) * 8 + 4 * col + i] = - (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i]; - } else { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - pred8[(row * 4 + j) * 8 + 4 * col + i] = - dst[(row * 4 + j) * dst_stride + 4 * col + i]; - } - } - } - } - dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8, 8, - 8, 8, qindex) * - 16; - -#ifdef DEBUG_DIST_8X8 - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) - assert(sum_rd_stats.dist == dist_8x8); -#endif // DEBUG_DIST_8X8 - - split_rd_stats->dist = dist_8x8; - tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); - } -#endif // CONFIG_DIST_8X8 if (this_cost_valid) *split_rd = tmp_rd; } @@ -4660,7 +4843,10 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, const int try_no_split = 1; int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH; - +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) + try_split &= tx_size_wide[tx_size] >= 16 && tx_size_high[tx_size] >= 16; +#endif TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES }; // TX no split @@ -4691,11 +4877,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, } } -#if COLLECT_TX_SIZE_DATA - // Do not skip tx_split when collecting tx size data. - try_split = 1; -#endif - // TX split int64_t split_rd = INT64_MAX; RD_STATS split_rd_stats; @@ -4707,54 +4888,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, rd_info_node, &split_rd_stats, &split_rd); } -#if COLLECT_TX_SIZE_DATA - do { - if (tx_size <= TX_4X4 || depth >= MAX_VARTX_DEPTH) break; - -#if 0 - // Randomly select blocks to collect data to reduce output file size. - const int rnd_val = rand() % 2; - if (rnd_val) break; -#endif - - const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); - const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); - const int within_border = - mi_row >= xd->tile.mi_row_start && - (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) && - mi_col >= xd->tile.mi_col_start && - (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end); - if (!within_border) break; - - FILE *fp = fopen(av1_tx_size_data_output_file, "a"); - if (!fp) break; - - // Split decision, RD cost, block type(inter/intra), q-index, rdmult, - // and block size. - const int split_selected = sum_rd < this_rd; - const int is_inter = 1; - const int txb_w = tx_size_wide[tx_size]; - const int txb_h = tx_size_high[tx_size]; - fprintf(fp, "%d,%lld,%lld,%d,%d,%d,%d,%d,", split_selected, - (long long)this_rd, (long long)sum_rd, cpi->common.base_qindex, - x->rdmult, is_inter, txb_w, txb_h); - - // Residue signal. - const int diff_stride = block_size_wide[plane_bsize]; - const int16_t *src_diff = - &p->src_diff[(blk_row * diff_stride + blk_col) * 4]; - for (int r = 0; r < txb_h; ++r) { - for (int c = 0; c < txb_w; ++c) { - fprintf(fp, "%d,", src_diff[c]); - } - src_diff += diff_stride; - } - fprintf(fp, "\n"); - - fclose(fp); - } while (0); -#endif // COLLECT_TX_SIZE_DATA - if (no_split.rd < split_rd) { ENTROPY_CONTEXT *pta = ta + blk_col; ENTROPY_CONTEXT *ptl = tl + blk_row; @@ -4773,7 +4906,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, mbmi->tx_size = tx_size_selected; update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, no_split.tx_type); - x->blk_skip[blk_row * bw + blk_col] = rd_stats->skip; + set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip); } else { *rd_stats = split_rd_stats; if (split_rd == INT64_MAX) *is_cost_valid = 0; @@ -4787,7 +4920,7 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, TXB_RD_INFO_NODE *rd_info_tree) { MACROBLOCKD *const xd = &x->e_mbd; int is_cost_valid = 1; - int64_t this_rd = 0; + int64_t this_rd = 0, skip_rd = 0; if (ref_best_rd < 0) is_cost_valid = 0; @@ -4818,42 +4951,39 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); + const int skip_ctx = av1_get_skip_context(xd); + const int s0 = x->skip_cost[skip_ctx][0]; + const int s1 = x->skip_cost[skip_ctx][1]; + skip_rd = RDCOST(x->rdmult, s1, 0); + this_rd = RDCOST(x->rdmult, s0, 0); for (idy = 0; idy < mi_height; idy += bh) { for (idx = 0; idx < mi_width; idx += bw) { + int64_t best_rd_sofar = (ref_best_rd - (AOMMIN(skip_rd, this_rd))); select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, plane_bsize, ctxa, ctxl, tx_above, tx_left, - &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid, - ftxs_mode, rd_info_tree); + &pn_rd_stats, best_rd_sofar, &is_cost_valid, ftxs_mode, + rd_info_tree); if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) { av1_invalid_rd_stats(rd_stats); return; } av1_merge_rd_stats(rd_stats, &pn_rd_stats); - this_rd += - AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist), - RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse)); + skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse); + this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist); block += step; if (rd_info_tree != NULL) rd_info_tree += 1; } } + if (skip_rd <= this_rd) { + rd_stats->rate = 0; + rd_stats->dist = rd_stats->sse; + rd_stats->skip = 1; + } else { + rd_stats->skip = 0; + } } - const int skip_ctx = av1_get_skip_context(xd); - const int s0 = x->skip_cost[skip_ctx][0]; - const int s1 = x->skip_cost[skip_ctx][1]; - int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse); - this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist); - if (skip_rd <= this_rd) { - this_rd = skip_rd; - rd_stats->rate = 0; - rd_stats->dist = rd_stats->sse; - rd_stats->skip = 1; - } else { - rd_stats->skip = 0; - } - if (this_rd > ref_best_rd) is_cost_valid = 0; - if (!is_cost_valid) { // reset cost value av1_invalid_rd_stats(rd_stats); @@ -4945,8 +5075,8 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; rd_stats->zero_rate = zero_blk_rate; rd_stats->ref_rdcost = ref_best_rd; - tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, ta, - tl, rd_stats, ftxs_mode, ref_best_rd, NULL); + tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, + &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL); const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || @@ -4954,14 +5084,14 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, rd_stats->rate = zero_blk_rate; rd_stats->dist = rd_stats->sse; rd_stats->skip = 1; - x->blk_skip[blk_row * mi_width + blk_col] = 1; + set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1); x->plane[0].eobs[block] = 0; x->plane[0].txb_entropy_ctx[block] = 0; update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, DCT_DCT); } else { rd_stats->skip = 0; - x->blk_skip[blk_row * mi_width + blk_col] = 0; + set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0); } if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) rd_stats->rate += x->txfm_partition_cost[ctx][0]; @@ -5128,12 +5258,13 @@ static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info, static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash) { // Linear search through the circular buffer to find matching hash. - int index; - for (int i = cur_record->num - 1; i >= 0; i--) { - index = (cur_record->index_start + i) % TX_SIZE_RD_RECORD_BUFFER_LEN; - if (cur_record->hash_vals[index] == hash) return index; + for (int i = cur_record->index_start - 1; i >= 0; i--) { + if (cur_record->hash_vals[i] == hash) return i; } - + for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) { + if (cur_record->hash_vals[i] == hash) return i; + } + int index; // If not found - add new RD info into the buffer and return its index if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) { index = (cur_record->index_start + cur_record->num) % @@ -5150,6 +5281,155 @@ static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, return index; } +typedef struct { + int leaf; + int8_t children[4]; +} RD_RECORD_IDX_NODE; + +static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = { + { 1, { 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = { + { 0, { 1, 2, -1, -1 } }, + { 1, { 0, 0, 0, 0 } }, + { 1, { 0, 0, 0, 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = { + { 0, { 1, 2, -1, -1 } }, + { 1, { 0 } }, + { 1, { 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = { + { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = { + { 0, { 1, 2, -1, -1 } }, + { 0, { 3, 4, 5, 6 } }, + { 0, { 7, 8, 9, 10 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = { + { 0, { 1, 2, -1, -1 } }, + { 0, { 3, 4, 7, 8 } }, + { 0, { 5, 6, 9, 10 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = { + { 0, { 1, 2, 3, 4 } }, { 0, { 5, 6, 9, 10 } }, { 0, { 7, 8, 11, 12 } }, + { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = { + { 0, { 2, 3, 4, 5 } }, { 0, { 6, 7, 8, 9 } }, + { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } }, + { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } }, + { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } }, + { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = { + { 0, { 2, 3, 6, 7 } }, { 0, { 4, 5, 8, 9 } }, + { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } }, + { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } }, + { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } }, + { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = { + { 0, { 4, 5, 8, 9 } }, { 0, { 6, 7, 10, 11 } }, + { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } }, + { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } }, + { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } }, + { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } }, + { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } }, + { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } }, + { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } }, + { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } }, + { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = { + { 0, { 1, -1, 2, -1 } }, + { 0, { 3, 4, -1, -1 } }, + { 0, { 5, 6, -1, -1 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = { + { 0, { 1, 2, -1, -1 } }, + { 0, { 3, 4, -1, -1 } }, + { 0, { 5, 6, -1, -1 } }, +}; + +static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = { + NULL, // BLOCK_4X4 + NULL, // BLOCK_4X8 + NULL, // BLOCK_8X4 + rd_record_tree_8x8, // BLOCK_8X8 + rd_record_tree_8x16, // BLOCK_8X16 + rd_record_tree_16x8, // BLOCK_16X8 + rd_record_tree_16x16, // BLOCK_16X16 + rd_record_tree_1_2, // BLOCK_16X32 + rd_record_tree_2_1, // BLOCK_32X16 + rd_record_tree_sqr, // BLOCK_32X32 + rd_record_tree_1_2, // BLOCK_32X64 + rd_record_tree_2_1, // BLOCK_64X32 + rd_record_tree_sqr, // BLOCK_64X64 + rd_record_tree_64x128, // BLOCK_64X128 + rd_record_tree_128x64, // BLOCK_128X64 + rd_record_tree_128x128, // BLOCK_128X128 + NULL, // BLOCK_4X16 + NULL, // BLOCK_16X4 + rd_record_tree_1_4, // BLOCK_8X32 + rd_record_tree_4_1, // BLOCK_32X8 + rd_record_tree_1_4, // BLOCK_16X64 + rd_record_tree_4_1, // BLOCK_64X16 +}; + +static const int rd_record_tree_size[BLOCK_SIZES_ALL] = { + 0, // BLOCK_4X4 + 0, // BLOCK_4X8 + 0, // BLOCK_8X4 + sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X8 + sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X16 + sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X8 + sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X16 + sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X32 + sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X16 + sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X32 + sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X64 + sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X32 + sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X64 + sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X128 + sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X64 + sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X128 + 0, // BLOCK_4X16 + 0, // BLOCK_16X4 + sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X32 + sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X8 + sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X64 + sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X16 +}; + +static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree, + BLOCK_SIZE bsize) { + const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize]; + const int size = rd_record_tree_size[bsize]; + for (int i = 0; i < size; ++i) { + if (rd_record[i].leaf) { + av1_zero(tree[i].children); + } else { + for (int j = 0; j < 4; ++j) { + const int8_t idx = rd_record[i].children[j]; + tree[i].children[j] = idx > 0 ? &tree[idx] : NULL; + } + } + } +} + // Go through all TX blocks that could be used in TX size search, compute // residual hash values for them and find matching RD info that stores previous // RD search results for these TX blocks. The idea is to prevent repeated @@ -5168,26 +5448,23 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, // Hashing is performed only for square TX sizes larger than TX_4X4 if (max_square_tx_size < TX_8X8) return 0; - - const int bw_mi = mi_size_wide[bsize]; const int diff_stride = bw; const struct macroblock_plane *const p = &x->plane[0]; const int16_t *diff = &p->src_diff[0]; - + init_rd_record_tree(dst_rd_info, bsize); // Coordinates of the top-left corner of current block within the superblock // measured in pixels: const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2; const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2; int cur_rd_info_idx = 0; int cur_tx_depth = 0; - uint8_t parent_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; - uint8_t child_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize]; while (cur_tx_depth <= MAX_VARTX_DEPTH) { const int cur_tx_bw = tx_size_wide[cur_tx_size]; const int cur_tx_bh = tx_size_high[cur_tx_size]; if (cur_tx_bw < 8 || cur_tx_bh < 8) break; const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size]; + const int tx_size_idx = cur_tx_size - TX_8X8; for (int row = 0; row < bh; row += cur_tx_bh) { for (int col = 0; col < bw; col += cur_tx_bw) { if (cur_tx_bw != cur_tx_bh) { @@ -5211,48 +5488,13 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator, (uint8_t *)hash_data, 2 * cur_tx_bw * cur_tx_bh); - // Find corresponding RD info based on the hash value. - const int rd_record_idx = - row_in_sb * (MAX_MIB_SIZE >> (cur_tx_size + 1 - TX_8X8)) + - col_in_sb; - - int idx = find_tx_size_rd_info( - &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx], hash); + const int record_idx = + row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb; + TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx]; + int idx = find_tx_size_rd_info(records, hash); dst_rd_info[cur_rd_info_idx].rd_info_array = - &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx] - .tx_rd_info[idx]; - } - - // Update the output quadtree RD info structure. - av1_zero(dst_rd_info[cur_rd_info_idx].children); - const int this_mi_row = row / MI_SIZE; - const int this_mi_col = col / MI_SIZE; - if (cur_tx_depth > 0) { // Set up child pointers. - const int mi_index = this_mi_row * bw_mi + this_mi_col; - const int child_idx = child_idx_buf[mi_index]; - assert(child_idx < 4); - dst_rd_info[parent_idx_buf[mi_index]].children[child_idx] = - &dst_rd_info[cur_rd_info_idx]; - } - if (cur_tx_depth < MAX_VARTX_DEPTH) { // Set up parent and child idx. - const int tx_bh_mi = cur_tx_bh / MI_SIZE; - const int tx_bw_mi = cur_tx_bw / MI_SIZE; - for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; ++i) { - memset(parent_idx_buf + i * bw_mi + this_mi_col, cur_rd_info_idx, - tx_bw_mi); - } - int child_idx = 0; - const int next_tx_bh_mi = tx_size_wide_unit[next_tx_size]; - const int next_tx_bw_mi = tx_size_wide_unit[next_tx_size]; - for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; - i += next_tx_bh_mi) { - for (int j = this_mi_col; j < this_mi_col + tx_bw_mi; - j += next_tx_bw_mi) { - assert(child_idx < 4); - child_idx_buf[i * bw_mi + j] = child_idx++; - } - } + &records->tx_rd_info[idx]; } ++cur_rd_info_idx; } @@ -5300,7 +5542,7 @@ static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist, const MACROBLOCKD *xd = &x->e_mbd; const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd); - *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, 1); + *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize); const int64_t mse = *dist / bw / bh; // Normalized quantizer takes the transform upscaling factor (8 for tx size // smaller than 32) into account. @@ -5354,7 +5596,7 @@ static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize, memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN); memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size)); mbmi->tx_size = tx_size; - memset(x->blk_skip, 1, sizeof(x->blk_skip[0]) * n4); + for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1); rd_stats->skip = 1; rd_stats->rate = 0; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) @@ -5388,17 +5630,21 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int model_rate; int64_t model_dist; int model_skip; - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, - &model_skip, NULL, NULL, NULL, NULL); + model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &model_rate, &model_dist, + &model_skip, NULL, NULL, NULL, NULL); const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist); // If the modeled rd is a lot worse than the best so far, breakout. // TODO(debargha, urvang): Improve the model and make the check below // tighter. assert(cpi->sf.model_based_prune_tx_search_level >= 0 && cpi->sf.model_based_prune_tx_search_level <= 2); + static const int prune_factor_by8[] = { 2 + MODELRD_TYPE_TX_SEARCH_PRUNE, + 4 + MODELRD_TYPE_TX_SEARCH_PRUNE }; if (!model_skip && - model_rd / (5 - cpi->sf.model_based_prune_tx_search_level) > - ref_best_rd) + ((model_rd * + prune_factor_by8[cpi->sf.model_based_prune_tx_search_level - 1]) >> + 3) > ref_best_rd) return; } @@ -5431,7 +5677,7 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, // Precompute residual hashes and find existing or add new RD records to // store and reuse rate and distortion values to speed up TX size search. - TXB_RD_INFO_NODE matched_rd_info[16 + 64 + 256]; + TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64]; int found_rd_info = 0; if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_inter_txb_hash) { found_rd_info = @@ -5479,34 +5725,61 @@ static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, assert(plane > 0); assert(tx_size < TX_SIZES_ALL); MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; const int max_blocks_high = max_block_high(xd, plane_bsize, plane); const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; ENTROPY_CONTEXT *ta = above_ctx + blk_col; ENTROPY_CONTEXT *tl = left_ctx + blk_row; + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx); + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_UV] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize, - ta, tl, rd_stats, ftxs_mode, INT64_MAX, NULL); + &txb_ctx, rd_stats, ftxs_mode, INT64_MAX, NULL); + + const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int blk_idx = blk_row * mi_width + blk_col; + av1_set_txb_context(x, plane, block, tx_size, ta, tl); + if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= + RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || + rd_stats->skip == 1) && + !xd->lossless[mbmi->segment_id]) { + rd_stats->rate = zero_blk_rate; + rd_stats->dist = rd_stats->sse; + } + + // Set chroma blk_skip to 0 + set_blk_skip(x, plane, blk_idx, 0); } // Return value 0: early termination triggered, no valid rd cost available; // 1: rd cost values are valid. static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, - int64_t ref_best_rd, + int64_t non_skip_ref_best_rd, + int64_t skip_ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; int plane; int is_cost_valid = 1; int64_t this_rd = 0; + int64_t skip_rd = 0; - if (ref_best_rd < 0) is_cost_valid = 0; + if ((non_skip_ref_best_rd < 0) && (skip_ref_best_rd < 0)) is_cost_valid = 0; av1_init_rd_stats(rd_stats); - if (x->skip_chroma_rd) return is_cost_valid; + if (x->skip_chroma_rd) { + if (!is_cost_valid) av1_invalid_rd_stats(rd_stats); + + return is_cost_valid; + } + const BLOCK_SIZE bsizec = scale_chroma_bsize( bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y); @@ -5531,36 +5804,31 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, const int step = bh * bw; ENTROPY_CONTEXT ta[MAX_MIB_SIZE]; ENTROPY_CONTEXT tl[MAX_MIB_SIZE]; - RD_STATS pn_rd_stats; - av1_init_rd_stats(&pn_rd_stats); av1_get_entropy_contexts(bsizec, pd, ta, tl); for (idy = 0; idy < mi_height; idy += bh) { for (idx = 0; idx < mi_width; idx += bw) { + RD_STATS pn_rd_stats; + av1_init_rd_stats(&pn_rd_stats); tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode); + if (pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return 0; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse); + if ((this_rd > non_skip_ref_best_rd) && + (skip_rd > skip_ref_best_rd)) { + av1_invalid_rd_stats(rd_stats); + return 0; + } block += step; } } - - if (pn_rd_stats.rate == INT_MAX) { - is_cost_valid = 0; - break; - } - - av1_merge_rd_stats(rd_stats, &pn_rd_stats); - - this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist), - RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse)); - - if (this_rd > ref_best_rd) { - is_cost_valid = 0; - break; - } } - } - - if (!is_cost_valid) { + } else { // reset cost value av1_invalid_rd_stats(rd_stats); } @@ -6137,9 +6405,9 @@ static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) { (mv->col >> 3) > mv_limits->col_max; } -static INLINE int get_single_mode(int this_mode, int ref_idx, - int is_comp_pred) { - int single_mode; +static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode, + int ref_idx, int is_comp_pred) { + PREDICTION_MODE single_mode; if (is_comp_pred) { single_mode = ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode); @@ -6149,63 +6417,6 @@ static INLINE int get_single_mode(int this_mode, int ref_idx, return single_mode; } -/* If the current mode shares the same mv with other modes with higher prority, - * skip this mode. This priority order is nearest > global > near. */ -static int skip_repeated_mv(const AV1_COMMON *const cm, - const MACROBLOCK *const x, int this_mode, - const MV_REFERENCE_FRAME ref_frames[2]) { - const int is_comp_pred = ref_frames[1] > INTRA_FRAME; - const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames); - const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; - if (!is_comp_pred) { - if (this_mode == NEARMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) { - // NEARMV has the same motion vector as NEARESTMV - return 1; - } - if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 && - cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { - // NEARMV has the same motion vector as GLOBALMV - return 1; - } - } - if (this_mode == GLOBALMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 && - cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { - // GLOBALMV has the same motion vector as NEARESTMV - return 1; - } - } - } else { - for (int i = 0; i < 2; ++i) { - const int single_mode = get_single_mode(this_mode, i, is_comp_pred); - if (single_mode == NEARMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) { - // NEARMV has the same motion vector as NEARESTMV in compound mode - return 1; - } - } - } - if (this_mode == NEAR_NEARMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 && - cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION && - cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) { - // NEAR_NEARMV has the same motion vector as GLOBAL_GLOBALMV - return 1; - } - } - if (this_mode == GLOBAL_GLOBALMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 && - cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION && - cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) { - // GLOBAL_GLOBALMV has the same motion vector as NEARST_NEARSTMV - return 1; - } - } - } - return 0; -} - static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row, int mi_col, int_mv *ref_mv_sub8x8[2], @@ -6215,10 +6426,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, const int num_planes = av1_num_planes(cm); const int pw = block_size_wide[bsize]; const int ph = block_size_high[bsize]; + const int plane = 0; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; // This function should only ever be called for compound modes assert(has_second_ref(mbmi)); + const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] }; const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] }; int_mv ref_mv[2]; int ite, ref; @@ -6228,11 +6441,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, struct macroblockd_plane *const pd = &xd->plane[0]; const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic; const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir; - int is_global[2]; + + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); + conv_params.use_jnt_comp_avg = 0; + WarpTypesAllowed warp_types[2]; for (ref = 0; ref < 2; ++ref) { const WarpedMotionParams *const wm = &xd->global_motion[xd->mi[0]->ref_frame[ref]]; - is_global[ref] = is_global_mv_block(xd->mi[0], wm->wmtype); + const int is_global = is_global_mv_block(xd->mi[0], wm->wmtype); + warp_types[ref].global_warp_allowed = is_global; + warp_types[ref].local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL; } // Do joint motion search in compound mode to get more accurate mv. @@ -6244,30 +6462,38 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, }; // Prediction buffer from second frame. - DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]); - uint8_t *second_pred; + DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]); + uint8_t *second_pred = get_buf_by_bd(xd, second_pred16); (void)ref_mv_sub8x8; + const int have_newmv = have_nearmv_in_inter_mode(mbmi->mode); + const int ref_mv_idx = mbmi->ref_mv_idx + (have_newmv ? 1 : 0); + MV *const best_mv = &x->best_mv.as_mv; + const int search_range = SEARCH_RANGE_8P; + const int sadpb = x->sadperbit16; // Allow joint search multiple times iteratively for each reference frame // and break out of the search loop if it couldn't find a better mv. for (ite = 0; ite < 4; ite++) { struct buf_2d ref_yv12[2]; int bestsme = INT_MAX; - int sadpb = x->sadperbit16; - MV *const best_mv = &x->best_mv.as_mv; - int search_range = 3; - MvLimits tmp_mv_limits = x->mv_limits; int id = ite % 2; // Even iterations search in the first reference frame, // odd iterations search in the second. The predictor // found for the 'other' reference frame is factored in. - const int plane = 0; - ConvolveParams conv_params = get_conv_params(!id, 0, plane, xd->bd); - conv_params.use_jnt_comp_avg = 0; - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global[!id]; - warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL; - + if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) { + if (cur_mv[id].as_int == init_mv[id].as_int) { + break; + } else { + int_mv cur_int_mv, init_int_mv; + cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3; + cur_int_mv.as_mv.row = cur_mv[id].as_mv.col >> 3; + init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3; + init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3; + if (cur_int_mv.as_int == init_int_mv.as_int) { + break; + } + } + } for (ref = 0; ref < 2; ++ref) { ref_mv[ref] = av1_get_ref_mv(x, ref); // Swap out the reference frame for a version that's been scaled to @@ -6294,26 +6520,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, ref_yv12[1] = xd->plane[plane].pre[1]; // Get the prediction block from the 'other' reference frame. - InterpFilters interp_filters = EIGHTTAP_REGULAR; + const InterpFilters interp_filters = EIGHTTAP_REGULAR; // Since we have scaled the reference frames to match the size of the // current frame we must use a unit scaling factor during mode selection. - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); - av1_highbd_build_inter_predictor( - ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw, - &cur_mv[!id].as_mv, &cm->sf_identity, pw, ph, 0, interp_filters, - &warp_types, p_col, p_row, plane, MV_PRECISION_Q3, mi_col * MI_SIZE, - mi_row * MI_SIZE, xd, cm->allow_warped_motion); - } else { - second_pred = (uint8_t *)second_pred_alloc_16; - av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, - second_pred, pw, &cur_mv[!id].as_mv, - &cm->sf_identity, pw, ph, &conv_params, - interp_filters, &warp_types, p_col, p_row, - plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE, - mi_row * MI_SIZE, xd, cm->allow_warped_motion); - } + av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, + second_pred, pw, &cur_mv[!id].as_mv, + &cm->sf_identity, pw, ph, &conv_params, + interp_filters, &warp_types[!id], p_col, p_row, + plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd, cm->allow_warped_motion); const int order_idx = id != 0; av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset, @@ -6324,16 +6540,13 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, if (id) xd->plane[plane].pre[0] = ref_yv12[id]; av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv); - // Use the mv result from the single mode as mv predictor. // Use the mv result from the single mode as mv predictor. *best_mv = cur_mv[id].as_mv; best_mv->col >>= 3; best_mv->row >>= 3; - av1_set_mvcost( - x, id, - mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0)); + av1_set_mvcost(x, id, ref_mv_idx); // Small-range full-pixel motion search. bestsme = av1_refining_search_8p_c(x, sadpb, search_range, @@ -6385,7 +6598,6 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, // Restore the pointer to the first prediction buffer. if (id) xd->plane[plane].pre[0] = ref_yv12[0]; - if (bestsme < last_besterr[id]) { cur_mv[id].as_mv = *best_mv; last_besterr[id] = bestsme; @@ -6397,10 +6609,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, *rate_mv = 0; for (ref = 0; ref < 2; ++ref) { - av1_set_mvcost( - x, ref, - mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0)); - + av1_set_mvcost(x, ref, ref_mv_idx); const int_mv curr_ref_mv = av1_get_ref_mv(x, ref); *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); @@ -6710,16 +6919,16 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, switch (mbmi->motion_mode) { case SIMPLE_TRANSLATION: - bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, - sadpb, cond_cost_list(cpi, cost_list), - &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col), - (MI_SIZE * mi_row), 0); + bestsme = av1_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0, + sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1, + (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0); break; case OBMC_CAUSAL: - bestsme = av1_obmc_full_pixel_diamond( - cpi, x, &mvp_full, step_param, sadpb, - MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv, - &(x->best_mv.as_mv), 0); + bestsme = av1_obmc_full_pixel_search(cpi, x, &mvp_full, step_param, sadpb, + MAX_MVSEARCH_STEPS - 1 - step_param, + 1, &cpi->fn_ptr[bsize], &ref_mv, + &(x->best_mv.as_mv), 0); break; default: assert(0 && "Invalid motion mode!\n"); } @@ -6850,25 +7059,17 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x, av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height, cm->width, cm->height); - ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); WarpTypesAllowed warp_types; warp_types.global_warp_allowed = is_global; warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL; // Get the prediction block from the 'other' reference frame. - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - av1_highbd_build_inter_predictor( - ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph, - 0, mbmi->interp_filters, &warp_types, p_col, p_row, plane, - MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd, - cm->allow_warped_motion); - } else { - av1_build_inter_predictor( - ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph, - &conv_params, mbmi->interp_filters, &warp_types, p_col, p_row, plane, - !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd, - cm->allow_warped_motion); - } + av1_build_inter_predictor(ref_yv12.buf, ref_yv12.stride, second_pred, pw, + other_mv, &sf, pw, ph, &conv_params, + mbmi->interp_filters, &warp_types, p_col, p_row, + plane, !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd, cm->allow_warped_motion); av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset, &xd->jcp_param.bck_offset, @@ -6921,7 +7122,7 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, int bestsme = INT_MAX; int sadpb = x->sadperbit16; MV *const best_mv = &x->best_mv.as_mv; - int search_range = 3; + int search_range = SEARCH_RANGE_8P; MvLimits tmp_mv_limits = x->mv_limits; @@ -7056,12 +7257,12 @@ static void do_masked_motion_search_indexed( // near mv modes to reduce distortion in subsequent blocks and also improve // visual quality. #define NEW_MV_DISCOUNT_FACTOR 8 -static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx, - int ref_mv_idx, +static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode, + int ref_idx, int ref_mv_idx, const MV_REFERENCE_FRAME *ref_frame, const MB_MODE_INFO_EXT *mbmi_ext); static int discount_newmv_test(const AV1_COMP *const cpi, const MACROBLOCK *x, - int this_mode, int_mv this_mv) { + PREDICTION_MODE this_mode, int_mv this_mv) { if (this_mode == NEWMV && this_mv.as_int != 0 && !cpi->rc.is_src_frame_alt_ref) { // Only discount new_mv when nearst_mv and all near_mv are zero, and the @@ -7176,6 +7377,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int N = bw * bh; + assert(N >= 64); int rate; int64_t dist; int64_t rd, best_rd = INT64_MAX; @@ -7199,28 +7401,27 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, (int64_t)aom_sum_squares_i16(residual1, N)) * (1 << WEDGE_WEIGHT_BITS) / 2; int16_t *ds = residual0; - if (N < 64) - av1_wedge_compute_delta_squares_c(ds, residual0, residual1, N); - else - av1_wedge_compute_delta_squares(ds, residual0, residual1, N); + + av1_wedge_compute_delta_squares(ds, residual0, residual1, N); for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize); - // TODO(jingning): Make sse2 functions support N = 16 case - if (N < 64) - wedge_sign = av1_wedge_sign_from_residuals_c(ds, mask, N, sign_limit); - else - wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit); + wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit); mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); - if (N < 64) - sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N); - else - sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); sse = ROUND_POWER_OF_TWO(sse, bd_round); - model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + // int rate2; + // int64_t dist2; + // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2); + // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n", + // sse, rate, dist, rate2, dist2); dist = dist2; + // rate = rate2; + rate += x->wedge_idx_cost[bsize][wedge_index]; rd = RDCOST(x->rdmult, rate, dist); @@ -7248,6 +7449,7 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi, const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int N = bw * bh; + assert(N >= 64); int rate; int64_t dist; int64_t rd, best_rd = INT64_MAX; @@ -7259,13 +7461,11 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi, const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); - if (N < 64) - sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N); - else - sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); sse = ROUND_POWER_OF_TWO(sse, bd_round); - model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); rate += x->wedge_idx_cost[bsize][wedge_index]; rd = RDCOST(x->rdmult, rate, dist); @@ -7317,50 +7517,45 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi, MB_MODE_INFO *const mbmi = xd->mi[0]; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; - const int N = bw * bh; + const int N = 1 << num_pels_log2_lookup[bsize]; int rate; - uint64_t sse; int64_t dist; - int64_t rd0; DIFFWTD_MASK_TYPE cur_mask_type; int64_t best_rd = INT64_MAX; DIFFWTD_MASK_TYPE best_mask_type = 0; const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; + DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]); + uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask }; // try each mask type and its inverse for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) { // build mask and inverse if (hbd) av1_build_compound_diffwtd_mask_highbd( - xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw, + tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd); else - av1_build_compound_diffwtd_mask(xd->seg_mask, cur_mask_type, p0, bw, p1, - bw, bh, bw); + av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, + p0, bw, p1, bw, bh, bw); // compute rd for mask - sse = av1_wedge_sse_from_residuals(residual1, diff10, xd->seg_mask, N); + uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10, + tmp_mask[cur_mask_type], N); sse = ROUND_POWER_OF_TWO(sse, bd_round); - model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); - rd0 = RDCOST(x->rdmult, rate, dist); + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + const int64_t rd0 = RDCOST(x->rdmult, rate, dist); if (rd0 < best_rd) { best_mask_type = cur_mask_type; best_rd = rd0; } } - - // make final mask mbmi->interinter_comp.mask_type = best_mask_type; - if (hbd) - av1_build_compound_diffwtd_mask_highbd( - xd->seg_mask, mbmi->interinter_comp.mask_type, CONVERT_TO_BYTEPTR(p0), - bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd); - else - av1_build_compound_diffwtd_mask( - xd->seg_mask, mbmi->interinter_comp.mask_type, p0, bw, p1, bw, bh, bw); - + if (best_mask_type == DIFFWTD_38_INV) { + memcpy(xd->seg_mask, seg_mask, N * 2); + } return best_rd; } @@ -7413,9 +7608,12 @@ static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x, } } -static int interinter_compound_motion_search( - const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, - const BLOCK_SIZE bsize, const int this_mode, int mi_row, int mi_col) { +static int interinter_compound_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *x, + const int_mv *const cur_mv, + const BLOCK_SIZE bsize, + const PREDICTION_MODE this_mode, + int mi_row, int mi_col) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; int_mv tmp_mv[2]; @@ -7440,11 +7638,40 @@ static int interinter_compound_motion_search( return tmp_rate_mv; } +static void get_inter_predictors_masked_compound( + const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, + int mi_row, int mi_col, uint8_t **preds0, uint8_t **preds1, + int16_t *residual1, int16_t *diff10, int *strides) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + int can_use_previous = cm->allow_warped_motion; + // get inter predictors to use for masked compound modes + av1_build_inter_predictors_for_planes_single_buf( + xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous); + av1_build_inter_predictors_for_planes_single_buf( + xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous); + const struct buf_2d *const src = &x->plane[0].src; + if (get_bitdepth_data_path_index(xd)) { + aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd); + aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1), + bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd); + } else { + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, + bw); + aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw); + } +} + static int64_t build_and_cost_compound_type( const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, - const BLOCK_SIZE bsize, const int this_mode, int *rs2, int rate_mv, - BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1, - int16_t *residual1, int16_t *diff10, int *strides, int mi_row, int mi_col) { + const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2, + int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, + uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides, + int mi_row, int mi_col, int mode_rate, int64_t ref_best_rd, + int *calc_pred_masked_compound) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -7456,19 +7683,30 @@ static int64_t build_and_cost_compound_type( int64_t tmp_skip_sse_sb; const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; + if (*calc_pred_masked_compound) { + get_inter_predictors_masked_compound(cpi, x, bsize, mi_row, mi_col, preds0, + preds1, residual1, diff10, strides); + *calc_pred_masked_compound = 0; + } + best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1, residual1, diff10); *rs2 += get_interinter_compound_mask_rate(x, mbmi); best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0); - if (have_newmv_in_inter_mode(this_mode) && - use_masked_motion_search(compound_type)) { + // Although the true rate_mv might be different after motion search, but it + // is unlikely to be the best mode considering the transform rd cost and other + // mode overhead cost + int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0); + if (mode_rd > ref_best_rd) return INT64_MAX; + + if (have_newmv_in_inter_mode(this_mode) && compound_type == COMPOUND_WEDGE) { *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize, this_mode, mi_row, mi_col); av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize); - av1_subtract_plane(x, bsize, 0); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); if (rd >= best_rd_cur) { mbmi->mv[0].as_int = cur_mv[0].as_int; @@ -7508,12 +7746,72 @@ typedef struct { int (*single_newmv_valid)[REF_FRAMES]; // Pointer to array of predicted rate-distortion // Should point to first of 2 arrays in 2D array - int64_t (*modelled_rd)[REF_FRAMES]; + int64_t (*modelled_rd)[MAX_REF_MV_SERCH][REF_FRAMES]; InterpFilter single_filter[MB_MODE_COUNT][REF_FRAMES]; int ref_frame_cost; int single_comp_cost; + int64_t (*simple_rd)[MAX_REF_MV_SERCH][REF_FRAMES]; + int skip_motion_mode; + INTERINTRA_MODE *inter_intra_mode; } HandleInterModeArgs; +/* If the current mode shares the same mv with other modes with higher cost, + * skip this mode. */ +static int skip_repeated_mv(const AV1_COMMON *const cm, + const MACROBLOCK *const x, + PREDICTION_MODE this_mode, + const MV_REFERENCE_FRAME ref_frames[2], + InterModeSearchState *search_state) { + const int is_comp_pred = ref_frames[1] > INTRA_FRAME; + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames); + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + PREDICTION_MODE compare_mode = MB_MODE_COUNT; + if (!is_comp_pred) { + if (this_mode == NEARMV) { + if (ref_mv_count == 0) { + // NEARMV has the same motion vector as NEARESTMV + compare_mode = NEARESTMV; + } + if (ref_mv_count == 1 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // NEARMV has the same motion vector as GLOBALMV + compare_mode = GLOBALMV; + } + } + if (this_mode == GLOBALMV) { + if (ref_mv_count == 0 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // GLOBALMV has the same motion vector as NEARESTMV + compare_mode = NEARESTMV; + } + if (ref_mv_count == 1) { + // GLOBALMV has the same motion vector as NEARMV + compare_mode = NEARMV; + } + } + + if (compare_mode != MB_MODE_COUNT) { + // Use modelled_rd to check whether compare mode was searched + if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] != + INT64_MAX) { + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames); + const int compare_cost = cost_mv_ref(x, compare_mode, mode_ctx); + const int this_cost = cost_mv_ref(x, this_mode, mode_ctx); + + // Only skip if the mode cost is larger than compare mode cost + if (this_cost > compare_cost) { + search_state->modelled_rd[this_mode][0][ref_frames[0]] = + search_state->modelled_rd[compare_mode][0][ref_frames[0]]; + return 1; + } + } + } + } + return 0; +} + static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv, const AV1_COMMON *cm, const MACROBLOCK *x) { @@ -7640,62 +7938,97 @@ static INLINE int64_t interpolation_filter_rd( const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; - int tmp_rate, tmp_skip_sb = 0; - int64_t tmp_dist, tmp_skip_sse = INT64_MAX; + int tmp_rate[2], tmp_skip_sb[2] = { 1, 1 }; + int64_t tmp_dist[2], tmp_skip_sse[2] = { 0, 0 }; const InterpFilters last_best = mbmi->interp_filters; mbmi->interp_filters = filter_sets[filter_idx]; const int tmp_rs = get_switchable_rate(x, mbmi->interp_filters, switchable_ctx); - if (!skip_pred) { - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); - av1_subtract_plane(x, bsize, 0); -#if DNN_BASED_RD_INTERP_FILTER - model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, - &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL); -#else - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, &tmp_skip_sb, - &tmp_skip_sse, NULL, NULL, NULL); -#endif + assert(skip_pred != 2); + assert((skip_pred >= 0) && (skip_pred <= cpi->default_interp_skip_flags)); + assert(rate[0] >= 0); + assert(dist[0] >= 0); + assert((skip_txfm_sb[0] == 0) || (skip_txfm_sb[0] == 1)); + assert(skip_sse_sb[0] >= 0); + assert(rate[1] >= 0); + assert(dist[1] >= 0); + assert((skip_txfm_sb[1] == 0) || (skip_txfm_sb[1] == 1)); + assert(skip_sse_sb[1] >= 0); + + if (skip_pred != cpi->default_interp_skip_flags) { + if (skip_pred != DEFAULT_LUMA_INTERP_SKIP_FLAG) { + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); +#if CONFIG_COLLECT_RD_STATS == 3 + RD_STATS rd_stats_y; + select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX); + PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 3 + model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0], + &tmp_skip_sb[0], &tmp_skip_sse[0], NULL, NULL, NULL); + tmp_rate[1] = tmp_rate[0]; + tmp_dist[1] = tmp_dist[0]; + } else { + // only luma MC is skipped + tmp_rate[1] = rate[0]; + tmp_dist[1] = dist[0]; + } if (num_planes > 1) { - int64_t tmp_y_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist); - if (tmp_y_rd > *rd) { - mbmi->interp_filters = last_best; - return 0; + for (int plane = 1; plane < num_planes; ++plane) { + int tmp_rate_uv, tmp_skip_sb_uv; + int64_t tmp_dist_uv, tmp_skip_sse_uv; + int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]); + if (tmp_rd >= *rd) { + mbmi->interp_filters = last_best; + return 0; + } + av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, orig_dst, bsize, + plane); + model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, plane, plane, mi_row, mi_col, &tmp_rate_uv, + &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, NULL, NULL); + tmp_rate[1] = + (int)AOMMIN(((int64_t)tmp_rate[1] + (int64_t)tmp_rate_uv), INT_MAX); + tmp_dist[1] += tmp_dist_uv; + tmp_skip_sb[1] &= tmp_skip_sb_uv; + tmp_skip_sse[1] += tmp_skip_sse_uv; } - int tmp_rate_uv, tmp_skip_sb_uv; - int64_t tmp_dist_uv, tmp_skip_sse_uv; - av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize); - for (int plane = 1; plane < num_planes; ++plane) - av1_subtract_plane(x, bsize, plane); -#if DNN_BASED_RD_INTERP_FILTER - model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 1, num_planes - 1, - &tmp_rate_uv, &tmp_dist_uv, &tmp_skip_sb_uv, - &tmp_skip_sse_uv, NULL, NULL, NULL); -#else - model_rd_for_sb(cpi, bsize, x, xd, 1, num_planes - 1, &tmp_rate_uv, - &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, - NULL, NULL); -#endif - tmp_rate += tmp_rate_uv; - tmp_skip_sb &= tmp_skip_sb_uv; - tmp_dist += tmp_dist_uv; - tmp_skip_sse += tmp_skip_sse_uv; } } else { - tmp_rate = *rate; - tmp_dist = *dist; + // both luma and chroma MC is skipped + tmp_rate[1] = rate[1]; + tmp_dist[1] = dist[1]; } - int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist); + int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]); + if (tmp_rd < *rd) { *rd = tmp_rd; *switchable_rate = tmp_rs; - *skip_txfm_sb = tmp_skip_sb; - *skip_sse_sb = tmp_skip_sse; - *rate = tmp_rate; - *dist = tmp_dist; - if (!skip_pred) { + if (skip_pred != cpi->default_interp_skip_flags) { + if (skip_pred == 0) { + // Overwrite the data as current filter is the best one + tmp_skip_sb[1] = tmp_skip_sb[0] & tmp_skip_sb[1]; + tmp_skip_sse[1] = tmp_skip_sse[0] + tmp_skip_sse[1]; + memcpy(rate, tmp_rate, sizeof(*rate) * 2); + memcpy(dist, tmp_dist, sizeof(*dist) * 2); + memcpy(skip_txfm_sb, tmp_skip_sb, sizeof(*skip_txfm_sb) * 2); + memcpy(skip_sse_sb, tmp_skip_sse, sizeof(*skip_sse_sb) * 2); + // As luma MC data is computed, no need to recompute after the search + x->recalc_luma_mc_data = 0; + } else if (skip_pred == DEFAULT_LUMA_INTERP_SKIP_FLAG) { + // As luma MC data is not computed, update of luma data can be skipped + rate[1] = tmp_rate[1]; + dist[1] = tmp_dist[1]; + skip_txfm_sb[1] = skip_txfm_sb[0] & tmp_skip_sb[1]; + skip_sse_sb[1] = skip_sse_sb[0] + tmp_skip_sse[1]; + // As luma MC data is not recomputed and current filter is the best, + // indicate the possibility of recomputing MC data + // If current buffer contains valid MC data, toggle to indicate that + // luma MC data needs to be recomputed + x->recalc_luma_mc_data ^= 1; + } swap_dst_buf(xd, dst_bufs, num_planes); } return 1; @@ -7715,8 +8048,8 @@ static INLINE int find_best_horiz_interp_filter_rd( int i; const int bw = block_size_wide[bsize]; assert(best_dual_mode == 0); - if ((bw <= 4) && (!skip_hor)) { - int skip_pred = 1; + if ((bw <= 4) && (skip_hor != cpi->default_interp_skip_flags)) { + int skip_pred = cpi->default_interp_skip_flags; // Process the filters in reverse order to enable reusing rate and // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) { @@ -7726,7 +8059,7 @@ static INLINE int find_best_horiz_interp_filter_rd( dist)) { best_dual_mode = i; } - skip_pred = 0; + skip_pred = skip_hor; } } else { for (i = 1; i < SWITCHABLE_FILTERS; ++i) { @@ -7751,8 +8084,8 @@ static INLINE void find_best_vert_interp_filter_rd( int best_dual_mode, int filter_set_size) { int i; const int bh = block_size_high[bsize]; - if ((bh <= 4) && (!skip_ver)) { - int skip_pred = 1; + if ((bh <= 4) && (skip_ver != cpi->default_interp_skip_flags)) { + int skip_pred = cpi->default_interp_skip_flags; // Process the filters in reverse order to enable reusing rate and // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP assert(filter_set_size == DUAL_FILTER_SET_SIZE); @@ -7762,7 +8095,7 @@ static INLINE void find_best_vert_interp_filter_rd( switchable_rate, skip_txfm_sb, skip_sse_sb, dst_bufs, i, switchable_ctx, skip_pred, rate, dist); - skip_pred = 0; + skip_pred = skip_ver; } } else { for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size; @@ -7784,6 +8117,7 @@ static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st, return 0; } } + if (has_second_ref(mi) && st->comp_type != mi->interinter_comp.type) return 0; return 1; } @@ -7806,11 +8140,11 @@ static INLINE void save_interp_filter_search_stat(MACROBLOCK *x, const int comp_idx = mbmi->compound_idx; const int offset = x->interp_filter_stats_idx[comp_idx]; if (offset < MAX_INTERP_FILTER_STATS) { - INTERPOLATION_FILTER_STATS stat = { - mbmi->interp_filters, - { mbmi->mv[0], mbmi->mv[1] }, - { mbmi->ref_frame[0], mbmi->ref_frame[1] }, - }; + INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters, + { mbmi->mv[0], mbmi->mv[1] }, + { mbmi->ref_frame[0], + mbmi->ref_frame[1] }, + mbmi->interinter_comp.type }; x->interp_filter_stats[comp_idx][offset] = stat; x->interp_filter_stats_idx[comp_idx]++; } @@ -7821,15 +8155,22 @@ static int64_t interpolation_filter_search( int mi_row, int mi_col, const BUFFER_SET *const tmp_dst, BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES], int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb, - int64_t *const skip_sse_sb) { + int64_t *const skip_sse_sb, const int skip_build_pred, + HandleInterModeArgs *args, int64_t ref_best_rd) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int need_search = av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd); - int i, tmp_rate; - int64_t tmp_dist; + int i; + // Index 0 corresponds to luma rd data and index 1 corresponds to cummulative + // data of all planes + int tmp_rate[2] = { 0, 0 }; + int64_t tmp_dist[2] = { 0, 0 }; + int best_skip_txfm_sb[2] = { 1, 1 }; + int64_t best_skip_sse_sb[2] = { 0, 0 }; + const int ref_frame = xd->mi[0]->ref_frame[0]; (void)single_filter; int match_found = -1; @@ -7845,18 +8186,32 @@ static int64_t interpolation_filter_search( switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1); *switchable_rate = get_switchable_rate(x, mbmi->interp_filters, switchable_ctx); - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); - for (int plane = 0; plane < num_planes; ++plane) - av1_subtract_plane(x, bsize, plane); -#if DNN_BASED_RD_INTERP_FILTER - model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, - &tmp_dist, skip_txfm_sb, skip_sse_sb, NULL, NULL, - NULL); -#else - model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist, - skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL); -#endif // DNN_BASED_RD_INTERP_FILTER - *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist); + if (!skip_build_pred) + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); + +#if CONFIG_COLLECT_RD_STATS == 3 + RD_STATS rd_stats_y; + select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX); + PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 3 + model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0], + &best_skip_txfm_sb[0], &best_skip_sse_sb[0], NULL, NULL, NULL); + if (num_planes > 1) + model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, 1, num_planes - 1, mi_row, mi_col, &tmp_rate[1], + &tmp_dist[1], &best_skip_txfm_sb[1], &best_skip_sse_sb[1], NULL, NULL, + NULL); + tmp_rate[1] = + (int)AOMMIN((int64_t)tmp_rate[0] + (int64_t)tmp_rate[1], INT_MAX); + assert(tmp_rate[1] >= 0); + tmp_dist[1] = tmp_dist[0] + tmp_dist[1]; + best_skip_txfm_sb[1] = best_skip_txfm_sb[0] & best_skip_txfm_sb[1]; + best_skip_sse_sb[1] = best_skip_sse_sb[0] + best_skip_sse_sb[1]; + *rd = RDCOST(x->rdmult, (*switchable_rate + tmp_rate[1]), tmp_dist[1]); + *skip_txfm_sb = best_skip_txfm_sb[1]; + *skip_sse_sb = best_skip_sse_sb[1]; + x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4); if (assign_filter != SWITCHABLE || match_found != -1) { return 0; @@ -7866,22 +8221,71 @@ static int64_t interpolation_filter_search( av1_broadcast_interp_filter(EIGHTTAP_REGULAR)); return 0; } - int skip_hor = 1; - int skip_ver = 1; + if (args->modelled_rd != NULL) { + if (has_second_ref(mbmi)) { + const int ref_mv_idx = mbmi->ref_mv_idx; + int refs[2] = { mbmi->ref_frame[0], + (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; + const int mode0 = compound_ref0_mode(mbmi->mode); + const int mode1 = compound_ref1_mode(mbmi->mode); + const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], + args->modelled_rd[mode1][ref_mv_idx][refs[1]]); + if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) { + return INT64_MAX; + } + } + } + + x->recalc_luma_mc_data = 0; + // skip_flag=xx (in binary form) + // Setting 0th flag corresonds to skipping luma MC and setting 1st bt + // corresponds to skipping chroma MC skip_flag=0 corresponds to "Don't skip + // luma and chroma MC" Skip flag=1 corresponds to "Skip Luma MC only" + // Skip_flag=2 is not a valid case + // skip_flag=3 corresponds to "Skip both luma and chroma MC" + int skip_hor = cpi->default_interp_skip_flags; + int skip_ver = cpi->default_interp_skip_flags; const int is_compound = has_second_ref(mbmi); - for (int k = 0; k < num_planes - 1; ++k) { - struct macroblockd_plane *const pd = &xd->plane[k]; - const int bw = pd->width; - const int bh = pd->height; - for (int j = 0; j < 1 + is_compound; ++j) { - const MV mv = mbmi->mv[j].as_mv; + assert(is_intrabc_block(mbmi) == 0); + for (int j = 0; j < 1 + is_compound; ++j) { + const RefBuffer *ref_buf = &cm->frame_refs[mbmi->ref_frame[j] - LAST_FRAME]; + const struct scale_factors *const sf = &ref_buf->sf; + // TODO(any): Refine skip flag calculation considering scaling + if (av1_is_scaled(sf)) { + skip_hor = 0; + skip_ver = 0; + break; + } + const MV mv = mbmi->mv[j].as_mv; + int skip_hor_plane = 0; + int skip_ver_plane = 0; + for (int k = 0; k < AOMMAX(1, (num_planes - 1)); ++k) { + struct macroblockd_plane *const pd = &xd->plane[k]; + const int bw = pd->width; + const int bh = pd->height; const MV mv_q4 = clamp_mv_to_umv_border_sb( xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; - skip_hor &= (sub_x == 0); - skip_ver &= (sub_y == 0); - } + skip_hor_plane |= ((sub_x == 0) << k); + skip_ver_plane |= ((sub_y == 0) << k); + } + skip_hor = skip_hor & skip_hor_plane; + skip_ver = skip_ver & skip_ver_plane; + // It is not valid that "luma MV is sub-pel, whereas chroma MV is not" + assert(skip_hor != 2); + assert(skip_ver != 2); + } + // When compond prediction type is compound segment wedge, luma MC and chroma + // MC need to go hand in hand as mask generated during luma MC is reuired for + // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during + // vertical filter decision may be incorrect as temporary MC evaluation + // overwrites the mask. Make skip_ver as 0 for this case so that mask is + // populated during luma MC + if (is_compound && mbmi->compound_idx == 1 && + mbmi->interinter_comp.type == COMPOUND_DIFFWTD) { + assert(mbmi->comp_group_idx == 1); + if (skip_hor == 0 && skip_ver == 1) skip_ver = 0; } // do interp_filter search const int filter_set_size = DUAL_FILTER_SET_SIZE; @@ -7895,14 +8299,14 @@ static int64_t interpolation_filter_search( // EIGHTTAP_REGULAR mode is calculated beforehand best_dual_mode = find_best_horiz_interp_filter_rd( x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate, - skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_hor, - &tmp_rate, &tmp_dist, best_dual_mode); + best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_hor, + tmp_rate, tmp_dist, best_dual_mode); // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes find_best_vert_interp_filter_rd( x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate, - skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_ver, - &tmp_rate, &tmp_dist, best_dual_mode, filter_set_size); + best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_ver, + tmp_rate, tmp_dist, best_dual_mode, filter_set_size); } else { // EIGHTTAP_REGULAR mode is calculated beforehand for (i = 1; i < filter_set_size; ++i) { @@ -7912,12 +8316,25 @@ static int64_t interpolation_filter_search( if (filter_x != filter_y) continue; } interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, - switchable_rate, skip_txfm_sb, skip_sse_sb, - dst_bufs, i, switchable_ctx, 0, &tmp_rate, - &tmp_dist); + switchable_rate, best_skip_txfm_sb, + best_skip_sse_sb, dst_bufs, i, switchable_ctx, 0, + tmp_rate, tmp_dist); + assert(x->recalc_luma_mc_data == 0); } } swap_dst_buf(xd, dst_bufs, num_planes); + // Recompute final MC data if required + if (x->recalc_luma_mc_data == 1) { + // Recomputing final luma MC data is required only if the same was skipped + // in either of the directions Condition below is necessary, but not + // sufficient + assert((skip_hor == 1) || (skip_ver == 1)); + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + } + *skip_txfm_sb = best_skip_txfm_sb[1]; + *skip_sse_sb = best_skip_sse_sb[1]; + x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4); + // save search results if (cpi->sf.skip_repeat_interpolation_filter_search) { assert(match_found == -1); @@ -7926,6 +8343,301 @@ static int64_t interpolation_filter_search( return 0; } +static int txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col, RD_STATS *rd_stats, + RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, + int mode_rate, int64_t ref_best_rd) { + /* + * This function combines y and uv planes' transform search processes + * together, when the prediction is generated. It first does subtration to + * obtain the prediction error. Then it calls + * select_tx_type_yrd/super_block_yrd and inter_block_uvrd sequentially and + * handles the early terminations happen in those functions. At the end, it + * computes the rd_stats/_y/_uv accordingly. + */ + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int skip_txfm_sb = 0; + const int num_planes = av1_num_planes(cm); + const int ref_frame_1 = mbmi->ref_frame[1]; + const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0); + const int64_t rd_thresh = + ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd; + const int skip_ctx = av1_get_skip_context(xd); + const int64_t min_header_rate = + mode_rate + AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]); + // Account for minimum skip and non_skip rd. + // Eventually either one of them will be added to mode_rate + const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0); + + if (min_header_rd_possible > ref_best_rd) { + av1_invalid_rd_stats(rd_stats_y); + av1_invalid_rd_stats(rd_stats); + return 0; + } + + av1_init_rd_stats(rd_stats); + av1_init_rd_stats(rd_stats_y); + av1_init_rd_stats(rd_stats_uv); + rd_stats->rate = mode_rate; + + if (!cpi->common.all_lossless) + check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb); + if (!skip_txfm_sb) { + int64_t non_skip_rdcosty = INT64_MAX; + int64_t skip_rdcosty = INT64_MAX; + int64_t min_rdcosty = INT64_MAX; + int is_cost_valid_uv = 0; + + // cost and distortion + av1_subtract_plane(x, bsize, 0); + if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { + // Motion mode + select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh); +#if CONFIG_COLLECT_RD_STATS == 2 + PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 2 + } else { + super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) + set_blk_skip(x, 0, i, rd_stats_y->skip); + } + + if (rd_stats_y->rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + // TODO(angiebird): check if we need this + // restore_dst_buf(xd, *orig_dst, num_planes); + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + + av1_merge_rd_stats(rd_stats, rd_stats_y); + + non_skip_rdcosty = RDCOST( + x->rdmult, rd_stats->rate + x->skip_cost[skip_ctx][0], rd_stats->dist); + skip_rdcosty = + RDCOST(x->rdmult, mode_rate + x->skip_cost[skip_ctx][1], rd_stats->sse); + min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty); + + if (min_rdcosty > ref_best_rd) { + int64_t tokenonly_rdy = + AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist), + RDCOST(x->rdmult, 0, rd_stats_y->sse)); + // Invalidate rd_stats_y to skip the rest of the motion modes search + if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.adaptive_txb_search_level) > + rd_thresh) + av1_invalid_rd_stats(rd_stats_y); + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + + if (num_planes > 1) { + /* clang-format off */ + is_cost_valid_uv = + inter_block_uvrd(cpi, x, rd_stats_uv, bsize, + ref_best_rd - non_skip_rdcosty, + ref_best_rd - skip_rdcosty, FTXS_NONE); + if (!is_cost_valid_uv) { + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + /* clang-format on */ + av1_merge_rd_stats(rd_stats, rd_stats_uv); + } else { + av1_init_rd_stats(rd_stats_uv); + } + if (rd_stats->skip) { + rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + rd_stats->rate += x->skip_cost[skip_ctx][1]; + mbmi->skip = 0; + // here mbmi->skip temporarily plays a role as what this_skip2 does + + int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (tmprd > ref_best_rd) { + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + } else if (!xd->lossless[mbmi->segment_id] && + (RDCOST(x->rdmult, + rd_stats_y->rate + rd_stats_uv->rate + + x->skip_cost[skip_ctx][0], + rd_stats->dist) >= + RDCOST(x->rdmult, x->skip_cost[skip_ctx][1], rd_stats->sse))) { + rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; + rd_stats->rate += x->skip_cost[skip_ctx][1]; + rd_stats->dist = rd_stats->sse; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + mbmi->skip = 1; + } else { + rd_stats->rate += x->skip_cost[skip_ctx][0]; + mbmi->skip = 0; + } + } else { + x->skip = 1; + mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode); + // The cost of skip bit needs to be added. + mbmi->skip = 0; + rd_stats->rate += x->skip_cost[skip_ctx][1]; + + rd_stats->dist = 0; + rd_stats->sse = 0; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + rd_stats->skip = 1; + int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (tmprd > ref_best_rd) { + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + } + return 1; +} + +static int handle_inter_intra_mode(const AV1_COMP *const cpi, + MACROBLOCK *const x, BLOCK_SIZE bsize, + int mi_row, int mi_col, MB_MODE_INFO *mbmi, + HandleInterModeArgs *args, + int64_t ref_best_rd, int *rate_mv, + int *tmp_rate2, BUFFER_SET *orig_dst) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &x->e_mbd; + + INTERINTRA_MODE best_interintra_mode = II_DC_PRED; + int64_t rd, best_interintra_rd = INT64_MAX; + int rmode, rate_sum; + int64_t dist_sum; + int tmp_rate_mv = 0; + int tmp_skip_txfm_sb; + int bw = block_size_wide[bsize]; + int64_t tmp_skip_sse_sb; + DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]); + uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_); + uint8_t *intrapred = get_buf_by_bd(xd, intrapred_); + const int *const interintra_mode_cost = + x->interintra_mode_cost[size_group_lookup[bsize]]; + const int_mv mv0 = mbmi->mv[0]; + const int is_wedge_used = is_interintra_wedge_used(bsize); + int rwedge = is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0; + mbmi->ref_frame[1] = NONE_FRAME; + xd->plane[0].dst.buf = tmp_buf; + xd->plane[0].dst.stride = bw; + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize); + + restore_dst_buf(xd, *orig_dst, num_planes); + mbmi->ref_frame[1] = INTRA_FRAME; + mbmi->use_wedge_interintra = 0; + best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]]; + int j = 0; + if (cpi->sf.reuse_inter_intra_mode == 0 || + best_interintra_mode == INTERINTRA_MODES) { + for (j = 0; j < INTERINTRA_MODES; ++j) { + mbmi->interintra_mode = (INTERINTRA_MODE)j; + rmode = interintra_mode_cost[mbmi->interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + model_rd_sb_fn[MODELRD_TYPE_INTERINTRA]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum); + if (rd < best_interintra_rd) { + best_interintra_rd = rd; + best_interintra_mode = mbmi->interintra_mode; + } + } + args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode; + } + if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) { + mbmi->interintra_mode = best_interintra_mode; + rmode = interintra_mode_cost[mbmi->interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, *rate_mv + rmode + rate_sum + rwedge, dist_sum); + best_interintra_rd = rd; + if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) { + return -1; + } + if (is_wedge_used) { + int64_t best_interintra_rd_nowedge = rd; + int64_t best_interintra_rd_wedge = INT64_MAX; + int_mv tmp_mv; + // Disable wedge search if source variance is small + if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) { + mbmi->use_wedge_interintra = 1; + + rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) + + x->wedge_interintra_cost[bsize][1]; + + best_interintra_rd_wedge = + pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + + best_interintra_rd_wedge += + RDCOST(x->rdmult, rmode + *rate_mv + rwedge, 0); + rd = INT64_MAX; + // Refine motion vector. + if (have_newmv_in_inter_mode(mbmi->mode)) { + // get negative of mask + const uint8_t *mask = av1_get_contiguous_soft_mask( + mbmi->interintra_wedge_index, 1, bsize); + tmp_mv = mbmi->mv[0]; + compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row, + mi_col, intrapred, mask, bw, &tmp_rate_mv, + 0); + if (mbmi->mv[0].as_int != tmp_mv.as_int) { + mbmi->mv[0].as_int = tmp_mv.as_int; + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, + bsize); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge, + dist_sum); + } + } + if (rd >= best_interintra_rd_wedge) { + tmp_mv.as_int = mv0.as_int; + tmp_rate_mv = *rate_mv; + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + // Evaluate closer to true rd + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum, + dist_sum); + best_interintra_rd_wedge = rd; + if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { + mbmi->use_wedge_interintra = 1; + mbmi->mv[0].as_int = tmp_mv.as_int; + *tmp_rate2 += tmp_rate_mv - *rate_mv; + *rate_mv = tmp_rate_mv; + } else { + mbmi->use_wedge_interintra = 0; + mbmi->mv[0].as_int = mv0.as_int; + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + } + } else { + mbmi->use_wedge_interintra = 0; + } + } // if (is_interintra_wedge_used(bsize)) + if (num_planes > 1) { + av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize); + } + return 0; +} + // TODO(afergs): Refactor the MBMI references in here - there's four // TODO(afergs): Refactor optional args - add them to a struct or remove static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, @@ -7933,11 +8645,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col, HandleInterModeArgs *const args, - int64_t ref_best_rd, const int *refs, int rate_mv, - BUFFER_SET *orig_dst + int64_t ref_best_rd, const int *refs, + int *rate_mv, BUFFER_SET *orig_dst #if CONFIG_COLLECT_INTER_MODE_RD_STATS , - int64_t *best_est_rd + TileDataEnc *tile_data, int64_t *best_est_rd, + int do_tx_search, InterModesInfo *inter_modes_info #endif ) { const AV1_COMMON *const cm = &cpi->common; @@ -7946,41 +8659,49 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, MB_MODE_INFO *mbmi = xd->mi[0]; const int is_comp_pred = has_second_ref(mbmi); const PREDICTION_MODE this_mode = mbmi->mode; - int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0; + const int rate2_nocoeff = rd_stats->rate; + int best_xskip, best_disable_skip = 0; RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; MB_MODE_INFO base_mbmi, best_mbmi; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const int rate_mv0 = *rate_mv; + int interintra_allowed = cm->seq_params.enable_interintra_compound && is_interintra_allowed(mbmi) && mbmi->compound_idx; int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE]; - int total_samples; - - (void)rate_mv; + assert(mbmi->ref_frame[1] != INTRA_FRAME); + const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1]; av1_invalid_rd_stats(&best_rd_stats); - aom_clear_system_state(); - mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0); - total_samples = mbmi->num_proj_ref[0]; - rate2_nocoeff = rd_stats->rate; + mbmi->num_proj_ref = 1; // assume num_proj_ref >=1 + MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; + if (cm->switchable_motion_mode) { + last_motion_mode_allowed = motion_mode_allowed(xd->global_motion, xd, mbmi, + cm->allow_warped_motion); + } + if (last_motion_mode_allowed == WARPED_CAUSAL) { + mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0); + } + int total_samples = mbmi->num_proj_ref; + if (total_samples == 0) { + last_motion_mode_allowed = OBMC_CAUSAL; + } base_mbmi = *mbmi; - MOTION_MODE last_motion_mode_allowed = - cm->switchable_motion_mode - ? motion_mode_allowed(xd->global_motion, xd, mbmi, - cm->allow_warped_motion) - : SIMPLE_TRANSLATION; - assert(mbmi->ref_frame[1] != INTRA_FRAME); - const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1]; + const int switchable_rate = + av1_is_interp_needed(xd) ? av1_get_switchable_rate(cm, x, xd) : 0; int64_t best_rd = INT64_MAX; - + int best_rate_mv = rate_mv0; for (int mode_index = (int)SIMPLE_TRANSLATION; mode_index <= (int)last_motion_mode_allowed + interintra_allowed; mode_index++) { + if (args->skip_motion_mode && mode_index) continue; int64_t tmp_rd = INT64_MAX; int tmp_rate2 = rate2_nocoeff; int is_interintra_mode = mode_index > (int)last_motion_mode_allowed; int skip_txfm_sb = 0; + int tmp_rate_mv = rate_mv0; *mbmi = base_mbmi; if (is_interintra_mode) { @@ -7995,10 +8716,9 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, // The prediction is calculated before motion_mode_rd() is called in // handle_inter_mode() } else if (mbmi->motion_mode == OBMC_CAUSAL) { - mbmi->motion_mode = OBMC_CAUSAL; - if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) { - int tmp_rate_mv = 0; - + uint32_t cur_mv = mbmi->mv[0].as_int; + assert(!is_comp_pred); + if (have_newmv_in_inter_mode(this_mode)) { single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv); mbmi->mv[0].as_int = x->best_mv.as_int; #if USE_DISCOUNT_NEWMV_TEST @@ -8006,36 +8726,38 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); } #endif - tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv; + tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; + } + if (mbmi->mv[0].as_int != cur_mv) { + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); } - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); av1_build_obmc_inter_prediction( cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride, args->left_pred_buf, args->left_pred_stride); } else if (mbmi->motion_mode == WARPED_CAUSAL) { int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; mbmi->motion_mode = WARPED_CAUSAL; - mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE; + mbmi->wm_params.wmtype = DEFAULT_WMTYPE; mbmi->interp_filters = av1_broadcast_interp_filter( av1_unswitchable_filter(cm->interp_filter)); memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); // Select the samples according to motion vector difference - if (mbmi->num_proj_ref[0] > 1) { - mbmi->num_proj_ref[0] = selectSamples( - &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref[0], bsize); + if (mbmi->num_proj_ref > 1) { + mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref, bsize); } - if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, + if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, - &mbmi->wm_params[0], mi_row, mi_col)) { + &mbmi->wm_params, mi_row, mi_col)) { // Refine MV for NEWMV mode - if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) { - int tmp_rate_mv = 0; + assert(!is_comp_pred); + if (have_newmv_in_inter_mode(this_mode)) { const int_mv mv0 = mbmi->mv[0]; - const WarpedMotionParams wm_params0 = mbmi->wm_params[0]; - int num_proj_ref0 = mbmi->num_proj_ref[0]; + const WarpedMotionParams wm_params0 = mbmi->wm_params; + int num_proj_ref0 = mbmi->num_proj_ref; // Refine MV in a small range. av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0, @@ -8057,12 +8779,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); } #endif - tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv; + tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; } else { // Restore the old MV and WM parameters. mbmi->mv[0] = mv0; - mbmi->wm_params[0] = wm_params0; - mbmi->num_proj_ref[0] = num_proj_ref0; + mbmi->wm_params = wm_params0; + mbmi->num_proj_ref = num_proj_ref0; } } @@ -8071,144 +8793,10 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, continue; } } else if (is_interintra_mode) { - INTERINTRA_MODE best_interintra_mode = II_DC_PRED; - int64_t rd, best_interintra_rd = INT64_MAX; - int rmode, rate_sum; - int64_t dist_sum; - int j; - int tmp_rate_mv = 0; - int tmp_skip_txfm_sb; - int bw = block_size_wide[bsize]; - int64_t tmp_skip_sse_sb; - DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]); - DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]); - uint8_t *tmp_buf, *intrapred; - const int *const interintra_mode_cost = - x->interintra_mode_cost[size_group_lookup[bsize]]; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_); - intrapred = CONVERT_TO_BYTEPTR(intrapred_); - } else { - tmp_buf = tmp_buf_; - intrapred = intrapred_; - } - const int_mv mv0 = mbmi->mv[0]; - - mbmi->ref_frame[1] = NONE_FRAME; - xd->plane[0].dst.buf = tmp_buf; - xd->plane[0].dst.stride = bw; - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize); - - restore_dst_buf(xd, *orig_dst, num_planes); - mbmi->ref_frame[1] = INTRA_FRAME; - mbmi->use_wedge_interintra = 0; - for (j = 0; j < INTERINTRA_MODES; ++j) { - mbmi->interintra_mode = (INTERINTRA_MODE)j; - rmode = interintra_mode_cost[mbmi->interintra_mode]; - av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, - intrapred, bw); - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - av1_subtract_plane(x, bsize, 0); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); - rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum); - if (rd < best_interintra_rd) { - best_interintra_rd = rd; - best_interintra_mode = mbmi->interintra_mode; - } - } - mbmi->interintra_mode = best_interintra_mode; - rmode = interintra_mode_cost[mbmi->interintra_mode]; - av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, - intrapred, bw); - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum); - best_interintra_rd = rd; - - if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) { - // restore ref_frame[1] - mbmi->ref_frame[1] = ref_frame_1; - continue; - } - - if (is_interintra_wedge_used(bsize)) { - int64_t best_interintra_rd_nowedge = INT64_MAX; - int64_t best_interintra_rd_wedge = INT64_MAX; - int_mv tmp_mv; - InterpFilters backup_interp_filters = mbmi->interp_filters; - int rwedge = x->wedge_interintra_cost[bsize][0]; - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum + rwedge, dist_sum); - best_interintra_rd_nowedge = rd; - - // Disable wedge search if source variance is small - if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) { - mbmi->use_wedge_interintra = 1; - - rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) + - x->wedge_interintra_cost[bsize][1]; - - best_interintra_rd_wedge = - pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); - - best_interintra_rd_wedge += - RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0); - // Refine motion vector. - if (have_newmv_in_inter_mode(mbmi->mode)) { - // get negative of mask - const uint8_t *mask = av1_get_contiguous_soft_mask( - mbmi->interintra_wedge_index, 1, bsize); - tmp_mv = av1_get_ref_mv(x, 0); - compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row, - mi_col, intrapred, mask, bw, - &tmp_rate_mv, 0); - mbmi->mv[0].as_int = tmp_mv.as_int; - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, - bsize); - av1_subtract_plane(x, bsize, 0); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, - NULL); - rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge, - dist_sum); - if (rd >= best_interintra_rd_wedge) { - tmp_mv.as_int = mv0.as_int; - tmp_rate_mv = rate_mv; - mbmi->interp_filters = backup_interp_filters; - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - } - } else { - tmp_mv.as_int = mv0.as_int; - tmp_rate_mv = rate_mv; - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - } - // Evaluate closer to true rd - rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, - INT64_MAX); - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum, - dist_sum); - best_interintra_rd_wedge = rd; - if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { - mbmi->use_wedge_interintra = 1; - mbmi->mv[0].as_int = tmp_mv.as_int; - tmp_rate2 += tmp_rate_mv - rate_mv; - } else { - mbmi->use_wedge_interintra = 0; - mbmi->mv[0].as_int = mv0.as_int; - mbmi->interp_filters = backup_interp_filters; - } - } else { - mbmi->use_wedge_interintra = 0; - } - } // if (is_interintra_wedge_used(bsize)) - restore_dst_buf(xd, *orig_dst, num_planes); - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); + const int ret = handle_inter_intra_mode( + cpi, x, bsize, mi_row, mi_col, mbmi, args, ref_best_rd, &tmp_rate_mv, + &tmp_rate2, orig_dst); + if (ret < 0) continue; } if (!cpi->common.all_lossless) @@ -8220,8 +8808,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, rd_stats->sse = 0; rd_stats->skip = 1; rd_stats->rate = tmp_rate2; - if (av1_is_interp_needed(xd)) - rd_stats->rate += av1_get_switchable_rate(cm, x, xd); + if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate; if (interintra_allowed) { rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]] [mbmi->ref_frame[1] == INTRA_FRAME]; @@ -8246,167 +8833,86 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode]; } } + if (!skip_txfm_sb) { #if CONFIG_COLLECT_INTER_MODE_RD_STATS int64_t est_rd = 0; int est_skip = 0; - if (cpi->sf.inter_mode_rd_model_estimation) { - InterModeRdModel *md = &inter_mode_rd_models[mbmi->sb_type]; + if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 && + cm->tile_rows == 1) { + InterModeRdModel *md = &tile_data->inter_mode_rd_models[mbmi->sb_type]; if (md->ready) { const int64_t curr_sse = get_sse(cpi, x); - est_rd = - get_est_rd(mbmi->sb_type, x->rdmult, curr_sse, rd_stats->rate); + est_rd = get_est_rd(tile_data, mbmi->sb_type, x->rdmult, curr_sse, + rd_stats->rate); est_skip = est_rd * 0.8 > *best_est_rd; -#if INTER_MODE_RD_TEST - if (est_rd < *best_est_rd) { - *best_est_rd = est_rd; - } -#else // INTER_MODE_RD_TEST if (est_skip) { - ++md->skip_count; mbmi->ref_frame[1] = ref_frame_1; continue; } else { if (est_rd < *best_est_rd) { *best_est_rd = est_rd; } - ++md->non_skip_count; } -#endif // INTER_MODE_RD_TEST } } #endif // CONFIG_COLLECT_INTER_MODE_RD_STATS + } - int64_t rdcosty = INT64_MAX; - int is_cost_valid_uv = 0; - - // cost and distortion - av1_subtract_plane(x, bsize, 0); - if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { - // Motion mode - select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, - ref_best_rd); -#if CONFIG_COLLECT_RD_STATS == 2 - PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize); -#endif // CONFIG_COLLECT_RD_STATS == 2 +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + if (!do_tx_search) { + const int64_t curr_sse = get_sse(cpi, x); + int est_residue_cost = 0; + int64_t est_dist = 0; + const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse, + &est_residue_cost, &est_dist); + (void)has_est_rd; + assert(has_est_rd); + const int mode_rate = rd_stats->rate; + rd_stats->rate += est_residue_cost; + rd_stats->dist = est_dist; + rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (cm->reference_mode == SINGLE_REFERENCE) { + if (!is_comp_pred) { + inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, + rd_stats->rdcost, mbmi); + } } else { - super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd); - memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); - memset(x->blk_skip, rd_stats_y->skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, + rd_stats->rdcost, mbmi); } - - if (rd_stats_y->rate == INT_MAX) { - av1_invalid_rd_stats(rd_stats); - if (mbmi->motion_mode != SIMPLE_TRANSLATION || - mbmi->ref_frame[1] == INTRA_FRAME) { - mbmi->ref_frame[1] = ref_frame_1; - continue; - } else { - restore_dst_buf(xd, *orig_dst, num_planes); - mbmi->ref_frame[1] = ref_frame_1; + } else { +#endif + int mode_rate = rd_stats->rate; + if (!txfm_search(cpi, x, bsize, mi_row, mi_col, rd_stats, rd_stats_y, + rd_stats_uv, mode_rate, ref_best_rd)) { + if (rd_stats_y->rate == INT_MAX && mode_index == 0) { return INT64_MAX; } + continue; } - - av1_merge_rd_stats(rd_stats, rd_stats_y); - - rdcosty = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, 0, rd_stats->sse)); - if (num_planes > 1) { - /* clang-format off */ - is_cost_valid_uv = - inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty, - FTXS_NONE); - if (!is_cost_valid_uv) { - mbmi->ref_frame[1] = ref_frame_1; - continue; + if (!skip_txfm_sb) { + const int64_t curr_rd = + RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (curr_rd < ref_best_rd) { + ref_best_rd = curr_rd; } - /* clang-format on */ - av1_merge_rd_stats(rd_stats, rd_stats_uv); - } else { - av1_init_rd_stats(rd_stats_uv); - } -#if CONFIG_RD_DEBUG - // record transform block coefficient cost - // TODO(angiebird): So far rd_debug tool only detects discrepancy of - // coefficient cost. Therefore, it is fine to copy rd_stats into mbmi - // here because we already collect the coefficient cost. Move this part to - // other place when we need to compare non-coefficient cost. - mbmi->rd_stats = *rd_stats; -#endif // CONFIG_RD_DEBUG - const int skip_ctx = av1_get_skip_context(xd); - if (rd_stats->skip) { - rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; - rd_stats_y->rate = 0; - rd_stats_uv->rate = 0; - rd_stats->rate += x->skip_cost[skip_ctx][1]; - mbmi->skip = 0; - // here mbmi->skip temporarily plays a role as what this_skip2 does - } else if (!xd->lossless[mbmi->segment_id] && - (RDCOST(x->rdmult, - rd_stats_y->rate + rd_stats_uv->rate + - x->skip_cost[skip_ctx][0], - rd_stats->dist) >= RDCOST(x->rdmult, - x->skip_cost[skip_ctx][1], - rd_stats->sse))) { - rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; - rd_stats->rate += x->skip_cost[skip_ctx][1]; - rd_stats->dist = rd_stats->sse; - rd_stats_y->rate = 0; - rd_stats_uv->rate = 0; - mbmi->skip = 1; - } else { - rd_stats->rate += x->skip_cost[skip_ctx][0]; - mbmi->skip = 0; - } - *disable_skip = 0; + *disable_skip = 0; #if CONFIG_COLLECT_INTER_MODE_RD_STATS - if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 && - cm->tile_rows == 1) { -#if INTER_MODE_RD_TEST - if (md->ready) { - int64_t real_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if (est_skip) { - ++md->skip_count; - if (real_rd < ref_best_rd) { - ++md->fp_skip_count; - } - // int fp_skip = real_rd < ref_best_rd; - // printf("est_skip %d fp_skip %d est_rd %ld best_est_rd %ld real_rd - // %ld ref_best_rd %ld\n", - // est_skip, fp_skip, est_rd, *best_est_rd, real_rd, - // ref_best_rd); - } else { - ++md->non_skip_count; - } + if (cpi->sf.inter_mode_rd_model_estimation) { + const int skip_ctx = av1_get_skip_context(xd); + inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse, + rd_stats->dist, + rd_stats_y->rate + rd_stats_uv->rate + + x->skip_cost[skip_ctx][mbmi->skip]); } -#endif // INTER_MODE_RD_TEST - inter_mode_data_push(mbmi->sb_type, rd_stats->sse, rd_stats->dist, - rd_stats_y->rate + rd_stats_uv->rate + - x->skip_cost[skip_ctx][mbmi->skip], - rd_stats->rate, ref_best_rd); - } #endif // CONFIG_COLLECT_INTER_MODE_RD_STATS - int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if (curr_rd < ref_best_rd) { - ref_best_rd = curr_rd; + } else { + *disable_skip = 1; } - } else { - x->skip = 1; - *disable_skip = 1; - mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode); - - // The cost of skip bit needs to be added. - mbmi->skip = 0; - rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1]; - - rd_stats->dist = 0; - rd_stats->sse = 0; - rd_stats_y->rate = 0; - rd_stats_uv->rate = 0; - rd_stats->skip = 1; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS } +#endif if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) { if (is_nontrans_global_motion(xd, xd->mi[0])) { @@ -8416,23 +8922,24 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, } tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if ((mbmi->motion_mode == SIMPLE_TRANSLATION && - mbmi->ref_frame[1] != INTRA_FRAME) || - (tmp_rd < best_rd)) { + if (mode_index == 0) + args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd; + if ((mode_index == 0) || (tmp_rd < best_rd)) { best_mbmi = *mbmi; best_rd = tmp_rd; best_rd_stats = *rd_stats; best_rd_stats_y = *rd_stats_y; + best_rate_mv = tmp_rate_mv; if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv; memcpy(best_blk_skip, x->blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); best_xskip = x->skip; best_disable_skip = *disable_skip; if (best_xskip) break; } } mbmi->ref_frame[1] = ref_frame_1; - + *rate_mv = best_rate_mv; if (best_rd == INT64_MAX) { av1_invalid_rd_stats(rd_stats); restore_dst_buf(xd, *orig_dst, num_planes); @@ -8443,7 +8950,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, *rd_stats_y = best_rd_stats_y; if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv; memcpy(x->blk_skip, best_blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); x->skip = best_xskip; *disable_skip = best_disable_skip; @@ -8482,15 +8989,9 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi, return 0; } -#ifndef NDEBUG -static INLINE int is_single_inter_mode(int this_mode) { - return this_mode >= SINGLE_INTER_MODE_START && - this_mode < SINGLE_INTER_MODE_END; -} -#endif - -static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) { - assert(is_single_inter_mode(single_mode)); +static INLINE int get_ref_mv_offset(PREDICTION_MODE single_mode, + uint8_t ref_mv_idx) { + assert(is_inter_singleref_mode(single_mode)); int ref_mv_offset; if (single_mode == NEARESTMV) { ref_mv_offset = 0; @@ -8502,14 +9003,15 @@ static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) { return ref_mv_offset; } -static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx, - int ref_mv_idx, +static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode, + int ref_idx, int ref_mv_idx, const MV_REFERENCE_FRAME *ref_frame, const MB_MODE_INFO_EXT *mbmi_ext) { const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame); const int is_comp_pred = ref_frame[1] > INTRA_FRAME; - const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred); - assert(is_single_inter_mode(single_mode)); + const PREDICTION_MODE single_mode = + get_single_mode(this_mode, ref_idx, is_comp_pred); + assert(is_inter_singleref_mode(single_mode)); if (single_mode == NEWMV) { this_mv->as_int = INVALID_MV; } else if (single_mode == GLOBALMV) { @@ -8533,7 +9035,7 @@ static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx, } // This function update the non-new mv for the current prediction mode -static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode, +static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode, const AV1_COMMON *cm, const MACROBLOCK *x) { const MACROBLOCKD *xd = &x->e_mbd; const MB_MODE_INFO *mbmi = xd->mi[0]; @@ -8543,7 +9045,8 @@ static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode, int_mv this_mv; get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, mbmi->ref_frame, x->mbmi_ext); - const int single_mode = get_single_mode(this_mode, i, is_comp_pred); + const PREDICTION_MODE single_mode = + get_single_mode(this_mode, i, is_comp_pred); if (single_mode == NEWMV) { cur_mv[i] = this_mv; } else { @@ -8584,18 +9087,29 @@ static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi, return cost; } -static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, int mi_col, int mi_row, - int_mv *cur_mv, int masked_compound_used, - BUFFER_SET *orig_dst, BUFFER_SET *tmp_dst, - int *rate_mv, int64_t *rd, - RD_STATS *rd_stats, int64_t ref_best_rd) { +// Struct for buffers used by compound_type_rd() function. +// For sizes and alignment of these arrays, refer to +// alloc_compound_type_rd_buffers() function. +typedef struct { + uint8_t *pred0; + uint8_t *pred1; + int16_t *residual1; // src - pred1 + int16_t *diff10; // pred1 - pred0 + uint8_t *tmp_best_mask_buf; // backup of the best segmentation mask +} CompoundTypeRdBuffers; + +static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_col, int mi_row, + int_mv *cur_mv, int masked_compound_used, + BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, + CompoundTypeRdBuffers *buffers, int *rate_mv, + int64_t *rd, RD_STATS *rd_stats, + int64_t ref_best_rd) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; - const int this_mode = mbmi->mode; + const PREDICTION_MODE this_mode = mbmi->mode; const int bw = block_size_wide[bsize]; - const int bh = block_size_high[bsize]; int rate_sum, rs2; int64_t dist_sum; @@ -8605,45 +9119,19 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, int64_t tmp_skip_sse_sb; INTERINTER_COMPOUND_DATA best_compound_data; best_compound_data.type = COMPOUND_AVERAGE; - DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]); - DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]); - DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1 - DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0 - uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE]; - uint8_t *preds0[1] = { pred0 }; - uint8_t *preds1[1] = { pred1 }; + uint8_t *preds0[1] = { buffers->pred0 }; + uint8_t *preds1[1] = { buffers->pred1 }; int strides[1] = { bw }; int tmp_rate_mv; const int num_pix = 1 << num_pels_log2_lookup[bsize]; const int mask_len = 2 * num_pix * sizeof(uint8_t); COMPOUND_TYPE cur_type; int best_compmode_interinter_cost = 0; - int can_use_previous = cm->allow_warped_motion; + int calc_pred_masked_compound = 1; best_mv[0].as_int = cur_mv[0].as_int; best_mv[1].as_int = cur_mv[1].as_int; *rd = INT64_MAX; - if (masked_compound_used) { - // get inter predictors to use for masked compound modes - av1_build_inter_predictors_for_planes_single_buf( - xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous); - av1_build_inter_predictors_for_planes_single_buf( - xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous); - const struct buf_2d *const src = &x->plane[0].src; - if (get_bitdepth_data_path_index(xd)) { - aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, - CONVERT_TO_BYTEPTR(pred1), bw, xd->bd); - aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(pred1), - bw, CONVERT_TO_BYTEPTR(pred0), bw, xd->bd); - } else { - aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, pred1, - bw); - aom_subtract_block(bh, bw, diff10, bw, pred1, bw, pred0, bw); - } - } - const int orig_is_best = xd->plane[0].dst.buf == orig_dst->plane[0]; - const BUFFER_SET *backup_buf = orig_is_best ? tmp_dst : orig_dst; - const BUFFER_SET *best_buf = orig_is_best ? orig_dst : tmp_dst; for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) { if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break; if (!is_interinter_compound_used(cur_type, bsize)) continue; @@ -8662,17 +9150,17 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, } masked_type_cost += x->comp_idx_cost[comp_index_ctx][1]; rs2 = masked_type_cost; - // No need to call av1_build_inter_predictors_sby here - // 1. COMPOUND_AVERAGE is always the first candidate - // 2. av1_build_inter_predictors_sby has been called by - // interpolation_filter_search - int64_t est_rd = - estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); + if (mode_rd < ref_best_rd) { + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + int64_t est_rd = + estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (est_rd != INT64_MAX) + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); + } // use spare buffer for following compound type try - restore_dst_buf(xd, *backup_buf, 1); - if (est_rd != INT64_MAX) - best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); + restore_dst_buf(xd, *tmp_dst, 1); } else { mbmi->comp_group_idx = 1; masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1]; @@ -8682,19 +9170,20 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, *rd / 3 < ref_best_rd) { best_rd_cur = build_and_cost_compound_type( cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst, - &tmp_rate_mv, preds0, preds1, residual1, diff10, strides, mi_row, - mi_col); + &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10, + strides, mi_row, mi_col, rd_stats->rate, ref_best_rd, + &calc_pred_masked_compound); } } if (best_rd_cur < *rd) { *rd = best_rd_cur; best_compound_data = mbmi->interinter_comp; if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) { - memcpy(tmp_best_mask_buf, xd->seg_mask, mask_len); + memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len); } best_compmode_interinter_cost = rs2; if (have_newmv_in_inter_mode(this_mode)) { - if (use_masked_motion_search(cur_type)) { + if (cur_type == COMPOUND_WEDGE) { best_tmp_rate_mv = tmp_rate_mv; best_mv[0].as_int = mbmi->mv[0].as_int; best_mv[1].as_int = mbmi->mv[1].as_int; @@ -8712,28 +9201,69 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, mbmi->comp_group_idx = (best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1; mbmi->interinter_comp = best_compound_data; - memcpy(xd->seg_mask, tmp_best_mask_buf, mask_len); + memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len); } if (have_newmv_in_inter_mode(this_mode)) { mbmi->mv[0].as_int = best_mv[0].as_int; mbmi->mv[1].as_int = best_mv[1].as_int; - if (use_masked_motion_search(mbmi->interinter_comp.type)) { + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { rd_stats->rate += best_tmp_rate_mv - *rate_mv; *rate_mv = best_tmp_rate_mv; } } - restore_dst_buf(xd, *best_buf, 1); + restore_dst_buf(xd, *orig_dst, 1); return best_compmode_interinter_cost; } +static INLINE int is_single_newmv_valid(HandleInterModeArgs *args, + MB_MODE_INFO *mbmi, + PREDICTION_MODE this_mode) { + for (int ref_idx = 0; ref_idx < 2; ++ref_idx) { + const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx, 1); + const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx]; + if (single_mode == NEWMV && + args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) { + return 0; + } + } + return 1; +} + +static int get_drl_refmv_count(const MACROBLOCK *const x, + const MV_REFERENCE_FRAME *ref_frame, + PREDICTION_MODE mode) { + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0; + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV); + const int has_drl = + (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1); + const int ref_set = + has_drl ? AOMMIN(MAX_REF_MV_SERCH, ref_mv_count - has_nearmv) : 1; + + return ref_set; +} + +typedef struct { + int64_t rd; + int drl_cost; + int rate_mv; + int_mv mv; +} inter_mode_info; + static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col, - HandleInterModeArgs *args, int64_t ref_best_rd + HandleInterModeArgs *args, int64_t ref_best_rd, + uint8_t *const tmp_buf, + CompoundTypeRdBuffers *rd_buffers #if CONFIG_COLLECT_INTER_MODE_RD_STATS , - int64_t *best_est_rd + TileDataEnc *tile_data, int64_t *best_est_rd, + const int do_tx_search, + InterModesInfo *inter_modes_info #endif ) { const AV1_COMMON *cm = &cpi->common; @@ -8742,15 +9272,26 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; const int is_comp_pred = has_second_ref(mbmi); - const int this_mode = mbmi->mode; + const PREDICTION_MODE this_mode = mbmi->mode; int i; int refs[2] = { mbmi->ref_frame[0], (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; int rate_mv = 0; - DECLARE_ALIGNED(32, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); - uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_); int64_t rd = INT64_MAX; - BUFFER_SET orig_dst, tmp_dst; + + // do first prediction into the destination buffer. Do the next + // prediction into a temporary buffer. Then keep track of which one + // of these currently holds the best predictor, and use the other + // one for future predictions. In the end, copy from tmp_buf to + // dst if necessary. + struct macroblockd_plane *p = xd->plane; + BUFFER_SET orig_dst = { + { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, + { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, + }; + const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE, + tmp_buf + 2 * MAX_SB_SQUARE }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } }; int skip_txfm_sb = 0; int64_t skip_sse_sb = INT64_MAX; @@ -8765,36 +9306,29 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO best_mbmi = *mbmi; int best_disable_skip; int best_xskip; - int plane_rate[MAX_MB_PLANE] = { 0 }; - int64_t plane_sse[MAX_MB_PLANE] = { 0 }; - int64_t plane_dist[MAX_MB_PLANE] = { 0 }; int64_t newmv_ret_val = INT64_MAX; int_mv backup_mv[2] = { { 0 } }; int backup_rate_mv = 0; + inter_mode_info mode_info[MAX_REF_MV_SERCH]; int comp_idx; const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp & (mbmi->mode != GLOBAL_GLOBALMV); - const int has_drl = (have_nearmv_in_inter_mode(mbmi->mode) && - mbmi_ext->ref_mv_count[ref_frame_type] > 2) || - ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) && - mbmi_ext->ref_mv_count[ref_frame_type] > 1); - // TODO(jingning): This should be deprecated shortly. - const int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; - const int ref_set = - has_drl ? AOMMIN(MAX_REF_MV_SERCH, - mbmi_ext->ref_mv_count[ref_frame_type] - idx_offset) - : 1; + const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; + const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { + mode_info[ref_mv_idx].mv.as_int = INVALID_MV; + mode_info[ref_mv_idx].rd = INT64_MAX; + if (cpi->sf.reduce_inter_modes && ref_mv_idx > 0) { if (mbmi->ref_frame[0] == LAST2_FRAME || mbmi->ref_frame[0] == LAST3_FRAME || mbmi->ref_frame[1] == LAST2_FRAME || mbmi->ref_frame[1] == LAST3_FRAME) { - if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + idx_offset] + if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + has_nearmv] .weight < REF_CAT_LEVEL) { continue; } @@ -8811,41 +9345,40 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); - mbmi->num_proj_ref[0] = 0; - mbmi->num_proj_ref[1] = 0; + mbmi->num_proj_ref = 0; mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->ref_mv_idx = ref_mv_idx; - if (is_comp_pred) { - for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) { - const int single_mode = - get_single_mode(this_mode, ref_idx, is_comp_pred); - if (single_mode == NEWMV && - args->single_newmv[mbmi->ref_mv_idx][mbmi->ref_frame[ref_idx]] - .as_int == INVALID_MV) - continue; - } + if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, this_mode))) { + continue; } rd_stats->rate += args->ref_frame_cost + args->single_comp_cost; - rd_stats->rate += + const int drl_cost = get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type); + rd_stats->rate += drl_cost; + mode_info[ref_mv_idx].drl_cost = drl_cost; + + if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd && + mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { + continue; + } - const RD_STATS backup_rd_stats = *rd_stats; - const MB_MODE_INFO backup_mbmi = *mbmi; int64_t best_rd2 = INT64_MAX; + const RD_STATS backup_rd_stats = *rd_stats; // If !search_jnt_comp, we need to force mbmi->compound_idx = 1. for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) { int rs = 0; int compmode_interinter_cost = 0; - *rd_stats = backup_rd_stats; - *mbmi = backup_mbmi; mbmi->compound_idx = comp_idx; - if (is_comp_pred && comp_idx == 0) { + *rd_stats = backup_rd_stats; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME; + mbmi->num_proj_ref = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->comp_group_idx = 0; - mbmi->compound_idx = 0; const int comp_group_idx_ctx = get_comp_group_idx_context(xd); const int comp_index_ctx = get_comp_index_context(cm, xd); @@ -8885,32 +9418,69 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } else { rd_stats->rate += rate_mv; } - } - for (i = 0; i < is_comp_pred + 1; ++i) { - mbmi->mv[i].as_int = cur_mv[i].as_int; - } - // Initialise tmp_dst and orig_dst buffers to prevent "may be used - // uninitialized" warnings in GCC when the stream is monochrome. - memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane)); - memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride)); - memset(orig_dst.plane, 0, sizeof(tmp_dst.plane)); - memset(orig_dst.stride, 0, sizeof(tmp_dst.stride)); - - // do first prediction into the destination buffer. Do the next - // prediction into a temporary buffer. Then keep track of which one - // of these currently holds the best predictor, and use the other - // one for future predictions. In the end, copy from tmp_buf to - // dst if necessary. - for (i = 0; i < num_planes; i++) { - tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE; - tmp_dst.stride[i] = MAX_SB_SIZE; + if (cpi->sf.skip_repeated_newmv) { + if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) { + int skip = 0; + int this_rate_mv = 0; + for (i = 0; i < ref_mv_idx; ++i) { + // Check if the motion search result same as previous results + if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int) { + // If the compared mode has no valid rd, it is unlikely this + // mode will be the best mode + if (mode_info[i].rd == INT64_MAX) { + skip = 1; + break; + } + // Compare the cost difference including drl cost and mv cost + if (mode_info[i].mv.as_int != INVALID_MV) { + const int compare_cost = + mode_info[i].rate_mv + mode_info[i].drl_cost; + const int_mv ref_mv = av1_get_ref_mv(x, 0); + this_rate_mv = av1_mv_bit_cost(&mode_info[i].mv.as_mv, + &ref_mv.as_mv, x->nmvjointcost, + x->mvcost, MV_COST_WEIGHT); + const int this_cost = this_rate_mv + drl_cost; + + if (compare_cost < this_cost) { + skip = 1; + break; + } else { + // If the cost is less than current best result, make this + // the best and update corresponding variables + if (best_mbmi.ref_mv_idx == i) { + assert(best_rd != INT64_MAX); + best_mbmi.ref_mv_idx = ref_mv_idx; + best_rd_stats.rate += this_cost - compare_cost; + best_rd = RDCOST(x->rdmult, best_rd_stats.rate, + best_rd_stats.dist); + if (best_rd < ref_best_rd) ref_best_rd = best_rd; + + skip = 1; + break; + } + } + } + } + } + if (skip) { + args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = + args->modelled_rd[this_mode][i][refs[0]]; + args->simple_rd[this_mode][ref_mv_idx][refs[0]] = + args->simple_rd[this_mode][i][refs[0]]; + mode_info[ref_mv_idx].rd = mode_info[i].rd; + mode_info[ref_mv_idx].rate_mv = this_rate_mv; + mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int; + + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + } + } } - for (i = 0; i < num_planes; i++) { - orig_dst.plane[i] = xd->plane[i].dst.buf; - orig_dst.stride[i] = xd->plane[i].dst.stride; + for (i = 0; i < is_comp_pred + 1; ++i) { + mbmi->mv[i].as_int = cur_mv[i].as_int; } - const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx); #if USE_DISCOUNT_NEWMV_TEST // We don't include the cost of the second reference here, because there @@ -8937,47 +9507,62 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, continue; } - ret_val = interpolation_filter_search( - x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, - args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb); - if (ret_val != 0) { - restore_dst_buf(xd, orig_dst, num_planes); - continue; - } else if (cpi->sf.model_based_post_interp_filter_breakout && - ref_best_rd != INT64_MAX && (rd / 6 > ref_best_rd)) { - restore_dst_buf(xd, orig_dst, num_planes); - if ((rd >> 4) > ref_best_rd) break; - continue; - } - + int skip_build_pred = 0; if (is_comp_pred && comp_idx) { + // Find matching interp filter or set to default interp filter + const int need_search = + av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd); + int match_found = -1; + const InterpFilter assign_filter = cm->interp_filter; + if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) { + match_found = find_interp_filter_in_stats(x, mbmi); + } + if (!need_search || match_found == -1) { + set_default_interp_filters(mbmi, assign_filter); + } + int64_t best_rd_compound; compmode_interinter_cost = compound_type_rd( cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used, - &orig_dst, &tmp_dst, &rate_mv, &best_rd_compound, rd_stats, - ref_best_rd); + &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound, + rd_stats, ref_best_rd); if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) { restore_dst_buf(xd, orig_dst, num_planes); continue; } - if (mbmi->interinter_comp.type != COMPOUND_AVERAGE) { - int tmp_rate; - int64_t tmp_dist; - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, - bsize); - for (int plane = 0; plane < num_planes; ++plane) - av1_subtract_plane(x, bsize, plane); - model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, - &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, - plane_sse, plane_dist); - rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist); + // No need to call av1_build_inter_predictors_sby if + // COMPOUND_AVERAGE is selected because it is the first + // candidate in compound_type_rd, and the following + // compound types searching uses tmp_dst buffer + if (mbmi->interinter_comp.type == COMPOUND_AVERAGE) { + if (num_planes > 1) + av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, &orig_dst, + bsize); + skip_build_pred = 1; } } + ret_val = interpolation_filter_search( + x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, + args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb, + skip_build_pred, args, ref_best_rd); + if (args->modelled_rd != NULL && !is_comp_pred) { + args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd; + } + if (ret_val != 0) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } else if (cpi->sf.model_based_post_interp_filter_breakout && + ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + if ((rd >> 3) * 2 > ref_best_rd) break; + continue; + } + if (search_jnt_comp) { // if 1/2 model rd is larger than best_rd in jnt_comp mode, // use jnt_comp mode, save additional search - if ((rd >> 1) > best_rd) { + if ((rd >> 3) * 4 > best_rd) { restore_dst_buf(xd, orig_dst, num_planes); continue; } @@ -8991,31 +9576,31 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, if (is_comp_pred) { const int mode0 = compound_ref0_mode(this_mode); const int mode1 = compound_ref1_mode(this_mode); - const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]], - args->modelled_rd[mode1][refs[1]]); - if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) { + const int64_t mrd = + AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], + args->modelled_rd[mode1][ref_mv_idx][refs[1]]); + if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) { restore_dst_buf(xd, orig_dst, num_planes); continue; } - } else { - args->modelled_rd[this_mode][refs[0]] = rd; - } - } - - if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { - // if current pred_error modeled rd is substantially more than the best - // so far, do not bother doing full rd - if (rd / 2 > ref_best_rd) { - restore_dst_buf(xd, orig_dst, num_planes); - continue; } } - rd_stats->rate += compmode_interinter_cost; if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) { // TODO(chengchen): this speed feature introduces big loss. // Need better estimation of rate distortion. + int dummy_rate; + int64_t dummy_dist; + int plane_rate[MAX_MB_PLANE] = { 0 }; + int64_t plane_sse[MAX_MB_PLANE] = { 0 }; + int64_t plane_dist[MAX_MB_PLANE] = { 0 }; + + model_rd_sb_fn[MODELRD_TYPE_JNT_COMPOUND]( + cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, &dummy_rate, + &dummy_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, plane_sse, + plane_dist); + rd_stats->rate += rs; rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2]; rd_stats_y->rate = plane_rate[0]; @@ -9028,18 +9613,21 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, rd_stats_uv->dist = plane_dist[1] + plane_dist[2]; } else { #if CONFIG_COLLECT_INTER_MODE_RD_STATS - ret_val = - motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, - disable_skip, mi_row, mi_col, args, ref_best_rd, - refs, rate_mv, &orig_dst, best_est_rd); + ret_val = motion_mode_rd( + cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, + mi_row, mi_col, args, ref_best_rd, refs, &rate_mv, &orig_dst, + tile_data, best_est_rd, do_tx_search, inter_modes_info); #else ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, mi_row, mi_col, - args, ref_best_rd, refs, rate_mv, &orig_dst); + args, ref_best_rd, refs, &rate_mv, &orig_dst); #endif } + mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int; + mode_info[ref_mv_idx].rate_mv = rate_mv; if (ret_val != INT64_MAX) { int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + mode_info[ref_mv_idx].rd = tmp_rd; if (tmp_rd < best_rd) { best_rd_stats = *rd_stats; best_rd_stats_y = *rd_stats_y; @@ -9049,7 +9637,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, best_disable_skip = *disable_skip; best_xskip = x->skip; memcpy(best_blk_skip, x->blk_skip, - sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w); } if (tmp_rd < best_rd2) { @@ -9062,8 +9650,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } restore_dst_buf(xd, orig_dst, num_planes); } - - args->modelled_rd = NULL; } if (best_rd == INT64_MAX) return INT64_MAX; @@ -9078,7 +9664,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, assert(IMPLIES(mbmi->comp_group_idx == 1, mbmi->interinter_comp.type != COMPOUND_AVERAGE)); memcpy(x->blk_skip, best_blk_skip, - sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w); return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); } @@ -9186,8 +9772,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int sadpb = x->sadperbit16; int cost_list[5]; int bestsme = av1_full_pixel_search( - cpi, x, bsize, &mvp_full, step_param, sadpb, - cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1, + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0, + sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1, (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1); x->mv_limits = tmp_mv_limits; @@ -9229,8 +9815,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, } else { super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX); memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); - memset(x->blk_skip, rd_stats.skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) + set_blk_skip(x, 0, i, rd_stats.skip); } if (num_planes > 1) { super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); @@ -9254,7 +9840,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, best_skip = x->skip; best_rdcost = rdc_noskip; memcpy(best_blk_skip, x->blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); } if (!xd->lossless[mbmi->segment_id]) { @@ -9271,7 +9857,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, best_skip = x->skip; best_rdcost = rdc_skip; memcpy(best_blk_skip, x->blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); } } } @@ -9279,7 +9865,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, *rd_cost = best_rdcost; x->skip = best_skip; memcpy(x->blk_skip, best_blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); return best_rd; } @@ -9302,8 +9888,8 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, mbmi->mv[0].as_int = 0; const int64_t intra_yrd = - rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, - &y_skip, bsize, best_rd, ctx); + rd_pick_intra_sby_mode(cpi, x, mi_row, mi_col, &rate_y, &rate_y_tokenonly, + &dist_y, &y_skip, bsize, best_rd, ctx); if (intra_yrd < best_rd) { // Only store reconstructed luma when there's chroma RDO. When there's no @@ -9447,6 +10033,17 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost, mbmi->uv_mode = UV_DC_PRED; mbmi->ref_frame[0] = ref_frame; mbmi->ref_frame[1] = second_ref_frame; + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + if (x->mbmi_ext->ref_mv_count[ref_frame_type] == UINT8_MAX) { + if (x->mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX || + x->mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) { + return; + } + MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext; + av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count, + mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row, + mi_col, mbmi_ext->mode_context); + } assert(this_mode == NEAREST_NEARESTMV); if (!build_cur_mv(mbmi->mv, this_mode, cm, x)) { @@ -9508,7 +10105,7 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost, memset(search_state->best_mbmode.inter_tx_size, search_state->best_mbmode.tx_size, sizeof(search_state->best_mbmode.inter_tx_size)); - set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n8_w, xd->n8_h, + set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n4_w, xd->n4_h, search_state->best_mbmode.skip && is_inter_block(mbmi), xd); // Set up color-related variables for skip mode. @@ -9595,11 +10192,12 @@ static void sf_refine_fast_tx_type_search( } else { super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); - memset(x->blk_skip, rd_stats_y.skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) + set_blk_skip(x, 0, i, rd_stats_y.skip); } if (num_planes > 1) { - inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, FTXS_NONE); + inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, INT64_MAX, + FTXS_NONE); } else { av1_init_rd_stats(&rd_stats_uv); } @@ -9647,7 +10245,7 @@ static void sf_refine_fast_tx_type_search( static void set_params_rd_pick_inter_mode( const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args, BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2], - uint32_t mode_skip_mask[REF_FRAMES], + uint32_t mode_skip_mask[REF_FRAMES], int skip_ref_frame_mask, unsigned int ref_costs_single[REF_FRAMES], unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES], struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { @@ -9700,18 +10298,45 @@ static void set_params_rd_pick_inter_mode( x->pred_mv_sad[ref_frame] = INT_MAX; x->mbmi_ext->mode_context[ref_frame] = 0; x->mbmi_ext->compound_mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) { + if (mbmi->partition != PARTITION_NONE && + mbmi->partition != PARTITION_SPLIT) { + if (skip_ref_frame_mask & (1 << ref_frame)) { + int skip = 1; + for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { + if (!(skip_ref_frame_mask & (1 << r))) { + const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; + if (rf[0] == ref_frame || rf[1] == ref_frame) { + skip = 0; + break; + } + } + } + if (skip) continue; + } + } assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, yv12_mb); } } - - // TODO(zoeliu@google.com): To further optimize the obtaining of motion vector - // references for compound prediction, as not every pair of reference frames - // woud be examined for the RD evaluation. + // ref_frame = ALTREF_FRAME for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) { x->mbmi_ext->mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; + const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES]; + if (!((cpi->ref_frame_flags & ref_frame_flag_list[rf[0]]) && + (cpi->ref_frame_flags & ref_frame_flag_list[rf[1]]))) { + continue; + } + + if (mbmi->partition != PARTITION_NONE && + mbmi->partition != PARTITION_SPLIT) { + if (skip_ref_frame_mask & (1 << ref_frame)) { + continue; + } + } av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row, mi_col, mbmi_ext->mode_context); @@ -9838,9 +10463,10 @@ static void set_params_rd_pick_inter_mode( } } -static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_cost, PICK_MODE_CONTEXT *ctx, - BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi, +static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, + int mi_col, RD_STATS *rd_cost, + PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize, + MB_MODE_INFO *const mbmi, PALETTE_MODE_INFO *const pmi, unsigned int *ref_costs_single, InterModeSearchState *search_state) { @@ -9867,9 +10493,9 @@ static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE_FRAME; rate_overhead_palette = rd_pick_palette_intra_sby( - cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette, - best_palette_color_map, &best_rd_palette, &best_model_rd_palette, NULL, - NULL, NULL, NULL, ctx, best_blk_skip); + cpi, x, bsize, mi_row, mi_col, intra_mode_cost[DC_PRED], + &best_mbmi_palette, best_palette_color_map, &best_rd_palette, + &best_model_rd_palette, NULL, NULL, NULL, NULL, ctx, best_blk_skip); if (pmi->palette_size[0] == 0) return; memcpy(x->blk_skip, best_blk_skip, @@ -9986,15 +10612,49 @@ static void init_inter_mode_search_state(InterModeSearchState *search_state, av1_zero(search_state->single_newmv); av1_zero(search_state->single_newmv_rate); av1_zero(search_state->single_newmv_valid); - for (int i = 0; i < MB_MODE_COUNT; ++i) - for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) - search_state->modelled_rd[i][ref_frame] = INT64_MAX; + for (int i = 0; i < MB_MODE_COUNT; ++i) { + for (int j = 0; j < MAX_REF_MV_SERCH; ++j) { + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + search_state->modelled_rd[i][j][ref_frame] = INT64_MAX; + search_state->simple_rd[i][j][ref_frame] = INT64_MAX; + } + } + } + + for (int dir = 0; dir < 2; ++dir) { + for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) { + SingleInterModeState *state; + + state = &search_state->single_state[dir][mode][ref_frame]; + state->ref_frame = NONE_FRAME; + state->rd = INT64_MAX; + + state = &search_state->single_state_modelled[dir][mode][ref_frame]; + state->ref_frame = NONE_FRAME; + state->rd = INT64_MAX; + } + } + } + for (int dir = 0; dir < 2; ++dir) { + for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) { + search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME; + } + } + } + av1_zero(search_state->single_state_cnt); + av1_zero(search_state->single_state_modelled_cnt); } +// Case 1: return 0, means don't skip this mode +// Case 2: return 1, means skip this mode completely +// Case 3: return 2, means skip compound only, but still try single motion modes static int inter_mode_search_order_independent_skip( - const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index, - int mi_row, int mi_col, uint32_t *mode_skip_mask, - uint16_t *ref_frame_skip_mask) { + const AV1_COMP *cpi, const PICK_MODE_CONTEXT *ctx, const MACROBLOCK *x, + BLOCK_SIZE bsize, int mode_index, int mi_row, int mi_col, + uint32_t *mode_skip_mask, uint16_t *ref_frame_skip_mask, + InterModeSearchState *search_state) { const SPEED_FEATURES *const sf = &cpi->sf; const AV1_COMMON *const cm = &cpi->common; const struct segmentation *const seg = &cm->seg; @@ -10003,6 +10663,32 @@ static int inter_mode_search_order_independent_skip( const unsigned char segment_id = mbmi->segment_id; const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame; const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode; + int skip_motion_mode = 0; + if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) { + const int ref_type = av1_ref_frame_type(ref_frame); + int skip_ref = ctx->skip_ref_frame_mask & (1 << ref_type); + if (ref_type <= ALTREF_FRAME && skip_ref) { + // Since the compound ref modes depends on the motion estimation result of + // two single ref modes( best mv of single ref modes as the start point ) + // If current single ref mode is marked skip, we need to check if it will + // be used in compound ref modes. + for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { + if (!(ctx->skip_ref_frame_mask & (1 << r))) { + const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; + if (rf[0] == ref_type || rf[1] == ref_type) { + // Found a not skipped compound ref mode which contains current + // single ref. So this single ref can't be skipped completly + // Just skip it's motion mode search, still try it's simple + // transition mode. + skip_motion_mode = 1; + skip_ref = 0; + break; + } + } + } + } + if (skip_ref) return 1; + } if (cpi->sf.mode_pruning_based_on_two_pass_partition_search && !x->cb_partition_scan) { @@ -10115,9 +10801,12 @@ static int inter_mode_search_order_independent_skip( return 1; } - if (skip_repeated_mv(cm, x, this_mode, ref_frame)) { + if (skip_repeated_mv(cm, x, this_mode, ref_frame, search_state)) { return 1; } + if (skip_motion_mode) { + return 2; + } return 0; } @@ -10139,12 +10828,13 @@ static INLINE void init_mbmi(MB_MODE_INFO *mbmi, int mode_index, set_default_interp_filters(mbmi, cm->interp_filter); } -static int handle_intra_mode(InterModeSearchState *search_state, - const AV1_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, int ref_frame_cost, - const PICK_MODE_CONTEXT *ctx, int disable_skip, - RD_STATS *rd_stats, RD_STATS *rd_stats_y, - RD_STATS *rd_stats_uv) { +static int64_t handle_intra_mode(InterModeSearchState *search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int ref_frame_cost, + const PICK_MODE_CONTEXT *ctx, int disable_skip, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv) { const AV1_COMMON *cm = &cpi->common; const SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; @@ -10159,9 +10849,19 @@ static int handle_intra_mode(InterModeSearchState *search_state, const int rows = block_size_high[bsize]; const int cols = block_size_wide[bsize]; const int num_planes = av1_num_planes(cm); - av1_init_rd_stats(rd_stats); - av1_init_rd_stats(rd_stats_y); - av1_init_rd_stats(rd_stats_uv); + const int skip_ctx = av1_get_skip_context(xd); + + int known_rate = intra_mode_cost[mbmi->mode]; + known_rate += ref_frame_cost; + if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED) + known_rate += intra_cost_penalty; + known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]); + const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0); + if (known_rd > search_state->best_rd) { + search_state->skip_intra_modes = 1; + return INT64_MAX; + } + TX_SIZE uv_tx; int is_directional_mode = av1_is_directional_mode(mbmi->mode); if (is_directional_mode && av1_use_angle_delta(bsize)) { @@ -10178,20 +10878,33 @@ static int handle_intra_mode(InterModeSearchState *search_state, search_state->directional_mode_skip_mask); search_state->angle_stats_ready = 1; } - if (search_state->directional_mode_skip_mask[mbmi->mode]) return 0; + if (search_state->directional_mode_skip_mask[mbmi->mode]) return INT64_MAX; + av1_init_rd_stats(rd_stats_y); rd_stats_y->rate = INT_MAX; - rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize, - intra_mode_cost[mbmi->mode], search_state->best_rd, - &model_rd); + rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &rate_dummy, rd_stats_y, + bsize, intra_mode_cost[mbmi->mode], + search_state->best_rd, &model_rd); } else { + av1_init_rd_stats(rd_stats_y); mbmi->angle_delta[PLANE_TYPE_Y] = 0; super_block_yrd(cpi, x, rd_stats_y, bsize, search_state->best_rd); } uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); - + int try_filter_intra = 0; + int64_t best_rd_tmp = INT64_MAX; if (mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { + if (rd_stats_y->rate != INT_MAX) { + const int tmp_rate = rd_stats_y->rate + x->filter_intra_cost[bsize][0] + + intra_mode_cost[mbmi->mode]; + best_rd_tmp = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist); + try_filter_intra = !((best_rd_tmp / 2) > search_state->best_rd); + } else { + try_filter_intra = !(search_state->best_mbmode.skip); + } + } + if (try_filter_intra) { RD_STATS rd_stats_y_fi; int filter_intra_selected_flag = 0; TX_SIZE best_tx_size = mbmi->tx_size; @@ -10199,20 +10912,12 @@ static int handle_intra_mode(InterModeSearchState *search_state, memcpy(best_txk_type, mbmi->txk_type, sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED; - int64_t best_rd_tmp = INT64_MAX; - if (rd_stats_y->rate != INT_MAX) { - best_rd_tmp = RDCOST(x->rdmult, - rd_stats_y->rate + x->filter_intra_cost[bsize][0] + - intra_mode_cost[mbmi->mode], - rd_stats_y->dist); - } mbmi->filter_intra_mode_info.use_filter_intra = 1; for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; fi_mode < FILTER_INTRA_MODES; ++fi_mode) { int64_t this_rd_tmp; mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode; - super_block_yrd(cpi, x, &rd_stats_y_fi, bsize, search_state->best_rd); if (rd_stats_y_fi.rate == INT_MAX) { continue; @@ -10223,6 +10928,9 @@ static int handle_intra_mode(InterModeSearchState *search_state, intra_mode_cost[mbmi->mode]); this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist); + if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > search_state->best_rd) { + break; + } if (this_rd_tmp < best_rd_tmp) { best_tx_size = mbmi->tx_size; memcpy(best_txk_type, mbmi->txk_type, @@ -10249,12 +10957,23 @@ static int handle_intra_mode(InterModeSearchState *search_state, mbmi->filter_intra_mode_info.use_filter_intra = 0; } } - - if (rd_stats_y->rate == INT_MAX) return 0; - + if (rd_stats_y->rate == INT_MAX) return INT64_MAX; + const int mode_cost_y = + intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]); + av1_init_rd_stats(rd_stats); + av1_init_rd_stats(rd_stats_uv); if (num_planes > 1) { uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); if (search_state->rate_uv_intra[uv_tx] == INT_MAX) { + int rate_y = + rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate; + const int64_t rdy = + RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist); + if (search_state->best_rd < (INT64_MAX / 2) && + rdy > (search_state->best_rd + (search_state->best_rd >> 2))) { + search_state->skip_intra_modes = 1; + return INT64_MAX; + } choose_intra_uv_mode( cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx], &search_state->rate_uv_tokenonly[uv_tx], @@ -10262,6 +10981,14 @@ static int handle_intra_mode(InterModeSearchState *search_state, &search_state->mode_uv[uv_tx]); if (try_palette) search_state->pmi_uv[uv_tx] = *pmi; search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV]; + + const int uv_rate = search_state->rate_uv_tokenonly[uv_tx]; + const int64_t uv_dist = search_state->dist_uvs[uv_tx]; + const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist); + if (uv_rd > search_state->best_rd) { + search_state->skip_intra_modes = 1; + return INT64_MAX; + } } rd_stats_uv->rate = search_state->rate_uv_tokenonly[uv_tx]; @@ -10277,10 +11004,7 @@ static int handle_intra_mode(InterModeSearchState *search_state, } mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx]; } - - rd_stats->rate = - rd_stats_y->rate + - intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]); + rd_stats->rate = rd_stats_y->rate + mode_cost_y; if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) { // super_block_yrd above includes the cost of the tx_size in the // tokenonly rate, but for intra blocks, tx_size is always coded @@ -10308,14 +11032,13 @@ static int handle_intra_mode(InterModeSearchState *search_state, rd_stats_y->rate = 0; rd_stats_uv->rate = 0; // Cost the skip mb case - rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1]; + rd_stats->rate += x->skip_cost[skip_ctx][1]; } else { // Add in the cost of the no skip flag. - rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][0]; + rd_stats->rate += x->skip_cost[skip_ctx][0]; } // Calculate the final RD estimate for this mode. - int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - + const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); // Keep record of best intra rd if (this_rd < search_state->best_intra_rd) { search_state->best_intra_rd = this_rd; @@ -10333,14 +11056,322 @@ static int handle_intra_mode(InterModeSearchState *search_state, search_state->best_pred_rd[i] = AOMMIN(search_state->best_pred_rd[i], this_rd); } - return 1; + return this_rd; +} + +static void collect_single_states(MACROBLOCK *x, + InterModeSearchState *search_state, + const MB_MODE_INFO *const mbmi) { + int i, j; + const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1; + const int mode_offset = INTER_OFFSET(this_mode); + const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); + + // Simple rd + int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame]; + for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { + int64_t rd = search_state->simple_rd[this_mode][ref_mv_idx][ref_frame]; + if (rd < simple_rd) simple_rd = rd; + } + + // Insertion sort of single_state + SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 }; + SingleInterModeState *state_s = search_state->single_state[dir][mode_offset]; + i = search_state->single_state_cnt[dir][mode_offset]; + for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j) + state_s[j] = state_s[j - 1]; + state_s[j] = this_state_s; + search_state->single_state_cnt[dir][mode_offset]++; + + // Modelled rd + int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame]; + for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { + int64_t rd = search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame]; + if (rd < modelled_rd) modelled_rd = rd; + } + + // Insertion sort of single_state_modelled + SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 }; + SingleInterModeState *state_m = + search_state->single_state_modelled[dir][mode_offset]; + i = search_state->single_state_modelled_cnt[dir][mode_offset]; + for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j) + state_m[j] = state_m[j - 1]; + state_m[j] = this_state_m; + search_state->single_state_modelled_cnt[dir][mode_offset]++; +} + +static void analyze_single_states(const AV1_COMP *cpi, + InterModeSearchState *search_state) { + int i, j, dir, mode; + if (cpi->sf.prune_comp_search_by_single_result >= 1) { + for (dir = 0; dir < 2; ++dir) { + int64_t best_rd; + SingleInterModeState(*state)[FWD_REFS]; + + // Use the best rd of GLOBALMV or NEWMV to prune the unlikely + // reference frames for all the modes (NEARESTMV and NEARMV may not + // have same motion vectors). Always keep the best of each mode + // because it might form the best possible combination with other mode. + state = search_state->single_state[dir]; + best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, + state[INTER_OFFSET(GLOBALMV)][0].rd); + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) { + if (state[mode][i].rd != INT64_MAX && + (state[mode][i].rd >> 1) > best_rd) { + state[mode][i].valid = 0; + } + } + } + + state = search_state->single_state_modelled[dir]; + best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, + state[INTER_OFFSET(GLOBALMV)][0].rd); + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode]; + ++i) { + if (state[mode][i].rd != INT64_MAX && + (state[mode][i].rd >> 1) > best_rd) { + state[mode][i].valid = 0; + } + } + } + } + } + + // Ordering by simple rd first, then by modelled rd + for (dir = 0; dir < 2; ++dir) { + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + const int state_cnt_s = search_state->single_state_cnt[dir][mode]; + const int state_cnt_m = + search_state->single_state_modelled_cnt[dir][mode]; + SingleInterModeState *state_s = search_state->single_state[dir][mode]; + SingleInterModeState *state_m = + search_state->single_state_modelled[dir][mode]; + int count = 0; + const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m); + for (i = 0; i < state_cnt_s; ++i) { + if (state_s[i].rd == INT64_MAX) break; + if (state_s[i].valid) + search_state->single_rd_order[dir][mode][count++] = + state_s[i].ref_frame; + } + if (count < max_candidates) { + for (i = 0; i < state_cnt_m; ++i) { + if (state_m[i].rd == INT64_MAX) break; + if (state_m[i].valid) { + int ref_frame = state_m[i].ref_frame; + int match = 0; + // Check if existing already + for (j = 0; j < count; ++j) { + if (search_state->single_rd_order[dir][mode][j] == ref_frame) { + match = 1; + break; + } + } + if (!match) { + // Check if this ref_frame is removed in simple rd + int valid = 1; + for (j = 0; j < state_cnt_s; j++) { + if (ref_frame == state_s[j].ref_frame && !state_s[j].valid) { + valid = 0; + break; + } + } + if (valid) + search_state->single_rd_order[dir][mode][count++] = ref_frame; + } + if (count >= max_candidates) break; + } + } + } + } + } +} + +static int compound_skip_get_candidates( + const AV1_COMP *cpi, const InterModeSearchState *search_state, + const int dir, const PREDICTION_MODE mode) { + const int mode_offset = INTER_OFFSET(mode); + const SingleInterModeState *state = + search_state->single_state[dir][mode_offset]; + const SingleInterModeState *state_modelled = + search_state->single_state_modelled[dir][mode_offset]; + int max_candidates = 0; + int candidates; + + for (int i = 0; i < FWD_REFS; ++i) { + if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break; + max_candidates++; + } + + candidates = max_candidates; + if (cpi->sf.prune_comp_search_by_single_result >= 2) { + candidates = AOMMIN(2, max_candidates); + } + if (cpi->sf.prune_comp_search_by_single_result >= 3) { + if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX && + state[0].ref_frame == state_modelled[0].ref_frame) + candidates = 1; + if (mode == NEARMV || mode == GLOBALMV) candidates = 1; + } + return candidates; +} + +static int compound_skip_by_single_states( + const AV1_COMP *cpi, const InterModeSearchState *search_state, + const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame, + const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) { + const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame }; + const int mode[2] = { compound_ref0_mode(this_mode), + compound_ref1_mode(this_mode) }; + const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) }; + const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1, + refs[1] <= GOLDEN_FRAME ? 0 : 1 }; + int ref_searched[2] = { 0, 0 }; + int ref_mv_match[2] = { 1, 1 }; + int i, j; + + for (i = 0; i < 2; ++i) { + const SingleInterModeState *state = + search_state->single_state[mode_dir[i]][mode_offset[i]]; + const int state_cnt = + search_state->single_state_cnt[mode_dir[i]][mode_offset[i]]; + for (j = 0; j < state_cnt; ++j) { + if (state[j].ref_frame == refs[i]) { + ref_searched[i] = 1; + break; + } + } + } + + const int ref_set = get_drl_refmv_count(x, refs, this_mode); + for (i = 0; i < 2; ++i) { + if (mode[i] == NEARESTMV || mode[i] == NEARMV) { + const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME }; + int idential = 1; + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) { + int_mv single_mv; + int_mv comp_mv; + get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, single_refs, + x->mbmi_ext); + get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, refs, x->mbmi_ext); + + idential &= (single_mv.as_int == comp_mv.as_int); + if (!idential) { + ref_mv_match[i] = 0; + break; + } + } + } + } + + for (i = 0; i < 2; ++i) { + if (ref_searched[i] && ref_mv_match[i]) { + const int candidates = + compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]); + const MV_REFERENCE_FRAME *ref_order = + search_state->single_rd_order[mode_dir[i]][mode_offset[i]]; + int match = 0; + for (j = 0; j < candidates; ++j) { + if (refs[i] == ref_order[j]) { + match = 1; + break; + } + } + if (!match) return 1; + } + } + + return 0; +} + +static INLINE int sf_check_is_drop_ref(const MODE_DEFINITION *mode, + InterModeSearchState *search_state) { + const MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0]; + const MV_REFERENCE_FRAME second_ref_frame = mode->ref_frame[1]; + if (search_state->num_available_refs > 2) { + if ((ref_frame == search_state->dist_order_refs[0] && + second_ref_frame == search_state->dist_order_refs[1]) || + (ref_frame == search_state->dist_order_refs[1] && + second_ref_frame == search_state->dist_order_refs[0])) + return 1; // drop this pair of refs + } + return 0; +} + +static INLINE void sf_drop_ref_analyze(InterModeSearchState *search_state, + const MODE_DEFINITION *mode, + int64_t distortion2) { + const PREDICTION_MODE this_mode = mode->mode; + MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0]; + const int idx = ref_frame - LAST_FRAME; + if (idx && distortion2 > search_state->dist_refs[idx]) { + search_state->dist_refs[idx] = distortion2; + search_state->dist_order_refs[idx] = ref_frame; + } + + // Reach the last single ref prediction mode + if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) { + // bubble sort dist_refs and the order index + for (int i = 0; i < REF_FRAMES; ++i) { + for (int k = i + 1; k < REF_FRAMES; ++k) { + if (search_state->dist_refs[i] < search_state->dist_refs[k]) { + int64_t tmp_dist = search_state->dist_refs[i]; + search_state->dist_refs[i] = search_state->dist_refs[k]; + search_state->dist_refs[k] = tmp_dist; + + int tmp_idx = search_state->dist_order_refs[i]; + search_state->dist_order_refs[i] = search_state->dist_order_refs[k]; + search_state->dist_order_refs[k] = tmp_idx; + } + } + } + for (int i = 0; i < REF_FRAMES; ++i) { + if (search_state->dist_refs[i] == -1) break; + search_state->num_available_refs = i; + } + search_state->num_available_refs++; + } +} + +static void alloc_compound_type_rd_buffers(AV1_COMMON *const cm, + CompoundTypeRdBuffers *const bufs) { + CHECK_MEM_ERROR( + cm, bufs->pred0, + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0))); + CHECK_MEM_ERROR( + cm, bufs->pred1, + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1))); + CHECK_MEM_ERROR( + cm, bufs->residual1, + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1))); + CHECK_MEM_ERROR( + cm, bufs->diff10, + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10))); + CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf, + (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE * + sizeof(*bufs->tmp_best_mask_buf))); } -void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, +static void release_compound_type_rd_buffers( + CompoundTypeRdBuffers *const bufs) { + aom_free(bufs->pred0); + aom_free(bufs->pred1); + aom_free(bufs->residual1); + aom_free(bufs->diff10); + aom_free(bufs->tmp_best_mask_buf); + av1_zero(*bufs); // Set all pointers to NULL for safety. +} + +void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, int mi_row, int mi_col, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { - const AV1_COMMON *const cm = &cpi->common; + AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); const SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; @@ -10350,9 +11381,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const struct segmentation *const seg = &cm->seg; PREDICTION_MODE this_mode; - MV_REFERENCE_FRAME ref_frame, second_ref_frame; unsigned char segment_id = mbmi->segment_id; - int i, k; + int i; struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; unsigned int ref_costs_single[REF_FRAMES]; unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; @@ -10364,28 +11394,57 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, InterModeSearchState search_state; init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize, best_rd_so_far); - + INTERINTRA_MODE interintra_modes[REF_FRAMES] = { + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES + }; HandleInterModeArgs args = { { NULL }, { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, { NULL }, { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 }, NULL, NULL, - NULL, NULL, + NULL, search_state.modelled_rd, { { 0 } }, INT_MAX, - INT_MAX + INT_MAX, search_state.simple_rd, + 0, interintra_modes }; for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; av1_invalid_rd_stats(rd_cost); // init params, set frame modes, speed features - set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col, - ref_frame_skip_mask, mode_skip_mask, - ref_costs_single, ref_costs_comp, yv12_mb); + set_params_rd_pick_inter_mode( + cpi, x, &args, bsize, mi_row, mi_col, ref_frame_skip_mask, mode_skip_mask, + ctx->skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb); #if CONFIG_COLLECT_INTER_MODE_RD_STATS int64_t best_est_rd = INT64_MAX; + // TODO(angiebird): Turn this on when this speed feature is well tested +#if 1 + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + const int do_tx_search = !md->ready; +#else + const int do_tx_search = 1; +#endif + InterModesInfo *inter_modes_info = &tile_data->inter_modes_info; + inter_modes_info->num = 0; #endif + int intra_mode_num = 0; + int intra_mode_idx_ls[MAX_MODES]; + int reach_first_comp_mode = 0; + + // Temporary buffers used by handle_inter_mode(). + // We allocate them once and reuse it in every call to that function. + // Note: Must be allocated on the heap due to large size of the arrays. + uint8_t *tmp_buf_orig; + CHECK_MEM_ERROR( + cm, tmp_buf_orig, + (uint8_t *)aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE)); + uint8_t *const tmp_buf = get_buf_by_bd(xd, tmp_buf_orig); + + CompoundTypeRdBuffers rd_buffers; + alloc_compound_type_rd_buffers(cm, &rd_buffers); + for (int midx = 0; midx < MAX_MODES; ++midx) { int mode_index = mode_map[midx]; int64_t this_rd = INT64_MAX; @@ -10394,42 +11453,44 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, int64_t distortion2 = 0; int skippable = 0; int this_skip2 = 0; - - this_mode = av1_mode_order[mode_index].mode; - ref_frame = av1_mode_order[mode_index].ref_frame[0]; - second_ref_frame = av1_mode_order[mode_index].ref_frame[1]; + const MODE_DEFINITION *mode_order = &av1_mode_order[mode_index]; + const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0]; + const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1]; + const int comp_pred = second_ref_frame > INTRA_FRAME; + this_mode = mode_order->mode; init_mbmi(mbmi, mode_index, cm); x->skip = 0; set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); - if (inter_mode_search_order_independent_skip(cpi, x, bsize, mode_index, - mi_row, mi_col, mode_skip_mask, - ref_frame_skip_mask)) - continue; - - if (ref_frame == INTRA_FRAME) { - if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) - continue; + // Reach the first compound prediction mode + if (sf->prune_comp_search_by_single_result > 0 && comp_pred && + reach_first_comp_mode == 0) { + analyze_single_states(cpi, &search_state); + reach_first_comp_mode = 1; } + const int ret = inter_mode_search_order_independent_skip( + cpi, ctx, x, bsize, mode_index, mi_row, mi_col, mode_skip_mask, + ref_frame_skip_mask, &search_state); + if (ret == 1) continue; + args.skip_motion_mode = (ret == 2); - if (sf->drop_ref) { - if (ref_frame > INTRA_FRAME && second_ref_frame > INTRA_FRAME) { - if (search_state.num_available_refs > 2) { - if ((ref_frame == search_state.dist_order_refs[0] && - second_ref_frame == search_state.dist_order_refs[1]) || - (ref_frame == search_state.dist_order_refs[1] && - second_ref_frame == search_state.dist_order_refs[0])) - continue; - } + if (sf->drop_ref && comp_pred) { + if (sf_check_is_drop_ref(mode_order, &search_state)) { + continue; } } if (search_state.best_rd < search_state.mode_threshold[mode_index]) continue; - const int comp_pred = second_ref_frame > INTRA_FRAME; + if (sf->prune_comp_search_by_single_result > 0 && comp_pred) { + if (compound_skip_by_single_states(cpi, &search_state, this_mode, + ref_frame, second_ref_frame, x)) + continue; + } + const int ref_frame_cost = comp_pred ? ref_costs_comp[ref_frame][second_ref_frame] : ref_costs_single[ref_frame]; @@ -10474,18 +11535,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, } if (ref_frame == INTRA_FRAME) { - RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv; - const int ret = handle_intra_mode( - &search_state, cpi, x, bsize, ref_frame_cost, ctx, disable_skip, - &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv); - if (!ret) { - continue; - } - rate2 = intra_rd_stats.rate; - distortion2 = intra_rd_stats.dist; - this_rd = RDCOST(x->rdmult, rate2, distortion2); - skippable = intra_rd_stats.skip; - rate_y = intra_rd_stats_y.rate; + intra_mode_idx_ls[intra_mode_num++] = mode_index; + continue; } else { mbmi->angle_delta[PLANE_TYPE_Y] = 0; mbmi->angle_delta[PLANE_TYPE_UV] = 0; @@ -10501,17 +11552,17 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, args.single_newmv = search_state.single_newmv; args.single_newmv_rate = search_state.single_newmv_rate; args.single_newmv_valid = search_state.single_newmv_valid; - args.modelled_rd = search_state.modelled_rd; args.single_comp_cost = real_compmode_cost; args.ref_frame_cost = ref_frame_cost; #if CONFIG_COLLECT_INTER_MODE_RD_STATS - this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y, - &rd_stats_uv, &disable_skip, mi_row, mi_col, - &args, ref_best_rd, &best_est_rd); + this_rd = handle_inter_mode( + cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip, + mi_row, mi_col, &args, ref_best_rd, tmp_buf, &rd_buffers, tile_data, + &best_est_rd, do_tx_search, inter_modes_info); #else this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip, mi_row, mi_col, - &args, ref_best_rd); + &args, ref_best_rd, tmp_buf, &rd_buffers); #endif rate2 = rd_stats.rate; skippable = rd_stats.skip; @@ -10520,6 +11571,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, rate_uv = rd_stats_uv.rate; } + if (sf->prune_comp_search_by_single_result > 0 && + is_inter_singleref_mode(this_mode)) { + collect_single_states(x, &search_state, mbmi); + } + if (this_rd == INT64_MAX) continue; this_skip2 = mbmi->skip; @@ -10554,10 +11610,24 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, search_state.best_mbmode = *mbmi; search_state.best_skip2 = this_skip2; search_state.best_mode_skippable = skippable; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + if (do_tx_search) { + // When do_tx_search == 0, handle_inter_mode won't provide correct + // rate_y and rate_uv because txfm_search process is replaced by + // rd estimation. + // Therfore, we should avoid updating best_rate_y and best_rate_uv + // here. These two values will be updated when txfm_search is called + search_state.best_rate_y = + rate_y + + x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable]; + search_state.best_rate_uv = rate_uv; + } +#else // CONFIG_COLLECT_INTER_MODE_RD_STATS search_state.best_rate_y = rate_y + x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable]; search_state.best_rate_uv = rate_uv; +#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS memcpy(ctx->blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); } @@ -10588,43 +11658,124 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT]) search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; } + if (sf->drop_ref && second_ref_frame == NONE_FRAME) { + // Collect data from single ref mode, and analyze data. + sf_drop_ref_analyze(&search_state, mode_order, distortion2); + } - if (sf->drop_ref) { - if (second_ref_frame == NONE_FRAME) { - const int idx = ref_frame - LAST_FRAME; - if (idx && distortion2 > search_state.dist_refs[idx]) { - search_state.dist_refs[idx] = distortion2; - search_state.dist_order_refs[idx] = ref_frame; - } + if (x->skip && !comp_pred) break; + } - // Reach the last single ref prediction mode - if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) { - // bubble sort dist_refs and the order index - for (i = 0; i < REF_FRAMES; ++i) { - for (k = i + 1; k < REF_FRAMES; ++k) { - if (search_state.dist_refs[i] < search_state.dist_refs[k]) { - int64_t tmp_dist = search_state.dist_refs[i]; - search_state.dist_refs[i] = search_state.dist_refs[k]; - search_state.dist_refs[k] = tmp_dist; - - int tmp_idx = search_state.dist_order_refs[i]; - search_state.dist_order_refs[i] = - search_state.dist_order_refs[k]; - search_state.dist_order_refs[k] = tmp_idx; - } - } - } + aom_free(tmp_buf_orig); + tmp_buf_orig = NULL; + release_compound_type_rd_buffers(&rd_buffers); - for (i = 0; i < REF_FRAMES; ++i) { - if (search_state.dist_refs[i] == -1) break; - search_state.num_available_refs = i; - } - search_state.num_available_refs++; - } +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + if (!do_tx_search) { + inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr); + search_state.best_rd = INT64_MAX; + + int64_t top_est_rd = + inter_modes_info->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]; + for (int j = 0; j < inter_modes_info->num; ++j) { + const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx; + *mbmi = inter_modes_info->mbmi_arr[data_idx]; + int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx]; + if (curr_est_rd * 0.9 > top_est_rd) { + continue; + } + const int mode_rate = inter_modes_info->mode_rate_arr[data_idx]; + + x->skip = 0; + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + RD_STATS rd_stats; + RD_STATS rd_stats_y; + RD_STATS rd_stats_uv; + + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); + if (mbmi->motion_mode == OBMC_CAUSAL) + av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); + + if (!txfm_search(cpi, x, bsize, mi_row, mi_col, &rd_stats, &rd_stats_y, + &rd_stats_uv, mode_rate, search_state.best_rd)) { + continue; + } else { + const int skip_ctx = av1_get_skip_context(xd); + inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse, + rd_stats.dist, + rd_stats_y.rate + rd_stats_uv.rate + + x->skip_cost[skip_ctx][mbmi->skip]); + } + rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); + + if (rd_stats.rdcost < search_state.best_rd) { + search_state.best_rd = rd_stats.rdcost; + // Note index of best mode so far + const int mode_index = get_prediction_mode_idx( + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + search_state.best_mode_index = mode_index; + *rd_cost = rd_stats; + search_state.best_rd = rd_stats.rdcost; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = mbmi->skip; + search_state.best_mode_skippable = rd_stats.skip; + search_state.best_rate_y = + rd_stats_y.rate + + x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip]; + search_state.best_rate_uv = rd_stats_uv.rate; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); } } + } +#endif - if (x->skip && !comp_pred) break; + for (int j = 0; j < intra_mode_num; ++j) { + const int mode_index = intra_mode_idx_ls[j]; + const MV_REFERENCE_FRAME ref_frame = + av1_mode_order[mode_index].ref_frame[0]; + assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME); + assert(ref_frame == INTRA_FRAME); + if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break; + init_mbmi(mbmi, mode_index, cm); + x->skip = 0; + set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME); + + // Select prediction reference frames. + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + } + + RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv; + + const int ref_frame_cost = ref_costs_single[ref_frame]; + intra_rd_stats.rdcost = handle_intra_mode( + &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0, + &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv); + if (intra_rd_stats.rdcost < search_state.best_rd) { + search_state.best_rd = intra_rd_stats.rdcost; + // Note index of best mode so far + search_state.best_mode_index = mode_index; + *rd_cost = intra_rd_stats; + search_state.best_rd = intra_rd_stats.rdcost; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = 0; + search_state.best_mode_skippable = intra_rd_stats.skip; + search_state.best_rate_y = + intra_rd_stats_y.rate + + x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip]; + search_state.best_rate_uv = intra_rd_stats_uv.rate; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + } } // In effect only when speed >= 2. @@ -10635,7 +11786,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, // Only try palette mode when the best mode so far is an intra mode. if (try_palette && !is_inter_mode(search_state.best_mbmode.mode)) { - search_palette_mode(cpi, x, rd_cost, ctx, bsize, mbmi, pmi, + search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi, ref_costs_single, &search_state); } @@ -10776,11 +11927,11 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col); if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) { int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; - mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref); + mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref); // Select the samples according to motion vector difference - if (mbmi->num_proj_ref[0] > 1) - mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, - mbmi->num_proj_ref[0], bsize); + if (mbmi->num_proj_ref > 1) + mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref, bsize); } set_default_interp_filters(mbmi, cm->interp_filter); @@ -10853,7 +12004,7 @@ static INLINE void calc_target_weighted_pred_above( struct calc_target_weighted_pred_ctxt *ctxt = (struct calc_target_weighted_pred_ctxt *)fun_ctxt; - const int bw = xd->n8_w << MI_SIZE_LOG2; + const int bw = xd->n4_w << MI_SIZE_LOG2; const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE); @@ -10899,7 +12050,7 @@ static INLINE void calc_target_weighted_pred_left( struct calc_target_weighted_pred_ctxt *ctxt = (struct calc_target_weighted_pred_ctxt *)fun_ctxt; - const int bw = xd->n8_w << MI_SIZE_LOG2; + const int bw = xd->n4_w << MI_SIZE_LOG2; const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw); @@ -10982,8 +12133,8 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, int above_stride, const uint8_t *left, int left_stride) { const BLOCK_SIZE bsize = xd->mi[0]->sb_type; - const int bw = xd->n8_w << MI_SIZE_LOG2; - const int bh = xd->n8_h << MI_SIZE_LOG2; + const int bw = xd->n4_w << MI_SIZE_LOG2; + const int bh = xd->n4_h << MI_SIZE_LOG2; int32_t *mask_buf = x->mask_buf; int32_t *wsrc_buf = x->wsrc_buf; diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h index 12df472c1..4c11f90b8 100644 --- a/third_party/aom/av1/encoder/rdopt.h +++ b/third_party/aom/av1/encoder/rdopt.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RDOPT_H_ -#define AV1_ENCODER_RDOPT_H_ +#ifndef AOM_AV1_ENCODER_RDOPT_H_ +#define AOM_AV1_ENCODER_RDOPT_H_ #include "av1/common/blockd.h" #include "av1/common/txb_common.h" @@ -25,6 +25,10 @@ extern "C" { #endif #define MAX_REF_MV_SERCH 3 +#define DEFAULT_LUMA_INTERP_SKIP_FLAG 1 +#define DEFAULT_CHROMA_INTERP_SKIP_FLAG 2 +#define DEFAULT_INTERP_SKIP_FLAG \ + (DEFAULT_LUMA_INTERP_SKIP_FLAG | DEFAULT_CHROMA_INTERP_SKIP_FLAG) struct TileInfo; struct macroblock; @@ -111,7 +115,7 @@ unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd); -void av1_rd_pick_inter_mode_sb(const struct AV1_COMP *cpi, +void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost, BLOCK_SIZE bsize, @@ -123,14 +127,12 @@ void av1_rd_pick_inter_mode_sb_seg_skip( BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); #if CONFIG_COLLECT_INTER_MODE_RD_STATS -#define INTER_MODE_RD_TEST 0 -void av1_inter_mode_data_init(); -void av1_inter_mode_data_fit(int rdmult); -void av1_inter_mode_data_show(const AV1_COMMON *cm); +void av1_inter_mode_data_init(struct TileDataEnc *tile_data); +void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult); #endif #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_ENCODER_RDOPT_H_ +#endif // AOM_AV1_ENCODER_RDOPT_H_ diff --git a/third_party/aom/av1/encoder/reconinter_enc.c b/third_party/aom/av1/encoder/reconinter_enc.c new file mode 100644 index 000000000..23d920fc3 --- /dev/null +++ b/third_party/aom/av1/encoder/reconinter_enc.c @@ -0,0 +1,627 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" + +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/obmc.h" +#include "av1/encoder/reconinter_enc.h" + +static INLINE void calc_subpel_params( + MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv, + int plane, const int pre_x, const int pre_y, int x, int y, + struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params, + int bw, int bh) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int is_scaled = av1_is_scaled(sf); + if (is_scaled) { + int ssx = pd->subsampling_x; + int ssy = pd->subsampling_y; + int orig_pos_y = (pre_y + y) << SUBPEL_BITS; + orig_pos_y += mv.row * (1 << (1 - ssy)); + int orig_pos_x = (pre_x + x) << SUBPEL_BITS; + orig_pos_x += mv.col * (1 << (1 - ssx)); + int pos_y = sf->scale_value_y(orig_pos_y, sf); + int pos_x = sf->scale_value_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (pos_x >> SCALE_SUBPEL_BITS); + subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK; + subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK; + subpel_params->xs = sf->x_step_q4; + subpel_params->ys = sf->y_step_q4; + } else { + const MV mv_q4 = clamp_mv_to_umv_border_sb( + xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); + subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS; + subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; + subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; + *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride + + (x + (mv_q4.col >> SUBPEL_BITS)); + } +} + +static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, + int mi_x, int mi_y) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + int is_compound = has_second_ref(mi); + int ref; + const int is_intrabc = is_intrabc_block(mi); + assert(IMPLIES(is_intrabc, !is_compound)); + int is_global[2] = { 0, 0 }; + for (ref = 0; ref < 1 + is_compound; ++ref) { + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; + is_global[ref] = is_global_mv_block(mi, wm->wmtype); + } + + const BLOCK_SIZE bsize = mi->sb_type; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) || + (block_size_high[bsize] < 8 && ss_y); + + if (is_intrabc) sub8x8_inter = 0; + + // For sub8x8 chroma blocks, we may be covering more than one luma block's + // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for + // the top-left corner of the prediction source - the correct top-left corner + // is at (pre_x, pre_y). + const int row_start = + (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0; + const int col_start = + (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0; + const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; + + sub8x8_inter = sub8x8_inter && !build_for_obmc; + if (sub8x8_inter) { + for (int row = row_start; row <= 0 && sub8x8_inter; ++row) { + for (int col = col_start; col <= 0; ++col) { + const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; + if (!is_inter_block(this_mbmi)) sub8x8_inter = 0; + if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0; + } + } + } + + if (sub8x8_inter) { + // block size + const int b4_w = block_size_wide[bsize] >> ss_x; + const int b4_h = block_size_high[bsize] >> ss_y; + const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y); + const int b8_w = block_size_wide[plane_bsize] >> ss_x; + const int b8_h = block_size_high[plane_bsize] >> ss_y; + assert(!is_compound); + + const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] }; + + int row = row_start; + for (int y = 0; y < b8_h; y += b4_h) { + int col = col_start; + for (int x = 0; x < b8_w; x += b4_w) { + MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; + is_compound = has_second_ref(this_mbmi); + int tmp_dst_stride = 8; + assert(bw < 8 || bh < 8); + ConvolveParams conv_params = get_conv_params_no_round( + 0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd); + conv_params.use_jnt_comp_avg = 0; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x; + + ref = 0; + const RefBuffer *ref_buf = + &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME]; + + pd->pre[ref].buf0 = + (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer; + pd->pre[ref].buf = + pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y, + ref_buf->buf->uv_stride, + &ref_buf->sf); + pd->pre[ref].width = ref_buf->buf->uv_crop_width; + pd->pre[ref].height = ref_buf->buf->uv_crop_height; + pd->pre[ref].stride = ref_buf->buf->uv_stride; + + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : &ref_buf->sf; + struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; + + const MV mv = this_mbmi->mv[ref].as_mv; + + uint8_t *pre; + SubpelParams subpel_params; + WarpTypesAllowed warp_types; + warp_types.global_warp_allowed = is_global[ref]; + warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL; + + calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre, + &subpel_params, bw, bh); + conv_params.do_average = ref; + if (is_masked_compound_type(mi->interinter_comp.type)) { + // masked compound type has its own average mechanism + conv_params.do_average = 0; + } + + av1_make_inter_predictor( + pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, + b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types, + (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y, + plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion); + + ++col; + } + ++row; + } + + for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref]; + return; + } + + { + ConvolveParams conv_params = get_conv_params_no_round( + 0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); + av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset, + &conv_params.bck_offset, + &conv_params.use_jnt_comp_avg, is_compound); + + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf; + for (ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf; + struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; + const MV mv = mi->mv[ref].as_mv; + + uint8_t *pre; + SubpelParams subpel_params; + calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre, + &subpel_params, bw, bh); + + WarpTypesAllowed warp_types; + warp_types.global_warp_allowed = is_global[ref]; + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + if (ref && is_masked_compound_type(mi->interinter_comp.type)) { + // masked compound type has its own average mechanism + conv_params.do_average = 0; + av1_make_masked_inter_predictor( + pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw, + bh, &conv_params, mi->interp_filters, plane, &warp_types, + mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd, + cm->allow_warped_motion); + } else { + conv_params.do_average = ref; + av1_make_inter_predictor( + pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw, + bh, &conv_params, mi->interp_filters, &warp_types, + mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref, + mi, build_for_obmc, xd, cm->allow_warped_motion); + } + } + } +} + +static void build_inter_predictors_for_planes(const AV1_COMMON *cm, + MACROBLOCKD *xd, BLOCK_SIZE bsize, + int mi_row, int mi_col, + int plane_from, int plane_to) { + int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + for (plane = plane_from; plane <= plane_to; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + const int bw = pd->width; + const int bh = pd->height; + + if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, + pd->subsampling_y)) + continue; + + build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y); + } +} + +void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize) { + av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, 0); +} + +void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize) { + for (int plane_idx = 1; plane_idx < MAX_MB_PLANE; plane_idx++) { + av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, + plane_idx); + } +} + +void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize, int plane_idx) { + build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, plane_idx, + plane_idx); + + if (is_interintra_pred(xd->mi[0])) { + BUFFER_SET default_ctx = { { NULL, NULL, NULL }, { 0, 0, 0 } }; + if (!ctx) { + default_ctx.plane[plane_idx] = xd->plane[plane_idx].dst.buf; + default_ctx.stride[plane_idx] = xd->plane[plane_idx].dst.stride; + ctx = &default_ctx; + } + av1_build_interintra_predictors_sbp(cm, xd, xd->plane[plane_idx].dst.buf, + xd->plane[plane_idx].dst.stride, ctx, + plane_idx, bsize); + } +} + +void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize) { + const int num_planes = av1_num_planes(cm); + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize); + if (num_planes > 1) + av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize); +} + +// TODO(sarahparker): +// av1_build_inter_predictor should be combined with +// av1_make_inter_predictor +void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, const MV *src_mv, + const struct scale_factors *sf, int w, int h, + ConvolveParams *conv_params, + InterpFilters interp_filters, + const WarpTypesAllowed *warp_types, int p_col, + int p_row, int plane, int ref, + enum mv_precision precision, int x, int y, + const MACROBLOCKD *xd, int can_use_previous) { + const int is_q4 = precision == MV_PRECISION_Q4; + const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, + is_q4 ? src_mv->col : src_mv->col * 2 }; + MV32 mv = av1_scale_mv(&mv_q4, x, y, sf); + mv.col += SCALE_EXTRA_OFF; + mv.row += SCALE_EXTRA_OFF; + + const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, + mv.col & SCALE_SUBPEL_MASK, + mv.row & SCALE_SUBPEL_MASK }; + src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride + + (mv.col >> SCALE_SUBPEL_BITS); + + av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf, + w, h, conv_params, interp_filters, warp_types, p_col, + p_row, plane, ref, xd->mi[0], 0, xd, + can_use_previous); +} + +static INLINE void build_prediction_by_above_pred( + MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, + MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) { + struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; + const int above_mi_col = ctxt->mi_col + rel_mi_col; + int mi_x, mi_y; + MB_MODE_INFO backup_mbmi = *above_mbmi; + + av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width, + above_mbmi, ctxt, num_planes); + mi_x = above_mi_col << MI_SIZE_LOG2; + mi_y = ctxt->mi_row << MI_SIZE_LOG2; + + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + + for (int j = 0; j < num_planes; ++j) { + const struct macroblockd_plane *pd = &xd->plane[j]; + int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x; + int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4, + block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1)); + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue; + build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y); + } + *above_mbmi = backup_mbmi; +} + +void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]) { + if (!xd->up_available) return; + + // Adjust mb_to_bottom_edge to have the correct value for the OBMC + // prediction block. This is half the height of the original block, + // except for 128-wide blocks, where we only use a height of 32. + int this_height = xd->n4_h * MI_SIZE; + int pred_height = AOMMIN(this_height / 2, 32); + xd->mb_to_bottom_edge += (this_height - pred_height) * 8; + + struct build_prediction_ctxt ctxt = { cm, mi_row, + mi_col, tmp_buf, + tmp_width, tmp_height, + tmp_stride, xd->mb_to_right_edge }; + BLOCK_SIZE bsize = xd->mi[0]->sb_type; + foreach_overlappable_nb_above(cm, xd, mi_col, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + build_prediction_by_above_pred, &ctxt); + + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ctxt.mb_to_far_edge; + xd->mb_to_bottom_edge -= (this_height - pred_height) * 8; +} + +static INLINE void build_prediction_by_left_pred( + MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height, + MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) { + struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; + const int left_mi_row = ctxt->mi_row + rel_mi_row; + int mi_x, mi_y; + MB_MODE_INFO backup_mbmi = *left_mbmi; + + av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height, + left_mbmi, ctxt, num_planes); + mi_x = ctxt->mi_col << MI_SIZE_LOG2; + mi_y = left_mi_row << MI_SIZE_LOG2; + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + + for (int j = 0; j < num_planes; ++j) { + const struct macroblockd_plane *pd = &xd->plane[j]; + int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4, + block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1)); + int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y; + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue; + build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y); + } + *left_mbmi = backup_mbmi; +} + +void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]) { + if (!xd->left_available) return; + + // Adjust mb_to_right_edge to have the correct value for the OBMC + // prediction block. This is half the width of the original block, + // except for 128-wide blocks, where we only use a width of 32. + int this_width = xd->n4_w * MI_SIZE; + int pred_width = AOMMIN(this_width / 2, 32); + xd->mb_to_right_edge += (this_width - pred_width) * 8; + + struct build_prediction_ctxt ctxt = { cm, mi_row, + mi_col, tmp_buf, + tmp_width, tmp_height, + tmp_stride, xd->mb_to_bottom_edge }; + BLOCK_SIZE bsize = xd->mi[0]->sb_type; + foreach_overlappable_nb_left(cm, xd, mi_row, + max_neighbor_obmc[mi_size_high_log2[bsize]], + build_prediction_by_left_pred, &ctxt); + + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_right_edge -= (this_width - pred_width) * 8; + xd->mb_to_bottom_edge = ctxt.mb_to_far_edge; +} + +void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col) { + const int num_planes = av1_num_planes(cm); + uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE]; + int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + int len = sizeof(uint16_t); + dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]); + dst_buf1[1] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len); + dst_buf1[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len); + dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]); + dst_buf2[1] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len); + dst_buf2[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len); + } else { + dst_buf1[0] = xd->tmp_obmc_bufs[0]; + dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE; + dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2; + dst_buf2[0] = xd->tmp_obmc_bufs[1]; + dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE; + dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2; + } + av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1, + dst_width1, dst_height1, dst_stride1); + av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2, + dst_width2, dst_height2, dst_stride2); + av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm), + mi_row, mi_col, 0, num_planes); + av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1, + dst_buf2, dst_stride2); +} + +// Builds the inter-predictor for the single ref case +// for use in the encoder to search the wedges efficiently. +static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane, + int bw, int bh, int x, int y, + int w, int h, int mi_x, int mi_y, + int ref, uint8_t *const ext_dst, + int ext_dst_stride, + int can_use_previous) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO *mi = xd->mi[0]; + + const struct scale_factors *const sf = &xd->block_refs[ref]->sf; + struct buf_2d *const pre_buf = &pd->pre[ref]; + uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x; + const MV mv = mi->mv[ref].as_mv; + + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); + WarpTypesAllowed warp_types; + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; + warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype); + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + const int pre_x = (mi_x) >> pd->subsampling_x; + const int pre_y = (mi_y) >> pd->subsampling_y; + uint8_t *pre; + SubpelParams subpel_params; + calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre, + &subpel_params, bw, bh); + + av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride, + &subpel_params, sf, w, h, &conv_params, + mi->interp_filters, &warp_types, pre_x + x, + pre_y + y, plane, ref, mi, 0, xd, can_use_previous); +} + +void av1_build_inter_predictors_for_planes_single_buf( + MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row, + int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3], + int can_use_previous) { + int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + for (plane = plane_from; plane <= plane_to; ++plane) { + const BLOCK_SIZE plane_bsize = get_plane_block_size( + bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x, + mi_y, ref, ext_dst[plane], + ext_dst_stride[plane], can_use_previous); + } +} + +static void build_masked_compound( + uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w) { + // Derive subsampling from h and w passed in. May be refactored to + // pass in subsampling factors directly. + const int subh = (2 << mi_size_high_log2[sb_type]) == h; + const int subw = (2 << mi_size_wide_log2[sb_type]) == w; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, block_size_wide[sb_type], w, h, subw, subh); +} + +static void build_masked_compound_highbd( + uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride, + const uint8_t *src1_8, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w, int bd) { + // Derive subsampling from h and w passed in. May be refactored to + // pass in subsampling factors directly. + const int subh = (2 << mi_size_high_log2[sb_type]) == h; + const int subw = (2 << mi_size_wide_log2[sb_type]) == w; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + // const uint8_t *mask = + // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type); + aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, block_size_wide[sb_type], w, h, + subw, subh, bd); +} + +static void build_wedge_inter_predictor_from_buf( + MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0, + int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_compound = has_second_ref(mbmi); + MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + mbmi->interinter_comp.seg_mask = xd->seg_mask; + const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp; + + if (is_compound && is_masked_compound_type(comp_data->type)) { + if (!plane && comp_data->type == COMPOUND_DIFFWTD) { + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + av1_build_compound_diffwtd_mask_highbd( + comp_data->seg_mask, comp_data->mask_type, + CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd); + else + av1_build_compound_diffwtd_mask( + comp_data->seg_mask, comp_data->mask_type, ext_dst0, + ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w); + } + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + build_masked_compound_highbd( + dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, + mbmi->sb_type, h, w, xd->bd); + else + build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, + ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type, + h, w); + } else { + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + dst, dst_buf->stride, NULL, 0, NULL, 0, w, h, + xd->bd); + else + aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL, + 0, NULL, 0, w, h); + } +} + +void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane_from, int plane_to, + uint8_t *ext_dst0[3], + int ext_dst_stride0[3], + uint8_t *ext_dst1[3], + int ext_dst_stride1[3]) { + int plane; + for (plane = plane_from; plane <= plane_to; ++plane) { + const BLOCK_SIZE plane_bsize = get_plane_block_size( + bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + build_wedge_inter_predictor_from_buf( + xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane], + ext_dst1[plane], ext_dst_stride1[plane]); + } +} diff --git a/third_party/aom/av1/encoder/reconinter_enc.h b/third_party/aom/av1/encoder/reconinter_enc.h new file mode 100644 index 000000000..10d5e8c28 --- /dev/null +++ b/third_party/aom/av1/encoder/reconinter_enc.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_ +#define AOM_AV1_ENCODER_RECONINTER_ENC_H_ + +#include "aom/aom_integer.h" +#include "av1/common/filter.h" +#include "av1/common/blockd.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/convolve.h" +#include "av1/common/warped_motion.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize); + +void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize); + +void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize, int plane_idx); + +void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BUFFER_SET *ctx, + BLOCK_SIZE bsize); + +void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, const MV *src_mv, + const struct scale_factors *sf, int w, int h, + ConvolveParams *conv_params, + InterpFilters interp_filters, + const WarpTypesAllowed *warp_types, int p_col, + int p_row, int plane, int ref, + enum mv_precision precision, int x, int y, + const MACROBLOCKD *xd, int can_use_previous); + +// Detect if the block have sub-pixel level motion vectors +// per component. +#define CHECK_SUBPEL 0 +static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi, + const MACROBLOCKD *const xd, + int dir) { +#if CHECK_SUBPEL + const BLOCK_SIZE bsize = mbmi->sb_type; + int plane; + int ref = (dir >> 1); + + if (dir & 0x01) { + if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1; + } else { + if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1; + } + + return 0; +#else + (void)mbmi; + (void)xd; + (void)dir; + return 1; +#endif +} + +static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) { + MB_MODE_INFO *const mi = xd->mi[0]; + const int is_compound = has_second_ref(mi); + int ref; + for (ref = 0; ref < 1 + is_compound; ++ref) { + int row_col; + for (row_col = 0; row_col < 2; ++row_col) { + const int dir = (ref << 1) + row_col; + if (has_subpel_mv_component(mi, xd, dir)) { + return 1; + } + } + } + return 0; +} + +void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]); + +void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]); + +void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col); + +void av1_build_inter_predictors_for_planes_single_buf( + MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row, + int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3], + int can_use_previous); + +void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane_from, int plane_to, + uint8_t *ext_dst0[3], + int ext_dst_stride0[3], + uint8_t *ext_dst1[3], + int ext_dst_stride1[3]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RECONINTER_ENC_H_ diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h index a207b0f26..1ad13d66a 100644 --- a/third_party/aom/av1/encoder/segmentation.h +++ b/third_party/aom/av1/encoder/segmentation.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_SEGMENTATION_H_ -#define AV1_ENCODER_SEGMENTATION_H_ +#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_ +#define AOM_AV1_ENCODER_SEGMENTATION_H_ #include "av1/common/blockd.h" #include "av1/encoder/encoder.h" @@ -35,4 +35,4 @@ void av1_reset_segment_features(AV1_COMMON *cm); } // extern "C" #endif -#endif // AV1_ENCODER_SEGMENTATION_H_ +#endif // AOM_AV1_ENCODER_SEGMENTATION_H_ diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c index d4b4b19c4..4c35baae0 100644 --- a/third_party/aom/av1/encoder/speed_features.c +++ b/third_party/aom/av1/encoder/speed_features.c @@ -98,6 +98,15 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, sf->use_square_partition_only_threshold = BLOCK_64X64; } + // TODO(huisu@google.com): train models for 720P and above. + if (!is_720p_or_larger) { + sf->ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64 + sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + } + if (speed >= 1) { if (is_720p_or_larger) { sf->use_square_partition_only_threshold = BLOCK_128X128; @@ -106,6 +115,14 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, } else { sf->use_square_partition_only_threshold = BLOCK_32X32; } + + if (!is_720p_or_larger) { + sf->ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64 + sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + } } if (speed >= 2) { @@ -126,13 +143,11 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, if (speed >= 3) { if (is_720p_or_larger) { sf->disable_split_mask = DISABLE_ALL_SPLIT; - sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0; sf->partition_search_breakout_dist_thr = (1 << 25); sf->partition_search_breakout_rate_thr = 200; } else { sf->max_intra_bsize = BLOCK_32X32; sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; - sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0; sf->partition_search_breakout_dist_thr = (1 << 23); sf->partition_search_breakout_rate_thr = 120; } @@ -166,6 +181,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, // Speed 0 for all speed features that give neutral coding performance change. sf->reduce_inter_modes = 1; sf->prune_ext_partition_types_search_level = 1; + sf->ml_prune_rect_partition = 1; sf->ml_prune_ab_partition = 1; sf->ml_prune_4_partition = 1; sf->adaptive_txb_search_level = 1; @@ -173,6 +189,11 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->model_based_prune_tx_search_level = 1; sf->model_based_post_interp_filter_breakout = 1; sf->inter_mode_rd_model_estimation = 1; + sf->prune_ref_frame_for_rect_partitions = + !(boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame); + sf->less_rectangular_check_level = 1; + sf->gm_search_type = GM_REDUCED_REF_SEARCH; + sf->gm_disable_recode = 1; if (speed >= 1) { sf->gm_erroradv_type = GM_ERRORADV_TR_1; @@ -182,8 +203,10 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->intra_tx_size_search_init_depth_rect = 1; sf->intra_tx_size_search_init_depth_sqr = 1; sf->tx_size_search_lgr_block = 1; - sf->two_pass_partition_search = 1; - sf->mode_pruning_based_on_two_pass_partition_search = 1; + if (speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL) { + sf->two_pass_partition_search = 1; + sf->mode_pruning_based_on_two_pass_partition_search = 1; + } sf->prune_ext_partition_types_search_level = 2; sf->use_fast_interpolation_filter_search = 1; sf->skip_repeat_interpolation_filter_search = 1; @@ -198,6 +221,11 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->use_intra_txb_hash = 1; sf->optimize_b_precheck = 1; sf->dual_sgr_penalty_level = 1; + sf->use_accurate_subpel_search = 1; + sf->reuse_inter_intra_mode = 1; + sf->prune_comp_search_by_single_result = 1; + sf->skip_repeated_newmv = 1; + sf->obmc_full_pixel_search_level = 1; } if (speed >= 2) { @@ -206,7 +234,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->selective_ref_frame = 2; sf->fast_cdef_search = 1; - sf->use_rd_breakout = 1; sf->adaptive_rd_thresh = 1; sf->mv.auto_mv_step_size = 1; sf->mv.subpel_iters_per_step = 1; @@ -224,8 +251,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, if (speed >= 3) { sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL; - sf->less_rectangular_check = 1; - sf->mode_skip_start = 10; + sf->less_rectangular_check_level = 2; sf->adaptive_pred_interp_filter = 1; // adaptive_motion_search breaks encoder multi-thread tests. // The values in x->pred_mv[] differ for single and multi-thread cases. @@ -237,6 +263,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->adaptive_rd_thresh = 2; sf->tx_type_search.prune_mode = PRUNE_2D_FAST; sf->gm_search_type = GM_DISABLE_SEARCH; + sf->prune_comp_search_by_single_result = 2; } if (speed >= 4) { @@ -250,10 +277,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 1; sf->cb_partition_search = !boosted; - sf->cb_pred_filter_search = 1; sf->alt_ref_search_fp = 1; - sf->mode_skip_start = 6; - sf->adaptive_interp_filter_search = 1; } if (speed >= 5) { @@ -276,7 +300,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR | FLAG_EARLY_TERMINATE; sf->disable_filter_search_var_thresh = 200; - sf->use_fast_coef_updates = ONE_LOOP_REDUCED; sf->use_fast_coef_costing = 1; sf->partition_search_breakout_rate_thr = 300; sf->use_transform_domain_distortion = 2; @@ -296,33 +319,17 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->simple_model_rd_from_var = 1; } if (speed >= 7) { - const int is_keyframe = cm->frame_type == KEY_FRAME; - const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key; sf->default_max_partition_size = BLOCK_32X32; sf->default_min_partition_size = BLOCK_8X8; sf->intra_y_mode_mask[TX_64X64] = INTRA_DC; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->frame_parameter_update = 0; sf->mv.search_method = FAST_HEX; - sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW; - sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST; - sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST; - sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST; - sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST; - sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST; - sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST; sf->partition_search_type = REFERENCE_PARTITION; - sf->reuse_inter_pred_sby = 1; - sf->force_frame_boost = - is_keyframe || - (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1); - sf->max_delta_qindex = is_keyframe ? 20 : 15; - sf->coeff_prob_appx_step = 4; sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; } if (speed >= 8) { sf->mv.search_method = FAST_DIAMOND; - sf->mv.fullpel_search_step_param = 10; sf->mv.subpel_force_stop = 2; sf->lpf_pick = LPF_PICK_MINIMAL_LPF; } @@ -356,54 +363,6 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) { cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv; } -static void set_dev_sf(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) { - AV1_COMMON *const cm = &cpi->common; - - if (speed & TXFM_CODING_SF) { - sf->inter_tx_size_search_init_depth_rect = 1; - sf->inter_tx_size_search_init_depth_sqr = 1; - sf->intra_tx_size_search_init_depth_rect = 1; - sf->intra_tx_size_search_init_depth_sqr = 1; - sf->tx_size_search_method = USE_FAST_RD; - sf->tx_type_search.fast_intra_tx_type_search = 1; - sf->tx_type_search.fast_inter_tx_type_search = 1; - } - - if (speed & INTER_PRED_SF) { - sf->selective_ref_frame = 2; - // sf->adaptive_motion_search = 1; - sf->mv.auto_mv_step_size = 1; - sf->adaptive_rd_thresh = 1; - sf->mv.subpel_iters_per_step = 1; - sf->adaptive_pred_interp_filter = 1; - } - - if (speed & INTRA_PRED_SF) { - sf->max_intra_bsize = BLOCK_32X32; - } - - if (speed & PARTITION_SF) { - if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) || - has_internal_image_edge(cpi)) { - sf->use_square_partition_only_threshold = - frame_is_boosted(cpi) ? BLOCK_128X128 : BLOCK_4X4; - } else { - sf->use_square_partition_only_threshold = - frame_is_intra_only(cm) ? BLOCK_128X128 : BLOCK_4X4; - } - sf->less_rectangular_check = 1; - sf->prune_ext_partition_types_search_level = 2; - } - - if (speed & LOOP_FILTER_SF) { - sf->fast_cdef_search = 1; - } - - if (speed & RD_SKIP_SF) { - sf->use_rd_breakout = 1; - } -} - void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; @@ -432,9 +391,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { #endif // DISABLE_TRELLISQ_SEARCH sf->gm_erroradv_type = GM_ERRORADV_TR_0; sf->mv.reduce_first_step_size = 0; - sf->coeff_prob_appx_step = 1; sf->mv.auto_mv_step_size = 0; - sf->mv.fullpel_search_step_param = 6; sf->comp_inter_joint_search_thresh = BLOCK_4X4; sf->adaptive_rd_thresh = 0; sf->tx_size_search_method = USE_FULL_RD; @@ -450,7 +407,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->adaptive_motion_search = 0; sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 0; - sf->cb_pred_filter_search = 0; sf->cb_partition_search = 0; sf->alt_ref_search_fp = 0; sf->partition_search_type = SEARCH_PARTITION; @@ -461,22 +417,19 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->tx_type_search.fast_inter_tx_type_search = 0; sf->tx_type_search.skip_tx_search = 0; sf->selective_ref_frame = 0; - sf->less_rectangular_check = 0; + sf->less_rectangular_check_level = 0; sf->use_square_partition_only_threshold = BLOCK_128X128; + sf->prune_ref_frame_for_rect_partitions = 0; sf->auto_min_max_partition_size = NOT_IN_USE; sf->rd_auto_partition_min_limit = BLOCK_4X4; sf->default_max_partition_size = BLOCK_LARGEST; sf->default_min_partition_size = BLOCK_4X4; sf->adjust_partitioning_from_last_frame = 0; - sf->last_partitioning_redo_frequency = 4; sf->disable_split_mask = 0; sf->mode_search_skip_flags = 0; - sf->force_frame_boost = 0; - sf->max_delta_qindex = 0; sf->disable_filter_search_var_thresh = 0; - sf->adaptive_interp_filter_search = 0; sf->allow_partition_search_skip = 0; - sf->use_accurate_subpel_search = 1; + sf->use_accurate_subpel_search = 2; sf->disable_wedge_search_var_thresh = 0; sf->fast_wedge_sign_estimate = 0; sf->drop_ref = 0; @@ -491,48 +444,46 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->optimize_b_precheck = 0; sf->jnt_comp_fast_tx_search = 0; sf->jnt_comp_skip_mv_search = 0; + sf->reuse_inter_intra_mode = 0; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; sf->intra_uv_mode_mask[i] = UV_INTRA_ALL; } - sf->use_rd_breakout = 0; sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE; - sf->use_fast_coef_updates = TWO_LOOP; sf->use_fast_coef_costing = 0; - sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set - sf->schedule_mode_search = 0; - for (i = 0; i < BLOCK_SIZES_ALL; ++i) sf->inter_mode_mask[i] = INTER_ALL; sf->max_intra_bsize = BLOCK_LARGEST; - sf->reuse_inter_pred_sby = 0; // This setting only takes effect when partition_search_type is set // to FIXED_PARTITION. sf->always_this_block_size = BLOCK_16X16; - sf->search_type_check_frequency = 50; // Recode loop tolerance %. sf->recode_tolerance = 25; - sf->default_interp_filter = SWITCHABLE; sf->partition_search_breakout_dist_thr = 0; sf->partition_search_breakout_rate_thr = 0; sf->simple_model_rd_from_var = 0; sf->prune_ext_partition_types_search_level = 0; + sf->ml_prune_rect_partition = 0; sf->ml_prune_ab_partition = 0; sf->ml_prune_4_partition = 0; sf->fast_cdef_search = 0; + for (i = 0; i < PARTITION_BLOCK_SIZES; ++i) + sf->ml_partition_search_breakout_thresh[i] = -1; // -1 means not enabled. // Set this at the appropriate speed levels sf->use_transform_domain_distortion = 0; sf->gm_search_type = GM_FULL_SEARCH; + sf->gm_disable_recode = 0; sf->use_fast_interpolation_filter_search = 0; sf->skip_repeat_interpolation_filter_search = 0; sf->use_hash_based_trellis = 0; + sf->prune_comp_search_by_single_result = 0; + sf->skip_repeated_newmv = 0; // Set decoder side speed feature to use less dual sgr modes sf->dual_sgr_penalty_level = 0; sf->inter_mode_rd_model_estimation = 0; - - set_dev_sf(cpi, sf, oxcf->dev_sf); + sf->obmc_full_pixel_search_level = 0; if (oxcf->mode == GOOD) set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed); @@ -599,10 +550,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { x->min_partition_size = sf->default_min_partition_size; x->max_partition_size = sf->default_max_partition_size; - if (!cpi->oxcf.frame_periodic_boost) { - sf->max_delta_qindex = 0; - } - // This is only used in motion vector unit test. if (cpi->oxcf.motion_vector_unit_test == 1) cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv; @@ -611,5 +558,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { #if CONFIG_DIST_8X8 if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0; + + if (cpi->oxcf.using_dist_8x8) x->min_partition_size = BLOCK_8X8; #endif // CONFIG_DIST_8X8 } diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h index d0408ba2f..41013b2e7 100644 --- a/third_party/aom/av1/encoder/speed_features.h +++ b/third_party/aom/av1/encoder/speed_features.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_SPEED_FEATURES_H_ -#define AV1_ENCODER_SPEED_FEATURES_H_ +#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_ +#define AOM_AV1_ENCODER_SPEED_FEATURES_H_ #include "av1/common/enums.h" @@ -54,25 +54,6 @@ enum { (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV), - INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) | - (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV), - INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) | - (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) | - (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) | - (1 << NEW_NEARMV) | (1 << NEAR_NEWMV), - INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) | - (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | - (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV), - INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) | (1 << NEWMV) | - (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | - (1 << NEW_NEWMV) | (1 << NEW_NEARESTMV) | - (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) | - (1 << NEAR_NEWMV), - INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) | - (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) | - (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) | - (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | - (1 << NEAR_NEARMV), INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) | @@ -132,11 +113,6 @@ typedef enum { // Other methods to come } SUBPEL_SEARCH_METHODS; -typedef enum { - NO_MOTION_THRESHOLD = 0, - LOW_MOTION_THRESHOLD = 7 -} MOTION_THRESHOLD; - typedef enum { USE_FULL_RD = 0, USE_FAST_RD, @@ -178,12 +154,6 @@ typedef enum { FLAG_SKIP_INTRA_LOWVAR = 1 << 5, } MODE_SEARCH_SKIP_LOGIC; -typedef enum { - FLAG_SKIP_EIGHTTAP_REGULAR = 1 << EIGHTTAP_REGULAR, - FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH, - FLAG_SKIP_MULTITAP_SHARP = 1 << MULTITAP_SHARP, -} INTERP_FILTER_MASK; - typedef enum { NO_PRUNE = 0, // eliminates one tx type in vertical and horizontal direction @@ -224,16 +194,6 @@ typedef enum { REFERENCE_PARTITION } PARTITION_SEARCH_TYPE; -typedef enum { - // Does a dry run to see if any of the contexts need to be updated or not, - // before the final run. - TWO_LOOP = 0, - - // No dry run, also only half the coef contexts and bands are updated. - // The rest are not updated at all. - ONE_LOOP_REDUCED = 1 -} FAST_COEFF_UPDATE; - typedef struct MV_SPEED_FEATURES { // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). SEARCH_METHODS search_method; @@ -257,9 +217,6 @@ typedef struct MV_SPEED_FEATURES { // Control when to stop subpel search int subpel_force_stop; - - // This variable sets the step_param used in full pel motion search. - int fullpel_search_step_param; } MV_SPEED_FEATURES; #define MAX_MESH_STEP 4 @@ -332,13 +289,6 @@ typedef struct SPEED_FEATURES { // mode to be evaluated. A high value means we will be faster. int adaptive_rd_thresh; - // Coefficient probability model approximation step size - int coeff_prob_appx_step; - - // The threshold is to determine how slow the motino is, it is used when - // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION - MOTION_THRESHOLD lf_motion_threshold; - // Determine which method we use to determine transform size. We can choose // between options like full rd, largest for prediction size, largest // for intra and model coefs for the rest. @@ -355,11 +305,6 @@ typedef struct SPEED_FEATURES { // largest transform only, since the largest transform block size is 64x64. int tx_size_search_lgr_block; - // After looking at the first set of modes (set by index here), skip - // checking modes for reference frames that don't match the reference frame - // of the best so far. - int mode_skip_start; - PARTITION_SEARCH_TYPE partition_search_type; TX_TYPE_SEARCH tx_type_search; @@ -397,6 +342,9 @@ typedef struct SPEED_FEATURES { // aggressiveness of pruning in order. int prune_ext_partition_types_search_level; + // Use a ML model to prune horz and vert partitions + int ml_prune_rect_partition; + // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions. int ml_prune_ab_partition; @@ -413,12 +361,16 @@ typedef struct SPEED_FEATURES { int mode_pruning_based_on_two_pass_partition_search; // Skip rectangular partition test when partition type none gives better - // rd than partition type split. - int less_rectangular_check; + // rd than partition type split. Can take values 0 - 2, 0 referring to no + // skipping, and 1 - 2 increasing aggressiveness of skipping in order. + int less_rectangular_check_level; // Use square partition only beyond this block size. BLOCK_SIZE use_square_partition_only_threshold; + // Prune reference frames for rectangular partitions. + int prune_ref_frame_for_rect_partitions; + // Sets min and max partition sizes for this superblock based on the // same superblock in last encoded frame, and the left and above neighbor. AUTO_MIN_MAX_MODE auto_min_max_partition_size; @@ -435,10 +387,6 @@ typedef struct SPEED_FEATURES { // frame's partitioning. Only used if use_lastframe_partitioning is set. int adjust_partitioning_from_last_frame; - // How frequently we re do the partitioning from scratch. Only used if - // use_lastframe_partitioning is set. - int last_partitioning_redo_frequency; - // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable // it always, to allow it for only Last frame and Intra, disable it for all // inter modes or to enable it always. @@ -461,8 +409,6 @@ typedef struct SPEED_FEATURES { // Pattern to be used for any exhaustive mesh searches. MESH_PATTERN mesh_patterns[MAX_MESH_STEP]; - int schedule_mode_search; - // Allows sub 8x8 modes to use the prediction filter that was determined // best for 8x8 mode. If set to 0 we always re check all the filters for // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter @@ -472,20 +418,10 @@ typedef struct SPEED_FEATURES { // Adaptive prediction mode search int adaptive_mode_search; - // Chessboard pattern prediction filter type search - int cb_pred_filter_search; - int cb_partition_search; int alt_ref_search_fp; - // Use finer quantizer in every other few frames that run variable block - // partition type search. - int force_frame_boost; - - // Maximally allowed base quantization index fluctuation. - int max_delta_qindex; - // Implements various heuristics to skip searching modes // The heuristics selected are based on flags // defined in the MODE_SEARCH_SKIP_HEURISTICS enum @@ -506,22 +442,9 @@ typedef struct SPEED_FEATURES { int intra_y_mode_mask[TX_SIZES]; int intra_uv_mode_mask[TX_SIZES]; - // This variable enables an early break out of mode testing if the model for - // rd built from the prediction signal indicates a value that's much - // higher than the best rd we've seen so far. - int use_rd_breakout; - // This feature controls how the loop filter level is determined. LPF_PICK_METHOD lpf_pick; - // This feature limits the number of coefficients updates we actually do - // by only looking at counts from 1/2 the bands. - FAST_COEFF_UPDATE use_fast_coef_updates; - - // A binary mask indicating if NEARESTMV, NEARMV, GLOBALMV, NEWMV - // modes are used in order from LSB to MSB for each BLOCK_SIZE. - int inter_mode_mask[BLOCK_SIZES_ALL]; - // This feature controls whether we do the expensive context update and // calculation in the rd coefficient costing loop. int use_fast_coef_costing; @@ -535,28 +458,13 @@ typedef struct SPEED_FEATURES { // TODO(aconverse): Fold this into one of the other many mode skips BLOCK_SIZE max_intra_bsize; - // The frequency that we check if - // FIXED_PARTITION search type should be used. - int search_type_check_frequency; - - // When partition is pre-set, the inter prediction result from pick_inter_mode - // can be reused in final block encoding process. It is enabled only for real- - // time mode speed 6. - int reuse_inter_pred_sby; - - // default interp filter choice - InterpFilter default_interp_filter; - - // adaptive interp_filter search to allow skip of certain filter types. - int adaptive_interp_filter_search; - - // mask for skip evaluation of certain interp_filter type. - INTERP_FILTER_MASK interp_filter_search_mask; - // Partition search early breakout thresholds. int64_t partition_search_breakout_dist_thr; int partition_search_breakout_rate_thr; + // Thresholds for ML based partition search breakout. + int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES]; + // Allow skipping partition search for still image frame int allow_partition_search_skip; @@ -577,6 +485,9 @@ typedef struct SPEED_FEATURES { GM_SEARCH_TYPE gm_search_type; + // whether to disable the global motion recode loop + int gm_disable_recode; + // Do limited interpolation filter search for dual filters, since best choice // usually includes EIGHTTAP_REGULAR. int use_fast_interpolation_filter_search; @@ -624,6 +535,25 @@ typedef struct SPEED_FEATURES { // Dynamically estimate final rd from prediction error and mode cost int inter_mode_rd_model_estimation; + + // Skip some ref frames in compound motion search by single motion search + // result. Has three levels for now: 0 referring to no skipping, and 1 - 3 + // increasing aggressiveness of skipping in order. + // Note: The search order might affect the result. It is better to search same + // single inter mode as a group. + int prune_comp_search_by_single_result; + + // Reuse the inter_intra_mode search result from NEARESTMV mode to other + // single ref modes + int reuse_inter_intra_mode; + + // Set the full pixel search level of obmc + // 0: obmc_full_pixel_diamond + // 1: obmc_refining_search_sad (faster) + int obmc_full_pixel_search_level; + + // flag to skip NEWMV mode in drl if the motion search result is the same + int skip_repeated_newmv; } SPEED_FEATURES; struct AV1_COMP; @@ -635,4 +565,4 @@ void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi); } // extern "C" #endif -#endif // AV1_ENCODER_SPEED_FEATURES_H_ +#endif // AOM_AV1_ENCODER_SPEED_FEATURES_H_ diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c index d7e4f4eb3..75fdf02a5 100644 --- a/third_party/aom/av1/encoder/temporal_filter.c +++ b/third_party/aom/av1/encoder/temporal_filter.c @@ -25,6 +25,7 @@ #include "av1/encoder/mcomp.h" #include "av1/encoder/encoder.h" #include "av1/encoder/ratectrl.h" +#include "av1/encoder/reconinter_enc.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/temporal_filter.h" #include "aom_dsp/aom_dsp_common.h" @@ -37,13 +38,12 @@ static void temporal_filter_predictors_mb_c( MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col, uint8_t *pred, struct scale_factors *scale, int x, int y, - int can_use_previous) { - const int which_mv = 0; + int can_use_previous, int num_planes) { const MV mv = { mv_row, mv_col }; enum mv_precision mv_precision_uv; int uv_stride; // TODO(angiebird): change plane setting accordingly - ConvolveParams conv_params = get_conv_params(which_mv, 0, 0, xd->bd); + ConvolveParams conv_params = get_conv_params(0, 0, xd->bd); const InterpFilters interp_filters = xd->mi[0]->interp_filters; WarpTypesAllowed warp_types; memset(&warp_types, 0, sizeof(WarpTypesAllowed)); @@ -55,37 +55,21 @@ static void temporal_filter_predictors_mb_c( uv_stride = stride; mv_precision_uv = MV_PRECISION_Q3; } - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - av1_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, - 16, 16, which_mv, interp_filters, - &warp_types, x, y, 0, MV_PRECISION_Q3, x, - y, xd, can_use_previous); - - av1_highbd_build_inter_predictor( - u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale, - uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types, - x, y, 1, mv_precision_uv, x, y, xd, can_use_previous); - - av1_highbd_build_inter_predictor( - v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale, - uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types, - x, y, 2, mv_precision_uv, x, y, xd, can_use_previous); - return; - } av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16, &conv_params, interp_filters, &warp_types, x, y, 0, 0, MV_PRECISION_Q3, x, y, xd, can_use_previous); - av1_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width, - &mv, scale, uv_block_width, uv_block_height, - &conv_params, interp_filters, &warp_types, x, y, 1, - 0, mv_precision_uv, x, y, xd, can_use_previous); + if (num_planes > 1) { + av1_build_inter_predictor( + u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale, + uv_block_width, uv_block_height, &conv_params, interp_filters, + &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous); - av1_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width, - &mv, scale, uv_block_width, uv_block_height, - &conv_params, interp_filters, &warp_types, x, y, 2, - 0, mv_precision_uv, x, y, xd, can_use_previous); + av1_build_inter_predictor( + v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale, + uv_block_width, uv_block_height, &conv_params, interp_filters, + &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous); + } } void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, @@ -214,7 +198,8 @@ void av1_highbd_temporal_filter_apply_c( static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, uint8_t *arf_frame_buf, uint8_t *frame_ptr_buf, - int stride) { + int stride, int x_pos, + int y_pos) { MACROBLOCK *const x = &cpi->td.mb; MACROBLOCKD *const xd = &x->e_mbd; const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; @@ -250,11 +235,9 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, x->mvcost = x->mv_cost_stack; x->nmvjointcost = x->nmv_vec_cost; - // Use mv costing from x->mvcost directly - av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1, - cond_cost_list(cpi, cost_list), &cpi->fn_ptr[BLOCK_16X16], 0, - &best_ref_mv1); - + av1_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param, + NSTEP, 1, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, 0, 0, x_pos, y_pos, 0); x->mv_limits = tmp_mv_limits; // Ignore mv costing by sending NULL pointer instead of cost array @@ -370,7 +353,8 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, // Find best match in this frame by MC int err = temporal_filter_find_matching_mb_c( cpi, frames[alt_ref_index]->y_buffer + mb_y_offset, - frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride); + frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride, + mb_col * 16, mb_row * 16); // Assign higher weight to matching MB if it's error // score is lower. If not applying MC default behavior @@ -386,7 +370,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row, mbd->mi[0]->mv[0].as_mv.col, predictor, scale, mb_col * 16, - mb_row * 16, cm->allow_warped_motion); + mb_row * 16, cm->allow_warped_motion, num_planes); // Apply the filter (YUV) if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { @@ -556,14 +540,6 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost, strength = group_boost / 300; } - // Adjustments for second level arf in multi arf case. - if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) { - strength >>= 1; - } - } - *arnr_frames = frames; *arnr_strength = strength; } @@ -593,21 +569,6 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) { int which_arf = gf_group->arf_update_idx[gf_group->index]; -#if USE_GF16_MULTI_LAYER - if (cpi->rc.baseline_gf_interval == 16) { - // Identify the index to the current ARF. - const int num_arfs_in_gf = cpi->num_extra_arfs + 1; - int arf_idx; - for (arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) { - if (gf_group->index == cpi->arf_pos_in_gf[arf_idx]) { - which_arf = arf_idx; - break; - } - } - assert(arf_idx < num_arfs_in_gf); - } -#endif // USE_GF16_MULTI_LAYER - // Set the temporal filtering status for the corresponding OVERLAY frame if (strength == 0 && frames_to_blur == 1) cpi->is_arf_filter_off[which_arf] = 1; diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h index bc0863a63..2ddc68b2c 100644 --- a/third_party/aom/av1/encoder/temporal_filter.h +++ b/third_party/aom/av1/encoder/temporal_filter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_TEMPORAL_FILTER_H_ -#define AV1_ENCODER_TEMPORAL_FILTER_H_ +#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ +#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ #ifdef __cplusplus extern "C" { @@ -22,4 +22,4 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance); } // extern "C" #endif -#endif // AV1_ENCODER_TEMPORAL_FILTER_H_ +#endif // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h index de1cbe99c..63b505f36 100644 --- a/third_party/aom/av1/encoder/tokenize.h +++ b/third_party/aom/av1/encoder/tokenize.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_TOKENIZE_H_ -#define AV1_ENCODER_TOKENIZE_H_ +#ifndef AOM_AV1_ENCODER_TOKENIZE_H_ +#define AOM_AV1_ENCODER_TOKENIZE_H_ #include "av1/common/entropy.h" #include "av1/encoder/block.h" @@ -70,4 +70,4 @@ static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id, } // extern "C" #endif -#endif // AV1_ENCODER_TOKENIZE_H_ +#endif // AOM_AV1_ENCODER_TOKENIZE_H_ diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h index 69063b801..405bc9e6e 100644 --- a/third_party/aom/av1/encoder/tx_prune_model_weights.h +++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ -#define AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ +#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ #ifdef __cplusplus extern "C" { @@ -19,79 +19,114 @@ extern "C" { #include "av1/encoder/ml.h" // Tx type model for 4x4 block. -static const float av1_tx_type_nn_weights_4x4_layer0[32] = { - 0.72406f, -0.40019f, 0.51795f, -0.43881f, -0.49746f, -0.41780f, -0.39409f, - -0.16183f, -1.00135f, -0.41733f, -0.96534f, 0.93272f, 1.06229f, 0.04188f, - 0.60919f, 0.92405f, -0.39359f, 0.70570f, 0.75375f, 1.11966f, -1.86360f, - -0.35421f, 0.18743f, 0.13346f, -0.21262f, 0.07050f, 0.10533f, -0.47402f, - 1.33417f, 1.72899f, 1.17983f, 0.10552f, +static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = { + -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f, + 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f, + -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f, + 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f, + 1.35792f, 0.27733f, 0.88660f, -0.68304f, }; -static const float av1_tx_type_nn_bias_4x4_layer0[8] = { - 1.96273f, -0.69845f, -0.10999f, -1.11311f, - 1.35101f, 0.43842f, -0.29264f, -1.15376f, +static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = { + 1.38742f, 0.59540f, -1.37622f, 1.92114f, + 0.00000f, -0.38998f, -0.32726f, -0.15650f, }; -static const float av1_tx_type_nn_weights_4x4_layer1[32] = { - 0.79770f, 0.08520f, 0.23298f, 0.05285f, 0.87506f, -0.90784f, -0.06197f, - -1.00580f, 0.68639f, -0.34881f, 0.15366f, -1.64658f, 0.80755f, -0.26293f, - 0.10253f, -0.23915f, 1.14696f, -0.10928f, -1.61377f, 0.00863f, 0.98599f, - -0.43872f, 0.61196f, -0.03787f, 1.01060f, 0.17643f, -0.00208f, -0.15738f, - 0.06517f, 0.72885f, 0.24387f, 1.28535f, +static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = { + 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f, + -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f, + -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f, + 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f, + -0.26782f, -0.65416f, -0.10648f, 0.05568f, }; -static const float av1_tx_type_nn_bias_4x4_layer1[4] = { - 1.23769f, - 1.40308f, - 0.09871f, - 1.82070f, +static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = { + 4.07177f, + 3.26961f, + 0.58083f, + 1.21199f, }; -static const NN_CONFIG av1_tx_type_nnconfig_4x4 = { +static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = { 4, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 8, }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x4_hor_layer0, + av1_tx_type_nn_weights_4x4_hor_layer1 }, + { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = { + -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f, + 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f, + 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f, + 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f, + -0.06589f, -0.28142f, -0.33118f, 1.72227f, +}; + +static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = { + -0.33685f, 0.22025f, 0.28140f, 0.56138f, + 0.93489f, -1.77048f, 1.34989f, -0.93747f, +}; + +static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = { + -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f, + 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f, + -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f, + -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f, + -0.86315f, -0.53336f, 0.30320f, -1.32331f, +}; + +static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = { + -1.31519f, + -3.26321f, + 1.71794f, + -1.90778f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers { - av1_tx_type_nn_weights_4x4_layer0, - av1_tx_type_nn_weights_4x4_layer1, - }, - { - av1_tx_type_nn_bias_4x4_layer0, - av1_tx_type_nn_bias_4x4_layer1, - }, + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x4_ver_layer0, + av1_tx_type_nn_weights_4x4_ver_layer1 }, + { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 } }; /******************************************************************************/ // Tx type model for 4x8 block. static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = { - 0.68355f, -0.06887f, 0.68525f, -0.86048f, -0.35906f, -0.28597f, -0.21108f, - 0.12591f, -1.13025f, -0.65695f, -0.25658f, 0.39155f, 0.89011f, 0.19258f, - 0.28316f, 0.61172f, 0.52587f, 0.99182f, 0.75704f, 0.66788f, -1.61814f, - -1.23483f, -0.62868f, -0.11902f, 0.33295f, 0.64796f, 0.92345f, -0.71821f, - 0.07575f, 0.34687f, 0.20518f, -0.87850f, + 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f, + 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f, + -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f, + -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f, + -1.35896f, -1.17121f, 1.68866f, 0.10357f, }; static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = { - 1.14049f, -0.18583f, 1.92114f, -0.72057f, - 1.32715f, 0.96713f, 1.09877f, -0.64345f, + 2.93391f, 0.66831f, -0.21419f, 0.00000f, + -0.72878f, 0.15127f, -1.46755f, 0.16658f, }; static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = { - 0.71978f, 0.06896f, 1.48617f, 0.97124f, -0.02487f, -0.95359f, 0.68983f, - -0.16313f, 0.51324f, -0.33770f, 0.45938f, -1.08238f, 0.72938f, 0.42300f, - 0.85691f, -0.03783f, 1.12617f, -0.04034f, 0.36923f, 0.25638f, 1.10167f, - 0.41633f, 0.72602f, -0.14797f, 0.66888f, 0.11437f, -0.99797f, -0.20725f, - 1.01163f, 2.06308f, 1.23331f, -0.15481f, + -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f, + -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f, + 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f, + 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f, + -0.50191f, 0.18219f, 1.83664f, -0.75276f, }; static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = { - 2.14443f, - 1.98356f, - 0.74616f, - 2.58795f, + -1.17455f, + -2.26089f, + -1.79863f, + -2.26333f, }; static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = { @@ -101,62 +136,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = { { 8, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_4x8_hor_layer0, - av1_tx_type_nn_weights_4x8_hor_layer1, - }, - { - av1_tx_type_nn_bias_4x8_hor_layer0, - av1_tx_type_nn_bias_4x8_hor_layer1, - }, + { av1_tx_type_nn_weights_4x8_hor_layer0, + av1_tx_type_nn_weights_4x8_hor_layer1 }, + { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 } }; static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = { - 0.88859f, 1.02796f, 1.15509f, 0.61719f, 0.85804f, 1.17581f, 0.93524f, - 0.06546f, 0.08018f, -0.78562f, -0.36614f, 0.14149f, -0.30069f, -0.52647f, - -0.82789f, 0.60527f, -1.74026f, -0.20271f, 0.09875f, 0.03708f, 0.09430f, - -0.24043f, -0.38433f, 1.21014f, 1.42443f, 0.69586f, 1.07812f, 1.21748f, - 1.10989f, 0.93122f, 1.04127f, 0.39424f, 0.95592f, 0.12904f, 0.46330f, - 0.49722f, 0.46303f, 0.36979f, 0.60227f, 0.39345f, -2.01632f, -0.05706f, - 0.07766f, -0.01271f, -0.16577f, -0.21957f, -0.14800f, 0.24898f, 0.27662f, - 0.42028f, 0.44748f, 1.14585f, 1.38805f, 0.46182f, -0.22982f, -0.07324f, - 0.29886f, -0.46959f, -0.04228f, -0.01064f, 0.24260f, -0.32282f, -0.23804f, - 1.44466f, -0.42190f, -0.36385f, 0.39746f, 0.38557f, -0.09624f, -0.21540f, - 0.57385f, -0.72878f, -0.39677f, -0.00717f, 0.60499f, 1.33849f, 1.05337f, - 1.11947f, 0.38487f, 0.86534f, -0.33970f, 0.71140f, 0.20772f, 0.61132f, - 0.06181f, -0.20027f, 0.13736f, -0.72321f, 0.64586f, -0.56740f, -0.90912f, - -0.20452f, 0.15381f, -0.84346f, 0.19550f, 0.63164f, 1.35441f, 0.63218f, - 0.82883f, 0.38803f, -0.23874f, -0.02962f, 0.23846f, -0.06822f, -0.40159f, - -0.17850f, -0.69524f, 1.12299f, -0.08286f, -0.14150f, -0.28456f, -0.41519f, - -0.12792f, -0.55286f, 0.51655f, 0.06636f, 0.73759f, 0.70072f, 0.12616f, - 0.31282f, 0.17130f, -1.34233f, 0.37221f, 0.95838f, 0.16286f, 1.04301f, - 0.73600f, -0.11233f, + -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f, + -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f, + -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f, + 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f, + 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f, + 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f, + -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f, + -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f, + 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f, + -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f, + -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f, + -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f, + 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f, + 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f, + -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f, + -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f, + 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f, + -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f, + -0.21958f, 0.05970f, }; static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = { - -0.89131f, 0.09124f, -0.71678f, -1.19929f, 0.98963f, 0.16896f, - -0.44943f, -0.97532f, -0.13997f, 1.07136f, -0.46362f, -0.45253f, - -0.63015f, -0.20008f, 1.24048f, -0.21265f, + 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f, + 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f, + 0.08288f, 0.18195f, -0.79890f, 0.10047f, }; static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = { - -0.79795f, 0.45973f, -0.54188f, -1.05095f, 0.64404f, -0.56470f, -0.57018f, - 0.61644f, 0.50229f, 1.14006f, 0.13805f, -0.42058f, -0.07468f, 0.66203f, - 0.93180f, -0.59662f, -0.25152f, 0.00336f, 1.09769f, -1.11921f, 0.15151f, - 0.58750f, -0.42480f, -0.95908f, -0.10980f, 1.31715f, 0.06665f, -0.52371f, - 0.37228f, -0.12364f, 0.54876f, -0.32698f, 0.39863f, -0.97669f, -1.06351f, - 1.82755f, 1.02851f, 0.10322f, -0.08322f, 0.08891f, -0.05715f, 0.93503f, - 0.02096f, -0.39506f, -0.99330f, -0.09407f, 0.75108f, -0.30104f, 1.78314f, - -0.01786f, -0.17392f, 0.00461f, 0.41394f, 0.92566f, 1.11251f, -0.71380f, - -0.04907f, 0.12736f, 0.00208f, 0.94451f, -0.31783f, -0.19655f, 0.64619f, - 0.50359f, + -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f, + -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f, + -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f, + -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f, + 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f, + 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f, + -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f, + -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f, + -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f, + -1.01848f, }; static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = { - 0.39274f, - 1.27276f, - 0.30322f, - 2.55238f, + -1.45955f, + -2.08949f, + -1.24813f, + -1.55368f, }; static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = { @@ -166,64 +196,59 @@ static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = { { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_4x8_ver_layer0, - av1_tx_type_nn_weights_4x8_ver_layer1, - }, - { - av1_tx_type_nn_bias_4x8_ver_layer0, - av1_tx_type_nn_bias_4x8_ver_layer1, - }, + { av1_tx_type_nn_weights_4x8_ver_layer0, + av1_tx_type_nn_weights_4x8_ver_layer1 }, + { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 } }; /******************************************************************************/ // Tx type model for 8x4 block. static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = { - 0.64828f, 0.61618f, 0.98975f, -0.14562f, 0.26957f, 1.80872f, 0.58299f, - -0.06917f, 0.00937f, -0.74073f, -0.66045f, -0.04576f, -0.39802f, -0.76960f, - -0.85166f, 0.88799f, -0.70694f, -0.34366f, -0.54906f, -0.39502f, -0.29465f, - -0.49650f, -0.32171f, 1.37181f, 1.30432f, 0.71843f, 1.01916f, 1.01582f, - 0.90999f, 0.86334f, 1.04603f, 0.40734f, 0.96187f, 0.53742f, 0.07510f, - 0.44167f, 0.02049f, -0.02874f, 0.97191f, 1.03647f, -2.62751f, -0.01390f, - -0.09282f, -0.02522f, -0.30849f, -0.19386f, -0.51356f, 0.52778f, 0.77191f, - 0.75416f, 0.69067f, 0.93561f, 1.35982f, 0.76193f, 0.57869f, 0.00251f, - -0.87244f, -0.26922f, -0.06682f, 0.07176f, 0.51142f, 0.58948f, 0.13914f, - 0.71165f, -0.40329f, -0.33201f, 0.35293f, 0.33437f, -0.01812f, -0.24765f, - 0.26810f, -0.77088f, 1.35707f, 0.22243f, 0.78402f, 0.66191f, 0.79890f, - 1.90669f, 0.73189f, 0.24222f, -0.34682f, 0.66990f, 0.19554f, 0.58414f, - 0.05060f, -0.21271f, 0.11656f, -0.74907f, 0.68837f, -0.39147f, -1.78263f, - -0.69918f, -0.06838f, -0.26927f, 0.38502f, 0.08305f, 1.29848f, 0.67328f, - 0.67269f, 0.65805f, -0.47778f, -1.02617f, 0.16523f, 0.12223f, -0.35294f, - -0.15866f, -0.56224f, 1.25895f, -0.21422f, -0.33518f, -0.33519f, -0.37414f, - 0.55122f, 0.14806f, 0.44312f, -0.07865f, 0.75295f, 0.10766f, 0.59922f, - 0.48837f, -0.19099f, -2.07991f, 0.35755f, 0.87813f, 0.07559f, 1.00724f, - 0.25223f, -0.06761f, + -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f, + 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f, + -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f, + -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f, + -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f, + 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f, + 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f, + -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f, + -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f, + 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f, + 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f, + -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f, + -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f, + 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f, + 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f, + 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f, + -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f, + -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f, + -1.85523f, 0.92532f, }; static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = { - -0.54227f, 0.08599f, -0.77447f, -1.10920f, 0.89298f, 0.05454f, - -0.73681f, 0.21048f, -0.41041f, 1.25690f, -0.60918f, 0.14661f, - -0.65392f, -0.25881f, 1.67995f, -0.03550f, + 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f, + -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f, + -0.28958f, -0.32869f, -0.01704f, 0.68171f, }; static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = { - -0.22312f, 0.73552f, 0.48399f, -0.66996f, 0.36527f, -0.42228f, -1.10793f, - 0.31167f, 0.16177f, 1.69315f, -0.06287f, -0.35804f, -0.24889f, 0.80824f, - 1.08952f, -0.62838f, 0.30066f, -0.19043f, -0.00518f, -1.31005f, 0.65797f, - 1.07714f, -0.24253f, 0.49779f, 0.05848f, 1.08914f, 0.08015f, -0.38853f, - 0.35108f, -0.11026f, 0.64528f, -0.37615f, 0.39995f, -0.58117f, -1.29627f, - 1.74169f, 0.75558f, -0.04910f, 0.35020f, 0.04556f, 0.12634f, 1.27223f, - 0.02608f, -0.19687f, -0.78649f, -0.22746f, 1.02589f, -0.28411f, 1.42443f, - -0.42115f, -0.21153f, -0.01733f, 0.62001f, 0.87167f, 1.66008f, -0.39179f, - -0.06293f, 0.27012f, 0.16871f, 0.64597f, 0.67358f, -0.20053f, 0.95830f, - 0.44232f, + -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f, + -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f, + 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f, + -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f, + 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f, + -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f, + -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f, + 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f, + 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f, + -1.10654f, }; static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = { - 0.14889f, - 1.74197f, - 0.53696f, - 2.87574f, + -0.92861f, + -1.45151f, + -1.33588f, + -4.33853f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = { @@ -233,42 +258,37 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = { { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_8x4_hor_layer0, - av1_tx_type_nn_weights_8x4_hor_layer1, - }, - { - av1_tx_type_nn_bias_8x4_hor_layer0, - av1_tx_type_nn_bias_8x4_hor_layer1, - }, + { av1_tx_type_nn_weights_8x4_hor_layer0, + av1_tx_type_nn_weights_8x4_hor_layer1 }, + { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 } }; static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = { - 0.81919f, 0.15527f, 0.60055f, -0.54617f, -0.35510f, -0.28223f, -0.20478f, - 0.15001f, -1.84806f, -0.30274f, -0.00865f, 0.33939f, 1.11970f, 0.44630f, - 0.32074f, 0.39637f, 0.08149f, 1.28070f, 0.86703f, 0.76503f, -1.83991f, - -1.13575f, -0.68605f, -0.23690f, 0.07099f, 0.64960f, 0.82543f, -0.72028f, - 0.08220f, 0.34338f, 0.20245f, -0.88920f, + -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f, + -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f, + -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f, + -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f, + 1.66212f, 1.70826f, 1.55182f, 0.12230f, }; static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = { - 1.14995f, -0.16021f, 2.38325f, -0.65179f, - 1.09624f, 1.07662f, 0.63837f, -0.64847f, + 0.10943f, 2.09789f, 2.16578f, 0.15766f, + -0.42461f, 0.00000f, 1.22090f, -1.28717f, }; static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = { - 0.10278f, 0.06819f, 1.73885f, 1.29889f, -0.18482f, -1.06132f, 0.67003f, - -0.23280f, 0.50181f, -0.33890f, 0.43524f, -1.03147f, 1.09640f, 0.66332f, - 0.47652f, -0.02251f, 0.94245f, -0.03861f, 0.84776f, 0.28377f, 0.92044f, - 0.23572f, 0.52082f, -0.16266f, 0.45290f, 0.11342f, -0.50310f, -0.92633f, - 1.46345f, 1.84714f, 1.06804f, -0.13610f, + 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f, + 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f, + 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f, + -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f, + -1.15005f, -0.39311f, 1.51236f, -1.68973f, }; static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = { - 2.41028f, - 1.95675f, - 0.82387f, - 2.41923f, + 1.81013f, + 1.10517f, + 2.90059f, + 0.95391f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = { @@ -278,131 +298,181 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = { { 8, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_8x4_ver_layer0, - av1_tx_type_nn_weights_8x4_ver_layer1, - }, - { - av1_tx_type_nn_bias_8x4_ver_layer0, - av1_tx_type_nn_bias_8x4_ver_layer1, - }, + { av1_tx_type_nn_weights_8x4_ver_layer0, + av1_tx_type_nn_weights_8x4_ver_layer1 }, + { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 } }; /******************************************************************************/ // Tx type model for 8x8 block. -static const float av1_tx_type_nn_weights_8x8_layer0[128] = { - 0.98214f, 1.05643f, 0.91173f, 0.24165f, 0.39961f, 0.25736f, 0.68593f, - 0.10553f, 0.13353f, -0.49687f, -1.66413f, 1.16584f, 2.25147f, -0.72247f, - -2.65486f, -0.03628f, -1.47746f, -1.07644f, -1.25551f, -0.91260f, -1.26199f, - -1.06022f, -1.42138f, 1.10500f, 2.96552f, -0.40638f, 0.02258f, -0.23137f, - 0.34922f, -0.01454f, 0.41251f, 0.35944f, -1.56742f, 0.01406f, 0.88114f, - 1.42462f, 0.87243f, 0.02439f, 0.07035f, 0.34303f, -3.16843f, 0.25798f, - 0.07494f, 0.38926f, -0.12267f, 0.09049f, -0.36711f, 0.01551f, 1.41269f, - 1.33505f, 1.43627f, 1.41909f, 1.44605f, 1.43008f, 1.36721f, 0.19443f, - -0.08606f, 0.17285f, 0.63692f, 0.92092f, 0.61007f, 0.87100f, -0.33631f, - 1.98025f, -0.40686f, -0.33808f, 0.34919f, 0.33817f, -0.01807f, -0.25259f, - 0.26442f, -0.76979f, 1.07788f, -1.38747f, 1.34315f, 2.79947f, 2.02838f, - -0.25062f, 0.00174f, 1.25888f, 0.17344f, 0.20897f, 1.28765f, 1.95749f, - 1.62351f, 1.04556f, 0.43858f, 0.12463f, 1.66399f, 0.03971f, 0.36614f, - 0.56932f, 0.15982f, 0.11587f, 0.21402f, 1.89386f, -0.91267f, -0.79781f, - 1.79155f, 0.60147f, -0.90118f, -4.32718f, -0.58154f, -0.02181f, -0.40734f, - -0.11409f, -0.79470f, 0.69697f, -0.16588f, -0.16090f, -0.21236f, -0.52776f, - -0.64455f, 0.09173f, 0.80766f, 0.76097f, 0.20295f, -0.93467f, -0.43509f, - 0.59659f, 0.07788f, -3.79459f, 0.16268f, 0.47343f, 0.05106f, -0.24880f, - 1.18941f, 0.10346f, -}; - -static const float av1_tx_type_nn_bias_8x8_layer0[16] = { - 0.75780f, 0.25628f, 0.19911f, -0.41384f, 1.33909f, 0.31498f, - -1.37171f, -1.09561f, -0.44056f, 0.49001f, -0.65804f, -1.96031f, - 0.64806f, -0.52520f, 1.38838f, 0.15519f, -}; - -static const float av1_tx_type_nn_weights_8x8_layer1[64] = { - -0.63856f, -2.02670f, -0.92947f, 0.00216f, 1.47710f, -2.01099f, -2.11289f, - -0.92288f, 0.19296f, 1.37866f, -0.85975f, -0.78624f, -2.10392f, 0.13976f, - 1.06968f, -2.04120f, 0.57991f, -1.84941f, -0.81512f, -2.08254f, -0.47334f, - 0.12256f, -1.39594f, -1.02829f, 0.06134f, 2.25646f, -1.25196f, -2.65317f, - -1.94473f, 0.10989f, 0.55446f, -1.76557f, 0.33455f, -1.85556f, -3.01878f, - -0.25100f, 1.65520f, -1.61409f, 1.16336f, -1.15560f, 0.13631f, 1.50733f, - -1.07538f, -0.91200f, -1.93132f, 0.09271f, 0.24425f, -1.80655f, -0.01138f, - -1.36421f, -0.62970f, -0.84766f, -0.34714f, -0.50531f, 1.91005f, -1.60316f, - -0.02495f, 1.04938f, 0.28411f, -0.79809f, -1.48232f, 0.00766f, 0.94016f, - -1.11974f, -}; - -static const float av1_tx_type_nn_bias_8x8_layer1[4] = { - 0.53574f, - 1.57736f, - -0.13698f, - 2.64613f, -}; - -static const NN_CONFIG av1_tx_type_nnconfig_8x8 = { +static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = { + -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f, + -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f, + 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f, + 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f, + -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f, + -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f, + -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f, + 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f, + 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f, + -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f, + 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f, + -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f, + 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f, + 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f, + 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f, + 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f, + 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f, + 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f, + -0.99892f, 1.09823f, +}; + +static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = { + -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f, + -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f, + -0.26319f, 2.65579f, -1.30137f, -0.01487f, +}; + +static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = { + -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f, + -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f, + 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f, + 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f, + 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f, + -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f, + 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f, + 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f, + 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f, + 0.06161f, +}; + +static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = { + 1.70385f, + 1.82373f, + 1.78496f, + 1.80826f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x8_hor_layer0, + av1_tx_type_nn_weights_8x8_hor_layer1 }, + { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = { + -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f, + 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f, + -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f, + -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f, + 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f, + 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f, + 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f, + -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f, + -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f, + 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f, + 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f, + -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f, + 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f, + 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f, + -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f, + 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f, + -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f, + -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f, + -1.29848f, 0.39308f, +}; + +static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = { + -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f, + 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f, + 0.83015f, 0.06024f, 1.17180f, 0.65122f, +}; + +static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = { + -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f, + 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f, + 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f, + 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f, + 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f, + 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f, + 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f, + 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f, + -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f, + -0.41305f, +}; + +static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = { + 2.14067f, + 2.76699f, + 2.04233f, + 1.34803f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers { - av1_tx_type_nn_weights_8x8_layer0, - av1_tx_type_nn_weights_8x8_layer1, - }, - { - av1_tx_type_nn_bias_8x8_layer0, - av1_tx_type_nn_bias_8x8_layer1, - }, + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x8_ver_layer0, + av1_tx_type_nn_weights_8x8_ver_layer1 }, + { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 } }; /******************************************************************************/ // Tx type model for 8x16 block. static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = { - 1.36274f, 1.37313f, 1.26859f, 1.26459f, 1.37979f, 1.47217f, 1.29710f, - 0.15765f, 0.31552f, -0.05727f, 0.25562f, 0.47925f, -0.32913f, -0.55757f, - -0.98010f, 0.08568f, -0.62754f, 0.12834f, -0.03717f, 0.06286f, 0.26159f, - 0.26023f, -0.62605f, 1.34500f, 1.47720f, 0.47937f, 0.84793f, 0.87866f, - 0.81260f, 0.74761f, 0.84217f, 0.53321f, -0.78232f, 0.35321f, 0.41240f, - 0.45002f, 0.88973f, 0.51055f, 0.91115f, -0.45512f, -2.37418f, -0.25205f, - 0.05893f, -0.15685f, -0.25156f, -0.17104f, -0.12230f, 0.17802f, 0.18796f, - -0.05797f, 0.26484f, 1.23515f, 1.70393f, 0.46022f, -0.14354f, 0.08501f, - -0.84625f, -0.42578f, -0.29345f, -0.51797f, -0.56515f, -0.47465f, 0.23970f, - 1.59912f, -0.40332f, -0.33209f, 0.37274f, 0.36831f, -0.00248f, -0.24295f, - 0.29539f, -0.76136f, -0.22531f, 0.12371f, 0.37889f, 1.02639f, 1.73330f, - 1.09686f, 1.04111f, 0.69006f, -1.27157f, 0.94013f, 0.61621f, 0.62274f, - 0.48759f, 0.55672f, 0.62597f, -0.38846f, 1.72124f, 0.08214f, -0.06650f, - 0.32617f, 0.10958f, 0.24650f, 0.10740f, 1.16861f, 0.50701f, 0.45383f, - 0.90016f, -0.00695f, -0.11986f, -0.07834f, 0.20346f, 0.25863f, -0.40889f, - -0.11344f, -0.79108f, 0.76259f, -0.14562f, -0.15459f, -0.20954f, -0.51306f, - 0.02743f, -0.82456f, -0.00861f, -0.27274f, 0.28762f, 0.07282f, 0.26410f, - 0.53413f, -0.22208f, -0.85031f, -1.39129f, -0.74519f, 0.09771f, 0.80313f, - 1.07698f, 0.02531f, + -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f, + 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f, + -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f, + 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f, + -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f, + 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f, + -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f, + 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f, + -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f, + -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f, + 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f, + 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f, + -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f, + 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f, + -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f, + 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f, + 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f, + -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f, + -0.28136f, 0.42556f, }; static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = { - -1.30434f, -1.19259f, -0.43467f, -0.85386f, 0.96584f, 0.29276f, - -0.41990f, -0.96924f, -0.30933f, 0.95264f, -0.25330f, -1.19584f, - 1.46564f, -0.42959f, 1.55720f, 0.18479f, + 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f, + -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f, + 1.81560f, -1.02643f, -0.81690f, 0.08302f, }; static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = { - -1.72959f, -0.21670f, 0.10616f, -0.02006f, 0.15084f, -0.85303f, -0.27535f, - 0.58704f, 0.23683f, 1.19743f, 0.77971f, 0.49874f, 0.19508f, 0.19641f, - 1.47895f, -0.52173f, -0.56746f, -0.50761f, 0.15864f, -0.95168f, 0.48103f, - 0.91904f, -0.11700f, 0.62863f, 0.06526f, 1.63803f, -0.72325f, -1.80449f, - 0.66373f, 0.12831f, 0.27139f, -0.26346f, 1.50852f, 0.25079f, -0.54255f, - 1.78815f, 1.39691f, -0.44989f, -0.18511f, -1.52903f, 0.13983f, 1.06906f, - -0.30184f, 0.37566f, 0.46209f, 0.10440f, 0.64695f, -0.34002f, 1.96990f, - 0.21189f, -0.91248f, -0.11263f, 0.26708f, 1.27405f, 1.89776f, 0.02081f, - -0.06977f, -0.02584f, 0.47733f, 0.27117f, 1.33315f, -0.09175f, 0.48747f, - 1.16772f, + 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f, + -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f, + 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f, + -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f, + 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f, + 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f, + 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f, + 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f, + 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f, + -1.31243f, }; static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = { - 1.25783f, - 1.19452f, - 0.69964f, - 2.41982f, + 0.83359f, + 1.06875f, + 1.77645f, + 1.49570f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = { @@ -412,62 +482,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = { { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_8x16_hor_layer0, - av1_tx_type_nn_weights_8x16_hor_layer1, - }, - { - av1_tx_type_nn_bias_8x16_hor_layer0, - av1_tx_type_nn_bias_8x16_hor_layer1, - }, + { av1_tx_type_nn_weights_8x16_hor_layer0, + av1_tx_type_nn_weights_8x16_hor_layer1 }, + { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 } }; static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = { - 0.90888f, 0.86305f, 0.81674f, 0.75352f, 1.07834f, 0.99048f, 0.96355f, - 0.13836f, -0.51334f, 0.19906f, 1.84608f, 0.67828f, 0.45876f, 0.08325f, - 0.28190f, -0.01958f, -1.96553f, 0.27837f, -0.05929f, 0.13491f, 0.21036f, - 0.05797f, -0.01373f, 0.73765f, 1.39603f, -0.53767f, 0.10362f, 0.03420f, - 0.41909f, 0.09510f, 0.32284f, 0.83860f, 0.13954f, 0.48434f, 1.47762f, - 0.45891f, 0.23613f, 0.13013f, 0.82097f, -0.03251f, -1.89757f, 0.21589f, - -0.10370f, 0.02530f, -0.25659f, 0.01466f, -0.23661f, 0.22783f, 0.92100f, - 1.02915f, 1.20358f, 1.17251f, 0.97749f, 1.04696f, 0.91333f, 0.54576f, - -0.52792f, 0.02217f, 0.25652f, 0.31405f, -0.18398f, 0.04572f, -0.81359f, - 1.82883f, -0.40047f, -0.33056f, 0.35255f, 0.34448f, -0.00339f, -0.23857f, - 0.28925f, -0.77175f, -0.24325f, -0.21420f, 1.11451f, 1.39553f, 0.51573f, - 0.05476f, 1.13791f, 0.94959f, -0.35710f, 0.67467f, 0.16722f, 0.61213f, - 0.07683f, -0.20613f, 0.13440f, -0.72131f, -0.15418f, -0.17688f, -0.16510f, - -0.19226f, 0.09270f, -2.43559f, -0.12669f, 0.05074f, 0.30414f, 0.00927f, - 0.60630f, 0.00801f, -1.07310f, -0.06227f, 2.10607f, 0.02382f, -0.39891f, - -0.09149f, -0.78596f, 0.83966f, -0.14802f, -0.14083f, -0.20831f, -0.55136f, - 0.08566f, -0.00647f, 0.07044f, 0.53408f, 0.85720f, -0.07393f, 0.24476f, - 0.43767f, 0.30519f, -1.89430f, 0.23252f, 1.63790f, 0.17316f, -0.03903f, - 0.25269f, 0.01562f, + 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f, + -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f, + -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f, + 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f, + -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f, + 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f, + 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f, + 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f, + -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f, + -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f, + 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f, + 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f, + -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f, + -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f, + -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f, + -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f, + -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f, + 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f, + -0.12236f, 0.16075f, }; static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = { - -0.83370f, -0.20704f, -0.60437f, -0.81664f, 1.16998f, 0.16745f, - -1.34680f, -1.07083f, -0.34649f, 0.65598f, -0.56278f, 0.22660f, - -0.25956f, -0.29608f, 1.24359f, -0.09167f, + -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f, + -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f, + 0.57598f, 0.99819f, 0.75175f, 0.17044f, }; static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = { - -0.71147f, -0.63964f, -0.69220f, 0.22326f, 0.67191f, -0.58894f, -0.98464f, - 0.23583f, 0.22824f, 1.39838f, 0.09920f, -0.59411f, -0.67101f, 0.19088f, - 0.83025f, -0.66991f, -0.42889f, -0.49969f, 1.39532f, -1.02000f, 0.62101f, - 0.57175f, -0.83226f, 0.01551f, 0.05604f, 1.23028f, 0.02030f, -0.55995f, - -0.42349f, 0.15375f, 0.52132f, -0.52421f, 0.89586f, -0.73778f, -0.10911f, - 0.22447f, 1.16858f, -0.48169f, 1.73890f, -0.69860f, 0.12504f, 1.10492f, - 0.04391f, -0.85670f, -0.49257f, 0.09616f, 0.76518f, -0.44854f, 1.50938f, - 0.62246f, -0.40366f, -0.11182f, -0.01680f, 0.59724f, 1.32170f, -1.09061f, - -0.04278f, -0.02449f, 0.25024f, 1.26239f, 0.42345f, -0.10031f, 0.80871f, - 0.44198f, + -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f, + 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f, + -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f, + 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f, + -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f, + -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f, + -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f, + 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f, + 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f, + 2.20547f, }; static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = { - 0.68329f, - 1.33555f, - 0.25943f, - 3.23439f, + -0.44080f, + -1.67455f, + -1.46332f, + -6.13206f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = { @@ -477,64 +542,59 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = { { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_8x16_ver_layer0, - av1_tx_type_nn_weights_8x16_ver_layer1, - }, - { - av1_tx_type_nn_bias_8x16_ver_layer0, - av1_tx_type_nn_bias_8x16_ver_layer1, - }, + { av1_tx_type_nn_weights_8x16_ver_layer0, + av1_tx_type_nn_weights_8x16_ver_layer1 }, + { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 } }; /******************************************************************************/ // Tx type model for 16x8 block. static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = { - 0.89821f, 0.90804f, 1.13052f, 0.74855f, 1.02053f, 0.91260f, 0.97102f, - 0.16808f, -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f, - -0.12236f, -0.03158f, -1.43561f, 0.07794f, 0.16586f, 0.09731f, 0.12967f, - 0.09725f, -0.16826f, 1.26640f, 0.88004f, 0.27312f, -0.07993f, 0.33640f, - 0.11732f, 0.33384f, 0.97066f, -0.61744f, -0.48545f, 0.44622f, 0.73744f, - 0.32262f, -0.05713f, 0.42280f, 1.10378f, 0.18540f, -2.07906f, 0.11443f, - 0.37877f, 0.24136f, -0.12524f, -0.12434f, 0.02116f, 0.11716f, 1.28267f, - 1.01508f, 1.26184f, 1.22545f, 1.29582f, 1.18855f, 1.27564f, 0.42001f, - -0.41481f, 0.06725f, -0.13133f, -0.24801f, 0.16515f, 0.16228f, 0.35197f, - 0.53610f, -0.39805f, -0.32584f, 0.40096f, 0.38621f, -0.00030f, -0.23434f, - 0.29149f, -0.76542f, 0.04996f, -0.30036f, 1.48687f, 0.90852f, -0.03083f, - -0.15953f, 1.19259f, 0.87690f, -1.08977f, 0.78757f, 0.81149f, 0.54089f, - 0.35400f, 0.37919f, 0.84997f, -0.20449f, 0.39601f, -0.37596f, 0.64748f, - 0.26021f, 0.37354f, 0.23593f, 0.16335f, 1.70681f, 0.31800f, -0.00964f, - 0.82687f, -0.78372f, -1.47438f, 0.32410f, 1.37436f, 0.07476f, -0.40574f, - -0.10353f, -0.79300f, 0.74381f, -0.15601f, -0.14380f, -0.20961f, -0.52697f, - 0.04669f, -0.00870f, 0.05624f, -0.09036f, 0.25701f, 0.30336f, 0.24199f, - 0.45579f, 0.66330f, -1.81834f, 0.74965f, 1.22747f, 0.25072f, 0.25100f, - 0.43289f, -0.00362f, + 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f, + -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f, + -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f, + 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f, + 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f, + 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f, + 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f, + -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f, + -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f, + -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f, + 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f, + -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f, + -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f, + -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f, + 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f, + -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f, + -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f, + 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f, + -0.36570f, -0.50757f, }; static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = { - -0.87643f, 0.36754f, -0.86409f, 1.37761f, 1.22688f, 0.09074f, - -1.47139f, -1.06100f, -0.24087f, 1.10382f, -0.32837f, -1.39592f, - -0.14741f, -0.43954f, 1.72137f, -0.21704f, + -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f, + 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f, + -0.12329f, 0.08986f, 1.08117f, -0.00220f, }; static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = { - -0.81860f, -0.80745f, -0.43612f, 0.58656f, 0.37455f, -0.56519f, -1.71536f, - 0.23278f, 0.23951f, 1.09610f, 0.49986f, 0.43375f, -0.53182f, 0.17376f, - 1.05626f, -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f, 1.14295f, - 0.45571f, -0.52504f, -0.00303f, 0.06044f, 0.66119f, -0.60340f, -1.14344f, - -0.28045f, 0.12742f, 0.61484f, -0.41016f, 1.36102f, -0.86969f, -0.52728f, - 1.01725f, 0.67083f, -0.10138f, 1.36406f, 0.34066f, 0.12498f, 0.86595f, - -0.39636f, -0.27888f, -0.40244f, 0.09847f, 0.81178f, -0.45313f, 1.39127f, - 0.99865f, -0.57908f, 0.55072f, 0.49638f, 1.11524f, 1.85504f, -0.28316f, - -0.05195f, -0.23284f, 0.26461f, -1.28120f, 0.60707f, -0.06110f, 0.74085f, - 0.63304f, + 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f, + 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f, + -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f, + -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f, + -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f, + -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f, + 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f, + 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f, + 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f, + -0.23347f, }; static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = { - 0.71765f, - 1.40400f, - 0.32221f, - 3.07234f, + 3.57175f, + 2.42612f, + 3.31259f, + 2.08287f, }; static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = { @@ -544,62 +604,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = { { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_16x8_hor_layer0, - av1_tx_type_nn_weights_16x8_hor_layer1, - }, - { - av1_tx_type_nn_bias_16x8_hor_layer0, - av1_tx_type_nn_bias_16x8_hor_layer1, - }, + { av1_tx_type_nn_weights_16x8_hor_layer0, + av1_tx_type_nn_weights_16x8_hor_layer1 }, + { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 } }; static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = { - 1.20497f, 1.23691f, 1.23738f, 1.07773f, 1.15264f, 1.31959f, 1.15365f, - 0.17179f, 0.68612f, 0.55636f, 0.57145f, 0.67022f, 0.19636f, -1.27420f, - -1.36428f, -0.16706f, -1.20934f, -0.87794f, -0.97146f, -0.74722f, -1.14493f, - -1.02689f, -0.88153f, 0.83857f, 1.53355f, 0.13601f, 0.35451f, 0.53750f, - 0.62381f, 0.32438f, 0.59405f, 0.33090f, -1.52948f, -0.46094f, 0.42634f, - 0.48763f, 0.30707f, 0.52553f, 0.71427f, -0.31287f, -2.37106f, -0.18756f, - 0.16561f, -0.00431f, -0.13747f, -0.09336f, -0.16511f, 0.13454f, 0.45010f, - -0.00317f, -0.06403f, 0.95442f, 1.59636f, 0.30602f, -0.05515f, 0.05467f, - -0.21758f, -0.19192f, -0.17935f, -0.00545f, 0.35409f, 0.26141f, -0.32174f, - 1.78129f, -0.40161f, -0.33158f, 0.38084f, 0.38081f, 0.01053f, -0.23567f, - 0.29239f, -0.76159f, -0.19373f, 0.13649f, 0.66949f, 1.19733f, 1.92557f, - 1.16691f, 0.94955f, 0.62324f, -0.85434f, -0.07699f, 0.87683f, 0.95911f, - 0.86106f, 0.57959f, 0.40146f, -0.35851f, 1.55427f, 0.15349f, -0.01582f, - 0.32517f, 0.03784f, 0.15916f, 0.09024f, 1.43187f, 0.56160f, 0.11521f, - 0.52476f, -0.26107f, -0.38167f, -0.31596f, 0.31304f, -0.65366f, -0.40680f, - -0.11082f, -0.78585f, 0.77906f, -0.13322f, -0.13747f, -0.21001f, -0.53204f, - -0.06752f, -0.84741f, -0.53442f, -0.16284f, 0.54027f, 0.13586f, -0.42001f, - 0.85388f, 0.08300f, -0.89325f, -1.73681f, -0.70473f, 0.23151f, 0.69549f, - 0.72124f, 0.12769f, + 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f, + 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f, + -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f, + 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f, + 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f, + -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f, + 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f, + -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f, + 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f, + 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f, + 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f, + -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f, + -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f, + -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f, + 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f, + 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f, + -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f, + -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f, + -0.81945f, -0.41647f, }; static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = { - -1.15644f, -0.31062f, 0.20697f, -0.60304f, -1.19498f, 0.21451f, - -0.42825f, -0.71800f, -0.25816f, 1.47408f, -0.24423f, -1.45773f, - -0.55834f, -0.36938f, 1.56759f, 0.07238f, + 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f, + 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f, + -0.04510f, 0.48000f, -0.09354f, -0.42422f, }; static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = { - -1.45227f, -0.67141f, 0.75237f, 0.32681f, -0.70528f, -0.76730f, -0.49777f, - 0.02418f, 0.25096f, 1.14840f, 0.23548f, 0.48755f, 0.33164f, 0.21050f, - 1.41651f, -0.28888f, -0.76668f, 0.04439f, 0.67538f, -1.06438f, 0.68128f, - 0.95824f, 0.08530f, -0.03635f, 0.06820f, 1.38621f, -0.50424f, -1.72992f, - -0.20949f, 0.13400f, 0.93366f, -0.05324f, 1.41593f, -0.75119f, -1.80912f, - 1.05440f, 0.62580f, -0.30867f, -0.07025f, -0.34654f, 0.13621f, 1.74426f, - -0.22417f, 0.47031f, -0.08142f, 0.10151f, 0.42498f, 0.06635f, 1.50623f, - 1.04130f, 0.85107f, 0.23382f, 0.69800f, 1.10856f, 1.18767f, -0.69395f, - -0.07985f, 0.50412f, 0.46019f, 0.49214f, 0.44219f, -0.09502f, 0.75745f, - 0.99208f, + 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f, + -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f, + 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f, + -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f, + -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f, + 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f, + 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f, + -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f, + 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f, + -0.00873f, }; static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = { - 0.68774f, - 0.88572f, - 0.77462f, - 3.05667f, + 3.34981f, + 3.74710f, + 1.38339f, + 0.45176f, }; static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = { @@ -609,14 +664,9 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = { { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_16x8_ver_layer0, - av1_tx_type_nn_weights_16x8_ver_layer1, - }, - { - av1_tx_type_nn_bias_16x8_ver_layer0, - av1_tx_type_nn_bias_16x8_ver_layer1, - }, + { av1_tx_type_nn_weights_16x8_ver_layer0, + av1_tx_type_nn_weights_16x8_ver_layer1 }, + { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 } }; /******************************************************************************/ @@ -687,445 +737,253 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x16 = { }; /******************************************************************************/ -// Tx type model for 16x32 block. -static const float av1_tx_type_nn_weights_16x32_hor_layer0[128] = { - 0.89821f, 0.90804f, 1.13052f, 0.74855f, 1.02053f, 0.91260f, 0.97102f, - 0.16808f, -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f, - -0.12236f, -0.03158f, -1.43561f, 0.07794f, 0.16586f, 0.09731f, 0.12967f, - 0.09725f, -0.16826f, 1.26640f, 0.88004f, 0.27312f, -0.07993f, 0.33640f, - 0.11732f, 0.33384f, 0.97066f, -0.61744f, -0.48545f, 0.44622f, 0.73744f, - 0.32262f, -0.05713f, 0.42280f, 1.10378f, 0.18540f, -2.07906f, 0.11443f, - 0.37877f, 0.24136f, -0.12524f, -0.12434f, 0.02116f, 0.11716f, 1.28267f, - 1.01508f, 1.26184f, 1.22545f, 1.29582f, 1.18855f, 1.27564f, 0.42001f, - -0.41481f, 0.06725f, -0.13133f, -0.24801f, 0.16515f, 0.16228f, 0.35197f, - 0.53610f, -0.39805f, -0.32584f, 0.40096f, 0.38621f, -0.00030f, -0.23434f, - 0.29149f, -0.76542f, 0.04996f, -0.30036f, 1.48687f, 0.90852f, -0.03083f, - -0.15953f, 1.19259f, 0.87690f, -1.08977f, 0.78757f, 0.81149f, 0.54089f, - 0.35400f, 0.37919f, 0.84997f, -0.20449f, 0.39601f, -0.37596f, 0.64748f, - 0.26021f, 0.37354f, 0.23593f, 0.16335f, 1.70681f, 0.31800f, -0.00964f, - 0.82687f, -0.78372f, -1.47438f, 0.32410f, 1.37436f, 0.07476f, -0.40574f, - -0.10353f, -0.79300f, 0.74381f, -0.15601f, -0.14380f, -0.20961f, -0.52697f, - 0.04669f, -0.00870f, 0.05624f, -0.09036f, 0.25701f, 0.30336f, 0.24199f, - 0.45579f, 0.66330f, -1.81834f, 0.74965f, 1.22747f, 0.25072f, 0.25100f, - 0.43289f, -0.00362f, -}; - -static const float av1_tx_type_nn_bias_16x32_hor_layer0[16] = { - -0.87643f, 0.36754f, -0.86409f, 1.37761f, 1.22688f, 0.09074f, - -1.47139f, -1.06100f, -0.24087f, 1.10382f, -0.32837f, -1.39592f, - -0.14741f, -0.43954f, 1.72137f, -0.21704f, -}; - -static const float av1_tx_type_nn_weights_16x32_hor_layer1[64] = { - -0.81860f, -0.80745f, -0.43612f, 0.58656f, 0.37455f, -0.56519f, -1.71536f, - 0.23278f, 0.23951f, 1.09610f, 0.49986f, 0.43375f, -0.53182f, 0.17376f, - 1.05626f, -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f, 1.14295f, - 0.45571f, -0.52504f, -0.00303f, 0.06044f, 0.66119f, -0.60340f, -1.14344f, - -0.28045f, 0.12742f, 0.61484f, -0.41016f, 1.36102f, -0.86969f, -0.52728f, - 1.01725f, 0.67083f, -0.10138f, 1.36406f, 0.34066f, 0.12498f, 0.86595f, - -0.39636f, -0.27888f, -0.40244f, 0.09847f, 0.81178f, -0.45313f, 1.39127f, - 0.99865f, -0.57908f, 0.55072f, 0.49638f, 1.11524f, 1.85504f, -0.28316f, - -0.05195f, -0.23284f, 0.26461f, -1.28120f, 0.60707f, -0.06110f, 0.74085f, - 0.63304f, -}; - -static const float av1_tx_type_nn_bias_16x32_hor_layer1[4] = { - 0.71765f, - 1.40400f, - 0.32221f, - 3.07234f, -}; - -static const NN_CONFIG av1_tx_type_nnconfig_16x32_hor = { +// Tx type model for 4x16 block. +static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = { + 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f, + 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f, + 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f, + 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f, + -1.74563f, -0.88830f, -1.77603f, 2.15935f, +}; + +static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = { + -0.36435f, -2.22731f, -0.00837f, -1.34546f, + 0.62806f, -0.20675f, 4.91940f, -0.56079f, +}; + +static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = { + -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f, + -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f, + 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f, + 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f, + 1.28413f, -0.30326f, 2.45329f, -0.83335f, +}; + +static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = { + 2.33198f, + 3.36245f, + 1.62603f, + 2.91056f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x16_hor_layer0, + av1_tx_type_nn_weights_4x16_hor_layer1 }, + { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = { + 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f, + 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f, + -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f, + -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f, + -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f, + -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f, + 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f, + 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f, + 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f, + -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f, + -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f, + 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f, + 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f, + 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f, + 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f, + -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f, + 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f, + 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f, + -0.27975f, -0.01149f, +}; + +static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = { + -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f, + -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f, + -0.32530f, 0.73483f, 0.08322f, -0.23890f, +}; + +static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = { + 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f, + -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f, + 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f, + -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f, + 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f, + -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f, + 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f, + 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f, + -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f, + -0.56513f, +}; + +static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = { + 4.60896f, + 4.53551f, + 4.53124f, + 4.27435f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_16x32_hor_layer0, - av1_tx_type_nn_weights_16x32_hor_layer1, - }, - { - av1_tx_type_nn_bias_16x32_hor_layer0, - av1_tx_type_nn_bias_16x32_hor_layer1, - }, + { av1_tx_type_nn_weights_4x16_ver_layer0, + av1_tx_type_nn_weights_4x16_ver_layer1 }, + { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 } }; +/******************************************************************************/ -static const float av1_tx_type_nn_weights_16x32_ver_layer0[512] = { - -0.01219f, 0.51494f, 0.25450f, 0.45788f, -0.87277f, 0.32954f, -0.04851f, - -0.24321f, -0.40000f, 0.21915f, 0.14108f, 0.98268f, 0.18989f, 0.54298f, - 0.36349f, 0.38931f, 1.08124f, 0.87199f, 1.03553f, 1.14777f, 1.04254f, - 1.11336f, 0.92198f, 0.84715f, 1.89363f, 1.21587f, 0.72377f, 1.25097f, - 0.84231f, 0.95529f, 1.12346f, 0.19113f, -0.04559f, 0.56859f, 0.59747f, - 0.60176f, 0.82465f, 0.59009f, 0.67240f, 1.58674f, -0.92951f, -0.23449f, - 0.11923f, -0.19151f, -0.15914f, 0.03146f, -0.16541f, 0.17181f, -0.21834f, - 0.21906f, 0.96708f, 0.36085f, -0.42380f, -2.25681f, -0.48812f, 0.72875f, - 0.06585f, 0.18818f, -0.02109f, -0.10996f, 0.00187f, -0.02078f, 0.04484f, - -0.07171f, 0.94773f, -0.33466f, 0.28484f, 0.14791f, 0.30274f, 0.13377f, - 0.40970f, 0.45133f, 1.69265f, -0.36422f, -0.15889f, 0.07670f, 0.44675f, - -0.28665f, -0.07097f, 1.03803f, -0.83274f, -0.24571f, 0.08039f, -0.23790f, - -0.23276f, -0.28031f, 0.26451f, -0.18513f, -2.23336f, -0.62073f, 0.32495f, - -0.67644f, -0.08559f, -0.36382f, -0.24515f, -0.01899f, 0.09012f, 0.19723f, - 0.04017f, 0.31624f, 0.58369f, 0.30411f, -0.81165f, -2.58541f, -0.20491f, - 0.68089f, -0.14799f, 0.13925f, 0.12867f, 0.15229f, 0.06887f, -0.03784f, - 0.02288f, -0.28712f, 0.14107f, 0.29485f, -0.11662f, 0.25239f, 0.30311f, - -0.07377f, -0.10962f, 0.59856f, 0.47967f, 0.01847f, -0.27889f, 0.46786f, - 0.18118f, 0.09355f, -2.10076f, 0.38823f, 0.28202f, 0.29104f, 0.86977f, - 0.52377f, 0.21161f, 0.72888f, -0.00952f, 0.15982f, -0.14651f, 0.28763f, - -0.14155f, 0.00093f, 0.08351f, 0.34685f, -0.22066f, 0.20378f, 0.25416f, - 0.03423f, -0.11068f, -0.41612f, 0.56913f, -0.06697f, -0.12585f, -0.21033f, - -0.14513f, -0.04477f, -0.35778f, 0.03437f, 0.06956f, -0.25356f, -1.46010f, - -0.08142f, 0.11926f, -0.63551f, -0.13882f, 0.34164f, 0.10821f, 1.07323f, - -0.62435f, -0.27116f, 0.25971f, 0.11952f, -0.39480f, -0.05474f, -0.12582f, - 0.28289f, 0.13723f, 0.58369f, 0.41865f, 0.28574f, 1.01357f, 0.46661f, - 0.61717f, 0.85708f, -0.03930f, -0.38013f, -0.33888f, -0.20561f, -0.19087f, - -0.01041f, 0.12119f, -0.20786f, 0.55915f, 0.67511f, 0.55554f, 0.56540f, - 0.76647f, 0.54766f, 0.45166f, 0.61384f, 0.95407f, -0.06811f, -0.62132f, - 0.12713f, 0.63713f, 2.04090f, 1.17054f, 0.00469f, -0.93692f, -0.24136f, - -0.04281f, -0.15787f, 0.37956f, -0.09174f, -0.72494f, 0.55285f, -1.40996f, - -0.54077f, 0.38445f, -0.08258f, 0.64259f, -0.54058f, -0.49865f, 1.41371f, - 0.89014f, 0.78788f, 0.37919f, 0.87447f, -0.00760f, -0.00947f, 0.16323f, - -0.36632f, -1.38115f, -0.24619f, 0.40490f, -0.08871f, -0.25365f, -0.60842f, - 0.11128f, 0.18658f, -0.86001f, -0.28271f, 0.39572f, -0.29930f, -0.10110f, - 0.33706f, 0.21731f, 0.15383f, -0.01707f, 0.02812f, 0.31192f, 0.39742f, - 0.38260f, -0.48263f, 0.57385f, 0.53239f, -0.60013f, -0.63211f, -0.45140f, - -0.73520f, -0.95260f, -0.70633f, -0.96190f, 0.01747f, -0.05195f, -0.07138f, - -1.09535f, -0.63548f, -1.55700f, -0.35721f, -0.18923f, 0.77568f, 0.09419f, - 0.36919f, -0.32761f, -0.06597f, -0.38988f, -0.43674f, -0.24284f, 0.36906f, - 0.28414f, 0.19273f, -0.68516f, 0.09514f, -0.45381f, 0.19917f, -0.32377f, - 1.32549f, 0.08244f, -0.64405f, 0.13195f, 2.85307f, 0.47631f, -0.33408f, - 0.04168f, 0.18585f, -0.18029f, 0.07986f, -0.08816f, -0.00703f, -0.01515f, - -0.13164f, 0.00571f, 0.05676f, 1.51425f, 0.73360f, 0.43486f, -0.08223f, - -0.06183f, -0.57098f, -0.29948f, 0.05945f, 0.19238f, -0.47980f, -0.35902f, - -0.19931f, 0.43443f, 0.67436f, 0.78573f, 0.25703f, 1.01863f, 0.99047f, - 0.95228f, 1.02429f, 1.19264f, 0.29935f, -0.26583f, -0.98749f, -0.46167f, - -0.29727f, -0.10515f, -0.39790f, -0.59321f, -0.61925f, -0.95452f, 0.04292f, - -0.48273f, -0.91195f, -0.45971f, -0.46355f, -0.88319f, -0.51712f, -0.47682f, - -0.86110f, -0.59178f, -0.57163f, -0.94946f, 0.19627f, -0.18699f, 0.11037f, - 1.39110f, 0.05715f, 3.00762f, 1.52243f, 0.25028f, 0.12779f, -0.12871f, - 0.04764f, 0.08288f, -0.16572f, -0.06580f, 0.05845f, -0.01474f, 0.04886f, - -0.10000f, 0.12911f, -0.01416f, -0.12472f, 0.14358f, 0.16554f, 0.08853f, - 0.13418f, -0.05408f, -0.13871f, -0.00049f, 0.20725f, -0.05603f, 0.27885f, - -0.14277f, 0.29653f, -0.24739f, 0.10101f, -0.17068f, -2.43802f, 0.41834f, - 0.49784f, 0.34949f, 0.98487f, 0.16792f, 1.07355f, 0.32546f, 1.32377f, - -0.08584f, 0.85214f, -0.05721f, 0.90307f, 0.20167f, 0.52664f, -0.14478f, - 0.64997f, 0.06846f, 0.32475f, 0.64453f, 0.70143f, -0.03091f, -0.24958f, - -0.39021f, -0.57693f, -0.18319f, 0.11793f, -0.05948f, 0.36670f, -0.27932f, - 0.14800f, -0.55459f, -0.89673f, 0.65922f, 0.54308f, -0.16731f, -0.59731f, - -0.20705f, -0.18183f, -0.05645f, -0.06829f, -0.40210f, -0.27955f, 0.28054f, - 0.57665f, 0.14171f, 0.54693f, -0.22144f, -0.59664f, 0.13295f, 0.07057f, - -0.19698f, 0.03328f, -0.09687f, -0.32390f, -0.11506f, -0.40406f, -0.11473f, - 0.10399f, -0.29824f, 0.16028f, 0.00053f, 0.22699f, 0.04203f, -0.43880f, - -0.12654f, 0.12172f, 0.21087f, -0.46350f, -0.22081f, -0.06173f, -0.23287f, - 0.90314f, 0.04466f, -0.06149f, 0.32682f, 0.16609f, -0.58991f, -0.03786f, - -0.41329f, 0.02632f, 0.23411f, 0.25344f, 0.16468f, 0.31007f, 0.21845f, - 0.32462f, 0.33945f, 0.11527f, -0.35926f, -0.18584f, 0.29340f, 0.78199f, - 2.39287f, 0.53838f, -1.55085f, 0.02238f, -0.26153f, -0.42498f, -0.02460f, - 0.19261f, -0.10870f, -0.08453f, -0.39561f, 0.08600f, 0.36310f, 0.58439f, - -0.59526f, 0.13104f, -0.06703f, -0.17529f, -0.41431f, -0.23121f, -0.32394f, - -0.33324f, -0.21405f, -0.41702f, -0.29236f, -0.31766f, -0.33512f, -0.22679f, - -0.13680f, -0.00118f, -1.81744f, -2.34798f, -1.08048f, -0.29883f, -0.29123f, - -0.01752f, -}; - -static const float av1_tx_type_nn_bias_16x32_ver_layer0[32] = { - 1.02458f, -1.02185f, -0.18978f, 0.05981f, -0.94931f, 0.34544f, 0.04415f, - -0.60036f, -0.11368f, -0.14154f, 1.23438f, 0.51640f, -0.57587f, -0.91380f, - 0.95720f, 0.68298f, -0.06353f, -2.14960f, -0.11080f, 0.79380f, -0.94199f, - 0.43040f, 0.01358f, 0.07201f, -0.49689f, -0.14839f, -0.80132f, -0.13925f, - -0.11834f, -0.24998f, -0.33976f, 0.35497f, -}; - -static const float av1_tx_type_nn_weights_16x32_ver_layer1[128] = { - 0.87367f, -1.06469f, -0.50829f, -0.70540f, 1.14596f, -1.12346f, -0.94467f, - 0.01380f, -0.18911f, 0.07961f, -0.18626f, 0.61902f, -0.64423f, 1.21545f, - 1.01149f, 0.26309f, 1.50380f, 1.93940f, -0.64064f, 1.03987f, -1.88000f, - -0.44574f, -1.53303f, 1.36307f, 1.00292f, 0.37031f, 0.21594f, 0.16758f, - 0.02592f, -0.77431f, -0.31797f, -1.53826f, 1.14013f, -1.21957f, 0.04571f, - -0.22168f, 0.32299f, 0.25949f, -0.13306f, 0.17850f, 0.92494f, 0.19999f, - 0.07494f, -0.03362f, -0.53453f, 1.02970f, -0.22947f, 0.73964f, 1.08445f, - 0.16855f, -0.02686f, 0.25254f, 0.05952f, 0.02194f, 0.05649f, 0.39195f, - 0.14139f, 0.53843f, -0.06959f, -0.06993f, -0.14151f, -0.53147f, 0.17481f, - -1.21977f, 0.62932f, 1.07173f, 0.24049f, -0.51574f, 0.97492f, -0.28169f, - -0.15406f, -0.05441f, -0.25415f, 0.16583f, 0.43674f, -0.00593f, -0.09277f, - 0.61402f, 1.35562f, -0.03926f, 0.18967f, -0.29548f, -0.55509f, 0.23661f, - 0.05023f, 0.36226f, -0.83314f, 0.39357f, 0.19943f, -0.63431f, -0.03847f, - 0.12213f, 0.62024f, -0.11704f, -0.22483f, 0.96624f, 0.18518f, 0.09181f, - -0.63068f, 0.66797f, 0.74107f, 0.40624f, 0.70636f, -0.06921f, 0.34175f, - -0.15513f, 2.07844f, 0.22126f, 0.52919f, 0.26793f, -0.50018f, 1.10549f, - 0.10970f, 0.05831f, 0.82842f, -1.22975f, 1.78377f, 0.92679f, 2.01480f, - -1.19011f, -0.53381f, 0.38533f, 0.45579f, -0.10683f, -0.40828f, 0.31398f, - 0.14978f, 0.91325f, -}; - -static const float av1_tx_type_nn_bias_16x32_ver_layer1[4] = { - 1.03659f, - 1.80249f, - 1.25710f, - 1.32000f, -}; - -static const NN_CONFIG av1_tx_type_nnconfig_16x32_ver = { - 16, // num_inputs - 4, // num_outputs - 1, // num_hidden_layers +// Tx type model for 16x4 block. +static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = { + 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f, + 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f, + -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f, + -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f, + -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f, + -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f, + 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f, + 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f, + 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f, + -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f, + 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f, + -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f, + 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f, + -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f, + -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f, + -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f, + 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f, + 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f, + 0.19055f, -1.56413f, +}; + +static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = { + -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f, + 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f, + 1.14048f, 0.33308f, -1.10886f, 0.41184f, +}; + +static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = { + -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f, + 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f, + -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f, + -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f, + 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f, + -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f, + -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f, + 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f, + 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f, + -0.43819f, +}; + +static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = { + 2.32575f, + 2.75703f, + 1.12304f, + 2.15567f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers { - 32, + 16, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_16x32_ver_layer0, - av1_tx_type_nn_weights_16x32_ver_layer1, - }, - { - av1_tx_type_nn_bias_16x32_ver_layer0, - av1_tx_type_nn_bias_16x32_ver_layer1, - }, + { av1_tx_type_nn_weights_16x4_hor_layer0, + av1_tx_type_nn_weights_16x4_hor_layer1 }, + { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 } }; -/******************************************************************************/ -// Tx type model for 32x16 block. -static const float av1_tx_type_nn_weights_32x16_hor_layer0[512] = { - -0.07289f, 0.30798f, 0.41881f, 0.33434f, -0.01599f, 0.85307f, -0.16060f, - -0.07922f, -0.04693f, 0.29186f, 0.44117f, 1.02417f, 0.12447f, 0.46321f, - 0.40060f, 0.50140f, 0.48338f, 0.47298f, 0.36585f, 0.42821f, 0.41289f, - 0.47534f, 0.42900f, 0.26061f, 0.45887f, 0.38163f, 0.17302f, 1.00888f, - 1.79910f, 1.36140f, 0.24471f, 0.04557f, 1.10823f, 0.74325f, 0.91210f, - 0.81387f, 0.98865f, -0.09874f, 0.55146f, 0.19385f, -0.50752f, -0.17249f, - 0.27261f, -0.02763f, -0.03286f, 0.09122f, 0.07015f, 0.20012f, 0.68983f, - -1.25345f, -0.00145f, 0.71567f, 0.54948f, -0.56154f, -0.28918f, 0.11997f, - -0.09907f, 0.09195f, 0.05768f, 0.15558f, 0.11284f, -0.35195f, -0.08723f, - -0.03571f, 0.94031f, 0.63737f, 0.98202f, 0.93826f, 0.87126f, 0.88530f, - 0.97697f, 0.55283f, 0.58670f, 0.86502f, 0.97008f, 0.99709f, 0.66214f, - 0.96660f, 0.99890f, 0.31945f, -1.00301f, 0.13215f, -0.03950f, 0.21148f, - 0.05128f, 0.10955f, 0.44839f, -0.33438f, -2.09773f, 0.13908f, 0.58669f, - 0.25268f, -0.24006f, 0.01286f, -0.05732f, 0.03401f, -0.06896f, 0.35397f, - 0.05133f, -0.21449f, -0.38437f, -0.32326f, -0.38731f, -0.44419f, 0.25968f, - -0.29422f, -0.12553f, -0.08896f, -0.16400f, -0.22309f, 0.21380f, -0.26912f, - 0.06866f, -0.25694f, 0.17632f, 0.32032f, -0.10666f, 0.26278f, 0.31877f, - -0.09338f, -0.14289f, 0.54232f, 0.46070f, 0.00059f, -0.27914f, 0.45177f, - 0.16274f, -0.08811f, -0.45791f, 0.53946f, -0.16794f, 0.16229f, 0.11840f, - -0.24435f, 0.26894f, -0.33180f, -0.47314f, 0.34061f, -0.13939f, 0.13321f, - -0.05208f, -0.18139f, -0.35234f, 1.37298f, -0.19360f, 0.21728f, 0.26088f, - 0.04045f, -0.10763f, -0.40470f, 0.50026f, -0.06726f, -0.12871f, -0.20963f, - -0.14583f, -0.04711f, -0.35988f, 0.03091f, 0.06491f, -0.31668f, -0.52190f, - 0.23397f, -0.13984f, -0.15207f, -0.49977f, 0.51205f, 0.12559f, -0.03631f, - 0.33447f, -0.36684f, 0.17533f, 0.15671f, -0.00096f, 0.06817f, 0.20922f, - 0.34006f, 0.71260f, 0.45024f, 0.53033f, 0.15645f, 0.76019f, 0.56870f, - 0.83066f, 0.63022f, 1.74436f, -0.24798f, 0.06795f, -0.00749f, 0.17795f, - 0.10371f, 0.06527f, 0.41054f, 0.49003f, 0.34630f, 0.02615f, 0.30320f, - -0.47133f, -0.49584f, 0.21775f, 0.27530f, -0.29977f, -0.64269f, 0.52627f, - -0.02492f, 0.08077f, 0.40786f, -0.36015f, -0.70714f, -1.98185f, -0.28187f, - 0.35018f, -0.06105f, -0.12710f, 0.06606f, -0.27805f, 0.44630f, -0.84731f, - -0.26699f, 0.25856f, 0.06194f, -0.18674f, -0.11560f, -0.43277f, 1.10579f, - 0.95876f, 0.17415f, 0.56386f, 0.68426f, 0.50180f, 0.24844f, 0.12347f, - 0.15281f, -0.19089f, 0.52279f, 0.41860f, -0.05270f, -0.17029f, -0.03542f, - 0.10621f, -0.25088f, 0.24070f, -0.08951f, 0.29950f, -0.36720f, 0.02151f, - 0.20129f, -0.70066f, -0.23144f, -0.20070f, -0.39262f, -0.01597f, -0.05591f, - 0.23814f, -0.25991f, 0.05812f, 0.60554f, -0.06106f, -0.58326f, 0.28762f, - -0.18747f, 0.08232f, -0.04243f, -0.03293f, 0.14722f, -0.13017f, -0.67263f, - 0.38698f, -0.18207f, -0.11496f, -0.27976f, -0.55345f, 1.42872f, 0.04684f, - 0.04214f, 0.00030f, 0.02410f, 0.19966f, -0.04246f, 0.00442f, 0.23121f, - 0.13364f, 0.21548f, -0.12748f, -0.14066f, -0.28354f, 0.59937f, -0.27553f, - 1.57503f, -0.01050f, -0.17724f, 0.44110f, -0.80334f, 0.72064f, 1.00501f, - -0.72638f, 0.02774f, 0.48540f, -0.72016f, -0.27721f, 0.31559f, 0.07322f, - 0.20279f, -0.19647f, 0.02352f, 0.12662f, 0.19743f, 0.30543f, 0.25712f, - 0.44702f, 0.16417f, 0.17888f, -2.58469f, 0.20555f, 0.57782f, -0.10892f, - 0.14527f, 0.82251f, 0.04200f, 0.44626f, 0.10818f, 0.71204f, 0.62903f, - 0.69178f, 0.73603f, 0.52717f, 0.83020f, 0.48824f, 1.03270f, -0.00152f, - 0.07958f, 0.24181f, -0.78839f, -0.74214f, -0.72998f, -1.58694f, 0.17735f, - 0.56318f, 0.32580f, -0.58503f, -0.33673f, -0.00838f, 0.48924f, 0.43362f, - 0.12750f, 0.00295f, 0.38624f, 0.17037f, 0.00729f, -0.26256f, -0.41669f, - 0.36847f, 0.22424f, 1.33334f, 0.18112f, 0.37682f, 0.49173f, -0.45240f, - -0.04857f, -0.35038f, -0.83099f, -0.01988f, 0.03497f, 0.38033f, 0.13685f, - 0.17597f, 0.28668f, 0.31193f, -0.43281f, 0.43267f, -0.50495f, 0.01969f, - 0.14131f, -0.09326f, -0.39425f, -0.62048f, -0.09119f, -0.28306f, -0.52671f, - -0.38584f, -0.10953f, 0.19669f, 0.34540f, -0.49941f, 0.04605f, -0.43535f, - 0.27519f, 0.03659f, -0.31961f, 0.13330f, 0.87009f, 0.20101f, -0.70392f, - -0.27883f, 0.33874f, -0.34308f, 0.67760f, 0.88195f, 0.55752f, -0.26563f, - 0.17875f, 0.06964f, 0.87607f, 1.47616f, 0.46747f, -0.56408f, -0.39352f, - -0.16427f, -0.41185f, 0.14187f, 0.19265f, -0.58613f, 0.56345f, -0.17729f, - -0.11320f, 0.08752f, -0.01329f, 1.20981f, 0.45170f, -0.20571f, -0.01150f, - 0.26476f, 0.13508f, 0.22020f, -0.42684f, -0.22499f, -1.51212f, 0.86648f, - 0.21776f, 0.24666f, 0.71339f, 0.42742f, -0.00952f, 0.14762f, 0.07693f, - -0.19599f, 0.03075f, -0.09703f, -0.32483f, -0.11616f, -0.40461f, -0.11693f, - 0.10038f, -0.30038f, 0.14686f, 0.00548f, 0.20350f, 0.00763f, -0.43756f, - -0.01997f, 0.00902f, 0.07470f, -0.41441f, -0.20605f, 0.07626f, -0.34973f, - 0.47455f, -0.15251f, -0.05325f, 0.04964f, 0.32477f, -0.54604f, 0.25273f, - -0.18461f, -0.30841f, 0.64908f, 0.60752f, 0.64148f, 0.72788f, 0.71232f, - 0.58597f, 0.73017f, 0.58857f, 0.71908f, 0.59860f, 0.61849f, 0.99398f, - 0.39572f, -0.36165f, -1.88646f, 0.14384f, -0.60541f, -0.21380f, -0.55498f, - -0.50960f, -0.08801f, 0.51892f, 0.19126f, 0.57879f, 1.19447f, 0.25673f, - -0.21631f, -0.43562f, -0.27497f, -0.02206f, -0.56169f, 0.58952f, -0.60983f, - -0.64088f, -0.69087f, -0.56261f, -0.74089f, -0.65063f, -0.66978f, -0.60836f, - -0.92770f, -0.77182f, -1.61031f, -0.70007f, -0.68402f, -0.42242f, -0.66722f, - -0.14533f, -}; - -static const float av1_tx_type_nn_bias_32x16_hor_layer0[32] = { - 1.53781f, -0.49320f, -0.31646f, 0.02826f, -1.05554f, 0.06559f, -0.12399f, - -0.61671f, -0.28956f, -0.15419f, 0.87189f, -0.43375f, -1.08477f, -0.66006f, - 0.36233f, 0.82678f, -0.92342f, -1.47101f, -0.02937f, -0.16497f, -0.75457f, - 0.50173f, -0.07560f, 0.71598f, 1.50795f, -0.04745f, -0.14008f, -0.18510f, - -0.14988f, -0.67044f, 0.79659f, 0.70610f, -}; - -static const float av1_tx_type_nn_weights_32x16_hor_layer1[128] = { - 0.84983f, -0.62530f, -0.82600f, -0.52563f, -0.11942f, -0.50279f, -0.13425f, - -0.02850f, 0.50767f, 0.10252f, 0.24540f, 0.67748f, -0.43483f, -0.22242f, - 0.23431f, 0.57287f, 0.69560f, 1.13814f, -0.47427f, -0.55858f, -1.47072f, - 0.26587f, -0.36335f, 0.83060f, 1.01645f, -0.52895f, -0.11614f, 0.17390f, - -0.13664f, -0.83098f, -0.07985f, -1.36820f, 0.47759f, -0.55612f, 0.46852f, - 0.07406f, -0.80467f, 0.23059f, 0.09992f, -0.06164f, 0.13541f, 0.06135f, - 0.83605f, -0.53224f, -0.13867f, 0.93838f, -0.61290f, 0.27732f, -0.46688f, - -0.41810f, 0.12885f, 0.13619f, -0.24612f, 0.07215f, 0.98866f, 0.10993f, - 1.05799f, -0.27146f, -0.00079f, -0.08585f, 0.08322f, -0.33809f, 0.67598f, - -1.06515f, 1.28866f, 0.61028f, -0.31704f, -0.59905f, 1.62151f, 0.10969f, - 0.20671f, -0.17818f, 0.14170f, 0.19322f, 0.30602f, 0.93111f, 0.19011f, - -0.45609f, 0.82506f, 0.32936f, -0.07858f, -0.27106f, -0.31638f, 0.23299f, - 0.81491f, 0.32584f, -0.52093f, -0.32472f, 0.53643f, -0.42605f, 0.01641f, - 0.09002f, 0.15832f, -0.08790f, 0.05511f, 1.00730f, 0.46309f, 0.68166f, - -0.18835f, 0.64512f, -1.00540f, 0.86802f, 0.18981f, -0.06982f, -0.24514f, - -0.08027f, 0.61199f, -0.20830f, 0.72001f, 0.17477f, 0.06511f, 0.00801f, - -0.43590f, 0.37257f, 0.70323f, 0.60233f, 1.62541f, 0.74383f, -0.22254f, - -0.33892f, 0.22881f, 0.62817f, 0.68915f, -0.06417f, 0.00969f, 1.65869f, - 0.89060f, 0.75948f, -}; - -static const float av1_tx_type_nn_bias_32x16_hor_layer1[4] = { - 0.95359f, - 1.56043f, - 1.06017f, - 2.54520f, -}; - -static const NN_CONFIG av1_tx_type_nnconfig_32x16_hor = { - 16, // num_inputs - 4, // num_outputs - 1, // num_hidden_layers - { - 32, - }, // num_hidden_nodes - { - av1_tx_type_nn_weights_32x16_hor_layer0, - av1_tx_type_nn_weights_32x16_hor_layer1, - }, - { - av1_tx_type_nn_bias_32x16_hor_layer0, - av1_tx_type_nn_bias_32x16_hor_layer1, - }, +static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = { + 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f, + 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f, + -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f, + -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f, + -0.17967f, -0.96622f, 0.42635f, -1.04784f, }; -static const float av1_tx_type_nn_weights_32x16_ver_layer0[128] = { - 1.30219f, 1.30548f, 1.33334f, 1.20560f, 1.01572f, 1.38100f, 1.37504f, - 0.12599f, -0.96957f, 0.19400f, 0.75734f, 0.11295f, -0.40447f, -1.53062f, - -0.82980f, 0.02168f, -1.11289f, -0.66861f, -0.83663f, -0.91455f, -0.78618f, - -0.87176f, -1.10711f, 0.71207f, 1.49689f, -0.12715f, 0.29357f, 0.35234f, - 0.61016f, 0.80708f, 0.83564f, 1.05961f, -0.99842f, 0.82004f, 0.02638f, - 0.44606f, 0.32298f, 0.21321f, 0.47290f, -0.71442f, -2.81050f, -0.02520f, - -0.08919f, 0.00369f, -0.05257f, -0.07011f, -0.16394f, 0.06290f, 0.80086f, - 0.32349f, 0.47411f, 1.36126f, 1.68162f, 0.91325f, -0.27495f, 0.00262f, - 0.06025f, 0.42832f, 0.36965f, 0.38063f, 0.32772f, 0.40914f, 0.44510f, - 3.02239f, -1.84077f, 0.49536f, -0.27340f, -0.10437f, -0.34293f, -0.08047f, - -0.29651f, -0.97111f, -0.34187f, 0.52869f, 1.27240f, 1.20306f, 1.19121f, - 1.28742f, 0.26393f, -0.62319f, 0.92285f, -0.08303f, -0.33118f, -0.13053f, - 0.24875f, -0.52089f, 0.44691f, -1.08908f, 1.20921f, 0.36538f, -0.46792f, - -0.18855f, -0.13443f, -0.28472f, -0.10353f, 0.06911f, 0.68519f, 0.08228f, - -0.49027f, -0.34381f, 0.04719f, -0.33298f, 0.72525f, 0.09538f, -0.29216f, - -0.07260f, -0.55827f, 0.54542f, -0.10144f, -0.09292f, -0.14427f, -0.38361f, - -0.41559f, 0.75338f, -0.04530f, 0.27944f, 0.06932f, -0.11537f, 0.29568f, - 1.92155f, -0.98996f, -0.08841f, 0.49386f, 0.15947f, 0.53290f, 1.46747f, - 0.59360f, 0.25468f, -}; - -static const float av1_tx_type_nn_bias_32x16_ver_layer0[16] = { - -1.19673f, 0.33043f, 0.24408f, 0.46221f, 2.00646f, 0.19031f, - -0.64944f, -0.43452f, 1.04400f, 1.47371f, 0.52460f, -1.39577f, - 0.83852f, -0.25536f, 1.33200f, -0.24444f, -}; - -static const float av1_tx_type_nn_weights_32x16_ver_layer1[64] = { - -1.31447f, -0.86455f, 0.85217f, 1.00048f, 0.37395f, -1.35713f, -0.54032f, - 0.82803f, 0.89606f, 1.57696f, 0.68067f, 0.42512f, -0.26250f, 0.14621f, - 0.93249f, -0.77690f, -0.93652f, -0.44488f, 0.68360f, -0.88178f, 1.89111f, - 0.67700f, -0.29310f, 0.91604f, -1.21881f, 1.11188f, 0.45045f, -0.86119f, - -0.09294f, 0.09360f, 0.80794f, 0.41027f, 1.80399f, -0.50362f, -1.44689f, - 0.85148f, 0.90707f, -0.18458f, 0.14165f, 1.17367f, 0.70869f, 1.57147f, - 0.24692f, 0.16626f, 0.56794f, 0.07313f, 0.14728f, -0.74296f, 1.74127f, - 1.26560f, 0.17753f, 1.10194f, 0.56435f, 1.73779f, 1.42841f, -1.16773f, - 0.24584f, 0.10813f, -0.60187f, 0.79802f, 0.75229f, -0.06112f, 1.77282f, - 1.01058f, -}; - -static const float av1_tx_type_nn_bias_32x16_ver_layer1[4] = { - 0.83082f, - 2.03845f, - 0.59627f, - 2.31341f, -}; - -static const NN_CONFIG av1_tx_type_nnconfig_32x16_ver = { - 8, // num_inputs +static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = { + -0.52088f, 0.52844f, -1.03655f, -0.30974f, + 2.59952f, -1.93604f, 0.00000f, 2.51787f, +}; + +static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = { + 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f, + 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f, + 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f, + -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f, + 1.26814f, -1.93873f, -0.00768f, 1.58309f, +}; + +static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = { + 2.34713f, + 1.68667f, + 1.25488f, + 1.69812f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = { + 4, // num_inputs 4, // num_outputs 1, // num_hidden_layers { - 16, + 8, }, // num_hidden_nodes - { - av1_tx_type_nn_weights_32x16_ver_layer0, - av1_tx_type_nn_weights_32x16_ver_layer1, - }, - { - av1_tx_type_nn_bias_32x16_ver_layer0, - av1_tx_type_nn_bias_32x16_ver_layer1, - }, + { av1_tx_type_nn_weights_16x4_ver_layer0, + av1_tx_type_nn_weights_16x4_ver_layer1 }, + { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 } }; /******************************************************************************/ // Map tx_size to its corresponding neural net model for tx type prediction. static const NN_CONFIG *av1_tx_type_nnconfig_map_hor[] = { - &av1_tx_type_nnconfig_4x4, // 4x4 - &av1_tx_type_nnconfig_8x8, // 8x8 - &av1_tx_type_nnconfig_16x16, // 16x16 - NULL, // 32x32 - NULL, // 64x64 - &av1_tx_type_nnconfig_4x8_hor, // 4x8 - &av1_tx_type_nnconfig_8x4_hor, // 8x4 - &av1_tx_type_nnconfig_8x16_hor, // 8x16 - &av1_tx_type_nnconfig_16x8_hor, // 16x8 - &av1_tx_type_nnconfig_16x32_hor, // 16x32 - &av1_tx_type_nnconfig_32x16_hor, // 32x16 - NULL, // 32x64 - NULL, // 64x32 - NULL, // 4x16 - NULL, // 16x4 - NULL, // 8x32 - NULL, // 32x8 - NULL, // 16x64 - NULL, // 64x16 + &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform + &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform + &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform + &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform + &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform + &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform }; static const NN_CONFIG *av1_tx_type_nnconfig_map_ver[] = { - &av1_tx_type_nnconfig_4x4, // 4x4 transform - &av1_tx_type_nnconfig_8x8, // 8x8 transform - &av1_tx_type_nnconfig_16x16, // 16x16 transform - NULL, // 32x32 transform - NULL, // 64x64 transform - &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform - &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform - &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform - &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform - &av1_tx_type_nnconfig_16x32_ver, // 16x32 transform - &av1_tx_type_nnconfig_32x16_ver, // 32x16 transform - NULL, // 32x64 transform - NULL, // 64x32 transform - NULL, // 4x16 transform - NULL, // 16x4 transform - NULL, // 8x32 transform - NULL, // 32x8 transform - NULL, // 16x64 transform - NULL, // 64x16 transform + &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform + &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform + &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform + &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform + &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform + &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform }; // Tx split model for 4x8 block. @@ -2083,4 +1941,4 @@ static const NN_CONFIG *av1_tx_split_nnconfig_map[TX_SIZES_ALL] = { } // extern "C" #endif -#endif // AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ +#endif // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c index c71f2e74c..07615543c 100644 --- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c @@ -395,7 +395,8 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, } void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, - int8_t cos_bit) { + int8_t cos_bit, const int instride, + const int outstride) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); @@ -480,70 +481,70 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, // stage 1 __m128i x1[64]; - x1[0] = _mm_add_epi32(input[0], input[63]); - x1[63] = _mm_sub_epi32(input[0], input[63]); - x1[1] = _mm_add_epi32(input[1], input[62]); - x1[62] = _mm_sub_epi32(input[1], input[62]); - x1[2] = _mm_add_epi32(input[2], input[61]); - x1[61] = _mm_sub_epi32(input[2], input[61]); - x1[3] = _mm_add_epi32(input[3], input[60]); - x1[60] = _mm_sub_epi32(input[3], input[60]); - x1[4] = _mm_add_epi32(input[4], input[59]); - x1[59] = _mm_sub_epi32(input[4], input[59]); - x1[5] = _mm_add_epi32(input[5], input[58]); - x1[58] = _mm_sub_epi32(input[5], input[58]); - x1[6] = _mm_add_epi32(input[6], input[57]); - x1[57] = _mm_sub_epi32(input[6], input[57]); - x1[7] = _mm_add_epi32(input[7], input[56]); - x1[56] = _mm_sub_epi32(input[7], input[56]); - x1[8] = _mm_add_epi32(input[8], input[55]); - x1[55] = _mm_sub_epi32(input[8], input[55]); - x1[9] = _mm_add_epi32(input[9], input[54]); - x1[54] = _mm_sub_epi32(input[9], input[54]); - x1[10] = _mm_add_epi32(input[10], input[53]); - x1[53] = _mm_sub_epi32(input[10], input[53]); - x1[11] = _mm_add_epi32(input[11], input[52]); - x1[52] = _mm_sub_epi32(input[11], input[52]); - x1[12] = _mm_add_epi32(input[12], input[51]); - x1[51] = _mm_sub_epi32(input[12], input[51]); - x1[13] = _mm_add_epi32(input[13], input[50]); - x1[50] = _mm_sub_epi32(input[13], input[50]); - x1[14] = _mm_add_epi32(input[14], input[49]); - x1[49] = _mm_sub_epi32(input[14], input[49]); - x1[15] = _mm_add_epi32(input[15], input[48]); - x1[48] = _mm_sub_epi32(input[15], input[48]); - x1[16] = _mm_add_epi32(input[16], input[47]); - x1[47] = _mm_sub_epi32(input[16], input[47]); - x1[17] = _mm_add_epi32(input[17], input[46]); - x1[46] = _mm_sub_epi32(input[17], input[46]); - x1[18] = _mm_add_epi32(input[18], input[45]); - x1[45] = _mm_sub_epi32(input[18], input[45]); - x1[19] = _mm_add_epi32(input[19], input[44]); - x1[44] = _mm_sub_epi32(input[19], input[44]); - x1[20] = _mm_add_epi32(input[20], input[43]); - x1[43] = _mm_sub_epi32(input[20], input[43]); - x1[21] = _mm_add_epi32(input[21], input[42]); - x1[42] = _mm_sub_epi32(input[21], input[42]); - x1[22] = _mm_add_epi32(input[22], input[41]); - x1[41] = _mm_sub_epi32(input[22], input[41]); - x1[23] = _mm_add_epi32(input[23], input[40]); - x1[40] = _mm_sub_epi32(input[23], input[40]); - x1[24] = _mm_add_epi32(input[24], input[39]); - x1[39] = _mm_sub_epi32(input[24], input[39]); - x1[25] = _mm_add_epi32(input[25], input[38]); - x1[38] = _mm_sub_epi32(input[25], input[38]); - x1[26] = _mm_add_epi32(input[26], input[37]); - x1[37] = _mm_sub_epi32(input[26], input[37]); - x1[27] = _mm_add_epi32(input[27], input[36]); - x1[36] = _mm_sub_epi32(input[27], input[36]); - x1[28] = _mm_add_epi32(input[28], input[35]); - x1[35] = _mm_sub_epi32(input[28], input[35]); - x1[29] = _mm_add_epi32(input[29], input[34]); - x1[34] = _mm_sub_epi32(input[29], input[34]); - x1[30] = _mm_add_epi32(input[30], input[33]); - x1[33] = _mm_sub_epi32(input[30], input[33]); - x1[31] = _mm_add_epi32(input[31], input[32]); - x1[32] = _mm_sub_epi32(input[31], input[32]); + x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]); + x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]); + x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]); + x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]); + x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]); + x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]); + x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]); + x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]); + x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]); + x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]); + x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]); + x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]); + x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]); + x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]); + x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]); + x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]); + x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]); + x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]); + x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]); + x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]); + x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]); + x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]); + x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]); + x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]); + x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]); + x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]); + x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]); + x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]); + x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]); + x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]); + x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]); + x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]); + x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]); + x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]); + x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]); + x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]); + x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]); + x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]); + x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]); + x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]); + x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]); + x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]); + x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]); + x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]); + x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]); + x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]); + x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]); + x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]); + x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]); + x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]); + x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]); + x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]); + x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]); + x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]); + x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]); + x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]); + x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]); + x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]); + x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]); + x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]); + x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]); + x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]); + x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]); + x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]); // stage 2 __m128i x2[64]; @@ -1149,68 +1150,68 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, x10[48], __rounding, cos_bit); // stage 11 - output[0] = x10[0]; - output[1] = x10[32]; - output[2] = x10[16]; - output[3] = x10[48]; - output[4] = x10[8]; - output[5] = x10[40]; - output[6] = x10[24]; - output[7] = x10[56]; - output[8] = x10[4]; - output[9] = x10[36]; - output[10] = x10[20]; - output[11] = x10[52]; - output[12] = x10[12]; - output[13] = x10[44]; - output[14] = x10[28]; - output[15] = x10[60]; - output[16] = x10[2]; - output[17] = x10[34]; - output[18] = x10[18]; - output[19] = x10[50]; - output[20] = x10[10]; - output[21] = x10[42]; - output[22] = x10[26]; - output[23] = x10[58]; - output[24] = x10[6]; - output[25] = x10[38]; - output[26] = x10[22]; - output[27] = x10[54]; - output[28] = x10[14]; - output[29] = x10[46]; - output[30] = x10[30]; - output[31] = x10[62]; - output[32] = x10[1]; - output[33] = x10[33]; - output[34] = x10[17]; - output[35] = x10[49]; - output[36] = x10[9]; - output[37] = x10[41]; - output[38] = x10[25]; - output[39] = x10[57]; - output[40] = x10[5]; - output[41] = x10[37]; - output[42] = x10[21]; - output[43] = x10[53]; - output[44] = x10[13]; - output[45] = x10[45]; - output[46] = x10[29]; - output[47] = x10[61]; - output[48] = x10[3]; - output[49] = x10[35]; - output[50] = x10[19]; - output[51] = x10[51]; - output[52] = x10[11]; - output[53] = x10[43]; - output[54] = x10[27]; - output[55] = x10[59]; - output[56] = x10[7]; - output[57] = x10[39]; - output[58] = x10[23]; - output[59] = x10[55]; - output[60] = x10[15]; - output[61] = x10[47]; - output[62] = x10[31]; - output[63] = x10[63]; + output[0 * outstride] = x10[0]; + output[1 * outstride] = x10[32]; + output[2 * outstride] = x10[16]; + output[3 * outstride] = x10[48]; + output[4 * outstride] = x10[8]; + output[5 * outstride] = x10[40]; + output[6 * outstride] = x10[24]; + output[7 * outstride] = x10[56]; + output[8 * outstride] = x10[4]; + output[9 * outstride] = x10[36]; + output[10 * outstride] = x10[20]; + output[11 * outstride] = x10[52]; + output[12 * outstride] = x10[12]; + output[13 * outstride] = x10[44]; + output[14 * outstride] = x10[28]; + output[15 * outstride] = x10[60]; + output[16 * outstride] = x10[2]; + output[17 * outstride] = x10[34]; + output[18 * outstride] = x10[18]; + output[19 * outstride] = x10[50]; + output[20 * outstride] = x10[10]; + output[21 * outstride] = x10[42]; + output[22 * outstride] = x10[26]; + output[23 * outstride] = x10[58]; + output[24 * outstride] = x10[6]; + output[25 * outstride] = x10[38]; + output[26 * outstride] = x10[22]; + output[27 * outstride] = x10[54]; + output[28 * outstride] = x10[14]; + output[29 * outstride] = x10[46]; + output[30 * outstride] = x10[30]; + output[31 * outstride] = x10[62]; + output[32 * outstride] = x10[1]; + output[33 * outstride] = x10[33]; + output[34 * outstride] = x10[17]; + output[35 * outstride] = x10[49]; + output[36 * outstride] = x10[9]; + output[37 * outstride] = x10[41]; + output[38 * outstride] = x10[25]; + output[39 * outstride] = x10[57]; + output[40 * outstride] = x10[5]; + output[41 * outstride] = x10[37]; + output[42 * outstride] = x10[21]; + output[43 * outstride] = x10[53]; + output[44 * outstride] = x10[13]; + output[45 * outstride] = x10[45]; + output[46 * outstride] = x10[29]; + output[47 * outstride] = x10[61]; + output[48 * outstride] = x10[3]; + output[49 * outstride] = x10[35]; + output[50 * outstride] = x10[19]; + output[51 * outstride] = x10[51]; + output[52 * outstride] = x10[11]; + output[53 * outstride] = x10[43]; + output[54 * outstride] = x10[27]; + output[55 * outstride] = x10[59]; + output[56 * outstride] = x10[7]; + output[57 * outstride] = x10[39]; + output[58 * outstride] = x10[23]; + output[59 * outstride] = x10[55]; + output[60 * outstride] = x10[15]; + output[61 * outstride] = x10[47]; + output[62 * outstride] = x10[31]; + output[63 * outstride] = x10[63]; } diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c index abb95f31e..8ec0256eb 100644 --- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c @@ -14,6 +14,7 @@ #include "av1/common/enums.h" #include "av1/common/av1_txfm.h" #include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "av1/encoder/x86/av1_txfm1d_sse4.h" #include "av1/encoder/x86/av1_fwd_txfm_sse2.h" @@ -52,9 +53,22 @@ static void fdct32_new_sse4_1(const __m128i *input, __m128i *output, } } +static void fdct64_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 64; + const int num_per_128 = 4; + int col_num = txfm_size / num_per_128; + (void)stage_range; + for (int col = 0; col < col_num; col++) { + av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num, + col_num); + } +} + static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { switch (txfm_type) { case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break; + case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break; default: assert(0); } return NULL; @@ -95,6 +109,42 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, transpose_32(txfm_size, buf_128, out_128); } +static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input, + int32_t *output, const int stride, + const TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf) { + assert(cfg->tx_size < TX_SIZES); + const int txfm_size = tx_size_wide[cfg->tx_size]; + const int8_t *shift = cfg->shift; + const int8_t *stage_range_col = cfg->stage_range_col; + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + __m128i *buf_128 = (__m128i *)txfm_buf; + __m128i *out_128 = (__m128i *)output; + + const int num_per_128 = 4; + int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; + int col_num = txfm_size / num_per_128; + + int16_array_with_stride_to_int32_array_without_stride(input, stride, output, + txfm_size); + /*col wise transform*/ + txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); + transpose_32(txfm_size, out_128, buf_128); + + /*row wise transform*/ + for (int col = 0; col < (col_num >> 1); col++) { + av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, + col_num, (col_num >> 1)); + } + + txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1); + av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]); + transpose_32x32(buf_128, out_128); +} + void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]); @@ -104,6 +154,15 @@ void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); } +void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); + (void)bd; + fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf); +} + static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA, const __m128i *inputB, __m128i *output) { __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]); @@ -162,8 +221,8 @@ static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, bufA[j] = _mm_cvtepi16_epi32(buf[j]); bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); } - av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row); - av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row); + av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1); av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]); av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]); @@ -209,10 +268,10 @@ static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output, bufA[j] = _mm_cvtepi16_epi32(buf[j]); bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); } - av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row); - av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row); - av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]); - av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); int32_t *output8 = output + 8 * 32 * i; for (int j = 0; j < width_div8; ++j) { @@ -260,8 +319,8 @@ static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output, } av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row); av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row); - av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]); - av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); int32_t *output8 = output + 8 * 32 * i; for (int j = 0; j < (32 / 4); ++j) { diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h index c582ca0e3..38707137c 100644 --- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_FWD_TXFM_AVX2_H_ -#define AV1_FWD_TXFM_AVX2_H_ +#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ +#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ #include static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) { @@ -100,4 +100,4 @@ static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1, *in1 = _mm256_srai_epi32(temp1, cos_bit); } -#endif // AV1_FWD_TXFM_AVX2_H_ +#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h index aa14d3ade..99a6b9082 100644 --- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_ -#define AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_ +#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ +#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ #include @@ -114,4 +114,4 @@ static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = { } #endif -#endif // AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_ +#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h index 0adefecdb..6df2a8bdb 100644 --- a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h +++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_TXMF1D_SSE2_H_ -#define AV1_TXMF1D_SSE2_H_ +#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ +#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ #include #include "av1/common/av1_txfm.h" @@ -29,7 +29,8 @@ void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output, void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, int8_t cos_bit); void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output, - int8_t cos_bit); + int8_t cos_bit, const int instride, + const int outstride); void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, const int8_t cos_bit, const int8_t *stage_range); @@ -138,4 +139,4 @@ static INLINE void transpose_32(int txfm_size, const __m128i *input, } #endif -#endif // AV1_TXMF1D_SSE2_H_ +#endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ diff --git a/third_party/aom/av1/encoder/x86/encodetxb_avx2.c b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c new file mode 100644 index 000000000..7642f57d1 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include /* SSE4.1 */ +#include /* AVX2 */ + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/txb_common.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = width + TX_PAD_HOR; + const __m256i y_zeros = _mm256_setzero_si256(); + + const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride; + uint8_t *pre_buf = levels - TX_PAD_TOP * stride; + uint8_t *pre_buf_end = pre_buf + pre_len; + do { + yy_storeu_256(pre_buf, y_zeros); + pre_buf += 32; + } while (pre_buf < pre_buf_end); + + const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); + uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride; + uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31)); + + do { + yy_storeu_256(bottom_buf, y_zeros); + bottom_buf += 32; + } while (bottom_buf < bottom_buf_end); + + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (width == 4) { + do { + const __m256i c0 = yy_loadu_256(cf); + const __m256i c1 = yy_loadu_256(cf + 8); + const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1)); + const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros); + const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8); + const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8); + yy_storeu_256(ls, res); + ls += 32; + cf += 16; + i += 4; + } while (i < height); + } else if (width == 8) { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + const __m128i res0 = _mm256_castsi256_si128(res); + const __m128i res1 = _mm256_extracti128_si256(res, 1); + xx_storel_64(ls, res0); + *(int32_t *)(ls + width) = 0; + xx_storel_64(ls + stride, _mm_srli_si128(res0, 8)); + *(int32_t *)(ls + width + stride) = 0; + xx_storel_64(ls + stride * 2, res1); + *(int32_t *)(ls + width + stride * 2) = 0; + xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8)); + *(int32_t *)(ls + width + stride * 3) = 0; + cf += 32; + ls += stride << 2; + i += 4; + } while (i < height); + } else if (width == 16) { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + xx_storeu_128(ls, _mm256_castsi256_si128(res)); + xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1)); + cf += 32; + *(int32_t *)(ls + width) = 0; + *(int32_t *)(ls + stride + width) = 0; + ls += stride << 1; + i += 2; + } while (i < height); + } else { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + yy_storeu_256(ls, res); + cf += 32; + *(int32_t *)(ls + width) = 0; + ls += stride; + i += 1; + } while (i < height); + } +} diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c index b3a879b0f..5e0687cd3 100644 --- a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c +++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c @@ -14,43 +14,55 @@ #include /* SSE4.1 */ #include "aom/aom_integer.h" -#include "aom_dsp/x86/mem_sse2.h" #include "av1/common/onyxc_int.h" #include "av1/common/txb_common.h" +#include "aom_dsp/x86/synonyms.h" void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels) { const int stride = width + TX_PAD_HOR; - memset(levels - TX_PAD_TOP * stride, 0, - sizeof(*levels) * TX_PAD_TOP * stride); - memset(levels + stride * height, 0, - sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); - const __m128i zeros = _mm_setzero_si128(); + + const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride; + uint8_t *pre_buf = levels - TX_PAD_TOP * stride; + uint8_t *pre_buf_end = pre_buf + pre_len; + do { + _mm_storeu_si128((__m128i *)(pre_buf), zeros); + pre_buf += 16; + } while (pre_buf < pre_buf_end); + + const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); + uint8_t *bottom_buf = levels + stride * height; + uint8_t *bottom_buf_end = bottom_buf + bottom_len; + do { + _mm_storeu_si128((__m128i *)(bottom_buf), zeros); + bottom_buf += 16; + } while (bottom_buf < bottom_buf_end); + int i = 0; uint8_t *ls = levels; const tran_low_t *cf = coeff; if (width == 4) { do { - const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); - const __m128i coeffB = _mm_load_si128((__m128i *)(cf + width)); + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); const __m128i absAB = _mm_abs_epi16(coeffAB); const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros); - _mm_storeu_si128((__m128i *)ls, lsAB); + xx_storeu_128(ls, lsAB); ls += (stride << 1); cf += (width << 1); i += 2; } while (i < height); } else if (width == 8) { do { - const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); - const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4)); + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); const __m128i absAB = _mm_abs_epi16(coeffAB); const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); - _mm_storeu_si128((__m128i *)ls, absAB8); + xx_storeu_128(ls, absAB8); ls += stride; cf += width; i += 1; @@ -59,16 +71,16 @@ void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width, do { int j = 0; do { - const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); - const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4)); - const __m128i coeffC = _mm_load_si128((__m128i *)(cf + 8)); - const __m128i coeffD = _mm_load_si128((__m128i *)(cf + 12)); + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); + const __m128i coeffC = xx_loadu_128(cf + 8); + const __m128i coeffD = xx_loadu_128(cf + 12); const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD); const __m128i absAB = _mm_abs_epi16(coeffAB); const __m128i absCD = _mm_abs_epi16(coeffCD); const __m128i absABCD = _mm_packs_epi16(absAB, absCD); - _mm_storeu_si128((__m128i *)(ls + j), absABCD); + xx_storeu_128(ls + j, absABCD); j += 16; cf += 16; } while (j < width); diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c index 4cd6371a6..535485ae8 100644 --- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c +++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c @@ -17,6 +17,7 @@ #include "av1/common/av1_txfm.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" #include "aom_dsp/txfm_common.h" #include "aom_dsp/x86/txfm_common_sse2.h" #include "aom_ports/mem.h" @@ -393,7 +394,32 @@ static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) { _mm_store_si128((__m128i *)(output + 15 * 4), res[15]); } -static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { +static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output, + const int stride) { + _mm_storeu_si128((__m128i *)(output), res[0]); + _mm_storeu_si128((__m128i *)(output + 4), res[1]); + _mm_storeu_si128((__m128i *)(output + stride), res[2]); + _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]); + + _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]); + _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]); + _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]); + _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]); + + _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]); + _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]); + _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]); + _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]); + + _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]); + _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]); + _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]); + _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]); +} + +static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + (void)(col_num); const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); @@ -589,7 +615,9 @@ static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { out[13] = u[3]; // buf0[3] } -static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) { +static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + (void)(col_num); const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); @@ -780,82 +808,82 @@ void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, switch (tx_type) { case DCT_DCT: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); - fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_DCT: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); - fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case DCT_ADST: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); - fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_ADST: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); - fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case FLIPADST_DCT: load_buffer_8x8(input, in, stride, 1, 0, shift[0]); - fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case DCT_FLIPADST: load_buffer_8x8(input, in, stride, 0, 1, shift[0]); - fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case FLIPADST_FLIPADST: load_buffer_8x8(input, in, stride, 1, 1, shift[0]); - fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_FLIPADST: load_buffer_8x8(input, in, stride, 0, 1, shift[0]); - fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case FLIPADST_ADST: load_buffer_8x8(input, in, stride, 1, 0, shift[0]); - fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; @@ -940,7 +968,26 @@ static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out, convert_8x8_to_16x16(in, out); } -static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 8 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + + load_buffer_8x8(topL, out, stride, flipud, fliplr, shift); + load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift); +} + +static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); @@ -962,7 +1009,6 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { const __m128i cospi52 = _mm_set1_epi32(cospi[52]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); __m128i u[16], v[16], x; - const int col_num = 4; int col; // Calculate the column 0, 1, 2, 3 @@ -1226,7 +1272,8 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { } } -static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { +static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, + const int num_cols) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); @@ -1271,25 +1318,25 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { __m128i u[16], v[16], x, y; int col; - for (col = 0; col < 4; ++col) { + for (col = 0; col < num_cols; ++col) { // stage 0 // stage 1 - u[0] = in[0 * 4 + col]; - u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]); - u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]); - u[3] = in[8 * 4 + col]; - u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]); - u[5] = in[12 * 4 + col]; - u[6] = in[4 * 4 + col]; - u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]); - u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]); - u[9] = in[14 * 4 + col]; - u[10] = in[6 * 4 + col]; - u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]); - u[12] = in[2 * 4 + col]; - u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]); - u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]); - u[15] = in[10 * 4 + col]; + u[0] = in[0 * num_cols + col]; + u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]); + u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]); + u[3] = in[8 * num_cols + col]; + u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]); + u[5] = in[12 * num_cols + col]; + u[6] = in[4 * num_cols + col]; + u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]); + u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]); + u[9] = in[14 * num_cols + col]; + u[10] = in[6 * num_cols + col]; + u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]); + u[12] = in[2 * num_cols + col]; + u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]); + u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]); + u[15] = in[10 * num_cols + col]; // stage 2 v[0] = u[0]; @@ -1453,22 +1500,22 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); // stage 9 - out[0 * 4 + col] = v[1]; - out[1 * 4 + col] = v[14]; - out[2 * 4 + col] = v[3]; - out[3 * 4 + col] = v[12]; - out[4 * 4 + col] = v[5]; - out[5 * 4 + col] = v[10]; - out[6 * 4 + col] = v[7]; - out[7 * 4 + col] = v[8]; - out[8 * 4 + col] = v[9]; - out[9 * 4 + col] = v[6]; - out[10 * 4 + col] = v[11]; - out[11 * 4 + col] = v[4]; - out[12 * 4 + col] = v[13]; - out[13 * 4 + col] = v[2]; - out[14 * 4 + col] = v[15]; - out[15 * 4 + col] = v[0]; + out[0 * num_cols + col] = v[1]; + out[1 * num_cols + col] = v[14]; + out[2 * num_cols + col] = v[3]; + out[3 * num_cols + col] = v[12]; + out[4 * num_cols + col] = v[5]; + out[5 * num_cols + col] = v[10]; + out[6 * num_cols + col] = v[7]; + out[7 * num_cols + col] = v[8]; + out[8 * num_cols + col] = v[9]; + out[9 * num_cols + col] = v[6]; + out[10 * num_cols + col] = v[11]; + out[11 * num_cols + col] = v[4]; + out[12 * num_cols + col] = v[13]; + out[13 * num_cols + col] = v[2]; + out[14 * num_cols + col] = v[15]; + out[15 * num_cols + col] = v[0]; } } @@ -1482,6 +1529,11 @@ static void col_txfm_16x16_rounding(__m128i *in, int shift) { col_txfm_8x8_rounding(&in[48], shift); } +static void col_txfm_8x16_rounding(__m128i *in, int shift) { + col_txfm_8x8_rounding(&in[0], shift); + col_txfm_8x8_rounding(&in[16], shift); +} + static void write_buffer_16x16(const __m128i *in, int32_t *output) { const int size_8x8 = 16 * 4; write_buffer_8x8(&in[0], output); @@ -1499,85 +1551,86 @@ void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, const int8_t *shift = fwd_txfm_shift_ls[TX_16X16]; const int txw_idx = get_txw_idx(TX_16X16); const int txh_idx = get_txh_idx(TX_16X16); + const int col_num = 4; switch (tx_type) { case DCT_DCT: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); - fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_DCT: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); - fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case DCT_ADST: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); - fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_ADST: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); - fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case FLIPADST_DCT: load_buffer_16x16(input, in, stride, 1, 0, shift[0]); - fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case DCT_FLIPADST: load_buffer_16x16(input, in, stride, 0, 1, shift[0]); - fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case FLIPADST_FLIPADST: load_buffer_16x16(input, in, stride, 1, 1, shift[0]); - fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_FLIPADST: load_buffer_16x16(input, in, stride, 0, 1, shift[0]); - fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case FLIPADST_ADST: load_buffer_16x16(input, in, stride, 1, 0, shift[0]); - fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]); + fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; @@ -1585,3 +1638,146 @@ void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, } (void)bd; } + +static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) { + for (int i = 0; i < size; i += 2) in[30 - i] = out[i]; + for (int i = 1; i < size; i += 2) in[size - i] = out[i]; +} + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + fadst8x8_sse4_1, // ADST_DCT + fdct8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fadst8x8_sse4_1, // FLIPADST_DCT + fdct8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + NULL, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + fdct16x16_sse4_1, // ADST_DCT + fadst16x16_sse4_1, // DCT_ADST + fadst16x16_sse4_1, // ADST_ADST + fdct16x16_sse4_1, // FLIPADST_DCT + fadst16x16_sse4_1, // DCT_FLIPADST + fadst16x16_sse4_1, // FLIPADST_FLIPADST + fadst16x16_sse4_1, // ADST_FLIPADST + fadst16x16_sse4_1, // FLIPADST_ADST + NULL, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + fadst16x16_sse4_1, // ADST_DCT + fdct16x16_sse4_1, // DCT_ADST + fadst16x16_sse4_1, // ADST_ADST + fadst16x16_sse4_1, // FLIPADST_DCT + fdct16x16_sse4_1, // DCT_FLIPADST + fadst16x16_sse4_1, // FLIPADST_FLIPADST + fadst16x16_sse4_1, // ADST_FLIPADST + fadst16x16_sse4_1, // FLIPADST_ADST + NULL, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + fdct8x8_sse4_1, // ADST_DCT + fadst8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fdct8x8_sse4_1, // FLIPADST_DCT + fadst8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + NULL, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[32], out[32]; + const int8_t *shift = fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type]; + int bit = fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]); + col_txfm(in, in, bit, 0); + col_txfm_8x8_rounding(in, -shift[1]); + transpose_8x8(in, out + i * 16); + } + + if (lr_flip) { + flip_buf_sse4_1(in, out, 32); + row_txfm(in, out, bit, 2); + } else { + row_txfm(out, out, bit, 2); + } + + for (int i = 0; i < 2; i++) { + transpose_8x8(out + i * 16, in); + av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2); + write_buffer_16x8(in, coeff + i * 8, 16); + } + + (void)bd; +} + +void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[32], out[32]; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type]; + int bit = fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, in, bit, 2); + col_txfm_8x16_rounding(in, -shift[1]); + transpose_8x8(in, out); + transpose_8x8(in + 16, out + 16); + + for (int i = 0; i < 2; i++) { + row_txfm(out + i * 16, out, bit, 0); + transpose_8x8(out, in); + av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2); + write_buffer_8x8(in, coeff + i * 64); + } + + (void)bd; +} diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c new file mode 100644 index 000000000..06aaaa7ee --- /dev/null +++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/transpose_sse2.h" + +#include "config/av1_rtcd.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" + +static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src, + const __m128i *shuffle, const __m256i *kl) { + const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle); + const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s)); + const __m256i dst0 = yy_loadu_256(dst); + const __m256i r0 = _mm256_add_epi32(dst0, d0); + yy_storeu_256(dst, r0); +} + +static INLINE void acc_stat_win7_one_line_avx2( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN], + int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN; + for (j = h_start; j < h_end; j += 2) { + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint8_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m256i kl = + _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win7_opt_avx2( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, double *M, double *H) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const double avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_win7_one_line_avx2( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32); + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN2; ++k) { + for (l = 0; l < WIENER_WIN * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const double avg_square_sum = avg * avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]); + double *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} + +static INLINE void acc_stat_win5_one_line_avx2( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN_CHROMA; + for (j = h_start; j < h_end; j += 2) { + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint8_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m256i kl = + _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win5_opt_avx2( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, double *M, double *H) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const double avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_win5_one_line_avx2( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32); + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN2_CHROMA; ++k) { + for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const double avg_square_sum = avg * avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]); + double *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} + +void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, double *M, double *H) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } else { + av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } +} + +static INLINE __m256i pair_set_epi16(uint16_t a, uint16_t b) { + return _mm256_set1_epi32( + (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +int64_t av1_lowbd_pixel_proj_error_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); + __m256i sum64 = _mm256_setzero_si256(); + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]); + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i flt0_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt0 + j), + yy_loadu_256(flt0 + j + 8)), + 0xd8); + const __m256i flt1_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt1 + j), + yy_loadu_256(flt1 + j + 8)), + 0xd8); + const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS); + const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0); + const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0); + const __m256i v0 = _mm256_madd_epi16( + xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m256i v1 = _mm256_madd_epi16( + xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m256i vr0 = + _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); + const __m256i vr1 = + _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); + const __m256i e0 = _mm256_sub_epi16( + _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); + const __m256i err0 = _mm256_madd_epi16(e0, e0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum64_0); + sum64 = _mm256_add_epi64(sum64, sum64_1); + } + } else if (params->r[0] > 0) { + __m256i xq_coeff = + pair_set_epi16(xq[0], (-xq[0] * (1 << SGRPROJ_RST_BITS))); + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i flt0_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt0 + j), + yy_loadu_256(flt0 + j + 8)), + 0xd8); + const __m256i v0 = + _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt0_16b, d0)); + const __m256i v1 = + _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt0_16b, d0)); + const __m256i vr0 = + _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); + const __m256i vr1 = + _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); + const __m256i e0 = _mm256_sub_epi16( + _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); + const __m256i err0 = _mm256_madd_epi16(e0, e0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum64_0); + sum64 = _mm256_add_epi64(sum64, sum64_1); + } + } else if (params->r[1] > 0) { + __m256i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS)); + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i flt1_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt1 + j), + yy_loadu_256(flt1 + j + 8)), + 0xd8); + const __m256i v0 = + _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt1_16b, d0)); + const __m256i v1 = + _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt1_16b, d0)); + const __m256i vr0 = + _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); + const __m256i vr1 = + _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); + const __m256i e0 = _mm256_sub_epi16( + _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); + const __m256i err0 = _mm256_madd_epi16(e0, e0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt1 += flt1_stride; + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum64_0); + sum64 = _mm256_add_epi64(sum64, sum64_1); + } + } else { + __m256i sum32 = _mm256_setzero_si256(); + for (i = 0; i < height; ++i) { + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i diff0 = _mm256_sub_epi16(d0, s0); + const __m256i err0 = _mm256_madd_epi16(diff0, diff0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + } + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64_0, sum64_1); + } + int64_t sum[4]; + yy_storeu_256(sum, sum64); + err += sum[0] + sum[1] + sum[2] + sum[3]; + return err; +} diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c new file mode 100644 index 000000000..04e4d1afc --- /dev/null +++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom_dsp/x86/synonyms.h" + +#include "config/av1_rtcd.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" + +static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src, + const __m128i *shuffle, const __m128i *kl) { + const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle); + const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s)); + const __m128i d1 = + _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8))); + const __m128i dst0 = xx_loadu_128(dst); + const __m128i dst1 = xx_loadu_128(dst + 4); + const __m128i r0 = _mm_add_epi32(dst0, d0); + const __m128i r1 = _mm_add_epi32(dst1, d1); + xx_storeu_128(dst, r0); + xx_storeu_128(dst + 4, r1); +} + +static INLINE void acc_stat_win7_one_line_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN], + int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + const int wiener_win = 7; + int j, k, l; + for (j = h_start; j < h_end; j += 2) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m128i kl = + _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win7_opt_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, double *M, double *H) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const double avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_win7_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32); + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN2; ++k) { + for (l = 0; l < WIENER_WIN * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const double avg_square_sum = avg * avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]); + double *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} + +static INLINE void acc_stat_win5_one_line_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + const int wiener_win = WIENER_WIN_CHROMA; + int j, k, l; + for (j = h_start; j < h_end; j += 2) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m128i kl = + _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l)))); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win5_opt_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, double *M, double *H) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const double avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_win5_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32); + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) { + for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const double avg_square_sum = avg * avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]); + double *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} +void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, double *M, double *H) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } else { + av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H); + } +} + +static INLINE __m128i pair_set_epi16(uint16_t a, uint16_t b) { + return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +int64_t av1_lowbd_pixel_proj_error_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + __m128i sum64 = _mm_setzero_si128(); + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]); + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j < width - 8; j += 8) { + const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); + const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); + const __m128i flt0_16b = + _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4)); + const __m128i flt1_16b = + _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4)); + const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS); + const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0); + const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0); + const __m128i v0 = _mm_madd_epi16( + xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m128i v1 = _mm_madd_epi16( + xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); + const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); + const __m128i e0 = + _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); + const __m128i err0 = _mm_madd_epi16(e0, e0); + sum32 = _mm_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum64_0); + sum64 = _mm_add_epi64(sum64, sum64_1); + } + } else if (params->r[0] > 0) { + __m128i xq_coeff = pair_set_epi16(xq[0], -(xq[0] << SGRPROJ_RST_BITS)); + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j < width - 8; j += 8) { + const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); + const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); + const __m128i flt0_16b = + _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4)); + const __m128i v0 = + _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt0_16b, d0)); + const __m128i v1 = + _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt0_16b, d0)); + const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); + const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); + const __m128i e0 = + _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); + const __m128i err0 = _mm_madd_epi16(e0, e0); + sum32 = _mm_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum64_0); + sum64 = _mm_add_epi64(sum64, sum64_1); + } + } else if (params->r[1] > 0) { + __m128i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS)); + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j < width - 8; j += 8) { + const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); + const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); + const __m128i flt1_16b = + _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4)); + const __m128i v0 = + _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt1_16b, d0)); + const __m128i v1 = + _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt1_16b, d0)); + const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); + const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); + const __m128i e0 = + _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); + const __m128i err0 = _mm_madd_epi16(e0, e0); + sum32 = _mm_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + flt1 += flt1_stride; + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum64_0); + sum64 = _mm_add_epi64(sum64, sum64_1); + } + } else { + __m128i sum32 = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < width - 16; j += 16) { + const __m128i d = xx_loadu_128(dat + j); + const __m128i s = xx_loadu_128(src + j); + const __m128i d0 = _mm_cvtepu8_epi16(d); + const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8)); + const __m128i s0 = _mm_cvtepu8_epi16(s); + const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)); + const __m128i diff0 = _mm_sub_epi16(d0, s0); + const __m128i diff1 = _mm_sub_epi16(d1, s1); + const __m128i err0 = _mm_madd_epi16(diff0, diff0); + const __m128i err1 = _mm_madd_epi16(diff1, diff1); + sum32 = _mm_add_epi32(sum32, err0); + sum32 = _mm_add_epi32(sum32, err1); + } + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += e * e; + } + dat += dat_stride; + src += src_stride; + } + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64_0, sum64_1); + } + int64_t sum[2]; + xx_storeu_128(sum, sum64); + err += sum[0] + sum[1]; + return err; +} diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c index f776e84c7..2a792f14e 100644 --- a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c +++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c @@ -14,7 +14,7 @@ #include #include "aom_dsp/x86/synonyms.h" - +#include "aom_dsp/x86/synonyms_avx2.h" #include "aom/aom_integer.h" #include "av1/common/reconinter.h" @@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d, uint64_t csse; const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE); - const __m256i v_zext_q = _mm256_set1_epi64x(0xffffffff); + const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff); __m256i v_acc0_q = _mm256_setzero_si256(); diff --git a/third_party/aom/av1/exports_com b/third_party/aom/av1/exports_com new file mode 100644 index 000000000..5c8e0e09d --- /dev/null +++ b/third_party/aom/av1/exports_com @@ -0,0 +1,2 @@ +text aom_read_obu_header_and_size +text av1_resize_frame420 diff --git a/third_party/aom/av1/exports_dec b/third_party/aom/av1/exports_dec index 05860e8c0..daabf6766 100644 --- a/third_party/aom/av1/exports_dec +++ b/third_party/aom/av1/exports_dec @@ -1,2 +1,3 @@ data aom_codec_av1_dx_algo text aom_codec_av1_dx +text av1_add_film_grain diff --git a/third_party/aom/av1/exports_test b/third_party/aom/av1/exports_test new file mode 100644 index 000000000..dab377575 --- /dev/null +++ b/third_party/aom/av1/exports_test @@ -0,0 +1,2 @@ +text av1_get_fwd_txfm_cfg +text av1_rtcd -- cgit v1.2.3