summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1')
-rw-r--r--third_party/aom/av1/av1.cmake14
-rw-r--r--third_party/aom/av1/av1_cx_iface.c160
-rw-r--r--third_party/aom/av1/av1_dx_iface.c62
-rw-r--r--third_party/aom/av1/av1_iface_common.h7
-rw-r--r--third_party/aom/av1/common/alloccommon.h6
-rw-r--r--third_party/aom/av1/common/arm/av1_inv_txfm_neon.c2447
-rw-r--r--third_party/aom/av1/common/arm/av1_inv_txfm_neon.h8
-rw-r--r--third_party/aom/av1/common/arm/blend_a64_hmask_neon.c4
-rw-r--r--third_party/aom/av1/common/arm/blend_a64_vmask_neon.c4
-rw-r--r--third_party/aom/av1/common/arm/cfl_neon.c4
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon.c363
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon.h6
-rw-r--r--third_party/aom/av1/common/arm/jnt_convolve_neon.c512
-rw-r--r--third_party/aom/av1/common/arm/mem_neon.h15
-rw-r--r--third_party/aom/av1/common/arm/selfguided_neon.c18
-rw-r--r--third_party/aom/av1/common/arm/transpose_neon.h83
-rw-r--r--third_party/aom/av1/common/arm/warp_plane_neon.c714
-rw-r--r--third_party/aom/av1/common/arm/wiener_convolve_neon.c145
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm1d.c140
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm1d.h6
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm1d_cfg.h6
-rw-r--r--third_party/aom/av1/common/av1_loopfilter.c945
-rw-r--r--third_party/aom/av1/common/av1_loopfilter.h120
-rwxr-xr-xthird_party/aom/av1/common/av1_rtcd_defs.pl46
-rw-r--r--third_party/aom/av1/common/av1_txfm.c50
-rw-r--r--third_party/aom/av1/common/av1_txfm.h32
-rw-r--r--third_party/aom/av1/common/blockd.c64
-rw-r--r--third_party/aom/av1/common/blockd.h53
-rw-r--r--third_party/aom/av1/common/cdef.h6
-rw-r--r--third_party/aom/av1/common/cdef_block.h6
-rw-r--r--third_party/aom/av1/common/cdef_block_simd.h5
-rw-r--r--third_party/aom/av1/common/cfl.h6
-rw-r--r--third_party/aom/av1/common/common.h6
-rw-r--r--third_party/aom/av1/common/common_data.h75
-rw-r--r--third_party/aom/av1/common/convolve.c116
-rw-r--r--third_party/aom/av1/common/convolve.h21
-rw-r--r--third_party/aom/av1/common/entropy.h6
-rw-r--r--third_party/aom/av1/common/entropymode.h8
-rw-r--r--third_party/aom/av1/common/entropymv.c55
-rw-r--r--third_party/aom/av1/common/entropymv.h16
-rw-r--r--third_party/aom/av1/common/enums.h12
-rw-r--r--third_party/aom/av1/common/filter.h22
-rw-r--r--third_party/aom/av1/common/frame_buffers.c11
-rw-r--r--third_party/aom/av1/common/frame_buffers.h12
-rw-r--r--third_party/aom/av1/common/idct.c274
-rw-r--r--third_party/aom/av1/common/idct.h31
-rw-r--r--third_party/aom/av1/common/mv.h8
-rw-r--r--third_party/aom/av1/common/mvref_common.c381
-rw-r--r--third_party/aom/av1/common/mvref_common.h43
-rw-r--r--third_party/aom/av1/common/obmc.h14
-rw-r--r--third_party/aom/av1/common/obu_util.c147
-rw-r--r--third_party/aom/av1/common/obu_util.h47
-rw-r--r--third_party/aom/av1/common/odintrin.h12
-rw-r--r--third_party/aom/av1/common/onyxc_int.h29
-rw-r--r--third_party/aom/av1/common/ppc/cfl_ppc.c85
-rw-r--r--third_party/aom/av1/common/pred_common.c4
-rw-r--r--third_party/aom/av1/common/pred_common.h6
-rw-r--r--third_party/aom/av1/common/quant_common.h6
-rw-r--r--third_party/aom/av1/common/reconinter.c652
-rw-r--r--third_party/aom/av1/common/reconinter.h148
-rw-r--r--third_party/aom/av1/common/reconintra.h6
-rw-r--r--third_party/aom/av1/common/resize.c39
-rw-r--r--third_party/aom/av1/common/resize.h6
-rw-r--r--third_party/aom/av1/common/restoration.c131
-rw-r--r--third_party/aom/av1/common/restoration.h7
-rw-r--r--third_party/aom/av1/common/scale.h7
-rw-r--r--third_party/aom/av1/common/scan.h6
-rw-r--r--third_party/aom/av1/common/seg_common.h6
-rw-r--r--third_party/aom/av1/common/thread_common.c18
-rw-r--r--third_party/aom/av1/common/thread_common.h6
-rw-r--r--third_party/aom/av1/common/tile_common.c16
-rw-r--r--third_party/aom/av1/common/tile_common.h9
-rw-r--r--third_party/aom/av1/common/timing.h6
-rw-r--r--third_party/aom/av1/common/token_cdfs.h5
-rw-r--r--third_party/aom/av1/common/txb_common.h243
-rw-r--r--third_party/aom/av1/common/warped_motion.c4
-rw-r--r--third_party/aom/av1/common/warped_motion.h6
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c1
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c6
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h6
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c6
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h10
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse2.h6
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.h11
-rw-r--r--third_party/aom/av1/common/x86/cfl_simd.h5
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_avx2.c2
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_sse2.c3
-rw-r--r--third_party/aom/av1/common/x86/convolve_sse2.c11
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c1
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c1
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c1
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c1117
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c3935
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c1
-rw-r--r--third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h28
-rw-r--r--third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c268
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_avx2.c211
-rw-r--r--third_party/aom/av1/common/x86/reconinter_avx2.c496
-rw-r--r--third_party/aom/av1/common/x86/selfguided_avx2.c23
-rw-r--r--third_party/aom/av1/common/x86/selfguided_sse4.c24
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_sse4.c809
-rw-r--r--third_party/aom/av1/common/x86/wiener_convolve_avx2.c3
-rw-r--r--third_party/aom/av1/common/x86/wiener_convolve_sse2.c3
-rw-r--r--third_party/aom/av1/decoder/accounting.h6
-rw-r--r--third_party/aom/av1/decoder/decodeframe.c555
-rw-r--r--third_party/aom/av1/decoder/decodeframe.h8
-rw-r--r--third_party/aom/av1/decoder/decodemv.c179
-rw-r--r--third_party/aom/av1/decoder/decodemv.h6
-rw-r--r--third_party/aom/av1/decoder/decoder.c21
-rw-r--r--third_party/aom/av1/decoder/decoder.h12
-rw-r--r--third_party/aom/av1/decoder/decodetxb.h6
-rw-r--r--third_party/aom/av1/decoder/detokenize.h6
-rw-r--r--third_party/aom/av1/decoder/dthread.h6
-rw-r--r--third_party/aom/av1/decoder/inspection.h6
-rw-r--r--third_party/aom/av1/decoder/obu.c176
-rw-r--r--third_party/aom/av1/decoder/obu.h29
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.c7
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.h6
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.c8
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.h6
-rw-r--r--third_party/aom/av1/encoder/aq_variance.c179
-rw-r--r--third_party/aom/av1/encoder/aq_variance.h10
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d.c147
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d.h6
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h6
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.c68
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.h6
-rw-r--r--third_party/aom/av1/encoder/bitstream.c312
-rw-r--r--third_party/aom/av1/encoder/bitstream.h16
-rw-r--r--third_party/aom/av1/encoder/block.h53
-rw-r--r--third_party/aom/av1/encoder/blockiness.c1
-rw-r--r--third_party/aom/av1/encoder/context_tree.c17
-rw-r--r--third_party/aom/av1/encoder/context_tree.h8
-rw-r--r--third_party/aom/av1/encoder/corner_detect.h6
-rw-r--r--third_party/aom/av1/encoder/corner_match.h6
-rw-r--r--third_party/aom/av1/encoder/cost.h6
-rw-r--r--third_party/aom/av1/encoder/dwt.h5
-rw-r--r--third_party/aom/av1/encoder/encodeframe.c1075
-rw-r--r--third_party/aom/av1/encoder/encodeframe.h11
-rw-r--r--third_party/aom/av1/encoder/encodemb.c71
-rw-r--r--third_party/aom/av1/encoder/encodemb.h17
-rw-r--r--third_party/aom/av1/encoder/encodemv.c26
-rw-r--r--third_party/aom/av1/encoder/encodemv.h14
-rw-r--r--third_party/aom/av1/encoder/encoder.c490
-rw-r--r--third_party/aom/av1/encoder/encoder.h105
-rw-r--r--third_party/aom/av1/encoder/encodetxb.c48
-rw-r--r--third_party/aom/av1/encoder/encodetxb.h6
-rw-r--r--third_party/aom/av1/encoder/ethread.c252
-rw-r--r--third_party/aom/av1/encoder/ethread.h6
-rw-r--r--third_party/aom/av1/encoder/extend.h6
-rw-r--r--third_party/aom/av1/encoder/firstpass.c992
-rw-r--r--third_party/aom/av1/encoder/firstpass.h19
-rw-r--r--third_party/aom/av1/encoder/global_motion.c4
-rw-r--r--third_party/aom/av1/encoder/global_motion.h6
-rw-r--r--third_party/aom/av1/encoder/grain_test_vectors.h6
-rw-r--r--third_party/aom/av1/encoder/hash.h8
-rw-r--r--third_party/aom/av1/encoder/hash_motion.c94
-rw-r--r--third_party/aom/av1/encoder/hash_motion.h16
-rw-r--r--third_party/aom/av1/encoder/hybrid_fwd_txfm.c38
-rw-r--r--third_party/aom/av1/encoder/hybrid_fwd_txfm.h6
-rw-r--r--third_party/aom/av1/encoder/lookahead.h6
-rw-r--r--third_party/aom/av1/encoder/mathutils.h7
-rw-r--r--third_party/aom/av1/encoder/mbgraph.c7
-rw-r--r--third_party/aom/av1/encoder/mbgraph.h6
-rw-r--r--third_party/aom/av1/encoder/mcomp.c231
-rw-r--r--third_party/aom/av1/encoder/mcomp.h33
-rw-r--r--third_party/aom/av1/encoder/ml.c16
-rw-r--r--third_party/aom/av1/encoder/ml.h11
-rw-r--r--third_party/aom/av1/encoder/palette.h6
-rw-r--r--third_party/aom/av1/encoder/partition_model_weights.h1457
-rw-r--r--third_party/aom/av1/encoder/picklpf.c3
-rw-r--r--third_party/aom/av1/encoder/picklpf.h6
-rw-r--r--third_party/aom/av1/encoder/pickrst.c128
-rw-r--r--third_party/aom/av1/encoder/pickrst.h23
-rw-r--r--third_party/aom/av1/encoder/pustats.h183
-rw-r--r--third_party/aom/av1/encoder/random.h6
-rw-r--r--third_party/aom/av1/encoder/ransac.h6
-rw-r--r--third_party/aom/av1/encoder/rate_distortion_model_params.h6
-rw-r--r--third_party/aom/av1/encoder/ratectrl.c96
-rw-r--r--third_party/aom/av1/encoder/ratectrl.h29
-rw-r--r--third_party/aom/av1/encoder/ratectrl_xiph.c0
-rw-r--r--third_party/aom/av1/encoder/ratectrl_xiph.h0
-rw-r--r--third_party/aom/av1/encoder/rd.c494
-rw-r--r--third_party/aom/av1/encoder/rd.h48
-rw-r--r--third_party/aom/av1/encoder/rdopt.c4551
-rw-r--r--third_party/aom/av1/encoder/rdopt.h18
-rw-r--r--third_party/aom/av1/encoder/reconinter_enc.c627
-rw-r--r--third_party/aom/av1/encoder/reconinter_enc.h127
-rw-r--r--third_party/aom/av1/encoder/segmentation.h6
-rw-r--r--third_party/aom/av1/encoder/speed_features.c145
-rw-r--r--third_party/aom/av1/encoder/speed_features.h144
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.c81
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.h6
-rw-r--r--third_party/aom/av1/encoder/tokenize.h6
-rw-r--r--third_party/aom/av1/encoder/tx_prune_model_weights.h1482
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c259
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c75
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h6
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h6
-rw-r--r--third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h9
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_avx2.c130
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_sse4.c46
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c344
-rw-r--r--third_party/aom/av1/encoder/x86/pickrst_avx2.c403
-rw-r--r--third_party/aom/av1/encoder/x86/pickrst_sse4.c389
-rw-r--r--third_party/aom/av1/encoder/x86/wedge_utils_avx2.c4
-rw-r--r--third_party/aom/av1/exports_com2
-rw-r--r--third_party/aom/av1/exports_dec1
-rw-r--r--third_party/aom/av1/exports_test2
209 files changed, 22699 insertions, 10002 deletions
diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
index 4c4f542fe..3a7cd7ee1 100644
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@@ -53,6 +53,8 @@ list(APPEND AOM_AV1_COMMON_SOURCES
"${AOM_ROOT}/av1/common/mv.h"
"${AOM_ROOT}/av1/common/mvref_common.c"
"${AOM_ROOT}/av1/common/mvref_common.h"
+ "${AOM_ROOT}/av1/common/obu_util.c"
+ "${AOM_ROOT}/av1/common/obu_util.h"
"${AOM_ROOT}/av1/common/odintrin.c"
"${AOM_ROOT}/av1/common/odintrin.h"
"${AOM_ROOT}/av1/common/onyxc_int.h"
@@ -78,8 +80,8 @@ list(APPEND AOM_AV1_COMMON_SOURCES
"${AOM_ROOT}/av1/common/thread_common.h"
"${AOM_ROOT}/av1/common/tile_common.c"
"${AOM_ROOT}/av1/common/tile_common.h"
- "${AOM_ROOT}/av1/common/timing.h"
"${AOM_ROOT}/av1/common/timing.c"
+ "${AOM_ROOT}/av1/common/timing.h"
"${AOM_ROOT}/av1/common/token_cdfs.h"
"${AOM_ROOT}/av1/common/txb_common.c"
"${AOM_ROOT}/av1/common/txb_common.h"
@@ -176,6 +178,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
"${AOM_ROOT}/av1/encoder/rd.h"
"${AOM_ROOT}/av1/encoder/rdopt.c"
"${AOM_ROOT}/av1/encoder/rdopt.h"
+ "${AOM_ROOT}/av1/encoder/reconinter_enc.c"
+ "${AOM_ROOT}/av1/encoder/reconinter_enc.h"
"${AOM_ROOT}/av1/encoder/segmentation.c"
"${AOM_ROOT}/av1/encoder/segmentation.h"
"${AOM_ROOT}/av1/encoder/speed_features.c"
@@ -268,7 +272,8 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
"${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
- "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c")
+ "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
"${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
@@ -276,7 +281,9 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
"${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
"${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
- "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c")
+ "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
"${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
@@ -301,6 +308,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON
"${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
"${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
"${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
+ "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c"
"${AOM_ROOT}/av1/common/cdef_block_neon.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
index 3bc4804c9..3295f618a 100644
--- a/third_party/aom/av1/av1_cx_iface.c
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -14,28 +14,29 @@
#include "config/aom_config.h"
#include "config/aom_version.h"
-#include "aom/aom_encoder.h"
#include "aom_ports/aom_once.h"
+#include "aom_ports/mem_ops.h"
#include "aom_ports/system_state.h"
+
+#include "aom/aom_encoder.h"
#include "aom/internal/aom_codec_internal.h"
-#include "av1/encoder/encoder.h"
-#include "aom/aomcx.h"
-#include "av1/encoder/firstpass.h"
+
#include "av1/av1_iface_common.h"
#include "av1/encoder/bitstream.h"
-#include "aom_ports/mem_ops.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
#define MAG_SIZE (4)
#define MAX_NUM_ENHANCEMENT_LAYERS 3
struct av1_extracfg {
int cpu_used; // available cpu percentage in 1/16
- int dev_sf;
unsigned int enable_auto_alt_ref;
unsigned int enable_auto_bwd_ref;
unsigned int noise_sensitivity;
unsigned int sharpness;
unsigned int static_thresh;
+ unsigned int row_mt;
unsigned int tile_columns; // log2 number of tile columns
unsigned int tile_rows; // log2 number of tile rows
unsigned int arnr_max_frames;
@@ -98,37 +99,40 @@ struct av1_extracfg {
float noise_level;
int noise_block_size;
#endif
+
+ unsigned int chroma_subsampling_x;
+ unsigned int chroma_subsampling_y;
};
static struct av1_extracfg default_extra_cfg = {
- 0, // cpu_used
- 0, // dev_sf
- 1, // enable_auto_alt_ref
- 0, // enable_auto_bwd_ref
- 0, // noise_sensitivity
- 0, // sharpness
- 0, // static_thresh
- 0, // tile_columns
- 0, // tile_rows
- 7, // arnr_max_frames
- 5, // arnr_strength
- 0, // min_gf_interval; 0 -> default decision
- 0, // max_gf_interval; 0 -> default decision
- AOM_TUNE_PSNR, // tuning
- 10, // cq_level
- 0, // rc_max_intra_bitrate_pct
- 0, // rc_max_inter_bitrate_pct
- 0, // gf_cbr_boost_pct
- 0, // lossless
- 1, // enable_cdef
- 1, // enable_restoration
- 0, // disable_trellis_quant
- 0, // enable_qm
- DEFAULT_QM_Y, // qm_y
- DEFAULT_QM_U, // qm_u
- DEFAULT_QM_V, // qm_v
- DEFAULT_QM_FIRST, // qm_min
- DEFAULT_QM_LAST, // qm_max
+ 0, // cpu_used
+ 1, // enable_auto_alt_ref
+ 0, // enable_auto_bwd_ref
+ 0, // noise_sensitivity
+ CONFIG_SHARP_SETTINGS, // sharpness
+ 0, // static_thresh
+ 0, // row_mt
+ 0, // tile_columns
+ 0, // tile_rows
+ 7, // arnr_max_frames
+ 5, // arnr_strength
+ 0, // min_gf_interval; 0 -> default decision
+ 0, // max_gf_interval; 0 -> default decision
+ AOM_TUNE_PSNR, // tuning
+ 10, // cq_level
+ 0, // rc_max_intra_bitrate_pct
+ 0, // rc_max_inter_bitrate_pct
+ 0, // gf_cbr_boost_pct
+ 0, // lossless
+ !CONFIG_SHARP_SETTINGS, // enable_cdef
+ 1, // enable_restoration
+ 0, // disable_trellis_quant
+ 0, // enable_qm
+ DEFAULT_QM_Y, // qm_y
+ DEFAULT_QM_U, // qm_u
+ DEFAULT_QM_V, // qm_v
+ DEFAULT_QM_FIRST, // qm_min
+ DEFAULT_QM_LAST, // qm_max
#if CONFIG_DIST_8X8
0,
#endif
@@ -150,7 +154,7 @@ static struct av1_extracfg default_extra_cfg = {
0, // render width
0, // render height
AOM_SUPERBLOCK_SIZE_DYNAMIC, // superblock_size
- 0, // Single tile decoding is off by default.
+ 1, // this depends on large_scale_tile.
0, // error_resilient_mode off by default.
0, // s_frame_mode off by default.
0, // film_grain_test_vector
@@ -168,6 +172,8 @@ static struct av1_extracfg default_extra_cfg = {
0, // noise_level
32, // noise_block_size
#endif
+ 0, // chroma_subsampling_x
+ 0, // chroma_subsampling_y
};
struct aom_codec_alg_priv {
@@ -251,10 +257,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
if (extra_cfg->max_gf_interval > 0) {
- RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1));
- }
- if (extra_cfg->min_gf_interval > 0 && extra_cfg->max_gf_interval > 0) {
- RANGE_CHECK(extra_cfg, max_gf_interval, extra_cfg->min_gf_interval,
+ RANGE_CHECK(extra_cfg, max_gf_interval, MAX(2, extra_cfg->min_gf_interval),
(MAX_LAG_BUFFERS - 1));
}
@@ -284,13 +287,14 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 2);
RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
RANGE_CHECK(extra_cfg, cpu_used, 0, 8);
- RANGE_CHECK(extra_cfg, dev_sf, 0, UINT8_MAX);
RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
AOM_SUPERBLOCK_SIZE_DYNAMIC);
RANGE_CHECK_HI(cfg, large_scale_tile, 1);
RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1);
+ RANGE_CHECK_HI(extra_cfg, row_mt, 1);
+
RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
@@ -372,6 +376,9 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
#endif
}
+ RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1);
+ RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1);
+
return AOM_CODEC_OK;
}
@@ -581,7 +588,6 @@ static aom_codec_err_t set_encoder_config(
oxcf->sframe_mode = cfg->sframe_mode;
oxcf->sframe_enabled = cfg->sframe_dist != 0;
oxcf->speed = extra_cfg->cpu_used;
- oxcf->dev_sf = extra_cfg->dev_sf;
oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
@@ -637,6 +643,8 @@ static aom_codec_err_t set_encoder_config(
oxcf->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
}
+ oxcf->row_mt = extra_cfg->row_mt;
+
oxcf->tile_columns = extra_cfg->tile_columns;
oxcf->tile_rows = extra_cfg->tile_rows;
@@ -692,6 +700,24 @@ static aom_codec_err_t set_encoder_config(
oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
+
+#if CONFIG_REDUCED_ENCODER_BORDER
+ if (oxcf->superres_mode != SUPERRES_NONE ||
+ oxcf->resize_mode != RESIZE_NONE) {
+ warn(
+ "Superres / resize cannot be used with CONFIG_REDUCED_ENCODER_BORDER. "
+ "Disabling superres/resize.\n");
+ // return AOM_CODEC_INVALID_PARAM;
+ disable_superres(oxcf);
+ oxcf->resize_mode = RESIZE_NONE;
+ oxcf->resize_scale_denominator = SCALE_NUMERATOR;
+ oxcf->resize_kf_scale_denominator = SCALE_NUMERATOR;
+ }
+#endif // CONFIG_REDUCED_ENCODER_BORDER
+
+ oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
+ oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
+
return AOM_CODEC_OK;
}
@@ -731,6 +757,10 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
return res;
}
+static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) {
+ return av1_get_global_headers(ctx->cpi);
+}
+
static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx,
va_list args) {
int *const arg = va_arg(args, int *);
@@ -765,12 +795,6 @@ static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
-static aom_codec_err_t ctrl_set_devsf(aom_codec_alg_priv_t *ctx, va_list args) {
- struct av1_extracfg extra_cfg = ctx->extra_cfg;
- extra_cfg.dev_sf = CAST(AOME_SET_DEVSF, args);
- return update_extra_cfg(ctx, &extra_cfg);
-}
-
static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx,
va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -806,6 +830,13 @@ static aom_codec_err_t ctrl_set_static_thresh(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.row_mt = CAST(AV1E_SET_ROW_MT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx,
va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1669,6 +1700,20 @@ static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static aom_codec_err_t ctrl_set_chroma_subsampling_x(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.chroma_subsampling_x = CAST(AV1E_SET_CHROMA_SUBSAMPLING_X, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.chroma_subsampling_y = CAST(AV1E_SET_CHROMA_SUBSAMPLING_Y, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AV1_COPY_REFERENCE, ctrl_copy_reference },
{ AOME_USE_REFERENCE, ctrl_use_reference },
@@ -1681,11 +1726,11 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AOME_SET_SCALEMODE, ctrl_set_scale_mode },
{ AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id },
{ AOME_SET_CPUUSED, ctrl_set_cpuused },
- { AOME_SET_DEVSF, ctrl_set_devsf },
{ AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref },
{ AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref },
{ AOME_SET_SHARPNESS, ctrl_set_sharpness },
{ AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
+ { AV1E_SET_ROW_MT, ctrl_set_row_mt },
{ AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
{ AV1E_SET_TILE_ROWS, ctrl_set_tile_rows },
{ AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
@@ -1754,7 +1799,8 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AV1E_GET_ACTIVEMAP, ctrl_get_active_map },
{ AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
{ AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
-
+ { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x },
+ { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y },
{ -1, NULL },
};
@@ -1850,13 +1896,13 @@ CODEC_INTERFACE(aom_codec_av1_cx) = {
},
{
// NOLINT
- 1, // 1 cfg map
- encoder_usage_cfg_map, // aom_codec_enc_cfg_map_t
- encoder_encode, // aom_codec_encode_fn_t
- encoder_get_cxdata, // aom_codec_get_cx_data_fn_t
- encoder_set_config, // aom_codec_enc_config_set_fn_t
- NULL, // aom_codec_get_global_headers_fn_t
- encoder_get_preview, // aom_codec_get_preview_frame_fn_t
- NULL // aom_codec_enc_mr_get_mem_loc_fn_t
+ 1, // 1 cfg map
+ encoder_usage_cfg_map, // aom_codec_enc_cfg_map_t
+ encoder_encode, // aom_codec_encode_fn_t
+ encoder_get_cxdata, // aom_codec_get_cx_data_fn_t
+ encoder_set_config, // aom_codec_enc_config_set_fn_t
+ encoder_get_global_headers, // aom_codec_get_global_headers_fn_t
+ encoder_get_preview, // aom_codec_get_preview_frame_fn_t
+ NULL // aom_codec_enc_mr_get_mem_loc_fn_t
}
};
diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c
index f42572019..4a6631047 100644
--- a/third_party/aom/av1/av1_dx_iface.c
+++ b/third_party/aom/av1/av1_dx_iface.c
@@ -26,6 +26,7 @@
#include "av1/common/alloccommon.h"
#include "av1/common/frame_buffers.h"
#include "av1/common/enums.h"
+#include "av1/common/obu_util.h"
#include "av1/decoder/decoder.h"
#include "av1/decoder/decodeframe.h"
@@ -46,6 +47,7 @@ struct aom_codec_alg_priv {
int last_show_frame; // Index of last output frame.
int byte_alignment;
int skip_loop_filter;
+ int skip_film_grain;
int decode_tile_row;
int decode_tile_col;
unsigned int tile_mode;
@@ -103,6 +105,15 @@ static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx,
priv->cfg.cfg.ext_partition = 1;
}
av1_zero(priv->image_with_grain);
+ // Turn row_mt on by default.
+ priv->row_mt = 1;
+
+ // Turn on normal tile coding mode by default.
+ // 0 is for normal tile coding mode, and 1 is for large scale tile coding
+ // mode(refer to lightfield example).
+ priv->tile_mode = 0;
+ priv->decode_tile_row = -1;
+ priv->decode_tile_col = -1;
}
return AOM_CODEC_OK;
@@ -216,7 +227,7 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
while (1) {
data += bytes_read;
data_sz -= bytes_read;
- const uint8_t *payload_start = data;
+ if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME;
// Check that the selected OBU is a sequence header
if (obu_header.type == OBU_SEQUENCE_HEADER) {
// Sanity check on sequence header size
@@ -264,9 +275,9 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
}
}
// skip past any unread OBU header data
- data = payload_start + payload_size;
+ data += payload_size;
data_sz -= payload_size;
- if (data_sz <= 0) break; // exit if we're out of OBUs
+ if (data_sz == 0) break; // exit if we're out of OBUs
status = aom_read_obu_header_and_size(
data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
if (status != AOM_CODEC_OK) return status;
@@ -313,6 +324,7 @@ static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
cm->new_fb_idx = INVALID_IDX;
cm->byte_alignment = ctx->byte_alignment;
cm->skip_loop_filter = ctx->skip_loop_filter;
+ cm->skip_film_grain = ctx->skip_film_grain;
if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
pool->get_fb_cb = ctx->get_ext_fb_cb;
@@ -434,7 +446,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
frame_worker_data->pbi->row_mt = ctx->row_mt;
- worker->hook = (AVxWorkerHook)frame_worker_hook;
+ worker->hook = frame_worker_hook;
if (!winterface->reset(worker)) {
set_error_detail(ctx, "Frame Worker thread creation failed");
return AOM_CODEC_MEM_ERROR;
@@ -515,12 +527,11 @@ static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
const uint8_t *data, size_t data_sz,
void *user_priv) {
- const uint8_t *data_start = data;
- const uint8_t *data_end = data + data_sz;
aom_codec_err_t res = AOM_CODEC_OK;
- // Release any pending output frames from the previous decoder call.
- // We need to do this even if the decoder is being flushed
+ // Release any pending output frames from the previous decoder_decode call.
+ // We need to do this even if the decoder is being flushed or the input
+ // arguments are invalid.
if (ctx->frame_workers) {
BufferPool *const pool = ctx->buffer_pool;
RefCntBuffer *const frame_bufs = pool->frame_bufs;
@@ -538,10 +549,13 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
unlock_buffer_pool(ctx->buffer_pool);
}
+ /* Sanity checks */
+ /* NULL data ptr allowed if data_sz is 0 too */
if (data == NULL && data_sz == 0) {
ctx->flushed = 1;
return AOM_CODEC_OK;
}
+ if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM;
// Reset flushed when receiving a valid frame.
ctx->flushed = 0;
@@ -552,6 +566,9 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
if (res != AOM_CODEC_OK) return res;
}
+ const uint8_t *data_start = data;
+ const uint8_t *data_end = data + data_sz;
+
if (ctx->is_annexb) {
// read the size of this temporal unit
size_t length_of_size;
@@ -617,6 +634,7 @@ static aom_image_t *add_grain_if_needed(aom_image_t *img,
img->fmt != grain_img_buf->fmt) {
aom_img_free(grain_img_buf);
grain_img_buf = NULL;
+ *grain_img_ptr = NULL;
}
}
if (!grain_img_buf) {
@@ -624,7 +642,14 @@ static aom_image_t *add_grain_if_needed(aom_image_t *img,
*grain_img_ptr = grain_img_buf;
}
- av1_add_film_grain(grain_params, img, grain_img_buf);
+ if (grain_img_buf) {
+ grain_img_buf->user_priv = img->user_priv;
+ if (av1_add_film_grain(grain_params, img, grain_img_buf)) {
+ aom_img_free(grain_img_buf);
+ grain_img_buf = NULL;
+ *grain_img_ptr = NULL;
+ }
+ }
return grain_img_buf;
}
@@ -720,8 +745,13 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
img = &ctx->img;
img->temporal_id = cm->temporal_layer_id;
img->spatial_id = cm->spatial_layer_id;
+ if (cm->skip_film_grain) grain_params->apply_grain = 0;
aom_image_t *res = add_grain_if_needed(
img, &ctx->image_with_grain[*index], grain_params);
+ if (!res) {
+ aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME,
+ "Grain systhesis failed\n");
+ }
*index += 1; // Advance the iterator to point to the next image
return res;
}
@@ -1128,6 +1158,19 @@ static aom_codec_err_t ctrl_set_skip_loop_filter(aom_codec_alg_priv_t *ctx,
return AOM_CODEC_OK;
}
+static aom_codec_err_t ctrl_set_skip_film_grain(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->skip_film_grain = va_arg(args, int);
+
+ if (ctx->frame_workers) {
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->pbi->common.skip_film_grain = ctx->skip_film_grain;
+ }
+
+ return AOM_CODEC_OK;
+}
+
static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx,
va_list args) {
#if !CONFIG_ACCOUNTING
@@ -1231,6 +1274,7 @@ static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
{ AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug },
{ AV1D_SET_ROW_MT, ctrl_set_row_mt },
{ AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr },
+ { AV1D_SET_SKIP_FILM_GRAIN, ctrl_set_skip_film_grain },
// Getters
{ AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted },
diff --git a/third_party/aom/av1/av1_iface_common.h b/third_party/aom/av1/av1_iface_common.h
index c03892b73..4a7af580b 100644
--- a/third_party/aom/av1/av1_iface_common.h
+++ b/third_party/aom/av1/av1_iface_common.h
@@ -8,10 +8,11 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_AV1_IFACE_COMMON_H_
-#define AV1_AV1_IFACE_COMMON_H_
+#ifndef AOM_AV1_AV1_IFACE_COMMON_H_
+#define AOM_AV1_AV1_IFACE_COMMON_H_
#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
void *user_priv) {
@@ -132,4 +133,4 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
return AOM_CODEC_OK;
}
-#endif // AV1_AV1_IFACE_COMMON_H_
+#endif // AOM_AV1_AV1_IFACE_COMMON_H_
diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h
index dbcb5b947..8e5896981 100644
--- a/third_party/aom/av1/common/alloccommon.h
+++ b/third_party/aom/av1/common/alloccommon.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ALLOCCOMMON_H_
-#define AV1_COMMON_ALLOCCOMMON_H_
+#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_
+#define AOM_AV1_COMMON_ALLOCCOMMON_H_
#define INVALID_IDX -1 // Invalid buffer index.
@@ -45,4 +45,4 @@ int av1_get_MBs(int width, int height);
} // extern "C"
#endif
-#endif // AV1_COMMON_ALLOCCOMMON_H_
+#endif // AOM_AV1_COMMON_ALLOCCOMMON_H_
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
index 51c991498..bad411743 100644
--- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
@@ -9,6 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include <arm_neon.h>
+
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "config/av1_rtcd.h"
@@ -19,19 +21,7 @@
#include "av1/common/enums.h"
#include "av1/common/idct.h"
#include "av1/common/arm/av1_inv_txfm_neon.h"
-
-static INLINE TxSetType find_TxSetType(TX_SIZE tx_size) {
- const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
- TxSetType tx_set_type;
- if (tx_size_sqr_up > TX_32X32) {
- tx_set_type = EXT_TX_SET_DCTONLY;
- } else if (tx_size_sqr_up == TX_32X32) {
- tx_set_type = EXT_TX_SET_DCT_IDTX;
- } else {
- tx_set_type = EXT_TX_SET_ALL16;
- }
- return tx_set_type;
-}
+#include "av1/common/arm/transpose_neon.h"
// 1D itx types
typedef enum ATTRIBUTE_PACKED {
@@ -65,6 +55,2038 @@ static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
{ av1_idct64_new, NULL, NULL },
};
+static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
+ uint8_t *output, int stride,
+ int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ int16x8_t temp_output;
+ for (int i = 0; i < height; ++i, j += step) {
+ temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
+ temp_output = vaddq_s16(temp_output, in[j]);
+ vst1_u8(output, vqmovun_s16(temp_output));
+ output += stride;
+ }
+}
+
+static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
+ int16x8_t res0,
+ int16x8_t res1) {
+ int16x8_t temp_output[2];
+ uint8x16_t temp_output_8q;
+ temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
+ temp_output[0] = vaddq_s16(temp_output[0], res0);
+ temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
+ temp_output[1] = vaddq_s16(temp_output[1], res1);
+ temp_output_8q =
+ vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
+ return temp_output_8q;
+}
+
+static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
+ uint8_t *output, int stride,
+ int flipud, int height) {
+ uint8x16_t temp_output_8q;
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ temp_output_8q = vld1q_u8(output + i * stride);
+ temp_output_8q =
+ lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
+ vst1q_u8((output + i * stride), temp_output_8q);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
+ int value) {
+ for (int i = 0; i < size; i++) {
+ a[i] = vdupq_n_s16((int16_t)value);
+ }
+}
+
+static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
+ int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0_l, s0_h, s1_l, s1_h;
+ int16x4_t v0[2], v1[2];
+
+ s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
+ s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
+ s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
+ s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
+
+ v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
+ int32x4_t t0[2], t1[2];
+ int16x4_t v0[2], v1[2];
+
+ // Don't add/sub before multiply, which will overflow in iadst8.
+ const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
+ const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
+ const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
+ const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
+
+ t0[0] = vaddq_s32(x0_lo, x1_lo);
+ t0[1] = vaddq_s32(x0_hi, x1_hi);
+ t1[0] = vsubq_s32(x0_lo, x1_lo);
+ t1[1] = vsubq_s32(x0_hi, x1_hi);
+
+ v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
+
+ x[0] = vcombine_s16(v0[0], v0[1]);
+ x[1] = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1,
+ int16_t *const c2,
+ int16_t *const c3) {
+ int16x4_t val = vdup_n_s16((int16_t)0);
+ val = vld1_lane_s16(c0, val, 0);
+ val = vld1_lane_s16(c1, val, 1);
+ val = vld1_lane_s16(c2, val, 2);
+ val = vld1_lane_s16(c3, val, 3);
+ return val;
+}
+
+static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+ (int16_t *)(cospi + 20), (int16_t *)(cospi + 44));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28),
+ (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[8];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ // Stage 1
+ x[0] = in[7];
+ x[1] = in[0];
+ x[2] = in[5];
+ x[3] = in[2];
+ x[4] = in[3];
+ x[5] = in[4];
+ x[6] = in[1];
+ x[7] = in[6];
+
+ // Stage 2
+ btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+ btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+ btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+ btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+
+ // Stage 3
+ x[0] = vqaddq_s16(s0, s4);
+ x[1] = vqaddq_s16(s1, s5);
+ x[2] = vqaddq_s16(s2, s6);
+ x[3] = vqaddq_s16(s3, s7);
+ x[4] = vqsubq_s16(s0, s4);
+ x[5] = vqsubq_s16(s1, s5);
+ x[6] = vqsubq_s16(s2, s6);
+ x[7] = vqsubq_s16(s3, s7);
+
+ // Stage 4
+ s0 = x[0];
+ s1 = x[1];
+ s2 = x[2];
+ s3 = x[3];
+ btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
+
+ // Stage 5
+ x[0] = vqaddq_s16(s0, s2);
+ x[1] = vqaddq_s16(s1, s3);
+ x[2] = vqsubq_s16(s0, s2);
+ x[3] = vqsubq_s16(s1, s3);
+ x[4] = vqaddq_s16(s4, s6);
+ x[5] = vqaddq_s16(s5, s7);
+ x[6] = vqsubq_s16(s4, s6);
+ x[7] = vqsubq_s16(s5, s7);
+
+ // stage 6
+ btf_16_half_neon(x + 2, c2);
+ btf_16_half_neon(x + 6, c2);
+
+ // Stage 7
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[4]);
+ out[2] = x[6];
+ out[3] = vnegq_s16(x[2]);
+ out[4] = x[3];
+ out[5] = vnegq_s16(x[7]);
+ out[6] = x[5];
+ out[7] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[8];
+ int16x8_t s0, s1, s4, s5;
+
+ // Stage 1
+ x[1] = in[0];
+
+ // Stage 2
+
+ btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
+
+ // Stage 3
+ x[0] = s0;
+ x[1] = s1;
+ x[4] = s0;
+ x[5] = s1;
+
+ // Stage 4
+ s0 = x[0];
+ s1 = x[1];
+ btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+
+ // Stage 5
+ x[0] = s0;
+ x[1] = s1;
+ x[2] = s0;
+ x[3] = s1;
+ x[4] = s4;
+ x[5] = s5;
+ x[6] = s4;
+ x[7] = s5;
+
+ // stage 6
+ btf_16_half_neon(x + 2, c2);
+ btf_16_half_neon(x + 6, c2);
+
+ // Stage 7
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[4]);
+ out[2] = x[6];
+ out[3] = vnegq_s16(x[2]);
+ out[4] = x[3];
+ out[5] = vnegq_s16(x[7]);
+ out[6] = x[5];
+ out[7] = vnegq_s16(x[1]);
+}
+
+static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+ int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[8], step2[8];
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 2
+ btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
+ btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
+
+ // stage 3
+ btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]);
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+
+ // stage 4
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]);
+
+ // stage 5
+ out[0] = vqaddq_s16(step1[0], step2[7]);
+ out[1] = vqaddq_s16(step1[1], step1[6]);
+ out[2] = vqaddq_s16(step1[2], step1[5]);
+ out[3] = vqaddq_s16(step1[3], step2[4]);
+ out[4] = vqsubq_s16(step1[3], step2[4]);
+ out[5] = vqsubq_s16(step1[2], step1[5]);
+ out[6] = vqsubq_s16(step1[1], step1[6]);
+ out[7] = vqsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 1
+ // stage 2
+ // stage 3
+ t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
+
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ // stage 4
+ // stage 5
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+}
+
+void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
+ assert(!(size % 4));
+ if (!bit) return;
+ const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
+ for (int i = 0; i < size; i++) {
+ arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
+ }
+}
+
+static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
+ int16x8_t temp[8];
+ for (int i = 0; i < size; ++i) {
+ temp[i] = input[size - 1 - i];
+ }
+ for (int i = 0; i < size; ++i) {
+ input[i] = temp[i];
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
+ int16x8_t *const a,
+ int out_size) {
+ for (int i = 0; i < 8; ++i) {
+ a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
+ vmovn_s32(vld1q_s32(input + 4)));
+ input += out_size;
+ }
+}
+
+static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ (void)cos_bit;
+
+ output[0] = vmulq_n_s16(input[0], (int16_t)2);
+ output[1] = vmulq_n_s16(input[1], (int16_t)2);
+ output[2] = vmulq_n_s16(input[2], (int16_t)2);
+ output[3] = vmulq_n_s16(input[3], (int16_t)2);
+ output[4] = vmulq_n_s16(input[4], (int16_t)2);
+ output[5] = vmulq_n_s16(input[5], (int16_t)2);
+ output[6] = vmulq_n_s16(input[6], (int16_t)2);
+ output[7] = vmulq_n_s16(input[7], (int16_t)2);
+}
+
+static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
+ int size) {
+ int32x4_t out_low, out_high;
+ int16x4_t low, high;
+
+ for (int z = 0; z < size; ++z) {
+ out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
+ out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
+
+ low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+ high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+ output[z] = vcombine_s16(low, high);
+ }
+}
+
+static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ (void)cos_bit;
+
+ int32x4_t out_low, out_high;
+ int16x4_t low, high;
+ int16_t scale = (int16_t)(2 * NewSqrt2);
+
+ for (int z = 0; z < 16; ++z) {
+ out_low = vmull_n_s16(vget_low_s16(input[z]), scale);
+ out_high = vmull_n_s16(vget_high_s16(input[z]), scale);
+
+ low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+ high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+ output[z] = vcombine_s16(low, high);
+ }
+}
+
+static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ (void)cos_bit;
+
+ for (int z = 0; z < 32; ++z) {
+ output[z] = vmulq_n_s16(input[z], (int16_t)4);
+ }
+}
+
+static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 4
+
+ t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ // stage 6
+ // stage 7
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+ out[8] = step1;
+ out[9] = step1;
+ out[10] = step1;
+ out[11] = step1;
+ out[12] = step1;
+ out[13] = step1;
+ out[14] = step1;
+ out[15] = step1;
+}
+
+static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[16], step2[16];
+
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+ (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
+ (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c3 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 2
+
+ btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
+ btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
+ btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
+ btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
+
+ step2[0] = in[0];
+ step2[1] = in[8];
+ step2[2] = in[4];
+ step2[3] = in[12];
+ step2[4] = in[2];
+ step2[5] = in[10];
+ step2[6] = in[6];
+ step2[7] = in[14];
+
+ // stage 3
+
+ btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
+ btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
+
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ step1[8] = vqaddq_s16(step2[8], step2[9]);
+ step1[9] = vqsubq_s16(step2[8], step2[9]);
+ step1[10] = vqsubq_s16(step2[11], step2[10]);
+ step1[11] = vqaddq_s16(step2[11], step2[10]);
+ step1[12] = vqaddq_s16(step2[12], step2[13]);
+ step1[13] = vqsubq_s16(step2[12], step2[13]);
+ step1[14] = vqsubq_s16(step2[15], step2[14]);
+ step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+
+ btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
+ &step2[10], &step2[13]);
+
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+
+ btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ step1[7] = step2[7];
+ step1[8] = vqaddq_s16(step2[8], step2[11]);
+ step1[9] = vqaddq_s16(step2[9], step2[10]);
+ step1[10] = vqsubq_s16(step2[9], step2[10]);
+ step1[11] = vqsubq_s16(step2[8], step2[11]);
+ step1[12] = vqsubq_s16(step2[15], step2[12]);
+ step1[13] = vqsubq_s16(step2[14], step2[13]);
+ step1[14] = vqaddq_s16(step2[14], step2[13]);
+ step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[7]);
+ step2[1] = vqaddq_s16(step1[1], step1[6]);
+ step2[2] = vqaddq_s16(step1[2], step1[5]);
+ step2[3] = vqaddq_s16(step1[3], step1[4]);
+ step2[4] = vqsubq_s16(step1[3], step1[4]);
+ step2[5] = vqsubq_s16(step1[2], step1[5]);
+ step2[6] = vqsubq_s16(step1[1], step1[6]);
+ step2[7] = vqsubq_s16(step1[0], step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ out[0] = vqaddq_s16(step2[0], step2[15]);
+ out[1] = vqaddq_s16(step2[1], step2[14]);
+ out[2] = vqaddq_s16(step2[2], step2[13]);
+ out[3] = vqaddq_s16(step2[3], step2[12]);
+ out[4] = vqaddq_s16(step2[4], step2[11]);
+ out[5] = vqaddq_s16(step2[5], step2[10]);
+ out[6] = vqaddq_s16(step2[6], step2[9]);
+ out[7] = vqaddq_s16(step2[7], step2[8]);
+ out[8] = vqsubq_s16(step2[7], step2[8]);
+ out[9] = vqsubq_s16(step2[6], step2[9]);
+ out[10] = vqsubq_s16(step2[5], step2[10]);
+ out[11] = vqsubq_s16(step2[4], step2[11]);
+ out[12] = vqsubq_s16(step2[3], step2[12]);
+ out[13] = vqsubq_s16(step2[2], step2[13]);
+ out[14] = vqsubq_s16(step2[1], step2[14]);
+ out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[16], step2[16];
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[2] = in[4];
+ step2[4] = in[2];
+ step2[6] = in[6];
+
+ btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
+ btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
+ btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
+ btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
+
+ // stage 3
+
+ btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+ btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
+
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[8] = vqaddq_s16(step2[8], step2[9]);
+ step1[9] = vqsubq_s16(step2[8], step2[9]);
+ step1[10] = vqsubq_s16(step2[11], step2[10]);
+ step1[11] = vqaddq_s16(step2[11], step2[10]);
+ step1[12] = vqaddq_s16(step2[12], step2[13]);
+ step1[13] = vqsubq_s16(step2[12], step2[13]);
+ step1[14] = vqsubq_s16(step2[15], step2[14]);
+ step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+
+ btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+ btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0,
+ &step2[10], &step2[13]);
+
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+
+ btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ step1[7] = step2[7];
+ step1[8] = vqaddq_s16(step2[8], step2[11]);
+ step1[9] = vqaddq_s16(step2[9], step2[10]);
+ step1[10] = vqsubq_s16(step2[9], step2[10]);
+ step1[11] = vqsubq_s16(step2[8], step2[11]);
+ step1[12] = vqsubq_s16(step2[15], step2[12]);
+ step1[13] = vqsubq_s16(step2[14], step2[13]);
+ step1[14] = vqaddq_s16(step2[14], step2[13]);
+ step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+ btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[7]);
+ step2[1] = vqaddq_s16(step1[1], step1[6]);
+ step2[2] = vqaddq_s16(step1[2], step1[5]);
+ step2[3] = vqaddq_s16(step1[3], step1[4]);
+ step2[4] = vqsubq_s16(step1[3], step1[4]);
+ step2[5] = vqsubq_s16(step1[2], step1[5]);
+ step2[6] = vqsubq_s16(step1[1], step1[6]);
+ step2[7] = vqsubq_s16(step1[0], step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+
+ out[0] = vqaddq_s16(step2[0], step2[15]);
+ out[1] = vqaddq_s16(step2[1], step2[14]);
+ out[2] = vqaddq_s16(step2[2], step2[13]);
+ out[3] = vqaddq_s16(step2[3], step2[12]);
+ out[4] = vqaddq_s16(step2[4], step2[11]);
+ out[5] = vqaddq_s16(step2[5], step2[10]);
+ out[6] = vqaddq_s16(step2[6], step2[9]);
+ out[7] = vqaddq_s16(step2[7], step2[8]);
+ out[8] = vqsubq_s16(step2[7], step2[8]);
+ out[9] = vqsubq_s16(step2[6], step2[9]);
+ out[10] = vqsubq_s16(step2[5], step2[10]);
+ out[11] = vqsubq_s16(step2[4], step2[11]);
+ out[12] = vqsubq_s16(step2[3], step2[12]);
+ out[13] = vqsubq_s16(step2[2], step2[13]);
+ out[14] = vqsubq_s16(step2[1], step2[14]);
+ out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
+ (int16_t *)(cospi + 10), (int16_t *)(cospi + 54));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
+ (int16_t *)(cospi + 26), (int16_t *)(cospi + 38));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30),
+ (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
+ const int16x4_t c3 =
+ create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14),
+ (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
+ const int16x4_t c4 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+
+ const int16x4_t c =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[16];
+ int16x8_t t[14];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+ // Stage 1
+ x[0] = in[15];
+ x[1] = in[0];
+ x[2] = in[13];
+ x[3] = in[2];
+ x[4] = in[11];
+ x[5] = in[4];
+ x[6] = in[9];
+ x[7] = in[6];
+ x[8] = in[7];
+ x[9] = in[8];
+ x[10] = in[5];
+ x[11] = in[10];
+ x[12] = in[3];
+ x[13] = in[12];
+ x[14] = in[1];
+ x[15] = in[14];
+
+ // Stage 2
+ btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+ btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+ btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+ btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+ btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
+ btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
+ btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
+
+ // Stage 3
+ x[0] = vqaddq_s16(s0, s8);
+ x[1] = vqaddq_s16(s1, s9);
+ x[2] = vqaddq_s16(s2, s10);
+ x[3] = vqaddq_s16(s3, s11);
+ x[4] = vqaddq_s16(s4, s12);
+ x[5] = vqaddq_s16(s5, s13);
+ x[6] = vqaddq_s16(s6, s14);
+ x[7] = vqaddq_s16(s7, s15);
+ x[8] = vqsubq_s16(s0, s8);
+ x[9] = vqsubq_s16(s1, s9);
+ x[10] = vqsubq_s16(s2, s10);
+ x[11] = vqsubq_s16(s3, s11);
+ x[12] = vqsubq_s16(s4, s12);
+ x[13] = vqsubq_s16(s5, s13);
+ x[14] = vqsubq_s16(s6, s14);
+ x[15] = vqsubq_s16(s7, s15);
+
+ // Stage 4
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ t[4] = x[4];
+ t[5] = x[5];
+ t[6] = x[6];
+ t[7] = x[7];
+ btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+ btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+ btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+ // Stage 5
+ x[0] = vqaddq_s16(t[0], t[4]);
+ x[1] = vqaddq_s16(t[1], t[5]);
+ x[2] = vqaddq_s16(t[2], t[6]);
+ x[3] = vqaddq_s16(t[3], t[7]);
+ x[4] = vqsubq_s16(t[0], t[4]);
+ x[5] = vqsubq_s16(t[1], t[5]);
+ x[6] = vqsubq_s16(t[2], t[6]);
+ x[7] = vqsubq_s16(t[3], t[7]);
+ x[8] = vqaddq_s16(s8, s12);
+ x[9] = vqaddq_s16(s9, s13);
+ x[10] = vqaddq_s16(s10, s14);
+ x[11] = vqaddq_s16(s11, s15);
+ x[12] = vqsubq_s16(s8, s12);
+ x[13] = vqsubq_s16(s9, s13);
+ x[14] = vqsubq_s16(s10, s14);
+ x[15] = vqsubq_s16(s11, s15);
+
+ // stage 6
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+ t[8] = x[8];
+ t[9] = x[9];
+ t[10] = x[10];
+ t[11] = x[11];
+ btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+ btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+
+ // Stage 7
+ x[0] = vqaddq_s16(t[0], t[2]);
+ x[1] = vqaddq_s16(t[1], t[3]);
+ x[2] = vqsubq_s16(t[0], t[2]);
+ x[3] = vqsubq_s16(t[1], t[3]);
+ x[4] = vqaddq_s16(s4, s6);
+ x[5] = vqaddq_s16(s5, s7);
+ x[6] = vqsubq_s16(s4, s6);
+ x[7] = vqsubq_s16(s5, s7);
+ x[8] = vqaddq_s16(t[8], t[10]);
+ x[9] = vqaddq_s16(t[9], t[11]);
+ x[10] = vqsubq_s16(t[8], t[10]);
+ x[11] = vqsubq_s16(t[9], t[11]);
+ x[12] = vqaddq_s16(s12, s14);
+ x[13] = vqaddq_s16(s13, s15);
+ x[14] = vqsubq_s16(s12, s14);
+ x[15] = vqsubq_s16(s13, s15);
+
+ // Stage 8
+ btf_16_half_neon(x + 2, c);
+ btf_16_half_neon(x + 6, c);
+ btf_16_half_neon(x + 10, c);
+ btf_16_half_neon(x + 14, c);
+
+ // Stage 9
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[8]);
+ out[2] = x[12];
+ out[3] = vnegq_s16(x[4]);
+ out[4] = x[6];
+ out[5] = vnegq_s16(x[14]);
+ out[6] = x[10];
+ out[7] = vnegq_s16(x[2]);
+ out[8] = x[3];
+ out[9] = vnegq_s16(x[11]);
+ out[10] = x[15];
+ out[11] = vnegq_s16(x[7]);
+ out[12] = x[5];
+ out[13] = vnegq_s16(x[13]);
+ out[14] = x[9];
+ out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const int16x4_t c4 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[16];
+ int16x8_t t[10];
+ int16x8_t s0, s1, s4, s5;
+ int16x8_t s8, s9, s12, s13;
+
+ // Stage 1
+ x[1] = in[0];
+
+ // Stage 2
+ btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+
+ // Stage 3
+ x[0] = s0;
+ x[1] = s1;
+ x[8] = s0;
+ x[9] = s1;
+
+ // Stage 4
+ t[0] = x[0];
+ t[1] = x[1];
+ btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+
+ // Stage 5
+ x[0] = t[0];
+ x[1] = t[1];
+ x[4] = t[0];
+ x[5] = t[1];
+ x[8] = s8;
+ x[9] = s9;
+ x[12] = s8;
+ x[13] = s9;
+
+ // stage 6
+ t[0] = x[0];
+ t[1] = x[1];
+ btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+ t[8] = x[8];
+ t[9] = x[9];
+ btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+
+ // Stage 7
+ x[0] = t[0];
+ x[1] = t[1];
+ x[2] = t[0];
+ x[3] = t[1];
+ x[4] = s4;
+ x[5] = s5;
+ x[6] = s4;
+ x[7] = s5;
+ x[8] = t[8];
+ x[9] = t[9];
+ x[10] = t[8];
+ x[11] = t[9];
+ x[12] = s12;
+ x[13] = s13;
+ x[14] = s12;
+ x[15] = s13;
+
+ // Stage 8
+ btf_16_half_neon(x + 2, c);
+ btf_16_half_neon(x + 6, c);
+ btf_16_half_neon(x + 10, c);
+ btf_16_half_neon(x + 14, c);
+
+ // Stage 9
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[8]);
+ out[2] = x[12];
+ out[3] = vnegq_s16(x[4]);
+ out[4] = x[6];
+ out[5] = vnegq_s16(x[14]);
+ out[6] = x[10];
+ out[7] = vnegq_s16(x[2]);
+ out[8] = x[3];
+ out[9] = vnegq_s16(x[11]);
+ out[10] = x[15];
+ out[11] = vnegq_s16(x[7]);
+ out[12] = x[5];
+ out[13] = vnegq_s16(x[13]);
+ out[14] = x[9];
+ out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ const int16x4_t c4 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[16];
+ int16x8_t t[14];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+ // Stage 1
+ x[1] = in[0];
+ x[3] = in[2];
+ x[5] = in[4];
+ x[7] = in[6];
+ x[8] = in[7];
+ x[10] = in[5];
+ x[12] = in[3];
+ x[14] = in[1];
+
+ // Stage 2
+ btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+ btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
+ btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
+ btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
+
+ btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
+ btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
+ btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
+ btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
+
+ // Stage 3
+ x[0] = vqaddq_s16(s0, s8);
+ x[1] = vqaddq_s16(s1, s9);
+ x[2] = vqaddq_s16(s2, s10);
+ x[3] = vqaddq_s16(s3, s11);
+ x[4] = vqaddq_s16(s4, s12);
+ x[5] = vqaddq_s16(s5, s13);
+ x[6] = vqaddq_s16(s6, s14);
+ x[7] = vqaddq_s16(s7, s15);
+ x[8] = vqsubq_s16(s0, s8);
+ x[9] = vqsubq_s16(s1, s9);
+ x[10] = vqsubq_s16(s2, s10);
+ x[11] = vqsubq_s16(s3, s11);
+ x[12] = vqsubq_s16(s4, s12);
+ x[13] = vqsubq_s16(s5, s13);
+ x[14] = vqsubq_s16(s6, s14);
+ x[15] = vqsubq_s16(s7, s15);
+
+ // Stage 4
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ t[4] = x[4];
+ t[5] = x[5];
+ t[6] = x[6];
+ t[7] = x[7];
+ btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+ btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+ btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+ // Stage 5
+ x[0] = vqaddq_s16(t[0], t[4]);
+ x[1] = vqaddq_s16(t[1], t[5]);
+ x[2] = vqaddq_s16(t[2], t[6]);
+ x[3] = vqaddq_s16(t[3], t[7]);
+ x[4] = vqsubq_s16(t[0], t[4]);
+ x[5] = vqsubq_s16(t[1], t[5]);
+ x[6] = vqsubq_s16(t[2], t[6]);
+ x[7] = vqsubq_s16(t[3], t[7]);
+ x[8] = vqaddq_s16(s8, s12);
+ x[9] = vqaddq_s16(s9, s13);
+ x[10] = vqaddq_s16(s10, s14);
+ x[11] = vqaddq_s16(s11, s15);
+ x[12] = vqsubq_s16(s8, s12);
+ x[13] = vqsubq_s16(s9, s13);
+ x[14] = vqsubq_s16(s10, s14);
+ x[15] = vqsubq_s16(s11, s15);
+
+ // stage 6
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+ t[8] = x[8];
+ t[9] = x[9];
+ t[10] = x[10];
+ t[11] = x[11];
+ btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+ btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+
+ // Stage 7
+ x[0] = vqaddq_s16(t[0], t[2]);
+ x[1] = vqaddq_s16(t[1], t[3]);
+ x[2] = vqsubq_s16(t[0], t[2]);
+ x[3] = vqsubq_s16(t[1], t[3]);
+ x[4] = vqaddq_s16(s4, s6);
+ x[5] = vqaddq_s16(s5, s7);
+ x[6] = vqsubq_s16(s4, s6);
+ x[7] = vqsubq_s16(s5, s7);
+ x[8] = vqaddq_s16(t[8], t[10]);
+ x[9] = vqaddq_s16(t[9], t[11]);
+ x[10] = vqsubq_s16(t[8], t[10]);
+ x[11] = vqsubq_s16(t[9], t[11]);
+ x[12] = vqaddq_s16(s12, s14);
+ x[13] = vqaddq_s16(s13, s15);
+ x[14] = vqsubq_s16(s12, s14);
+ x[15] = vqsubq_s16(s13, s15);
+
+ // Stage 8
+ btf_16_half_neon(x + 2, c);
+ btf_16_half_neon(x + 6, c);
+ btf_16_half_neon(x + 10, c);
+ btf_16_half_neon(x + 14, c);
+
+ // Stage 9
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[8]);
+ out[2] = x[12];
+ out[3] = vnegq_s16(x[4]);
+ out[4] = x[6];
+ out[5] = vnegq_s16(x[14]);
+ out[6] = x[10];
+ out[7] = vnegq_s16(x[2]);
+ out[8] = x[3];
+ out[9] = vnegq_s16(x[11]);
+ out[10] = x[15];
+ out[11] = vnegq_s16(x[7]);
+ out[12] = x[5];
+ out[13] = vnegq_s16(x[13]);
+ out[14] = x[9];
+ out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[32], step2[32];
+
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
+ (int16_t *)(cospi + 34), (int16_t *)(cospi + 30));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
+ (int16_t *)(cospi + 50), (int16_t *)(cospi + 14));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54),
+ (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
+ const int16x4_t c3 =
+ create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38),
+ (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
+ const int16x4_t c4 =
+ create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+ (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
+ const int16x4_t c5 =
+ create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
+ (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+ const int16x4_t c6 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c7 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 2
+
+ btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
+ btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
+ btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
+ btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
+ btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
+ btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
+
+ step2[0] = in[0];
+ step2[1] = in[16];
+ step2[2] = in[8];
+ step2[3] = in[24];
+ step2[4] = in[4];
+ step2[5] = in[20];
+ step2[6] = in[12];
+ step2[7] = in[28];
+ step2[8] = in[2];
+ step2[9] = in[18];
+ step2[10] = in[10];
+ step2[11] = in[26];
+ step2[12] = in[6];
+ step2[13] = in[22];
+ step2[14] = in[14];
+ step2[15] = in[30];
+
+ // stage 3
+
+ btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
+ btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
+ btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
+ btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
+
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ step1[4] = step2[4];
+ step1[5] = step2[5];
+ step1[6] = step2[6];
+ step1[7] = step2[7];
+
+ step1[16] = vqaddq_s16(step2[16], step2[17]);
+ step1[17] = vqsubq_s16(step2[16], step2[17]);
+ step1[18] = vqsubq_s16(step2[19], step2[18]);
+ step1[19] = vqaddq_s16(step2[19], step2[18]);
+ step1[20] = vqaddq_s16(step2[20], step2[21]);
+ step1[21] = vqsubq_s16(step2[20], step2[21]);
+ step1[22] = vqsubq_s16(step2[23], step2[22]);
+ step1[23] = vqaddq_s16(step2[23], step2[22]);
+ step1[24] = vqaddq_s16(step2[24], step2[25]);
+ step1[25] = vqsubq_s16(step2[24], step2[25]);
+ step1[26] = vqsubq_s16(step2[27], step2[26]);
+ step1[27] = vqaddq_s16(step2[27], step2[26]);
+ step1[28] = vqaddq_s16(step2[28], step2[29]);
+ step1[29] = vqsubq_s16(step2[28], step2[29]);
+ step1[30] = vqsubq_s16(step2[31], step2[30]);
+ step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+ // stage 4
+
+ btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
+ btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
+ btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
+ btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6,
+ &step2[18], &step2[29]);
+ btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6,
+ &step2[22], &step2[25]);
+
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[8] = vqaddq_s16(step1[8], step1[9]);
+ step2[9] = vqsubq_s16(step1[8], step1[9]);
+ step2[10] = vqsubq_s16(step1[11], step1[10]);
+ step2[11] = vqaddq_s16(step1[11], step1[10]);
+ step2[12] = vqaddq_s16(step1[12], step1[13]);
+ step2[13] = vqsubq_s16(step1[12], step1[13]);
+ step2[14] = vqsubq_s16(step1[15], step1[14]);
+ step2[15] = vqaddq_s16(step1[15], step1[14]);
+ step2[16] = step1[16];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[31] = step1[31];
+
+ // stage 5
+
+ btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
+ btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
+ btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7,
+ &step1[10], &step1[13]);
+
+ step1[4] = vqaddq_s16(step2[4], step2[5]);
+ step1[5] = vqsubq_s16(step2[4], step2[5]);
+ step1[6] = vqsubq_s16(step2[7], step2[6]);
+ step1[7] = vqaddq_s16(step2[7], step2[6]);
+ step1[8] = step2[8];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[19]);
+ step1[17] = vqaddq_s16(step2[17], step2[18]);
+ step1[18] = vqsubq_s16(step2[17], step2[18]);
+ step1[19] = vqsubq_s16(step2[16], step2[19]);
+ step1[20] = vqsubq_s16(step2[23], step2[20]);
+ step1[21] = vqsubq_s16(step2[22], step2[21]);
+ step1[22] = vqaddq_s16(step2[22], step2[21]);
+ step1[23] = vqaddq_s16(step2[23], step2[20]);
+ step1[24] = vqaddq_s16(step2[24], step2[27]);
+ step1[25] = vqaddq_s16(step2[25], step2[26]);
+ step1[26] = vqsubq_s16(step2[25], step2[26]);
+ step1[27] = vqsubq_s16(step2[24], step2[27]);
+ step1[28] = vqsubq_s16(step2[31], step2[28]);
+ step1[29] = vqsubq_s16(step2[30], step2[29]);
+ step1[30] = vqaddq_s16(step2[30], step2[29]);
+ step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
+ btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7,
+ &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7,
+ &step2[21], &step2[26]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[3]);
+ step2[1] = vqaddq_s16(step1[1], step1[2]);
+ step2[2] = vqsubq_s16(step1[1], step1[2]);
+ step2[3] = vqsubq_s16(step1[0], step1[3]);
+ step2[4] = step1[4];
+ step2[7] = step1[7];
+ step2[8] = vqaddq_s16(step1[8], step1[11]);
+ step2[9] = vqaddq_s16(step1[9], step1[10]);
+ step2[10] = vqsubq_s16(step1[9], step1[10]);
+ step2[11] = vqsubq_s16(step1[8], step1[11]);
+ step2[12] = vqsubq_s16(step1[15], step1[12]);
+ step2[13] = vqsubq_s16(step1[14], step1[13]);
+ step2[14] = vqaddq_s16(step1[14], step1[13]);
+ step2[15] = vqaddq_s16(step1[15], step1[12]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
+ btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[7]);
+ step1[1] = vqaddq_s16(step2[1], step2[6]);
+ step1[2] = vqaddq_s16(step2[2], step2[5]);
+ step1[3] = vqaddq_s16(step2[3], step2[4]);
+ step1[4] = vqsubq_s16(step2[3], step2[4]);
+ step1[5] = vqsubq_s16(step2[2], step2[5]);
+ step1[6] = vqsubq_s16(step2[1], step2[6]);
+ step1[7] = vqsubq_s16(step2[0], step2[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[23]);
+ step1[17] = vqaddq_s16(step2[17], step2[22]);
+ step1[18] = vqaddq_s16(step2[18], step2[21]);
+ step1[19] = vqaddq_s16(step2[19], step2[20]);
+ step1[20] = vqsubq_s16(step2[19], step2[20]);
+ step1[21] = vqsubq_s16(step2[18], step2[21]);
+ step1[22] = vqsubq_s16(step2[17], step2[22]);
+ step1[23] = vqsubq_s16(step2[16], step2[23]);
+ step1[24] = vqsubq_s16(step2[31], step2[24]);
+ step1[25] = vqsubq_s16(step2[30], step2[25]);
+ step1[26] = vqsubq_s16(step2[29], step2[26]);
+ step1[27] = vqsubq_s16(step2[28], step2[27]);
+ step1[28] = vqaddq_s16(step2[27], step2[28]);
+ step1[29] = vqaddq_s16(step2[26], step2[29]);
+ step1[30] = vqaddq_s16(step2[25], step2[30]);
+ step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
+ btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
+ btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[15]);
+ step2[1] = vqaddq_s16(step1[1], step1[14]);
+ step2[2] = vqaddq_s16(step1[2], step1[13]);
+ step2[3] = vqaddq_s16(step1[3], step1[12]);
+ step2[4] = vqaddq_s16(step1[4], step1[11]);
+ step2[5] = vqaddq_s16(step1[5], step1[10]);
+ step2[6] = vqaddq_s16(step1[6], step1[9]);
+ step2[7] = vqaddq_s16(step1[7], step1[8]);
+ step2[8] = vqsubq_s16(step1[7], step1[8]);
+ step2[9] = vqsubq_s16(step1[6], step1[9]);
+ step2[10] = vqsubq_s16(step1[5], step1[10]);
+ step2[11] = vqsubq_s16(step1[4], step1[11]);
+ step2[12] = vqsubq_s16(step1[3], step1[12]);
+ step2[13] = vqsubq_s16(step1[2], step1[13]);
+ step2[14] = vqsubq_s16(step1[1], step1[14]);
+ step2[15] = vqsubq_s16(step1[0], step1[15]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[18];
+ step2[19] = step1[19];
+ step2[28] = step1[28];
+ step2[29] = step1[29];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 9
+
+ out[0] = vqaddq_s16(step2[0], step2[31]);
+ out[1] = vqaddq_s16(step2[1], step2[30]);
+ out[2] = vqaddq_s16(step2[2], step2[29]);
+ out[3] = vqaddq_s16(step2[3], step2[28]);
+ out[4] = vqaddq_s16(step2[4], step2[27]);
+ out[5] = vqaddq_s16(step2[5], step2[26]);
+ out[6] = vqaddq_s16(step2[6], step2[25]);
+ out[7] = vqaddq_s16(step2[7], step2[24]);
+ out[8] = vqaddq_s16(step2[8], step2[23]);
+ out[9] = vqaddq_s16(step2[9], step2[22]);
+ out[10] = vqaddq_s16(step2[10], step2[21]);
+ out[11] = vqaddq_s16(step2[11], step2[20]);
+ out[12] = vqaddq_s16(step2[12], step2[19]);
+ out[13] = vqaddq_s16(step2[13], step2[18]);
+ out[14] = vqaddq_s16(step2[14], step2[17]);
+ out[15] = vqaddq_s16(step2[15], step2[16]);
+ out[16] = vqsubq_s16(step2[15], step2[16]);
+ out[17] = vqsubq_s16(step2[14], step2[17]);
+ out[18] = vqsubq_s16(step2[13], step2[18]);
+ out[19] = vqsubq_s16(step2[12], step2[19]);
+ out[20] = vqsubq_s16(step2[11], step2[20]);
+ out[21] = vqsubq_s16(step2[10], step2[21]);
+ out[22] = vqsubq_s16(step2[9], step2[22]);
+ out[23] = vqsubq_s16(step2[8], step2[23]);
+ out[24] = vqsubq_s16(step2[7], step2[24]);
+ out[25] = vqsubq_s16(step2[6], step2[25]);
+ out[26] = vqsubq_s16(step2[5], step2[26]);
+ out[27] = vqsubq_s16(step2[4], step2[27]);
+ out[28] = vqsubq_s16(step2[3], step2[28]);
+ out[29] = vqsubq_s16(step2[2], step2[29]);
+ out[30] = vqsubq_s16(step2[1], step2[30]);
+ out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+
+ t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+ out[8] = step1;
+ out[9] = step1;
+ out[10] = step1;
+ out[11] = step1;
+ out[12] = step1;
+ out[13] = step1;
+ out[14] = step1;
+ out[15] = step1;
+ out[16] = step1;
+ out[17] = step1;
+ out[18] = step1;
+ out[19] = step1;
+ out[20] = step1;
+ out[21] = step1;
+ out[22] = step1;
+ out[23] = step1;
+ out[24] = step1;
+ out[25] = step1;
+ out[26] = step1;
+ out[27] = step1;
+ out[28] = step1;
+ out[29] = step1;
+ out[30] = step1;
+ out[31] = step1;
+}
+
+static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[32], step2[32];
+ int32x4_t t32[16];
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[4] = in[4];
+ step2[8] = in[2];
+ step2[12] = in[6];
+
+ btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+ btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+ btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+ btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] = step2[4];
+
+ btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+ btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[16];
+ step1[18] = step2[19];
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ step1[21] = step2[20];
+ step1[22] = step2[23];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[24];
+ step1[26] = step2[27];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+ step1[29] = step2[28];
+ step1[30] = step2[31];
+ step1[31] = step2[31];
+
+ // stage 4
+
+ btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+ btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+ btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
+ &step2[18], &step2[29]);
+ btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
+ &step2[22], &step2[25]);
+
+ step2[0] = step1[0];
+ step2[8] = step1[8];
+ step2[9] = step1[8];
+ step2[10] = step1[11];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[13] = step1[12];
+ step2[14] = step1[15];
+ step2[15] = step1[15];
+ step2[16] = step1[16];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[31] = step1[31];
+
+ // stage 5
+
+ t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+ step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
+ &step1[10], &step1[13]);
+
+ step1[4] = step2[4];
+ step1[5] = step2[4];
+ step1[6] = step2[7];
+ step1[7] = step2[7];
+ step1[8] = step2[8];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[19]);
+ step1[17] = vqaddq_s16(step2[17], step2[18]);
+ step1[18] = vqsubq_s16(step2[17], step2[18]);
+ step1[19] = vqsubq_s16(step2[16], step2[19]);
+ step1[20] = vqsubq_s16(step2[23], step2[20]);
+ step1[21] = vqsubq_s16(step2[22], step2[21]);
+ step1[22] = vqaddq_s16(step2[22], step2[21]);
+ step1[23] = vqaddq_s16(step2[23], step2[20]);
+ step1[24] = vqaddq_s16(step2[24], step2[27]);
+ step1[25] = vqaddq_s16(step2[25], step2[26]);
+ step1[26] = vqsubq_s16(step2[25], step2[26]);
+ step1[27] = vqsubq_s16(step2[24], step2[27]);
+ step1[28] = vqsubq_s16(step2[31], step2[28]);
+ step1[29] = vqsubq_s16(step2[30], step2[29]);
+ step1[30] = vqaddq_s16(step2[30], step2[29]);
+ step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+ btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
+ &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
+ &step2[21], &step2[26]);
+
+ step2[0] = step1[0];
+ step2[1] = step1[0];
+ step2[2] = step1[0];
+ step2[3] = step1[0];
+ step2[4] = step1[4];
+ step2[7] = step1[7];
+ step2[8] = vqaddq_s16(step1[8], step1[11]);
+ step2[9] = vqaddq_s16(step1[9], step1[10]);
+ step2[10] = vqsubq_s16(step1[9], step1[10]);
+ step2[11] = vqsubq_s16(step1[8], step1[11]);
+ step2[12] = vqsubq_s16(step1[15], step1[12]);
+ step2[13] = vqsubq_s16(step1[14], step1[13]);
+ step2[14] = vqaddq_s16(step1[14], step1[13]);
+ step2[15] = vqaddq_s16(step1[15], step1[12]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+ btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[7]);
+ step1[1] = vqaddq_s16(step2[1], step2[6]);
+ step1[2] = vqaddq_s16(step2[2], step2[5]);
+ step1[3] = vqaddq_s16(step2[3], step2[4]);
+ step1[4] = vqsubq_s16(step2[3], step2[4]);
+ step1[5] = vqsubq_s16(step2[2], step2[5]);
+ step1[6] = vqsubq_s16(step2[1], step2[6]);
+ step1[7] = vqsubq_s16(step2[0], step2[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[23]);
+ step1[17] = vqaddq_s16(step2[17], step2[22]);
+ step1[18] = vqaddq_s16(step2[18], step2[21]);
+ step1[19] = vqaddq_s16(step2[19], step2[20]);
+ step1[20] = vqsubq_s16(step2[19], step2[20]);
+ step1[21] = vqsubq_s16(step2[18], step2[21]);
+ step1[22] = vqsubq_s16(step2[17], step2[22]);
+ step1[23] = vqsubq_s16(step2[16], step2[23]);
+ step1[24] = vqsubq_s16(step2[31], step2[24]);
+ step1[25] = vqsubq_s16(step2[30], step2[25]);
+ step1[26] = vqsubq_s16(step2[29], step2[26]);
+ step1[27] = vqsubq_s16(step2[28], step2[27]);
+ step1[28] = vqaddq_s16(step2[27], step2[28]);
+ step1[29] = vqaddq_s16(step2[26], step2[29]);
+ step1[30] = vqaddq_s16(step2[25], step2[30]);
+ step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+ btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+ btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[15]);
+ step2[1] = vqaddq_s16(step1[1], step1[14]);
+ step2[2] = vqaddq_s16(step1[2], step1[13]);
+ step2[3] = vqaddq_s16(step1[3], step1[12]);
+ step2[4] = vqaddq_s16(step1[4], step1[11]);
+ step2[5] = vqaddq_s16(step1[5], step1[10]);
+ step2[6] = vqaddq_s16(step1[6], step1[9]);
+ step2[7] = vqaddq_s16(step1[7], step1[8]);
+ step2[8] = vqsubq_s16(step1[7], step1[8]);
+ step2[9] = vqsubq_s16(step1[6], step1[9]);
+ step2[10] = vqsubq_s16(step1[5], step1[10]);
+ step2[11] = vqsubq_s16(step1[4], step1[11]);
+ step2[12] = vqsubq_s16(step1[3], step1[12]);
+ step2[13] = vqsubq_s16(step1[2], step1[13]);
+ step2[14] = vqsubq_s16(step1[1], step1[14]);
+ step2[15] = vqsubq_s16(step1[0], step1[15]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[18];
+ step2[19] = step1[19];
+ step2[28] = step1[28];
+ step2[29] = step1[29];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 9
+
+ out[0] = vqaddq_s16(step2[0], step2[31]);
+ out[1] = vqaddq_s16(step2[1], step2[30]);
+ out[2] = vqaddq_s16(step2[2], step2[29]);
+ out[3] = vqaddq_s16(step2[3], step2[28]);
+ out[4] = vqaddq_s16(step2[4], step2[27]);
+ out[5] = vqaddq_s16(step2[5], step2[26]);
+ out[6] = vqaddq_s16(step2[6], step2[25]);
+ out[7] = vqaddq_s16(step2[7], step2[24]);
+ out[8] = vqaddq_s16(step2[8], step2[23]);
+ out[9] = vqaddq_s16(step2[9], step2[22]);
+ out[10] = vqaddq_s16(step2[10], step2[21]);
+ out[11] = vqaddq_s16(step2[11], step2[20]);
+ out[12] = vqaddq_s16(step2[12], step2[19]);
+ out[13] = vqaddq_s16(step2[13], step2[18]);
+ out[14] = vqaddq_s16(step2[14], step2[17]);
+ out[15] = vqaddq_s16(step2[15], step2[16]);
+ out[16] = vqsubq_s16(step2[15], step2[16]);
+ out[17] = vqsubq_s16(step2[14], step2[17]);
+ out[18] = vqsubq_s16(step2[13], step2[18]);
+ out[19] = vqsubq_s16(step2[12], step2[19]);
+ out[20] = vqsubq_s16(step2[11], step2[20]);
+ out[21] = vqsubq_s16(step2[10], step2[21]);
+ out[22] = vqsubq_s16(step2[9], step2[22]);
+ out[23] = vqsubq_s16(step2[8], step2[23]);
+ out[24] = vqsubq_s16(step2[7], step2[24]);
+ out[25] = vqsubq_s16(step2[6], step2[25]);
+ out[26] = vqsubq_s16(step2[5], step2[26]);
+ out[27] = vqsubq_s16(step2[4], step2[27]);
+ out[28] = vqsubq_s16(step2[3], step2[28]);
+ out[29] = vqsubq_s16(step2[2], step2[29]);
+ out[30] = vqsubq_s16(step2[1], step2[30]);
+ out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[32], step2[32];
+ int32x4_t t32[16];
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 1
+ // stage 2
+
+ btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+ btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
+ btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
+ btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+ btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+ btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
+ btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
+ btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+ step2[0] = in[0];
+ step2[2] = in[8];
+ step2[4] = in[4];
+ step2[6] = in[12];
+ step2[8] = in[2];
+ step2[10] = in[10];
+ step2[12] = in[6];
+ step2[14] = in[14];
+
+ // stage 3
+
+ btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+ btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
+ btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
+ btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[4] = step2[4];
+ step1[6] = step2[6];
+ step1[16] = vqaddq_s16(step2[16], step2[17]);
+ step1[17] = vqsubq_s16(step2[16], step2[17]);
+ step1[18] = vqsubq_s16(step2[19], step2[18]);
+ step1[19] = vqaddq_s16(step2[19], step2[18]);
+ step1[20] = vqaddq_s16(step2[20], step2[21]);
+ step1[21] = vqsubq_s16(step2[20], step2[21]);
+ step1[22] = vqsubq_s16(step2[23], step2[22]);
+ step1[23] = vqaddq_s16(step2[23], step2[22]);
+ step1[24] = vqaddq_s16(step2[24], step2[25]);
+ step1[25] = vqsubq_s16(step2[24], step2[25]);
+ step1[26] = vqsubq_s16(step2[27], step2[26]);
+ step1[27] = vqaddq_s16(step2[27], step2[26]);
+ step1[28] = vqaddq_s16(step2[28], step2[29]);
+ step1[29] = vqsubq_s16(step2[28], step2[29]);
+ step1[30] = vqsubq_s16(step2[31], step2[30]);
+ step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+ // stage 4
+
+ btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+ btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
+ btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+ btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
+ &step2[18], &step2[29]);
+ btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
+ &step2[22], &step2[25]);
+
+ step2[0] = step1[0];
+ step2[2] = step1[2];
+ step2[8] = vqaddq_s16(step1[8], step1[9]);
+ step2[9] = vqsubq_s16(step1[8], step1[9]);
+ step2[10] = vqsubq_s16(step1[11], step1[10]);
+ step2[11] = vqaddq_s16(step1[11], step1[10]);
+ step2[12] = vqaddq_s16(step1[12], step1[13]);
+ step2[13] = vqsubq_s16(step1[12], step1[13]);
+ step2[14] = vqsubq_s16(step1[15], step1[14]);
+ step2[15] = vqaddq_s16(step1[15], step1[14]);
+ step2[16] = step1[16];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[31] = step1[31];
+
+ // stage 5
+
+ t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+
+ step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
+ btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
+ &step1[10], &step1[13]);
+
+ step1[4] = vqaddq_s16(step2[4], step2[5]);
+ step1[5] = vqsubq_s16(step2[4], step2[5]);
+ step1[6] = vqsubq_s16(step2[7], step2[6]);
+ step1[7] = vqaddq_s16(step2[7], step2[6]);
+ step1[8] = step2[8];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[19]);
+ step1[17] = vqaddq_s16(step2[17], step2[18]);
+ step1[18] = vqsubq_s16(step2[17], step2[18]);
+ step1[19] = vqsubq_s16(step2[16], step2[19]);
+ step1[20] = vqsubq_s16(step2[23], step2[20]);
+ step1[21] = vqsubq_s16(step2[22], step2[21]);
+ step1[22] = vqaddq_s16(step2[22], step2[21]);
+ step1[23] = vqaddq_s16(step2[23], step2[20]);
+ step1[24] = vqaddq_s16(step2[24], step2[27]);
+ step1[25] = vqaddq_s16(step2[25], step2[26]);
+ step1[26] = vqsubq_s16(step2[25], step2[26]);
+ step1[27] = vqsubq_s16(step2[24], step2[27]);
+ step1[28] = vqsubq_s16(step2[31], step2[28]);
+ step1[29] = vqsubq_s16(step2[30], step2[29]);
+ step1[30] = vqaddq_s16(step2[30], step2[29]);
+ step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+ btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
+ &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
+ &step2[21], &step2[26]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[3]);
+ step2[1] = vqaddq_s16(step1[0], step1[2]);
+ step2[2] = vqsubq_s16(step1[0], step1[2]);
+ step2[3] = vqsubq_s16(step1[0], step1[3]);
+ step2[4] = step1[4];
+ step2[7] = step1[7];
+ step2[8] = vqaddq_s16(step1[8], step1[11]);
+ step2[9] = vqaddq_s16(step1[9], step1[10]);
+ step2[10] = vqsubq_s16(step1[9], step1[10]);
+ step2[11] = vqsubq_s16(step1[8], step1[11]);
+ step2[12] = vqsubq_s16(step1[15], step1[12]);
+ step2[13] = vqsubq_s16(step1[14], step1[13]);
+ step2[14] = vqaddq_s16(step1[14], step1[13]);
+ step2[15] = vqaddq_s16(step1[15], step1[12]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+ btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[7]);
+ step1[1] = vqaddq_s16(step2[1], step2[6]);
+ step1[2] = vqaddq_s16(step2[2], step2[5]);
+ step1[3] = vqaddq_s16(step2[3], step2[4]);
+ step1[4] = vqsubq_s16(step2[3], step2[4]);
+ step1[5] = vqsubq_s16(step2[2], step2[5]);
+ step1[6] = vqsubq_s16(step2[1], step2[6]);
+ step1[7] = vqsubq_s16(step2[0], step2[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[23]);
+ step1[17] = vqaddq_s16(step2[17], step2[22]);
+ step1[18] = vqaddq_s16(step2[18], step2[21]);
+ step1[19] = vqaddq_s16(step2[19], step2[20]);
+ step1[20] = vqsubq_s16(step2[19], step2[20]);
+ step1[21] = vqsubq_s16(step2[18], step2[21]);
+ step1[22] = vqsubq_s16(step2[17], step2[22]);
+ step1[23] = vqsubq_s16(step2[16], step2[23]);
+ step1[24] = vqsubq_s16(step2[31], step2[24]);
+ step1[25] = vqsubq_s16(step2[30], step2[25]);
+ step1[26] = vqsubq_s16(step2[29], step2[26]);
+ step1[27] = vqsubq_s16(step2[28], step2[27]);
+ step1[28] = vqaddq_s16(step2[27], step2[28]);
+ step1[29] = vqaddq_s16(step2[26], step2[29]);
+ step1[30] = vqaddq_s16(step2[25], step2[30]);
+ step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+ btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+ btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[15]);
+ step2[1] = vqaddq_s16(step1[1], step1[14]);
+ step2[2] = vqaddq_s16(step1[2], step1[13]);
+ step2[3] = vqaddq_s16(step1[3], step1[12]);
+ step2[4] = vqaddq_s16(step1[4], step1[11]);
+ step2[5] = vqaddq_s16(step1[5], step1[10]);
+ step2[6] = vqaddq_s16(step1[6], step1[9]);
+ step2[7] = vqaddq_s16(step1[7], step1[8]);
+ step2[8] = vqsubq_s16(step1[7], step1[8]);
+ step2[9] = vqsubq_s16(step1[6], step1[9]);
+ step2[10] = vqsubq_s16(step1[5], step1[10]);
+ step2[11] = vqsubq_s16(step1[4], step1[11]);
+ step2[12] = vqsubq_s16(step1[3], step1[12]);
+ step2[13] = vqsubq_s16(step1[2], step1[13]);
+ step2[14] = vqsubq_s16(step1[1], step1[14]);
+ step2[15] = vqsubq_s16(step1[0], step1[15]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[18];
+ step2[19] = step1[19];
+ step2[28] = step1[28];
+ step2[29] = step1[29];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 9
+
+ out[0] = vqaddq_s16(step2[0], step2[31]);
+ out[1] = vqaddq_s16(step2[1], step2[30]);
+ out[2] = vqaddq_s16(step2[2], step2[29]);
+ out[3] = vqaddq_s16(step2[3], step2[28]);
+ out[4] = vqaddq_s16(step2[4], step2[27]);
+ out[5] = vqaddq_s16(step2[5], step2[26]);
+ out[6] = vqaddq_s16(step2[6], step2[25]);
+ out[7] = vqaddq_s16(step2[7], step2[24]);
+ out[8] = vqaddq_s16(step2[8], step2[23]);
+ out[9] = vqaddq_s16(step2[9], step2[22]);
+ out[10] = vqaddq_s16(step2[10], step2[21]);
+ out[11] = vqaddq_s16(step2[11], step2[20]);
+ out[12] = vqaddq_s16(step2[12], step2[19]);
+ out[13] = vqaddq_s16(step2[13], step2[18]);
+ out[14] = vqaddq_s16(step2[14], step2[17]);
+ out[15] = vqaddq_s16(step2[15], step2[16]);
+ out[16] = vqsubq_s16(step2[15], step2[16]);
+ out[17] = vqsubq_s16(step2[14], step2[17]);
+ out[18] = vqsubq_s16(step2[13], step2[18]);
+ out[19] = vqsubq_s16(step2[12], step2[19]);
+ out[20] = vqsubq_s16(step2[11], step2[20]);
+ out[21] = vqsubq_s16(step2[10], step2[21]);
+ out[22] = vqsubq_s16(step2[9], step2[22]);
+ out[23] = vqsubq_s16(step2[8], step2[23]);
+ out[24] = vqsubq_s16(step2[7], step2[24]);
+ out[25] = vqsubq_s16(step2[6], step2[25]);
+ out[26] = vqsubq_s16(step2[5], step2[26]);
+ out[27] = vqsubq_s16(step2[4], step2[27]);
+ out[28] = vqsubq_s16(step2[3], step2[28]);
+ out[29] = vqsubq_s16(step2[2], step2[29]);
+ out[30] = vqsubq_s16(step2[1], step2[30]);
+ out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
// Functions for blocks with eob at DC and within
// topleft 8x8, 16x16, 32x32 corner
static const transform_1d_neon
@@ -90,10 +2112,37 @@ static const transform_1d_neon
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL } }
};
-static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
- uint8_t *output, int stride,
- TX_TYPE tx_type,
- TX_SIZE tx_size, int eob) {
+
+static const transform_neon
+ lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL },
+ { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL },
+ { identity8_new_neon, identity8_new_neon, NULL, NULL } },
+ {
+ { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL },
+ { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon,
+ NULL },
+ { identity16_new_neon, identity16_new_neon, identity16_new_neon,
+ NULL },
+ },
+ { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon,
+ idct32_new_neon },
+ { NULL, NULL, NULL, NULL },
+ { identity32_new_neon, identity32_new_neon, identity32_new_neon,
+ identity32_new_neon } },
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_idtx_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
int32_t *temp_in = txfm_buf;
@@ -160,7 +2209,79 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
}
}
-static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[32 * 4];
+ int16x8_t b[32 * 4];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const int32_t *input_1;
+ int temp_b = 0;
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ input_1 = input;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ transpose_s16_8x8q(&a[k], &a[k]);
+ input_1 += 8;
+ }
+ input += (txfm_size_col * 8);
+ if (abs(rect_type) == 1) {
+ int y = i * txfm_size_col;
+ round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ }
+ row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+ av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+ -shift[0]);
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(
+ &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_wxh_identity_neon(
const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
@@ -244,7 +2365,88 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
}
}
-static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[16 * 2];
+ int16x8_t b[16 * 2];
+ int eobx, eoby, ud_flip, lr_flip;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const int32_t *input_1;
+ int temp_b = 0;
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ input_1 = input;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ transpose_s16_8x8q(&a[k], &a[k]);
+ input_1 += 8;
+ }
+ input += (txfm_size_col * 8);
+ if (abs(rect_type) == 1) {
+ int y = i * txfm_size_col;
+ round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ }
+ row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+ av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+ -shift[0]);
+ if (lr_flip == 1) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ flip_buf_ud_neon(&a[k], 8);
+ transpose_s16_8x8q(
+ &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+ }
+ temp_b += 8;
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(
+ &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_wxh_identity_neon(
const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
@@ -328,6 +2530,78 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
}
}
+static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[16 * 2];
+ int16x8_t b[16 * 2];
+ int eobx, eoby, ud_flip, lr_flip;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const int32_t *input_1;
+ int temp_b = 0;
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ input_1 = input;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ transpose_s16_8x8q(&a[k], &a[k]);
+ input_1 += 8;
+ }
+ input += (txfm_size_col * 8);
+ if (abs(rect_type) == 1) {
+ int y = i * txfm_size_col;
+ round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ }
+ row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+ av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+ -shift[0]);
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+ }
+}
+
static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
uint8_t *output, int stride,
TX_TYPE tx_type,
@@ -644,7 +2918,7 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
}
}
-static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+static INLINE void lowbd_inv_txfm2d_add_wxh_no_identity_neon(
const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
@@ -727,6 +3001,118 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
}
}
+static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[64 * 8];
+ int16x8_t b[64 * 8];
+ int eobx, eoby, ud_flip, lr_flip;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const int32_t *input_1;
+ int temp_b = 0;
+
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ input_1 = input;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ transpose_s16_8x8q(&a[k], &a[k]);
+ input_1 += 8;
+ }
+ input += (txfm_size_col * 8);
+ if (abs(rect_type) == 1) {
+ int y = i * txfm_size_col;
+ round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ }
+ row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+ av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+ -shift[0]);
+ if (lr_flip == 1) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ flip_buf_ud_neon(&a[k], 8);
+ transpose_s16_8x8q(
+ &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+ }
+ temp_b += 8;
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_universe_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case IDTX:
+ lowbd_inv_txfm2d_add_wxh_idtx_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_wxh_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_wxh_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ default:
+ lowbd_inv_txfm2d_add_wxh_no_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
static INLINE void lowbd_inv_txfm2d_add_universe_neon(
const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
TX_SIZE tx_size, int eob) {
@@ -756,6 +3142,7 @@ static INLINE void lowbd_inv_txfm2d_add_universe_neon(
break;
}
}
+
void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, TX_SIZE tx_size,
int eob) {
@@ -787,8 +3174,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
break;
case TX_16X64: {
- lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
- tx_size, eob);
+ lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
} break;
case TX_64X16: {
@@ -797,13 +3184,13 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
}
- lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
- tx_size, eob);
+ lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
} break;
case TX_32X64: {
- lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
- tx_size, eob);
+ lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
} break;
case TX_64X32: {
@@ -812,8 +3199,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
}
- lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
- tx_size, eob);
+ lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
} break;
case TX_64X64: {
@@ -822,8 +3209,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
}
- lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type,
- tx_size, eob);
+ lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
} break;
default:
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
index 6af2d61e7..9ec658291 100644
--- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
-#define AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
@@ -23,6 +23,8 @@
typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
const int8_t cos_bit,
const int8_t *stage_ptr);
+typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit, int bit);
DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
@@ -149,4 +151,4 @@ static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
*eoby = eob_fill[temp_eoby];
}
-#endif // AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#endif // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
index 0d8233744..7134f183e 100644
--- a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
+++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
@@ -34,8 +34,8 @@ void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
uint8x8_t tmp0, tmp1;
uint8x16_t res_q;
uint16x8_t res, res_low, res_high;
- uint32x2_t tmp0_32, tmp1_32;
- uint16x4_t tmp0_16, tmp1_16;
+ uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+ uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
if (w >= 16) {
diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
index 33b06b767..194e94c8c 100644
--- a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
+++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
@@ -27,8 +27,8 @@ void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
uint8x8_t tmp0, tmp1;
uint8x16_t tmp0_q, tmp1_q, res_q;
uint16x8_t res, res_low, res_high;
- uint32x2_t tmp0_32, tmp1_32;
- uint16x4_t tmp0_16, tmp1_16;
+ uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+ uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c
index d731b6a66..39025b5e5 100644
--- a/third_party/aom/av1/common/arm/cfl_neon.c
+++ b/third_party/aom/av1/common/arm/cfl_neon.c
@@ -131,7 +131,7 @@ static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
}
-#if __ARM_ARCH <= 7
+#ifndef __aarch64__
uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
@@ -311,7 +311,7 @@ static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst,
// Permute and add in such a way that each lane contains the block sum.
// [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
-#if __ARM_ARCH >= 8
+#ifdef __aarch64__
sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
#else
diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c
index f15744c94..d0c4f8ff6 100644
--- a/third_party/aom/av1/common/arm/convolve_neon.c
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@@ -13,6 +13,8 @@
#include <assert.h>
#include <arm_neon.h>
+#include "config/av1_rtcd.h"
+
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
#include "av1/common/convolve.h"
@@ -68,6 +70,33 @@ static INLINE uint8x8_t convolve8_horiz_8x8(
return vqmovun_s16(sum);
}
+#if !defined(__aarch64__)
+static INLINE uint8x8_t convolve8_horiz_4x1(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+ const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
+ int16x4_t sum;
+
+ sum = vmul_n_s16(s0, filter[0]);
+ sum = vmla_n_s16(sum, s1, filter[1]);
+ sum = vmla_n_s16(sum, s2, filter[2]);
+ sum = vmla_n_s16(sum, s5, filter[5]);
+ sum = vmla_n_s16(sum, s6, filter[6]);
+ sum = vmla_n_s16(sum, s7, filter[7]);
+ /* filter[3] can take a max value of 128. So the max value of the result :
+ * 128*255 + sum > 16 bits
+ */
+ sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
+ sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+
+ sum = vqrshl_s16(sum, shift_round_0);
+ sum = vqrshl_s16(sum, shift_by_bits);
+
+ return vqmovun_s16(vcombine_s16(sum, sum));
+}
+#endif // !defined(__arch64__)
+
static INLINE uint8x8_t convolve8_vert_8x4(
const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
@@ -175,7 +204,10 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
(void)conv_params;
(void)filter_params_y;
- uint8x8_t t0, t1, t2, t3;
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3;
+#endif
assert(bits >= 0);
assert((FILTER_BITS - conv_params->round_1) >= 0 ||
@@ -188,7 +220,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
src -= horiz_offset;
-
+#if defined(__aarch64__)
if (h == 4) {
uint8x8_t d01, d23;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
@@ -275,12 +307,18 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
w -= 4;
} while (w > 0);
} else {
+#endif
int width;
const uint8_t *s;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10;
uint8x8_t t4, t5, t6, t7;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+#endif
if (w <= 4) {
+#if defined(__aarch64__)
do {
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -387,10 +425,49 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
}
h -= 8;
} while (h > 0);
+#else
+ int16x8_t tt0;
+ int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
+ const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
+ const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
+ do {
+ t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ x0 = vget_low_s16(tt0); // a0 a1 a2 a3
+ x4 = vget_high_s16(tt0); // a4 a5 a6 a7
+
+ t0 = vld1_u8(src + 8); // a8 a9 a10 a11 a12 a13 a14 a15
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ x7 = vget_low_s16(tt0); // a8 a9 a10 a11
+
+ x1 = vext_s16(x0, x4, 1); // a1 a2 a3 a4
+ x2 = vext_s16(x0, x4, 2); // a2 a3 a4 a5
+ x3 = vext_s16(x0, x4, 3); // a3 a4 a5 a6
+ x5 = vext_s16(x4, x7, 1); // a5 a6 a7 a8
+ x6 = vext_s16(x4, x7, 2); // a6 a7 a8 a9
+ x7 = vext_s16(x4, x7, 3); // a7 a8 a9 a10
+
+ src += src_stride;
+
+ t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
+ shift_round_0_low, shift_by_bits_low);
+
+ if (w == 4) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+ 0); // 00 01 02 03
+ dst += dst_stride;
+ } else if (w == 2) {
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01
+ dst += dst_stride;
+ }
+ h -= 1;
+ } while (h > 0);
+#endif
} else {
uint8_t *d;
- int16x8_t s11, s12, s13, s14;
-
+ int16x8_t s11;
+#if defined(__aarch64__)
+ int16x8_t s12, s13, s14;
do {
__builtin_prefetch(src + 0 * src_stride);
__builtin_prefetch(src + 1 * src_stride);
@@ -479,8 +556,47 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
dst += 8 * dst_stride;
h -= 8;
} while (h > 0);
+#else
+ do {
+ t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ width = w;
+ s = src + 8;
+ d = dst;
+ __builtin_prefetch(dst);
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s11 = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(s11, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(s11, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(s11, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(s11, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(s11, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(s11, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(s11, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ shift_round_0, shift_by_bits);
+ vst1_u8(d, t0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 1;
+ } while (h > 0);
+#endif
}
+#if defined(__aarch64__)
}
+#endif
}
void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
@@ -505,9 +621,12 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
filter_params_y, subpel_y_q4 & SUBPEL_MASK);
if (w <= 4) {
- uint8x8_t d01, d23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+ uint8x8_t d01;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+ uint8x8_t d23;
+ int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
src += src_stride;
s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
@@ -526,6 +645,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
do {
s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
src += src_stride;
+#if defined(__aarch64__)
s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
src += src_stride;
s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
@@ -591,14 +711,41 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
s5 = s9;
s6 = s10;
h -= 4;
+#else
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(src + 0 * src_stride);
+
+ d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
+
+ if (w == 4) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+ dst += dst_stride;
+ } else if (w == 2) {
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
+ dst += dst_stride;
+ }
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ h -= 1;
+#endif
} while (h > 0);
} else {
int height;
const uint8_t *s;
uint8_t *d;
- uint8x8_t t0, t1, t2, t3;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
+ uint8x8_t t0;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3;
+ int16x8_t s8, s9, s10;
+#endif
do {
__builtin_prefetch(src + 0 * src_stride);
__builtin_prefetch(src + 1 * src_stride);
@@ -628,6 +775,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
do {
s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
s += src_stride;
+#if defined(__aarch64__)
s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
s += src_stride;
s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
@@ -670,6 +818,24 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
s5 = s9;
s6 = s10;
height -= 4;
+#else
+ __builtin_prefetch(d);
+ __builtin_prefetch(s);
+
+ t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+ vst1_u8(d, t0);
+ d += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
} while (height > 0);
src += 8;
dst += 8;
@@ -686,7 +852,10 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
ConvolveParams *conv_params) {
int im_dst_stride;
int width, height;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
DECLARE_ALIGNED(16, int16_t,
im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
@@ -724,13 +893,18 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
assert(conv_params->round_0 > 0);
if (w <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
do {
s = src_ptr;
+
+#if defined(__aarch64__)
__builtin_prefetch(s + 0 * src_stride);
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
@@ -789,16 +963,56 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
src_ptr += 4 * src_stride;
dst_ptr += 4 * im_dst_stride;
height -= 4;
+#else
+ int16x8_t tt0;
+
+ __builtin_prefetch(s);
+
+ t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s0 = vget_low_s16(tt0);
+ s4 = vget_high_s16(tt0);
+
+ __builtin_prefetch(dst_ptr);
+ s += 8;
+
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
+ s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
+ s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+
+ if (w == 4) {
+ vst1_s16(dst_ptr, d0);
+ dst_ptr += im_dst_stride;
+ } else if (w == 2) {
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+ dst_ptr += im_dst_stride;
+ }
+
+ src_ptr += src_stride;
+ height -= 1;
+#endif
} while (height > 0);
} else {
int16_t *d_tmp;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
int16x8_t s11, s12, s13, s14;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+#endif
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
+#if defined(__aarch64__)
do {
__builtin_prefetch(src_ptr + 0 * src_stride);
__builtin_prefetch(src_ptr + 1 * src_stride);
@@ -886,6 +1100,45 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
dst_ptr += 8 * im_dst_stride;
height -= 8;
} while (height > 0);
+#else
+ do {
+ t0 = vld1_u8(src_ptr);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ width = w;
+ s = src_ptr + 8;
+ d_tmp = dst_ptr;
+
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t sum = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+
+ vst1q_s16(d_tmp, res0);
+
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += im_dst_stride;
+ height -= 1;
+ } while (height > 0);
+#endif
}
// vertical
@@ -910,10 +1163,17 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
width = w;
if (width <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x4_t d0, d1, d2, d3;
- uint16x8_t dd0, dd1;
- uint8x8_t d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint16x4_t d0;
+ uint16x8_t dd0;
+ uint8x8_t d01;
+
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10;
+ uint16x4_t d1, d2, d3;
+ uint16x8_t dd1;
+ uint8x8_t d23;
+#endif
d_u8 = dst_u8_ptr;
v_s = v_src_ptr;
@@ -931,6 +1191,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
v_s += (7 * im_stride);
do {
+#if defined(__aarch64__)
load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
v_s += (im_stride << 2);
@@ -1008,11 +1269,48 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
s5 = s9;
s6 = s10;
height -= 4;
+#else
+ s7 = vld1_s16(v_s);
+ v_s += im_stride;
+
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+ d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+
+ dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
+ d01 = vqmovn_u16(dd0);
+
+ if (w == 4) {
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ d_u8 += dst_stride;
+
+ } else if (w == 2) {
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 0); // 00 01
+ d_u8 += dst_stride;
+ }
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
} while (height > 0);
} else {
// if width is a multiple of 8 & height is a multiple of 4
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x8_t res0, res1, res2, res3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x8_t res0;
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10;
+ uint8x8_t res1, res2, res3;
+#endif
do {
__builtin_prefetch(v_src_ptr + 0 * im_stride);
@@ -1032,6 +1330,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
height = h;
do {
+#if defined(__aarch64__)
load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
v_s += (im_stride << 2);
@@ -1076,6 +1375,28 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
s5 = s9;
s6 = s10;
height -= 4;
+#else
+ s7 = vld1q_s16(v_s);
+ v_s += im_stride;
+
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+ res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+
+ vst1_u8(d_u8, res0);
+ d_u8 += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
} while (height > 0);
v_src_ptr += 8;
dst_u8_ptr += 8;
diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h
index 47c93d645..f382984f2 100644
--- a/third_party/aom/av1/common/arm/convolve_neon.h
+++ b/third_party/aom/av1/common/arm/convolve_neon.h
@@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef AV1_COMMON_ARM_CONVOLVE_NEON_H_
-#define AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
#include <arm_neon.h>
@@ -225,4 +225,4 @@ static INLINE uint16x4_t convolve8_4x4_s32(
return res;
}
-#endif // AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
index 4015082b4..e5674ef7c 100644
--- a/third_party/aom/av1/common/arm/jnt_convolve_neon.c
+++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
@@ -22,12 +22,108 @@
#include "av1/common/arm/mem_neon.h"
#include "av1/common/arm/transpose_neon.h"
+#if !defined(__aarch64__)
+static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
+ const uint16_t fwd_offset,
+ const uint16_t bck_offset,
+ const int16x4_t sub_const_vec,
+ const int16_t round_bits,
+ const int use_jnt_comp_avg, uint8x8_t *t0) {
+ int16x4_t tmp0;
+ uint16x4_t tmp_u0;
+ uint32x4_t sum0;
+ int32x4_t dst0;
+ int16x8_t tmp4;
+
+ if (use_jnt_comp_avg) {
+ const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
+
+ sum0 = vmull_n_u16(res0, fwd_offset);
+ sum0 = vmlal_n_u16(sum0, d0, bck_offset);
+
+ sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+ dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), vmovl_s16(sub_const_vec));
+
+ dst0 = vqrshlq_s32(dst0, round_bits_vec);
+
+ tmp0 = vqmovn_s32(dst0);
+ tmp4 = vcombine_s16(tmp0, tmp0);
+
+ *t0 = vqmovun_s16(tmp4);
+ } else {
+ const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
+ tmp_u0 = vhadd_u16(res0, d0);
+
+ tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec);
+
+ tmp0 = vqrshl_s16(tmp0, round_bits_vec);
+
+ tmp4 = vcombine_s16(tmp0, tmp0);
+
+ *t0 = vqmovun_s16(tmp4);
+ }
+}
+
+static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
+ const uint16_t fwd_offset,
+ const uint16_t bck_offset,
+ const int16x4_t sub_const,
+ const int16_t round_bits,
+ const int use_jnt_comp_avg, uint8x8_t *t0) {
+ int16x4_t tmp0, tmp2;
+ int16x8_t f0;
+ uint32x4_t sum0, sum2;
+ int32x4_t dst0, dst2;
+
+ uint16x8_t tmp_u0;
+
+ if (use_jnt_comp_avg) {
+ const int32x4_t sub_const_vec = vmovl_s16(sub_const);
+ const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
+
+ sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset);
+ sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset);
+ sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+ sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset);
+ sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset);
+ sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+
+ dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec);
+ dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec);
+
+ dst0 = vqrshlq_s32(dst0, round_bits_vec);
+ dst2 = vqrshlq_s32(dst2, round_bits_vec);
+
+ tmp0 = vqmovn_s32(dst0);
+ tmp2 = vqmovn_s32(dst2);
+
+ f0 = vcombine_s16(tmp0, tmp2);
+
+ *t0 = vqmovun_s16(f0);
+
+ } else {
+ const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
+ const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits);
+
+ tmp_u0 = vhaddq_u16(res0, d0);
+
+ f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec);
+
+ f0 = vqrshlq_s16(f0, round_bits_vec);
+
+ *t0 = vqmovun_s16(f0);
+ }
+}
+#endif // !defined(__arch64__)
+
static INLINE void compute_avg_4x4(
uint16x4_t res0, uint16x4_t res1, uint16x4_t res2, uint16x4_t res3,
uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
const uint16_t fwd_offset, const uint16_t bck_offset,
const int16x4_t sub_const_vec, const int16_t round_bits,
- const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+ const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
int16x4_t tmp0, tmp1, tmp2, tmp3;
uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
uint32x4_t sum0, sum1, sum2, sum3;
@@ -107,7 +203,7 @@ static INLINE void compute_avg_8x4(
uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
const uint16_t fwd_offset, const uint16_t bck_offset,
const int16x4_t sub_const, const int16_t round_bits,
- const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
+ const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
uint8x8_t *t3) {
int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int16x8_t f0, f1, f2, f3;
@@ -231,7 +327,6 @@ static INLINE void jnt_convolve_2d_horiz_neon(
int16_t *dst_ptr;
int dst_stride;
int width, height;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
dst_ptr = im_block;
dst_stride = im_stride;
@@ -239,15 +334,22 @@ static INLINE void jnt_convolve_2d_horiz_neon(
width = w;
if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- int16x8_t tt0, tt1, tt2, tt3;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+ int16x8_t tt0;
+ uint8x8_t t0;
const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+ int16x8_t tt1, tt2, tt3;
+ uint8x8_t t1, t2, t3;
+#endif
do {
s = src;
__builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
@@ -301,17 +403,48 @@ static INLINE void jnt_convolve_2d_horiz_neon(
src += 4 * src_stride;
dst_ptr += 4 * dst_stride;
height -= 4;
+#else
+ t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+ s0 = vget_low_s16(tt0); // a0 a1 a2 a3
+ s4 = vget_high_s16(tt0); // a4 a5 a6 a7
+ __builtin_prefetch(dst_ptr);
+ s += 8;
+ t0 = vld1_u8(s); // a8 a9 a10 a11
+
+ // a8 a9 a10 a11
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
+ s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
+ s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+
+ vst1_s16(dst_ptr, d0);
+
+ src += src_stride;
+ dst_ptr += dst_stride;
+ height -= 1;
+#endif
} while (height > 0);
} else {
int16_t *d_tmp;
- int16x8_t s11, s12, s13, s14;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t res0;
+ uint8x8_t t0;
const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
-
do {
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+ int16x8_t s8, s9, s10, s11, s12, s13, s14;
+ int16x8_t res1, res2, res3, res4, res5, res6, res7;
__builtin_prefetch(src + 0 * src_stride);
__builtin_prefetch(src + 1 * src_stride);
__builtin_prefetch(src + 2 * src_stride);
@@ -390,6 +523,42 @@ static INLINE void jnt_convolve_2d_horiz_neon(
src += 8 * src_stride;
dst_ptr += 8 * dst_stride;
height -= 8;
+#else
+ int16x8_t temp_0;
+ t0 = vld1_u8(src);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ width = w;
+ s = src + 8;
+ d_tmp = dst_ptr;
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ temp_0 = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(temp_0, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(temp_0, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(temp_0, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(temp_0, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(temp_0, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter_tmp, horiz_const, shift_round_0);
+ vst1q_s16(d_tmp, res0);
+
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src += src_stride;
+ dst_ptr += dst_stride;
+ height -= 1;
+#endif
} while (height > 0);
}
}
@@ -420,10 +589,15 @@ static INLINE void jnt_convolve_2d_vert_neon(
const int do_average = conv_params->do_average;
const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x4_t res4, res5, res6, res7;
- uint16x4_t d0, d1, d2, d3;
- uint8x8_t t0, t1;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint16x4_t res4, d0;
+ uint8x8_t t0;
+
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10;
+ uint16x4_t res5, res6, res7, d1, d2, d3;
+ uint8x8_t t1;
+#endif
dst = conv_params->dst;
src_ptr = im_block;
@@ -450,6 +624,7 @@ static INLINE void jnt_convolve_2d_vert_neon(
s += (7 * im_stride);
do {
+#if defined(__aarch64__)
load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10);
s += (im_stride << 2);
@@ -480,17 +655,13 @@ static INLINE void jnt_convolve_2d_vert_neon(
bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg,
&t0, &t1);
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
- 0); // 00 01 02 03
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
d_u8 += dst8_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
- 1); // 10 11 12 13
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
d_u8 += dst8_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
- 0); // 20 21 22 23
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
d_u8 += dst8_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
- 1); // 30 31 32 33
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
d_u8 += dst8_stride;
} else {
@@ -505,6 +676,39 @@ static INLINE void jnt_convolve_2d_vert_neon(
s5 = s9;
s6 = s10;
height -= 4;
+#else
+ s7 = vld1_s16(s);
+ s += (im_stride);
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d_u8 + 0 * dst8_stride);
+
+ d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const);
+
+ if (do_average) {
+ res4 = vld1_u16(d);
+ d += (dst_stride);
+
+ compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec,
+ round_bits, use_jnt_comp_avg, &t0);
+
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+ d_u8 += dst8_stride;
+
+ } else {
+ vst1_u16(d, d0);
+ d += (dst_stride);
+ }
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height--;
+#endif
} while (height > 0);
src_ptr += 4;
dst_ptr += 4;
@@ -722,8 +926,10 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
uint8_t *dst_u8_ptr;
CONV_BUF_TYPE *d, *dst_ptr;
int width, height;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
s = src_ptr;
dst_ptr = dst;
dst_u8_ptr = dst8;
@@ -731,11 +937,18 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
height = h;
if ((w == 4) || (h == 4)) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- int16x8_t tt0, tt1, tt2, tt3;
- uint16x4_t res4, res5, res6, res7;
- uint32x2_t tu0, tu1;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+ int16x8_t tt0;
+ uint16x4_t res4;
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+ int16x8_t tt1, tt2, tt3;
+ uint16x4_t res5, res6, res7;
+ uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0);
int16x8_t u0, u1;
+#else
+ int16x4_t temp_0;
+#endif
const int16x4_t zero = vdup_n_s16(0);
const int16x4_t round_offset_vec = vdup_n_s16(round_offset);
const int16x4_t shift_round_0 = vdup_n_s16(-conv_params->round_0 + 1);
@@ -746,6 +959,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
d_u8 = dst_u8_ptr;
width = w;
__builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
__builtin_prefetch(s + 1 * src_stride);
__builtin_prefetch(s + 2 * src_stride);
__builtin_prefetch(s + 3 * src_stride);
@@ -854,15 +1068,66 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
dst_ptr += (dst_stride << 2);
dst_u8_ptr += (dst8_stride << 2);
height -= 4;
+#else
+ t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+ s0 = vget_low_s16(tt0); // a0 a1 a2 a3
+ s4 = vget_high_s16(tt0); // a4 a5 a6 a7
+ __builtin_prefetch(d);
+
+ s += 8;
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11
+
+ // a8 a9 a10 a11
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ temp_0 = s7;
+ s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
+ s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
+ s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ zero, shift_round_0);
+ d0 = vrshl_s16(d0, horiz_const);
+ d0 = vadd_s16(d0, round_offset_vec);
+ s0 = s4;
+ s4 = temp_0;
+ if (conv_params->do_average) {
+ __builtin_prefetch(d);
+ __builtin_prefetch(d_u8);
+
+ res4 = vld1_u16(d);
+
+ compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+ bck_offset, round_offset_vec, round_bits,
+ use_jnt_comp_avg, &t0);
+
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+ 0); // 00 01 02 03
+ } else {
+ vst1_u16(d, vreinterpret_u16_s16(d0));
+ }
+
+ s += 4;
+ width -= 4;
+ d += 4;
+ d_u8 += 4;
+ } while (width > 0);
+ src_ptr += (src_stride);
+ dst_ptr += (dst_stride);
+ dst_u8_ptr += (dst8_stride);
+ height--;
+#endif
} while (height > 0);
} else {
CONV_BUF_TYPE *d_tmp;
uint8_t *d_u8_tmp;
- int16x8_t s11, s12, s13, s14;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
- uint16x8_t res8, res9, res10, res11;
-
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t res0;
+ uint16x8_t res8;
const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
const int16x4_t round_offset64 = vdup_n_s16(round_offset);
const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
@@ -872,6 +1137,11 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
d = dst_ptr = dst;
d_u8 = dst_u8_ptr = dst8;
do {
+#if defined(__aarch64__)
+ int16x8_t s11, s12, s13, s14;
+ int16x8_t s8, s9, s10;
+ int16x8_t res1, res2, res3, res4, res5, res6, res7;
+ uint16x8_t res9, res10, res11;
__builtin_prefetch(src_ptr + 0 * src_stride);
__builtin_prefetch(src_ptr + 1 * src_stride);
__builtin_prefetch(src_ptr + 2 * src_stride);
@@ -1007,6 +1277,67 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
dst_ptr += 8 * dst_stride;
dst_u8_ptr += 8 * dst8_stride;
height -= 8;
+#else
+ int16x8_t temp_0;
+ __builtin_prefetch(src_ptr);
+ t0 = vld1_u8(src_ptr);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ width = w;
+ s = src_ptr + 8;
+ d = dst_ptr;
+ d_u8_tmp = dst_u8_ptr;
+
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ d_u8 = d_u8_tmp;
+ d_tmp = d;
+
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ temp_0 = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(temp_0, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(temp_0, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(temp_0, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(temp_0, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(temp_0, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter_tmp, zero, shift_round_0);
+
+ res0 = vrshlq_s16(res0, horiz_const);
+ res0 = vaddq_s16(res0, round_offset128);
+
+ if (conv_params->do_average) {
+ res8 = vld1q_u16(d_tmp);
+ d_tmp += (dst_stride);
+
+ compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+ bck_offset, round_offset64, round_bits,
+ use_jnt_comp_avg, &t0);
+
+ vst1_u8(d_u8, t0);
+ d_u8 += (dst8_stride);
+ } else {
+ vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+ d_tmp += (dst_stride);
+ }
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ d_u8_tmp += 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ dst_u8_ptr += dst8_stride;
+ height--;
+#endif
} while (height > 0);
}
}
@@ -1057,7 +1388,6 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
uint8_t *dst_u8_ptr;
CONV_BUF_TYPE *d, *dst_ptr;
int width, height;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
s = src_ptr;
dst_ptr = dst;
@@ -1070,11 +1400,18 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
assert((conv_params->round_1 - 2) >= bits);
if ((w == 4) || (h == 4)) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- uint16x4_t res4, res5, res6, res7;
- uint32x2_t tu0, tu1, tu2, tu3;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+ uint16x4_t res4;
+ uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
+ tu3 = vdup_n_u32(0);
int16x8_t u0, u1, u2, u3;
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+ uint16x4_t res5, res6, res7;
+ uint8x8_t t1;
+#endif
const int16x4_t round_offset64 = vdup_n_s16(round_offset);
const int16x4_t shift_vec = vdup_n_s16(-shift_value);
const int16x4_t zero = vdup_n_s16(0);
@@ -1111,6 +1448,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
s += (7 * src_stride);
do {
+#if defined(__aarch64__)
load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
@@ -1154,17 +1492,13 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
round_offset64, round_bits, use_jnt_comp_avg, &t0,
&t1);
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
- 0); // 00 01 02 03
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
d_u8 += dst8_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
- 1); // 10 11 12 13
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
d_u8 += dst8_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
- 0); // 20 21 22 23
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
d_u8 += dst8_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1),
- 1); // 30 31 32 33
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
d_u8 += dst8_stride;
} else {
store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
@@ -1183,6 +1517,44 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
s += (src_stride << 2);
height -= 4;
+#else
+ load_unaligned_u8_4x1(s, src_stride, &tu0);
+ u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+ s7 = vget_low_s16(u0);
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+ zero, shift_vec);
+
+ d0 = vadd_s16(d0, round_offset64);
+
+ if (conv_params->do_average) {
+ __builtin_prefetch(d);
+
+ res4 = vld1_u16(d);
+ d += (dst_stride);
+
+ compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+ bck_offset, round_offset64, round_bits,
+ use_jnt_comp_avg, &t0);
+
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+ d_u8 += dst8_stride;
+ } else {
+ vst1_u16(d, vreinterpret_u16_s16(d0));
+ d += (dst_stride);
+ }
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+
+ s += (src_stride);
+ height--;
+#endif
} while (height > 0);
src_ptr += 4;
dst_ptr += 4;
@@ -1191,15 +1563,19 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
} while (width > 0);
} else {
CONV_BUF_TYPE *d_tmp;
- int16x8_t s11, s12, s13, s14;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
- uint16x8_t res8, res9, res10, res11;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t res0;
+ uint16x8_t res8;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
const int16x8_t shift_vec = vdupq_n_s16(-shift_value);
const int16x4_t round_offset64 = vdup_n_s16(round_offset);
const int16x8_t zero = vdupq_n_s16(0);
-
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10, s11, s12, s13, s14;
+ int16x8_t res1, res2, res3, res4, res5, res6, res7;
+ uint16x8_t res10, res11, res9;
+#endif
dst_ptr = dst;
dst_u8_ptr = dst8;
do {
@@ -1227,6 +1603,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
d_u8 = dst_u8_ptr;
do {
+#if defined(__aarch64__)
load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -1316,6 +1693,43 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
s6 = s14;
s += (8 * src_stride);
height -= 8;
+#else
+ s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ __builtin_prefetch(dst_ptr);
+
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+ zero, shift_vec);
+ res0 = vaddq_s16(res0, round_offset128);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+
+ if (conv_params->do_average) {
+ __builtin_prefetch(d_tmp);
+
+ res8 = vld1q_u16(d_tmp);
+ d_tmp += (dst_stride);
+
+ compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+ bck_offset, round_offset64, round_bits,
+ use_jnt_comp_avg, &t0);
+
+ vst1_u8(d_u8, t0);
+ d_u8 += (dst8_stride);
+ } else {
+ vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+ d_tmp += dst_stride;
+ }
+
+ s += (src_stride);
+ height--;
+#endif
} while (height > 0);
src_ptr += 8;
dst_ptr += 8;
diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h
index 4bf45a52c..c4ae2e784 100644
--- a/third_party/aom/av1/common/arm/mem_neon.h
+++ b/third_party/aom/av1/common/arm/mem_neon.h
@@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef AV1_COMMON_ARM_MEM_NEON_H_
-#define AV1_COMMON_ARM_MEM_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
+#define AOM_AV1_COMMON_ARM_MEM_NEON_H_
#include <arm_neon.h>
#include <string.h>
@@ -362,6 +362,15 @@ static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
*tu1 = vset_lane_u32(a, *tu1, 1);
}
+static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
+ uint32x2_t *tu0) {
+ uint32_t a;
+
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu0 = vset_lane_u32(a, *tu0, 0);
+}
+
static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
uint32x2_t *tu0) {
uint32_t a;
@@ -482,4 +491,4 @@ static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
vst1q_u32(s, s4);
}
-#endif // AV1_COMMON_ARM_MEM_NEON_H_
+#endif // AOM_AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c
index b4808a972..b3a37c4cb 100644
--- a/third_party/aom/av1/common/arm/selfguided_neon.c
+++ b/third_party/aom/av1/common/arm/selfguided_neon.c
@@ -1007,10 +1007,11 @@ static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
}
-void final_filter_fast_internal(uint16_t *A, int32_t *B, const int buf_stride,
- int16_t *src, const int src_stride,
- int32_t *dst, const int dst_stride,
- const int width, const int height) {
+static void final_filter_fast_internal(uint16_t *A, int32_t *B,
+ const int buf_stride, int16_t *src,
+ const int src_stride, int32_t *dst,
+ const int dst_stride, const int width,
+ const int height) {
int16x8_t s0;
int32_t *B_tmp, *dst_ptr;
uint16_t *A_tmp;
@@ -1340,10 +1341,10 @@ static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
}
}
-void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
- int stride, int32_t *flt0, int32_t *flt1,
- int flt_stride, int sgr_params_idx,
- int bit_depth, int highbd) {
+int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
+ int stride, int32_t *flt0, int32_t *flt1,
+ int flt_stride, int sgr_params_idx,
+ int bit_depth, int highbd) {
const sgr_params_type *const params = &sgr_params[sgr_params_idx];
assert(!(params->r[0] == 0 && params->r[1] == 0));
@@ -1376,6 +1377,7 @@ void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
if (params->r[1] > 0)
restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride,
bit_depth, sgr_params_idx, 1);
+ return 0;
}
void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h
index fe134087b..8a3d9f07f 100644
--- a/third_party/aom/av1/common/arm/transpose_neon.h
+++ b/third_party/aom/av1/common/arm/transpose_neon.h
@@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef AV1_COMMON_ARM_TRANSPOSE_NEON_H_
-#define AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
#include <arm_neon.h>
@@ -386,6 +386,83 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
}
+static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+ int16x8x2_t b0;
+ b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+ vreinterpret_s16_s32(vget_low_s32(a1)));
+ b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+ vreinterpret_s16_s32(vget_high_s32(a1)));
+ return b0;
+}
+
+static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
+ const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
+ const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
+ const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+ const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+ *out = d0.val[0];
+ *(out + 1) = d1.val[0];
+ *(out + 2) = d2.val[0];
+ *(out + 3) = d3.val[0];
+ *(out + 4) = d0.val[1];
+ *(out + 5) = d1.val[1];
+ *(out + 6) = d2.val[1];
+ *(out + 7) = d3.val[1];
+}
+
static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
int16x4_t *a2, int16x4_t *a3) {
// Swap 16 bit elements. Goes from:
@@ -457,4 +534,4 @@ static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
*a3 = c1.val[1];
}
-#endif // AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#endif // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c
new file mode 100644
index 000000000..7f02d42a7
--- /dev/null
+++ b/third_party/aom/av1/common/arm/warp_plane_neon.c
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <memory.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+/* This is a modified version of 'warped_filter' from warped_motion.c:
+ * Each coefficient is stored in 8 bits instead of 16 bits
+ * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+ This is done in order to avoid overflow: Since the tap with the largest
+ coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+ order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+ convolve functions.
+
+ Instead, we use the summation order
+ ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+ The rearrangement of coefficients in this table is so that we can get the
+ coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, static const int8_t,
+ filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+ // [-1, 0)
+ { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
+ { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0},
+ { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0},
+ { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0},
+ { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0},
+ { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0},
+ { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0},
+ { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0},
+ { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0},
+ { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0},
+ { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0},
+ { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0},
+ { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0},
+ { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0},
+ { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0},
+ { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0},
+ { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0},
+ { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0},
+ { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0},
+ { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0},
+ { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0},
+ { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0},
+ { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0},
+ { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0},
+ { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0},
+ { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0},
+ { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0},
+ { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0},
+ { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0},
+ { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0},
+ { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0},
+ { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0},
+ // [0, 1)
+ { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0},
+ { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0},
+ { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1},
+ {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1},
+ {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1},
+ {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1},
+ {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1},
+ {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1},
+ {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2},
+ {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2},
+ {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2},
+ {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2},
+ {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2},
+ {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2},
+ {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2},
+ {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2},
+ {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2},
+ {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2},
+ {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2},
+ {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2},
+ {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2},
+ {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2},
+ {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2},
+ {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2},
+ {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2},
+ {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1},
+ {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2},
+ {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1},
+ {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1},
+ {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1},
+ { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0},
+ { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0},
+ // [1, 2)
+ { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0},
+ { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1},
+ { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1},
+ { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1},
+ { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1},
+ { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2},
+ { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2},
+ { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2},
+ { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3},
+ { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3},
+ { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3},
+ { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4},
+ { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4},
+ { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4},
+ { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4},
+ { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4},
+ { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4},
+ { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4},
+ { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4},
+ { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4},
+ { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4},
+ { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4},
+ { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4},
+ { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3},
+ { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3},
+ { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3},
+ { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2},
+ { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2},
+ { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2},
+ { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1},
+ { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1},
+ { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0},
+ // dummy (replicate row index 191)
+ { 0, 0, 2, -1, 0, 0, 127, 0},
+
+#else
+ // [-1, 0)
+ { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0},
+ { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0},
+ { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0},
+ { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0},
+ { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0},
+ { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0},
+ { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0},
+ { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0},
+ { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0},
+ { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0},
+ { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0},
+ { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0},
+ { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0},
+ { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0},
+ { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0},
+ { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0},
+ // [0, 1)
+ { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0},
+ { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1},
+ {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1},
+ {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1},
+ {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2},
+ {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2},
+ {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2},
+ {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2},
+ {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2},
+ {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2},
+ {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2},
+ {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2},
+ {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2},
+ {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1},
+ {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1},
+ { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0},
+ // [1, 2)
+ { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0},
+ { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1},
+ { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2},
+ { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3},
+ { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3},
+ { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3},
+ { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4},
+ { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4},
+ { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4},
+ { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4},
+ { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4},
+ { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3},
+ { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3},
+ { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2},
+ { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1},
+ { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1},
+ // dummy (replicate row index 95)
+ { 0, 0, 4, -3, 0, -1, 127, 1},
+#endif // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0,
+ uint8x8_t src_1, int16x4_t *res) {
+ int16x8_t coeff_0, coeff_1;
+ int16x8_t pix_0, pix_1;
+
+ coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]),
+ vreinterpret_s16_s32(x1.val[0]));
+ coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]),
+ vreinterpret_s16_s32(x1.val[1]));
+
+ pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0));
+ pix_0 = vmulq_s16(coeff_0, pix_0);
+
+ pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1));
+ pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1);
+
+ *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0));
+}
+
+static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2,
+ uint8x16_t src_3, uint8x16_t src_4,
+ int16x8_t *tmp_dst, int sx, int alpha,
+ int k, const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ const uint8x16_t mask = { 255, 0, 255, 0, 255, 0, 255, 0,
+ 255, 0, 255, 0, 255, 0, 255, 0 };
+ const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz));
+ const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz);
+
+ int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+ int32x2x2_t b0, b1;
+ uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low;
+ int32x4_t tmp_res_low, tmp_res_high;
+ uint16x8_t res;
+ int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd;
+
+ uint8x16_t tmp_0 = vandq_u8(src_1, mask);
+ uint8x16_t tmp_1 = vandq_u8(src_2, mask);
+ uint8x16_t tmp_2 = vandq_u8(src_3, mask);
+ uint8x16_t tmp_3 = vandq_u8(src_4, mask);
+
+ tmp_2 = vextq_u8(tmp_0, tmp_0, 1);
+ tmp_3 = vextq_u8(tmp_1, tmp_1, 1);
+
+ src_1 = vaddq_u8(tmp_0, tmp_2);
+ src_2 = vaddq_u8(tmp_1, tmp_3);
+
+ src_1_low = vget_low_u8(src_1);
+ src_2_low = vget_low_u8(src_2);
+ src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4));
+ src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4));
+ src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2));
+ src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6));
+
+ // Loading the 8 filter taps
+ f0 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f1 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f2 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f3 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f4 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f5 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f6 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f7 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]));
+
+ b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)),
+ vreinterpret_s32_s16(vget_low_s16(f2)));
+ b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)),
+ vreinterpret_s32_s16(vget_low_s16(f6)));
+ convolve(b0, b1, src_1_low, src_3_low, &res_0246_even);
+
+ b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)),
+ vreinterpret_s32_s16(vget_low_s16(f3)));
+ b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)),
+ vreinterpret_s32_s16(vget_low_s16(f7)));
+ convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd);
+
+ b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)),
+ vreinterpret_s32_s16(vget_high_s16(f2)));
+ b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)),
+ vreinterpret_s32_s16(vget_high_s16(f6)));
+ convolve(b0, b1, src_2_low, src_4_low, &res_1357_even);
+
+ b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)),
+ vreinterpret_s32_s16(vget_high_s16(f3)));
+ b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)),
+ vreinterpret_s32_s16(vget_high_s16(f7)));
+ convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd);
+
+ tmp_res_low = vaddl_s16(res_0246_even, res_1357_even);
+ tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high));
+ res = vqrshlq_u16(res, shift);
+
+ tmp_dst[k + 7] = vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_neon(const int16x8_t *src,
+ int32x4_t *res_low, int32x4_t *res_high,
+ int sy, int gamma) {
+ int16x4_t src_0, src_1, fltr_0, fltr_1;
+ int32x4_t res_0, res_1;
+ int32x2_t res_0_im, res_1_im;
+ int32x4_t res_even, res_odd, im_res_0, im_res_1;
+
+ int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+ int16x8x2_t b0, b1, b2, b3;
+ int32x4x2_t c0, c1, c2, c3;
+ int32x4x2_t d0, d1, d2, d3;
+
+ b0 = vtrnq_s16(src[0], src[1]);
+ b1 = vtrnq_s16(src[2], src[3]);
+ b2 = vtrnq_s16(src[4], src[5]);
+ b3 = vtrnq_s16(src[6], src[7]);
+
+ c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b0.val[1]));
+ c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b2.val[1]));
+ c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ f0 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f1 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f2 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f3 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f4 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f5 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f6 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f7 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
+ d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));
+ d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3));
+ d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7));
+
+ // row:0,1 even_col:0,2
+ src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0]));
+ fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0]));
+ res_0 = vmull_s16(src_0, fltr_0);
+
+ // row:0,1,2,3 even_col:0,2
+ src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0]));
+ fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1]));
+ res_0 = vmlal_s16(res_0, src_0, fltr_0);
+ res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+ // row:0,1 even_col:4,6
+ src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1]));
+ fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0]));
+ res_1 = vmull_s16(src_1, fltr_1);
+
+ // row:0,1,2,3 even_col:4,6
+ src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1]));
+ fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1]));
+ res_1 = vmlal_s16(res_1, src_1, fltr_1);
+ res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+ // row:0,1,2,3 even_col:0,2,4,6
+ im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+ // row:4,5 even_col:0,2
+ src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0]));
+ fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0]));
+ res_0 = vmull_s16(src_0, fltr_0);
+
+ // row:4,5,6,7 even_col:0,2
+ src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0]));
+ fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1]));
+ res_0 = vmlal_s16(res_0, src_0, fltr_0);
+ res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+ // row:4,5 even_col:4,6
+ src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1]));
+ fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0]));
+ res_1 = vmull_s16(src_1, fltr_1);
+
+ // row:4,5,6,7 even_col:4,6
+ src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1]));
+ fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1]));
+ res_1 = vmlal_s16(res_1, src_1, fltr_1);
+ res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+ // row:4,5,6,7 even_col:0,2,4,6
+ im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+ // row:0-7 even_col:0,2,4,6
+ res_even = vaddq_s32(im_res_0, im_res_1);
+
+ // row:0,1 odd_col:1,3
+ src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0]));
+ fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0]));
+ res_0 = vmull_s16(src_0, fltr_0);
+
+ // row:0,1,2,3 odd_col:1,3
+ src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0]));
+ fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1]));
+ res_0 = vmlal_s16(res_0, src_0, fltr_0);
+ res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+ // row:0,1 odd_col:5,7
+ src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1]));
+ fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0]));
+ res_1 = vmull_s16(src_1, fltr_1);
+
+ // row:0,1,2,3 odd_col:5,7
+ src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1]));
+ fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1]));
+ res_1 = vmlal_s16(res_1, src_1, fltr_1);
+ res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+ // row:0,1,2,3 odd_col:1,3,5,7
+ im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+ // row:4,5 odd_col:1,3
+ src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0]));
+ fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0]));
+ res_0 = vmull_s16(src_0, fltr_0);
+
+ // row:4,5,6,7 odd_col:1,3
+ src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0]));
+ fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1]));
+ res_0 = vmlal_s16(res_0, src_0, fltr_0);
+ res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+ // row:4,5 odd_col:5,7
+ src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1]));
+ fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0]));
+ res_1 = vmull_s16(src_1, fltr_1);
+
+ // row:4,5,6,7 odd_col:5,7
+ src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1]));
+ fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1]));
+ res_1 = vmlal_s16(res_1, src_1, fltr_1);
+ res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+ // row:4,5,6,7 odd_col:1,3,5,7
+ im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+ // row:0-7 odd_col:1,3,5,7
+ res_odd = vaddq_s32(im_res_0, im_res_1);
+
+ // reordering as 0 1 2 3 | 4 5 6 7
+ c0 = vtrnq_s32(res_even, res_odd);
+
+ // Final store
+ *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1]));
+ *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1]));
+}
+
+void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ int16x8_t tmp[15];
+ const int bd = 8;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const int32x4_t fwd = vdupq_n_s32((int32_t)w0);
+ const int32x4_t bwd = vdupq_n_s32((int32_t)w1);
+ const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd));
+
+ int limit = 0;
+ uint8x16_t vec_dup, mask_val;
+ int32x4_t res_lo, res_hi;
+ int16x8_t result_final;
+ uint8x16_t src_1, src_2, src_3, src_4;
+ uint8x16_t indx_vec = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ };
+ uint8x16_t cmp_vec;
+
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert);
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int16x4_t res_sub_const =
+ vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1))));
+ int k;
+
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ for (int i = 0; i < p_height; i += 8) {
+ for (int j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+ const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+ const int32_t x4 = dst_x >> subsampling_x;
+ const int32_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ // horizontal
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int16_t dup_val =
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
+
+ tmp[k + 7] = vdupq_n_s16(dup_val);
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz));
+ tmp[k + 7] = vdupq_n_s16(dup_val);
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ const uint8_t *src = ref + iy * stride + ix4 - 7;
+ src_1 = vld1q_u8(src);
+
+ if (out_of_boundary_left >= 0) {
+ limit = out_of_boundary_left + 1;
+ cmp_vec = vdupq_n_u8(out_of_boundary_left);
+ vec_dup = vdupq_n_u8(*(src + limit));
+ mask_val = vcleq_u8(indx_vec, cmp_vec);
+ src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+ }
+ if (out_of_boundary_right >= 0) {
+ limit = 15 - (out_of_boundary_right + 1);
+ cmp_vec = vdupq_n_u8(15 - out_of_boundary_right);
+ vec_dup = vdupq_n_u8(*(src + limit));
+ mask_val = vcgeq_u8(indx_vec, cmp_vec);
+ src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+ }
+ src_2 = vextq_u8(src_1, src_1, 1);
+ src_3 = vextq_u8(src_2, src_2, 1);
+ src_4 = vextq_u8(src_3, src_3, 1);
+
+ horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+ } else {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ const uint8_t *src = ref + iy * stride + ix4 - 7;
+ src_1 = vld1q_u8(src);
+ src_2 = vextq_u8(src_1, src_1, 1);
+ src_3 = vextq_u8(src_2, src_2, 1);
+ src_4 = vextq_u8(src_3, src_3, 1);
+
+ horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+ }
+
+ // vertical
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ const int16x8_t *v_src = tmp + (k + 4);
+
+ vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma);
+
+ res_lo = vaddq_s32(res_lo, add_const_vert);
+ res_hi = vaddq_s32(res_hi, add_const_vert);
+
+ if (conv_params->is_compound) {
+ uint16_t *const p =
+ (uint16_t *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j];
+
+ res_lo = vrshlq_s32(res_lo, shift_vert);
+ if (conv_params->do_average) {
+ uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
+ uint16x4_t tmp16_lo = vld1_u16(p);
+ int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
+ int16x4_t tmp16_low;
+ if (conv_params->use_jnt_comp_avg) {
+ res_lo = vmulq_s32(res_lo, bwd);
+ tmp32_lo = vmulq_s32(tmp32_lo, fwd);
+ tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+ tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
+ } else {
+ tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+ tmp16_low = vshrn_n_s32(tmp32_lo, 1);
+ }
+ int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const);
+ res_low = vqrshl_s16(res_low, round_bits_vec);
+ int16x8_t final_res_low = vcombine_s16(res_low, res_low);
+ uint8x8_t res_8_low = vqmovun_s16(final_res_low);
+
+ vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0);
+ } else {
+ uint16x4_t res_u16_low = vqmovun_s32(res_lo);
+ vst1_u16(p, res_u16_low);
+ }
+ if (p_width > 4) {
+ uint16_t *const p4 =
+ (uint16_t *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+ res_hi = vrshlq_s32(res_hi, shift_vert);
+ if (conv_params->do_average) {
+ uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4];
+
+ uint16x4_t tmp16_hi = vld1_u16(p4);
+ int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
+ int16x4_t tmp16_high;
+ if (conv_params->use_jnt_comp_avg) {
+ res_hi = vmulq_s32(res_hi, bwd);
+ tmp32_hi = vmulq_s32(tmp32_hi, fwd);
+ tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+ tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS);
+ } else {
+ tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+ tmp16_high = vshrn_n_s32(tmp32_hi, 1);
+ }
+ int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const);
+ res_high = vqrshl_s16(res_high, round_bits_vec);
+ int16x8_t final_res_high = vcombine_s16(res_high, res_high);
+ uint8x8_t res_8_high = vqmovun_s16(final_res_high);
+
+ vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high),
+ 0);
+ } else {
+ uint16x4_t res_u16_high = vqmovun_s32(res_hi);
+ vst1_u16(p4, res_u16_high);
+ }
+ }
+ } else {
+ res_lo = vrshlq_s32(res_lo, shift_vert);
+ res_hi = vrshlq_s32(res_hi, shift_vert);
+
+ result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi));
+ result_final = vsubq_s16(result_final, sub_constant);
+
+ uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+ uint8x8_t val = vqmovun_s16(result_final);
+
+ if (p_width == 4) {
+ vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
+ } else {
+ vst1_u8(p, val);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
index 72fbed4d4..a9bb5bcf0 100644
--- a/third_party/aom/av1/common/arm/wiener_convolve_neon.c
+++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
@@ -26,7 +26,6 @@
Apply horizontal filter and store in a temporary buffer. When applying
vertical filter, overwrite the original pixel values.
*/
-
void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
@@ -78,8 +77,10 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
/* if height is a multiple of 8 */
if (!(h & 7)) {
int16x8_t res0, res1, res2, res3;
- uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11;
+ uint16x8_t res4;
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+#if defined(__aarch64__)
+ uint16x8_t res5, res6, res7, res8, res9, res10, res11;
uint8x8_t t8, t9, t10, t11, t12, t13, t14;
do {
@@ -190,16 +191,64 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
dst_ptr += 8 * MAX_SB_SIZE;
height -= 8;
} while (height > 0);
+#else
+ uint8x8_t temp_0;
+
+ do {
+ const uint8_t *s;
+
+ __builtin_prefetch(src_ptr);
+
+ t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ s = src_ptr + 8;
+ d_tmp = dst_ptr;
+ width = w;
+
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ temp_0 = t0;
+ t0 = t7;
+
+ t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+ res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+ res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+ res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+ bd, conv_params->round_0);
+
+ vst1q_u16(d_tmp, res4);
+
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += MAX_SB_SIZE;
+ height--;
+ } while (height > 0);
+#endif
} else {
/*if height is a multiple of 4*/
- int16x8_t tt0, tt1, tt2, tt3;
const uint8_t *s;
+ int16x8_t tt0, tt1, tt2, tt3;
+ uint16x8_t d0;
+ uint8x8_t t0, t1, t2, t3;
+
+#if defined(__aarch64__)
uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
- uint16x8_t d0, d1, d2, d3;
+ uint16x8_t d1, d2, d3;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
int16x4_t s11, s12, s13, s14;
- uint8x8_t t0, t1, t2, t3;
-
do {
__builtin_prefetch(src_ptr + 0 * src_stride);
__builtin_prefetch(src_ptr + 1 * src_stride);
@@ -292,11 +341,61 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
dst_ptr += 4 * MAX_SB_SIZE;
height -= 4;
} while (height > 0);
+#else
+ uint8x8_t temp_0, t4, t5, t6, t7;
+
+ do {
+ __builtin_prefetch(src_ptr);
+
+ t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ __builtin_prefetch(dst_ptr);
+
+ s = src_ptr + 8;
+ d_tmp = dst_ptr;
+ width = w;
+
+ do {
+ t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ temp_0 = t0;
+ t0 = t7;
+
+ t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+ tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+ tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd,
+ conv_params->round_0);
+
+ vst1q_u16(d_tmp, d0);
+
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+
+ src_ptr += src_stride;
+ dst_ptr += MAX_SB_SIZE;
+ height -= 1;
+ } while (height > 0);
+#endif
}
{
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x8_t t0, t1, t2, t3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10;
+ uint8x8_t t1, t2, t3;
+#endif
int16_t *src_tmp_ptr, *s;
uint8_t *dst_tmp_ptr;
height = h;
@@ -324,6 +423,7 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
d = dst_tmp_ptr;
height = h;
+#if defined(__aarch64__)
do {
__builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
__builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
@@ -397,5 +497,34 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
w -= 8;
} while (w > 0);
+#else
+ do {
+ __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+
+ s7 = vld1q_s16(s);
+ s += src_stride;
+
+ t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
+ bd, conv_params->round_1);
+
+ vst1_u8(d, t0);
+ d += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+ } while (height > 0);
+
+ src_tmp_ptr += 8;
+ dst_tmp_ptr += 8;
+
+ w -= 8;
+ } while (w > 0);
+#endif
}
}
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c
index 8514dc64c..7ef2d6d7f 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d.c
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.c
@@ -11,56 +11,7 @@
#include <stdlib.h>
#include "av1/common/av1_inv_txfm1d.h"
-
-static void range_check_buf(int32_t stage, const int32_t *input,
- const int32_t *buf, int32_t size, int8_t bit) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
- const int64_t max_value = (1LL << (bit - 1)) - 1;
- const int64_t min_value = -(1LL << (bit - 1));
-
- int in_range = 1;
-
- for (int i = 0; i < size; ++i) {
- if (buf[i] < min_value || buf[i] > max_value) {
- in_range = 0;
- }
- }
-
- if (!in_range) {
- fprintf(stderr, "Error: coeffs contain out-of-range values\n");
- fprintf(stderr, "size: %d\n", size);
- fprintf(stderr, "stage: %d\n", stage);
- fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
- max_value);
-
- fprintf(stderr, "coeffs: ");
-
- fprintf(stderr, "[");
- for (int j = 0; j < size; j++) {
- if (j > 0) fprintf(stderr, ", ");
- fprintf(stderr, "%d", input[j]);
- }
- fprintf(stderr, "]\n");
-
- fprintf(stderr, " buf: ");
-
- fprintf(stderr, "[");
- for (int j = 0; j < size; j++) {
- if (j > 0) fprintf(stderr, ", ");
- fprintf(stderr, "%d", buf[j]);
- }
- fprintf(stderr, "]\n\n");
- }
-
- assert(in_range);
-#else
- (void)stage;
- (void)input;
- (void)buf;
- (void)size;
- (void)bit;
-#endif
-}
+#include "av1/common/av1_txfm.h"
// TODO(angiebird): Make 1-d txfm functions static
//
@@ -84,7 +35,7 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[1] = input[2];
bf1[2] = input[1];
bf1[3] = input[3];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -94,7 +45,7 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -129,7 +80,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = input[5];
bf1[6] = input[3];
bf1[7] = input[7];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -143,7 +94,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -157,7 +108,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -171,7 +122,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
bf1[7] = bf0[7];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -218,7 +169,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = input[11];
bf1[14] = input[7];
bf1[15] = input[15];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -240,7 +191,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -262,7 +213,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -284,7 +235,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
bf1[15] = bf0[15];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -306,7 +257,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -328,7 +279,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
bf1[14] = bf0[14];
bf1[15] = bf0[15];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -399,7 +350,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = input[23];
bf1[30] = input[15];
bf1[31] = input[31];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -437,7 +388,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -475,7 +426,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -513,7 +464,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
bf1[31] = bf0[31];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -551,7 +502,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -589,7 +540,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
bf1[30] = bf0[30];
bf1[31] = bf0[31];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -627,7 +578,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 8
stage++;
@@ -665,7 +616,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = bf0[29];
bf1[30] = bf0[30];
bf1[31] = bf0[31];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
stage++;
@@ -760,7 +711,6 @@ void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
output[1] = round_shift(x1, bit);
output[2] = round_shift(x2, bit);
output[3] = round_shift(x3, bit);
- range_check_buf(6, input, output, 4, stage_range[6]);
}
void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -786,7 +736,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = input[4];
bf1[6] = input[1];
bf1[7] = input[6];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -800,7 +750,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -814,7 +764,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -828,7 +778,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -842,7 +792,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -856,7 +806,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = bf0[5];
bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -903,7 +853,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = input[12];
bf1[14] = input[1];
bf1[15] = input[14];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -925,7 +875,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -947,7 +897,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -969,7 +919,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -991,7 +941,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -1013,7 +963,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -1035,7 +985,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 8
stage++;
@@ -1057,7 +1007,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[13];
bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
stage++;
@@ -1193,7 +1143,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = input[47];
bf1[62] = input[31];
bf1[63] = input[63];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -1263,7 +1213,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -1333,7 +1283,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -1403,7 +1353,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
bf1[63] = bf0[63];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -1473,7 +1423,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -1543,7 +1493,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
bf1[62] = bf0[62];
bf1[63] = bf0[63];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -1613,7 +1563,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 8
stage++;
@@ -1683,7 +1633,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[61];
bf1[62] = bf0[62];
bf1[63] = bf0[63];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
stage++;
@@ -1753,7 +1703,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 10
stage++;
@@ -1823,7 +1773,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[61];
bf1[62] = bf0[62];
bf1[63] = bf0[63];
- range_check_buf(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 11
stage++;
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h
index 64a1a921c..c31c019aa 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d.h
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_INV_TXFM1D_H_
-#define AV1_INV_TXFM1D_H_
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
#include "av1/common/av1_txfm.h"
@@ -58,4 +58,4 @@ void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
}
#endif
-#endif // AV1_INV_TXFM1D_H_
+#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
index 4c600f756..7d80a0099 100644
--- a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
+++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_INV_TXFM2D_CFG_H_
-#define AV1_INV_TXFM2D_CFG_H_
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
#include "av1/common/av1_inv_txfm1d.h"
// sum of fwd_shift_##
@@ -44,4 +44,4 @@ extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL];
extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/];
extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/];
-#endif // AV1_INV_TXFM2D_CFG_H_
+#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
index 9d68b8760..537d8dfe9 100644
--- a/third_party/aom/av1/common/av1_loopfilter.c
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -68,23 +68,6 @@ static const int mode_lf_lut[] = {
// 10101010|10101010
//
// A loopfilter should be applied to every other 4x4 horizontally.
-// TODO(chengchen): make these tables static
-const FilterMask left_txform_mask[TX_SIZES] = {
- { { 0xffffffffffffffffULL, // TX_4X4,
- 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
-
- { { 0x5555555555555555ULL, // TX_8X8,
- 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL } },
-
- { { 0x1111111111111111ULL, // TX_16X16,
- 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL } },
-
- { { 0x0101010101010101ULL, // TX_32X32,
- 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL } },
-
- { { 0x0001000100010001ULL, // TX_64X64,
- 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
-};
// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
// We use 4 uint64_t to represent the 256 bit.
@@ -113,98 +96,314 @@ const FilterMask left_txform_mask[TX_SIZES] = {
// 00000000|00000000
//
// A loopfilter should be applied to every other 4x4 horizontally.
-const FilterMask above_txform_mask[TX_SIZES] = {
- { { 0xffffffffffffffffULL, // TX_4X4
- 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
- { { 0x0000ffff0000ffffULL, // TX_8X8
- 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL } },
-
- { { 0x000000000000ffffULL, // TX_16X16
- 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL } },
-
- { { 0x000000000000ffffULL, // TX_32X32
- 0x0000000000000000ULL, 0x000000000000ffffULL, 0x0000000000000000ULL } },
-
- { { 0x000000000000ffffULL, // TX_64X64
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
};
-// 64 bit mask to shift and set for each prediction size. A bit is set for
-// each 4x4 block that would be in the top left most block of the given block
-// size in the 64x64 block.
-const FilterMask size_mask_y[BLOCK_SIZES_ALL] = {
- { { 0x0000000000000001ULL, // BLOCK_4X4
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x0000000000010001ULL, // BLOCK_4X8
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x0000000000000003ULL, // BLOCK_8X4
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x0000000000030003ULL, // BLOCK_8X8
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x0003000300030003ULL, // BLOCK_8X16
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x00000000000f000fULL, // BLOCK_16X8
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x000f000f000f000fULL, // BLOCK_16X16
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x000f000f000f000fULL, // BLOCK_16X32
- 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x00ff00ff00ff00ffULL, // BLOCK_32X16
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x00ff00ff00ff00ffULL, // BLOCK_32X32
- 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x00ff00ff00ff00ffULL, // BLOCK_32X64
- 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL } },
-
- { { 0xffffffffffffffffULL, // BLOCK_64X32
- 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0xffffffffffffffffULL, // BLOCK_64X64
- 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } },
- // Y plane max coding block size is 128x128, but the codec divides it
- // into 4 64x64 blocks.
- // BLOCK_64X128
- { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
- // BLOCK_128X64
- { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
- // BLOCK_128X128
- { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } },
-
- { { 0x0001000100010001ULL, // BLOCK_4X16
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x000000000000000fULL, // BLOCK_16X4
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
-
- { { 0x0003000300030003ULL, // BLOCK_8X32
- 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
+ -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
+};
- { { 0x0000000000ff00ffULL, // BLOCK_32X8
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
+ -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
+};
- { { 0x000f000f000f000fULL, // BLOCK_16X64
- 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL } },
+const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, 0, 1, 2,
+ 3, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1 };
+
+const FilterMask left_mask_univariant_reordered[67] = {
+ // TX_4X4
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X4, TX_4X4
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_4X4
+ { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_4X4
+ { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_4X4
+ { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+ 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+ 0xffffffffffffffffULL } }, // block size 64X64, TX_4X4
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X4
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_4X4
+ { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_4X4
+ { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+ 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_4X4
+ // TX_8X8
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_8X8
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X8
+ { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_8X8
+ { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_8X8
+ { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_8X8
+ { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_8X8
+ { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_8X8
+ { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
+ 0x0055005500550055ULL } }, // block size 32X64, TX_8X8
+ { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_8X8
+ { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
+ 0x5555555555555555ULL } }, // block size 64X64, TX_8X8
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X8
+ { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_8X8
+ { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
+ 0x0005000500050005ULL } }, // block size 16X64, TX_8X8
+ { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_8X8
+ // TX_16X16
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_16X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X16
+ { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_16X16
+ { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_16X16
+ { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
+ 0x0011001100110011ULL } }, // block size 32X64, TX_16X16
+ { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_16X16
+ { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
+ 0x1111111111111111ULL } }, // block size 64X64, TX_16X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 16X64, TX_16X16
+ { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_16X16
+ // TX_32X32
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_32X32
+ { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+ 0x0101010101010101ULL } }, // block size 32X64, TX_32X32
+ { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_32X32
+ { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+ 0x0101010101010101ULL } }, // block size 64X64, TX_32X32
+ // TX_64X64
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 64X64, TX_64X64
+ // 2:1, 1:2 transform sizes.
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X8
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X8
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_8X4
+ { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_8X4
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X16
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_16X8
+ { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_16X8
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X32
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 16X64, TX_16X32
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_32X16
+ { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_32X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 32X64, TX_32X64
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_64X32
+ // 4:1, 1:4 transform sizes.
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X16
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_16X4
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X32
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_32X8
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 16X64, TX_16X64
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_64X16
+};
- { { 0xffffffffffffffffULL, // BLOCK_64X16
- 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }
+const FilterMask above_mask_univariant_reordered[67] = {
+ // TX_4X4
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X4, TX_4X4
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_4X4
+ { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_4X4
+ { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_4X4
+ { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+ 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+ 0xffffffffffffffffULL } }, // block size 64X64, TX_4x4
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X4
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_4X4
+ { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_4X4
+ { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+ 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_4X4
+ // TX_8X8
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_8X8
+ { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X8
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_8X8
+ { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_8X8
+ { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_8X8
+ { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_8X8
+ { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_8X8
+ { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
+ 0x000000ff000000ffULL } }, // block size 32X64, TX_8X8
+ { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_8X8
+ { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+ 0x0000ffff0000ffffULL } }, // block size 64X64, TX_8X8
+ { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X8
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_8X8
+ { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
+ 0x0000000f0000000fULL } }, // block size 16X64, TX_8X8
+ { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_8X8
+ // TX_16X16
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_16X16
+ { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X16
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_16X16
+ { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_16X16
+ { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
+ 0x00000000000000ffULL } }, // block size 32X64, TX_16X16
+ { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_16X16
+ { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
+ 0x000000000000ffffULL } }, // block size 64X64, TX_16X16
+ { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
+ 0x000000000000000fULL } }, // block size 16X64, TX_16X16
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_16X16
+ // TX_32X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_32X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
+ 0x0000000000000000ULL } }, // block size 32X64, TX_32X32
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_32X32
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
+ 0x0000000000000000ULL } }, // block size 64X64, TX_32X32
+ // TX_64X64
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X64, TX_64X64
+ // 2:1, 1:2 transform sizes.
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X8
+ { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X8
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_8X4
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_8X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X16
+ { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X16
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_16X8
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_16X8
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X32
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
+ 0x0000000000000000ULL } }, // block size 16X64, TX_16X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_32X16
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_32X16
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X64, TX_32X64
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_64X32
+ // 4:1, 1:4 transform sizes.
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X16
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_16X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_32X8
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X64, TX_16X64
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_64X16
};
LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
int mi_col) {
- if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
- (mi_col << MI_SIZE_LOG2) >= cm->width)
- return NULL;
assert(cm->lf.lfm != NULL);
const int row = mi_row >> MIN_MIB_SIZE_LOG2; // 64x64
const int col = mi_col >> MIN_MIB_SIZE_LOG2;
@@ -248,10 +447,10 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
SIMD_WIDTH);
}
}
-static uint8_t get_filter_level(const AV1_COMMON *cm,
- const loop_filter_info_n *lfi_n,
- const int dir_idx, int plane,
- const MB_MODE_INFO *mbmi) {
+
+uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n,
+ const int dir_idx, int plane,
+ const MB_MODE_INFO *mbmi) {
const int segment_id = mbmi->segment_id;
if (cm->delta_lf_present_flag) {
int delta_lf;
@@ -374,30 +573,6 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
}
}
}
-
-#if LOOP_FILTER_BITMASK
- memset(lf->neighbor_sb_lpf_info.tx_size_y_above, TX_64X64,
- sizeof(TX_SIZE) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.tx_size_y_left, TX_64X64,
- sizeof(TX_SIZE) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.tx_size_uv_above, TX_64X64,
- sizeof(TX_SIZE) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.tx_size_uv_left, TX_64X64,
- sizeof(TX_SIZE) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.y_level_above, 0,
- sizeof(uint8_t) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.y_level_left, 0,
- sizeof(uint8_t) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.u_level_above, 0,
- sizeof(uint8_t) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.u_level_left, 0,
- sizeof(uint8_t) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.v_level_above, 0,
- sizeof(uint8_t) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.v_level_left, 0,
- sizeof(uint8_t) * MI_SIZE_64X64);
- memset(lf->neighbor_sb_lpf_info.skip, 0, sizeof(uint8_t) * MI_SIZE_64X64);
-#endif // LOOP_FILTER_BITMASK
}
#if LOOP_FILTER_BITMASK
@@ -413,7 +588,7 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
// After locating which uint64_t, mi_row % 4 is the
// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
// Therefore, shift = (row << stride_log2) + mi_col;
-static int get_index_shift(int mi_col, int mi_row, int *index) {
+int get_index_shift(int mi_col, int mi_row, int *index) {
// *index = mi_row >> 2;
// rows = mi_row % 4;
// stride_log2 = 4;
@@ -588,15 +763,9 @@ static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
else
lfm->lfl_y_hor[row][col] = level;
} else if (plane == 1) {
- if (dir == VERT_EDGE)
- lfm->lfl_u_ver[row][col] = level;
- else
- lfm->lfl_u_hor[row][col] = level;
+ lfm->lfl_u[row][col] = level;
} else {
- if (dir == VERT_EDGE)
- lfm->lfl_v_ver[row][col] = level;
- else
- lfm->lfl_v_hor[row][col] = level;
+ lfm->lfl_v[row][col] = level;
}
}
}
@@ -623,11 +792,12 @@ static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
const TX_SIZE prev_tx_size =
plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
: mbmi_prev->tx_size;
- const TX_SIZE min_tx_size =
- (dir == VERT_EDGE) ? AOMMIN(txsize_horz_map[tx_size],
- txsize_horz_map[prev_tx_size])
- : AOMMIN(txsize_vert_map[tx_size],
- txsize_vert_map[prev_tx_size]);
+ TX_SIZE min_tx_size = (dir == VERT_EDGE)
+ ? AOMMIN(txsize_horz_map[tx_size],
+ txsize_horz_map[prev_tx_size])
+ : AOMMIN(txsize_vert_map[tx_size],
+ txsize_vert_map[prev_tx_size]);
+ min_tx_size = AOMMIN(min_tx_size, TX_16X16);
assert(min_tx_size < TX_SIZES);
const int row = r % MI_SIZE_64X64;
const int col = c % MI_SIZE_64X64;
@@ -883,13 +1053,11 @@ void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
} else if (plane == 1) {
av1_zero(lfm->left_u);
av1_zero(lfm->above_u);
- av1_zero(lfm->lfl_u_ver);
- av1_zero(lfm->lfl_u_hor);
+ av1_zero(lfm->lfl_u);
} else {
av1_zero(lfm->left_v);
av1_zero(lfm->above_v);
- av1_zero(lfm->lfl_v_ver);
- av1_zero(lfm->lfl_v_hor);
+ av1_zero(lfm->lfl_v);
}
}
}
@@ -979,13 +1147,10 @@ static void filter_selectively_vert_row2(
if ((mask_16x16_0 & mask_16x16_1) & 1) {
if (plane) {
- // TODO(any): add aom_lpf_vertical_6_dual for chroma plane.
- aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
- aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
+ aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
} else {
- // TODO(any): add dual function simd function. Current sse2 code
- // just called aom_lpf_vertical_14_sse2 twice.
aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, lfi1->mblim, lfi1->lim,
lfi1->hev_thr);
@@ -1005,9 +1170,9 @@ static void filter_selectively_vert_row2(
if ((mask_8x8_0 & mask_8x8_1) & 1) {
if (plane) {
- aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
- aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
- lfi1->hev_thr);
+ aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
} else {
aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, lfi1->mblim, lfi1->lim,
@@ -1070,10 +1235,9 @@ static void highbd_filter_selectively_vert_row2(
if ((mask_16x16_0 & mask_16x16_1) & 1) {
if (plane) {
- aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, bd);
- aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, bd);
+ aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
} else {
aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, lfi1->mblim,
@@ -1094,10 +1258,9 @@ static void highbd_filter_selectively_vert_row2(
if ((mask_8x8_0 & mask_8x8_1) & 1) {
if (plane) {
- aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim,
- lfi0->hev_thr, bd);
- aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim,
- lfi1->lim, lfi1->hev_thr, bd);
+ aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
} else {
aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
lfi0->hev_thr, lfi1->mblim,
@@ -1163,13 +1326,15 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
if ((mask_16x16 & two_block_mask) == two_block_mask) {
- /*
- aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr);
- */
-
- lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
- lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr);
+ if (plane) {
+ aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ } else {
+ aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ }
count = 2;
} else {
lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
@@ -1181,28 +1346,24 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
if ((mask_8x8 & two_block_mask) == two_block_mask) {
- /*
- aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim, lfin->lim,
- lfin->hev_thr);
- */
-
- lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
- lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr);
+ if (plane) {
+ aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ } else {
+ aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ }
count = 2;
} else {
lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
}
} else if (mask_4x4 & 1) {
if ((mask_4x4 & two_block_mask) == two_block_mask) {
- /*
aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
lfi->hev_thr, lfin->mblim, lfin->lim,
lfin->hev_thr);
- */
- aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
- aom_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim,
- lfin->hev_thr);
count = 2;
} else {
aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
@@ -1239,15 +1400,15 @@ static void highbd_filter_selectively_horiz(
plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
if ((mask_16x16 & two_block_mask) == two_block_mask) {
- /*
- aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, bd);
- */
-
- highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
- bd);
- highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim,
- lfin->hev_thr, bd);
+ if (plane) {
+ aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ }
count = 2;
} else {
highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
@@ -1258,15 +1419,15 @@ static void highbd_filter_selectively_horiz(
plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
if ((mask_8x8 & two_block_mask) == two_block_mask) {
- /*
- aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim, lfin->lim,
- lfin->hev_thr, bd);
- */
- highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
- bd);
- highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim,
- lfin->hev_thr, bd);
+ if (plane) {
+ aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ }
count = 2;
} else {
highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
@@ -1274,15 +1435,9 @@ static void highbd_filter_selectively_horiz(
}
} else if (mask_4x4 & 1) {
if ((mask_4x4 & two_block_mask) == two_block_mask) {
- /*
aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
lfi->hev_thr, lfin->mblim, lfin->lim,
lfin->hev_thr, bd);
- */
- aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, bd);
- aom_highbd_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim,
- lfin->hev_thr, bd);
count = 2;
} else {
aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
@@ -1299,43 +1454,289 @@ static void highbd_filter_selectively_horiz(
}
}
-static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf,
- uint8_t *dst_buf, int ref_stride, int dst_stride,
- int start, int end) {
- return 0;
-
- start <<= MI_SIZE_LOG2;
- end <<= MI_SIZE_LOG2;
- uint8_t *ref0 = ref_buf;
- uint8_t *dst0 = dst_buf;
- if (cm->seq_params.use_highbitdepth) {
- const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf);
- const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf);
- for (int j = 0; j < 4; ++j) {
- for (int i = start; i < end; ++i)
- if (ref16[i] != dst16[i]) {
- ref_buf = ref0;
- dst_buf = dst0;
- return i + 1;
+void av1_build_bitmask_vert_info(
+ AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+ int plane) {
+ const int subsampling_x = plane_ptr->subsampling_x;
+ const int subsampling_y = plane_ptr->subsampling_y;
+ const int row_step = (MI_SIZE >> MI_SIZE_LOG2);
+ const int is_uv = plane > 0;
+ TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+ uint8_t level, prev_level = 1;
+ int skip, prev_skip = 0;
+ int is_coding_block_border;
+
+ for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) {
+ const int mi_row = r << subsampling_y;
+ const int row = mi_row % MI_SIZE_64X64;
+ int index = 0;
+ const int shift = get_index_shift(0, row, &index);
+
+ for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
+ c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
+ const int mi_col = c << subsampling_x;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+ for (int col_in_unit = 0;
+ col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
+ const int x = (c + col_in_unit) << MI_SIZE_LOG2;
+ if (x >= plane_ptr->dst.width) break;
+ const int col = col_in_unit << subsampling_x;
+ const uint64_t mask = ((uint64_t)1 << (shift | col));
+ skip = lfm->skip.bits[index] & mask;
+ is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
+ switch (plane) {
+ case 0: level = lfm->lfl_y_ver[row][col]; break;
+ case 1: level = lfm->lfl_u[row][col]; break;
+ case 2: level = lfm->lfl_v[row][col]; break;
+ default: assert(plane >= 0 && plane <= 2); return;
}
- ref16 += ref_stride;
- dst16 += dst_stride;
+ for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+ if (is_uv && ts == TX_64X64) continue;
+ if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
+ tx_size = ts;
+ break;
+ }
+ }
+ if ((c + col_in_unit > 0) && (level || prev_level) &&
+ (!prev_skip || !skip || is_coding_block_border)) {
+ const TX_SIZE min_tx_size =
+ AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+ const int tmp_row = (mi_row | subsampling_y) % MI_SIZE_64X64;
+ const int tmp_col = (col | subsampling_x) % MI_SIZE_64X64;
+ const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
+ const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+ switch (plane) {
+ case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
+ case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
+ case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ }
+
+ // update prev info
+ prev_level = level;
+ prev_skip = skip;
+ prev_tx_size = tx_size;
+ // advance
+ col_in_unit += tx_size_wide_unit[tx_size];
+ }
}
- } else {
- for (int j = 0; j < 4; ++j) {
- for (int i = start; i < end; ++i)
- if (ref_buf[i] != dst_buf[i]) {
- ref_buf = ref0;
- dst_buf = dst0;
- return i + 1;
+ }
+}
+
+void av1_build_bitmask_horz_info(
+ AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+ int plane) {
+ const int subsampling_x = plane_ptr->subsampling_x;
+ const int subsampling_y = plane_ptr->subsampling_y;
+ const int col_step = (MI_SIZE >> MI_SIZE_LOG2);
+ const int is_uv = plane > 0;
+ TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+ uint8_t level, prev_level = 1;
+ int skip, prev_skip = 0;
+ int is_coding_block_border;
+
+ for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) {
+ const int mi_col = c << subsampling_x;
+ const int col = mi_col % MI_SIZE_64X64;
+
+ for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
+ r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
+ const int mi_row = r << subsampling_y;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+ for (int r_in_unit = 0;
+ r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
+ const int y = (r + r_in_unit) << MI_SIZE_LOG2;
+ if (y >= plane_ptr->dst.height) break;
+ const int row = r_in_unit << subsampling_y;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ const uint64_t mask = ((uint64_t)1 << shift);
+ skip = lfm->skip.bits[index] & mask;
+ is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
+ switch (plane) {
+ case 0: level = lfm->lfl_y_hor[row][col]; break;
+ case 1: level = lfm->lfl_u[row][col]; break;
+ case 2: level = lfm->lfl_v[row][col]; break;
+ default: assert(plane >= 0 && plane <= 2); return;
}
- ref_buf += ref_stride;
- dst_buf += dst_stride;
+ for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+ if (is_uv && ts == TX_64X64) continue;
+ if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
+ tx_size = ts;
+ break;
+ }
+ }
+ if ((r + r_in_unit > 0) && (level || prev_level) &&
+ (!prev_skip || !skip || is_coding_block_border)) {
+ const TX_SIZE min_tx_size =
+ AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+ const int tmp_row = (row | subsampling_y) % MI_SIZE_64X64;
+ const int tmp_col = (mi_col | subsampling_x) % MI_SIZE_64X64;
+ const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
+ const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+
+ switch (plane) {
+ case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
+ case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
+ case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ }
+
+ // update prev info
+ prev_level = level;
+ prev_skip = skip;
+ prev_tx_size = tx_size;
+ // advance
+ r_in_unit += tx_size_high_unit[tx_size];
+ }
+ }
+ }
+}
+
+void av1_filter_block_plane_bitmask_vert(
+ AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+ int mi_row, int mi_col) {
+ struct buf_2d *const dst = &plane_ptr->dst;
+ uint8_t *const buf0 = dst->buf;
+ const int ssx = plane_ptr->subsampling_x;
+ const int ssy = plane_ptr->subsampling_y;
+ const int mask_cutoff = 0xffff;
+ const int row_step = 1 << ssy;
+ const int two_row_step = 2 << ssy;
+ const int row_stride = dst->stride << MI_SIZE_LOG2;
+ const int two_row_stride = row_stride << 1;
+ uint64_t mask_16x16 = 0;
+ uint64_t mask_8x8 = 0;
+ uint64_t mask_4x4 = 0;
+ uint8_t *lfl;
+ uint8_t *lfl2;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ assert(lfm);
+
+ // 1. vertical filtering. filter two rows at a time
+ for (int r = 0;
+ ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+ r += two_row_step) {
+ const int row = r | ssy;
+ const int row_next = row + row_step;
+ const int col = ssx;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ int index_next = 0;
+ const int shift_next = get_index_shift(col, row_next, &index_next);
+ switch (pl) {
+ case 0:
+ mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+ lfl = &lfm->lfl_y_ver[row][col];
+ lfl2 = &lfm->lfl_y_ver[row_next][col];
+ break;
+ case 1:
+ mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+ lfl = &lfm->lfl_u[row][col];
+ lfl2 = &lfm->lfl_u[row_next][col];
+ break;
+ case 2:
+ mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+ lfl = &lfm->lfl_v[row][col];
+ lfl2 = &lfm->lfl_v[row_next][col];
+ break;
+ default: assert(pl >= 0 && pl <= 2); return;
+ }
+ uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+ uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+ uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+ uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+ uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+ uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_vert_row2(
+ ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+ mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+ &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+ else
+ filter_selectively_vert_row2(
+ ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+ mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+ dst->buf += two_row_stride;
+ }
+ // reset buf pointer for horizontal filtering
+ dst->buf = buf0;
+}
+
+void av1_filter_block_plane_bitmask_horz(
+ AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+ int mi_row, int mi_col) {
+ struct buf_2d *const dst = &plane_ptr->dst;
+ uint8_t *const buf0 = dst->buf;
+ const int ssx = plane_ptr->subsampling_x;
+ const int ssy = plane_ptr->subsampling_y;
+ const int mask_cutoff = 0xffff;
+ const int row_step = 1 << ssy;
+ const int row_stride = dst->stride << MI_SIZE_LOG2;
+ uint64_t mask_16x16 = 0;
+ uint64_t mask_8x8 = 0;
+ uint64_t mask_4x4 = 0;
+ uint8_t *lfl;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ assert(lfm);
+ for (int r = 0;
+ ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+ r += row_step) {
+ if (mi_row + r == 0) {
+ dst->buf += row_stride;
+ continue;
}
+ const int row = r | ssy;
+ const int col = ssx;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ switch (pl) {
+ case 0:
+ mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+ lfl = &lfm->lfl_y_hor[row][col];
+ break;
+ case 1:
+ mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+ lfl = &lfm->lfl_u[row][col];
+ break;
+ case 2:
+ mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+ lfl = &lfm->lfl_v[row][col];
+ break;
+ default: assert(pl >= 0 && pl <= 2); return;
+ }
+ mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+ mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+ mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_horiz(
+ CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
+ else
+ filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl);
+ dst->buf += row_stride;
}
- ref_buf = ref0;
- dst_buf = dst0;
- return 0;
+ // reset buf pointer for next block
+ dst->buf = buf0;
}
void av1_filter_block_plane_ver(AV1_COMMON *const cm,
@@ -1385,15 +1786,15 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm,
mask_16x16 = lfm->left_u[TX_16X16].bits[index];
mask_8x8 = lfm->left_u[TX_8X8].bits[index];
mask_4x4 = lfm->left_u[TX_4X4].bits[index];
- lfl = &lfm->lfl_u_ver[row][col];
- lfl2 = &lfm->lfl_u_ver[row_next][col];
+ lfl = &lfm->lfl_u[row][col];
+ lfl2 = &lfm->lfl_u[row_next][col];
break;
case 2:
mask_16x16 = lfm->left_v[TX_16X16].bits[index];
mask_8x8 = lfm->left_v[TX_8X8].bits[index];
mask_4x4 = lfm->left_v[TX_4X4].bits[index];
- lfl = &lfm->lfl_v_ver[row][col];
- lfl2 = &lfm->lfl_v_ver[row_next][col];
+ lfl = &lfm->lfl_v[row][col];
+ lfl2 = &lfm->lfl_v[row_next][col];
break;
default: assert(pl >= 0 && pl <= 2); return;
}
@@ -1460,13 +1861,13 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm,
mask_16x16 = lfm->above_u[TX_16X16].bits[index];
mask_8x8 = lfm->above_u[TX_8X8].bits[index];
mask_4x4 = lfm->above_u[TX_4X4].bits[index];
- lfl = &lfm->lfl_u_hor[row][col];
+ lfl = &lfm->lfl_u[row][col];
break;
case 2:
mask_16x16 = lfm->above_v[TX_16X16].bits[index];
mask_8x8 = lfm->above_v[TX_8X8].bits[index];
mask_4x4 = lfm->above_v[TX_4X4].bits[index];
- lfl = &lfm->lfl_v_hor[row][col];
+ lfl = &lfm->lfl_v[row][col];
break;
default: assert(pl >= 0 && pl <= 2); return;
}
@@ -1820,6 +2221,9 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
MACROBLOCKD *xd, int start, int stop,
+#if LOOP_FILTER_BITMASK
+ int is_decoding,
+#endif
int plane_start, int plane_end) {
struct macroblockd_plane *pd = xd->plane;
const int col_start = 0;
@@ -1827,6 +2231,45 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
int mi_row, mi_col;
int plane;
+#if LOOP_FILTER_BITMASK
+ if (is_decoding) {
+ for (plane = plane_start; plane < plane_end; plane++) {
+ if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+ break;
+ else if (plane == 1 && !(cm->lf.filter_level_u))
+ continue;
+ else if (plane == 2 && !(cm->lf.filter_level_v))
+ continue;
+
+ av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
+ plane, plane + 1);
+ av1_build_bitmask_vert_info(cm, &pd[plane], plane);
+ av1_build_bitmask_horz_info(cm, &pd[plane], plane);
+
+ // apply loop filtering which only goes through buffer once
+ for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
+ for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
+ av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, mi_col,
+ plane, plane + 1);
+ av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
+ mi_col);
+ if (mi_col - MI_SIZE_64X64 >= 0) {
+ av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+ mi_col - MI_SIZE_64X64, plane, plane + 1);
+ av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+ mi_col - MI_SIZE_64X64);
+ }
+ }
+ av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+ mi_col - MI_SIZE_64X64, plane, plane + 1);
+ av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+ mi_col - MI_SIZE_64X64);
+ }
+ }
+ return;
+ }
+#endif
+
for (plane = plane_start; plane < plane_end; plane++) {
if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
break;
@@ -1910,8 +2353,11 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
}
void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
- MACROBLOCKD *xd, int plane_start, int plane_end,
- int partial_frame) {
+ MACROBLOCKD *xd,
+#if LOOP_FILTER_BITMASK
+ int is_decoding,
+#endif
+ int plane_start, int plane_end, int partial_frame) {
int start_mi_row, end_mi_row, mi_rows_to_filter;
start_mi_row = 0;
@@ -1923,6 +2369,9 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
}
end_mi_row = start_mi_row + mi_rows_to_filter;
av1_loop_filter_frame_init(cm, plane_start, plane_end);
- loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
- plane_end);
+ loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
+#if LOOP_FILTER_BITMASK
+ is_decoding,
+#endif
+ plane_start, plane_end);
}
diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h
index c35c3b2dc..80ac61178 100644
--- a/third_party/aom/av1/common/av1_loopfilter.h
+++ b/third_party/aom/av1/common/av1_loopfilter.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_LOOPFILTER_H_
-#define AV1_COMMON_LOOPFILTER_H_
+#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_
+#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_
#include "config/aom_config.h"
@@ -60,51 +60,20 @@ typedef struct {
uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64];
uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
- // U plane vertical edge and horizontal edge filter level
- uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64];
- uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+ // U plane filter level
+ uint8_t lfl_u[MI_SIZE_64X64][MI_SIZE_64X64];
- // V plane vertical edge and horizontal edge filter level
- uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64];
- uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64];
-} LoopFilterMask;
+ // V plane filter level
+ uint8_t lfl_v[MI_SIZE_64X64][MI_SIZE_64X64];
-// To determine whether to apply loop filtering at one transform block edge,
-// we need information of the neighboring transform block. Specifically,
-// in determining a vertical edge, we need the information of the tx block
-// to its left. For a horizontal edge, we need info of the tx block above it.
-// Thus, we need to record info of right column and bottom row of tx blocks.
-// We record the information of the neighboring superblock, when bitmask
-// building for a superblock is finished. And it will be used for next
-// superblock bitmask building.
-// Information includes:
-// ------------------------------------------------------------
-// MI_SIZE_64X64
-// Y tx_size above |--------------|
-// Y tx_size left |--------------|
-// UV tx_size above |--------------|
-// UV tx_size left |--------------|
-// Y level above |--------------|
-// Y level left |--------------|
-// U level above |--------------|
-// U level left |--------------|
-// V level above |--------------|
-// V level left |--------------|
-// skip |--------------|
-// ------------------------------------------------------------
-typedef struct {
- TX_SIZE tx_size_y_above[MI_SIZE_64X64];
- TX_SIZE tx_size_y_left[MI_SIZE_64X64];
- TX_SIZE tx_size_uv_above[MI_SIZE_64X64];
- TX_SIZE tx_size_uv_left[MI_SIZE_64X64];
- uint8_t y_level_above[MI_SIZE_64X64];
- uint8_t y_level_left[MI_SIZE_64X64];
- uint8_t u_level_above[MI_SIZE_64X64];
- uint8_t u_level_left[MI_SIZE_64X64];
- uint8_t v_level_above[MI_SIZE_64X64];
- uint8_t v_level_left[MI_SIZE_64X64];
- uint8_t skip[MI_SIZE_64X64];
-} LpfSuperblockInfo;
+ // other info
+ FilterMask skip;
+ FilterMask is_vert_border;
+ FilterMask is_horz_border;
+ // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64
+ FilterMask tx_size_ver[2][5];
+ FilterMask tx_size_hor[2][5];
+} LoopFilterMask;
#endif // LOOP_FILTER_BITMASK
struct loopfilter {
@@ -130,7 +99,6 @@ struct loopfilter {
LoopFilterMask *lfm;
size_t lfm_num;
int lfm_stride;
- LpfSuperblockInfo neighbor_sb_lpf_info;
#endif // LOOP_FILTER_BITMASK
};
@@ -157,9 +125,15 @@ void av1_loop_filter_init(struct AV1Common *cm);
void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
int plane_end);
+#if LOOP_FILTER_BITMASK
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+ struct macroblockd *mbd, int is_decoding,
+ int plane_start, int plane_end, int partial_frame);
+#else
void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
struct macroblockd *mbd, int plane_start,
int plane_end, int partial_frame);
+#endif
void av1_filter_block_plane_vert(const struct AV1Common *const cm,
const MACROBLOCKD *const xd, const int plane,
@@ -180,6 +154,9 @@ typedef struct LoopFilterWorkerData {
MACROBLOCKD *xd;
} LFWorkerData;
+uint8_t get_filter_level(const struct AV1Common *cm,
+ const loop_filter_info_n *lfi_n, const int dir_idx,
+ int plane, const MB_MODE_INFO *mbmi);
#if LOOP_FILTER_BITMASK
void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col,
int plane, int subsampling_x, int subsampling_y,
@@ -192,10 +169,59 @@ void av1_filter_block_plane_ver(struct AV1Common *const cm,
void av1_filter_block_plane_hor(struct AV1Common *const cm,
struct macroblockd_plane *const plane, int pl,
int mi_row, int mi_col);
+LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm,
+ int mi_row, int mi_col);
+int get_index_shift(int mi_col, int mi_row, int *index);
+
+static const FilterMask left_txform_mask[TX_SIZES] = {
+ { { 0x0000000000000001ULL, // TX_4X4,
+ 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+ { { 0x0000000000010001ULL, // TX_8X8,
+ 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+ { { 0x0001000100010001ULL, // TX_16X16,
+ 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+ { { 0x0001000100010001ULL, // TX_32X32,
+ 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+ { { 0x0001000100010001ULL, // TX_64X64,
+ 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
+};
+
+static const uint64_t above_txform_mask[2][TX_SIZES] = {
+ {
+ 0x0000000000000001ULL, // TX_4X4
+ 0x0000000000000003ULL, // TX_8X8
+ 0x000000000000000fULL, // TX_16X16
+ 0x00000000000000ffULL, // TX_32X32
+ 0x000000000000ffffULL, // TX_64X64
+ },
+ {
+ 0x0000000000000001ULL, // TX_4X4
+ 0x0000000000000005ULL, // TX_8X8
+ 0x0000000000000055ULL, // TX_16X16
+ 0x0000000000005555ULL, // TX_32X32
+ 0x0000000055555555ULL, // TX_64X64
+ },
+};
+
+extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL];
+
+extern const FilterMask left_mask_univariant_reordered[67];
+
+extern const FilterMask above_mask_univariant_reordered[67];
#endif
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_COMMON_LOOPFILTER_H_
+#endif // AOM_AV1_COMMON_AV1_LOOPFILTER_H_
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index fa8b34981..dee1f1c79 100755
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -76,12 +76,12 @@ specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
+
# directional intra predictor functions
add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
-
# FILTER_INTRA predictor functions
add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
specialize qw/av1_filter_intra_predictor sse4_1/;
@@ -108,6 +108,22 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
+
+add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/;
+
add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
@@ -122,9 +138,7 @@ specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/;
add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-specialize qw/av1_inv_txfm2d_add_32x32 avx2/;
add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -132,8 +146,6 @@ add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *ou
add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-specialize qw/av1_inv_txfm2d_add_64x64 sse4_1/;
-
add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -146,13 +158,13 @@ add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride
# build compound seg mask functions
add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
-specialize qw/av1_build_compound_diffwtd_mask sse4_1/;
+specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/;
add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
-specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 neon/;
+specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
#
# Encoder functions below this point.
@@ -186,7 +198,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_8x16 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_16x8 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -203,6 +217,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -218,7 +233,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
specialize qw/av1_temporal_filter_apply sse2 msa/;
- add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+ add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
# ENCODEMB INVOKE
@@ -238,7 +253,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
specialize qw/av1_get_nz_map_contexts sse2/;
add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
- specialize qw/av1_txb_init_levels sse4_1/;
+ specialize qw/av1_txb_init_levels sse4_1 avx2/;
add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
@@ -251,6 +266,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
specialize qw/av1_get_crc32c_value sse4_2/;
+ add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, double *M, double *H";
+ specialize qw/av1_compute_stats sse4_1 avx2/;
+
+ add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+ specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/;
}
# end encoder functions
@@ -275,7 +295,7 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) {
# WARPED_MOTION / GLOBAL_MOTION functions
add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_warp_affine sse4_1/;
+specialize qw/av1_warp_affine sse4_1 neon/;
add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
specialize qw/av1_highbd_warp_affine sse4_1/;
@@ -290,9 +310,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/;
-add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
- int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
- int sgr_params_idx, int bit_depth, int highbd";
+add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth, int highbd";
specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
# CONVOLVE_ROUND/COMPOUND_ROUND functions
diff --git a/third_party/aom/av1/common/av1_txfm.c b/third_party/aom/av1/common/av1_txfm.c
index 1e6654121..bb70eab70 100644
--- a/third_party/aom/av1/common/av1_txfm.c
+++ b/third_party/aom/av1/common/av1_txfm.c
@@ -108,3 +108,53 @@ const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = {
1, // TXFM_TYPE_IDENTITY16
1, // TXFM_TYPE_IDENTITY32
};
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+ const int32_t *buf, int32_t size, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ const int64_t max_value = (1LL << (bit - 1)) - 1;
+ const int64_t min_value = -(1LL << (bit - 1));
+
+ int in_range = 1;
+
+ for (int i = 0; i < size; ++i) {
+ if (buf[i] < min_value || buf[i] > max_value) {
+ in_range = 0;
+ }
+ }
+
+ if (!in_range) {
+ fprintf(stderr, "Error: coeffs contain out-of-range values\n");
+ fprintf(stderr, "size: %d\n", size);
+ fprintf(stderr, "stage: %d\n", stage);
+ fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
+ max_value);
+
+ fprintf(stderr, "coeffs: ");
+
+ fprintf(stderr, "[");
+ for (int j = 0; j < size; j++) {
+ if (j > 0) fprintf(stderr, ", ");
+ fprintf(stderr, "%d", input[j]);
+ }
+ fprintf(stderr, "]\n");
+
+ fprintf(stderr, " buf: ");
+
+ fprintf(stderr, "[");
+ for (int j = 0; j < size; j++) {
+ if (j > 0) fprintf(stderr, ", ");
+ fprintf(stderr, "%d", buf[j]);
+ }
+ fprintf(stderr, "]\n\n");
+ }
+
+ assert(in_range);
+#else
+ (void)stage;
+ (void)input;
+ (void)buf;
+ (void)size;
+ (void)bit;
+#endif
+}
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
index c9cc79852..59d64ca4a 100644
--- a/third_party/aom/av1/common/av1_txfm.h
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_TXFM_H_
-#define AV1_TXFM_H_
+#ifndef AOM_AV1_COMMON_AV1_TXFM_H_
+#define AOM_AV1_COMMON_AV1_TXFM_H_
#include <assert.h>
#include <math.h>
@@ -39,7 +39,7 @@ extern const int32_t av1_sinpi_arr_data[7][5];
static const int cos_bit_min = 10;
static const int cos_bit_max = 16;
-static const int NewSqrt2Bits = 12;
+#define NewSqrt2Bits ((int32_t)12)
// 2^12 * sqrt(2)
static const int32_t NewSqrt2 = 5793;
// 2^12 / sqrt(2)
@@ -64,7 +64,7 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
#if DO_RANGE_CHECK_CLAMP
bit = AOMMIN(bit, 31);
- return clamp(value, (1 << (bit - 1)) - 1, -(1 << (bit - 1)));
+ return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1);
#endif // DO_RANGE_CHECK_CLAMP
(void)bit;
return value;
@@ -78,10 +78,25 @@ static INLINE int32_t round_shift(int64_t value, int bit) {
static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
int bit) {
int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
+ int64_t intermediate = result_64 + (1LL << (bit - 1));
+ // NOTE(david.barker): The value 'result_64' may not necessarily fit
+ // into 32 bits. However, the result of this function is nominally
+ // ROUND_POWER_OF_TWO_64(result_64, bit)
+ // and that is required to fit into stage_range[stage] many bits
+ // (checked by range_check_buf()).
+ //
+ // Here we've unpacked that rounding operation, and it can be shown
+ // that the value of 'intermediate' here *does* fit into 32 bits
+ // for any conformant bitstream.
+ // The upshot is that, if you do all this calculation using
+ // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic,
+ // then you'll still get the correct result.
+ // To provide a check on this logic, we assert that 'intermediate'
+ // would fit into an int32 if range checking is enabled.
#if CONFIG_COEFFICIENT_RANGE_CHECKING
- assert(result_64 >= INT32_MIN && result_64 <= INT32_MAX);
+ assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX);
#endif
- return round_shift(result_64, bit);
+ return (int32_t)(intermediate >> bit);
}
static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
@@ -206,9 +221,12 @@ static INLINE int get_txw_idx(TX_SIZE tx_size) {
static INLINE int get_txh_idx(TX_SIZE tx_size) {
return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
}
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+ const int32_t *buf, int32_t size, int8_t bit);
#define MAX_TXWH_IDX 5
#ifdef __cplusplus
}
#endif // __cplusplus
-#endif // AV1_TXFM_H_
+#endif // AOM_AV1_COMMON_AV1_TXFM_H_
diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c
index 86b4b5d6c..2e796b656 100644
--- a/third_party/aom/av1/common/blockd.c
+++ b/third_party/aom/av1/common/blockd.c
@@ -28,66 +28,6 @@ PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
return above_mi->mode;
}
-void av1_foreach_transformed_block_in_plane(
- const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
- foreach_transformed_block_visitor visit, void *arg) {
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
- // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
- // transform size varies per plane, look it up in a common way.
- const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
- const BLOCK_SIZE plane_bsize =
- get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
- const uint8_t txw_unit = tx_size_wide_unit[tx_size];
- const uint8_t txh_unit = tx_size_high_unit[tx_size];
- const int step = txw_unit * txh_unit;
- int i = 0, r, c;
-
- // If mb_to_right_edge is < 0 we are in a situation in which
- // the current block size extends into the UMV and we won't
- // visit the sub blocks that are wholly within the UMV.
- const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
- const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-
- int blk_row, blk_col;
-
- const BLOCK_SIZE max_unit_bsize =
- get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
- int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
- int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
- mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
- mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
-
- // Keep track of the row and column of the blocks we use so that we know
- // if we are in the unrestricted motion border.
- for (r = 0; r < max_blocks_high; r += mu_blocks_high) {
- const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
- // Skip visiting the sub blocks that are wholly within the UMV.
- for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
- const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
- for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
- for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
- visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
- i += step;
- }
- }
- }
- }
-}
-
-void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
- BLOCK_SIZE bsize, int mi_row, int mi_col,
- foreach_transformed_block_visitor visit,
- void *arg, const int num_planes) {
- for (int plane = 0; plane < num_planes; ++plane) {
- if (!is_chroma_reference(mi_row, mi_col, bsize,
- xd->plane[plane].subsampling_x,
- xd->plane[plane].subsampling_y))
- continue;
- av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
- }
-}
-
void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
int has_eob, int aoff, int loff) {
@@ -159,6 +99,10 @@ void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
xd->plane[i].subsampling_x = i ? ss_x : 0;
xd->plane[i].subsampling_y = i ? ss_y : 0;
}
+ for (i = num_planes; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].subsampling_x = 1;
+ xd->plane[i].subsampling_y = 1;
+ }
}
const int16_t dr_intra_derivative[90] = {
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
index 979f13bd9..a2311c1b0 100644
--- a/third_party/aom/av1/common/blockd.h
+++ b/third_party/aom/av1/common/blockd.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_BLOCKD_H_
-#define AV1_COMMON_BLOCKD_H_
+#ifndef AOM_AV1_COMMON_BLOCKD_H_
+#define AOM_AV1_COMMON_BLOCKD_H_
#include "config/aom_config.h"
@@ -38,13 +38,13 @@ extern "C" {
#define MAX_DIFFWTD_MASK_BITS 1
// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
DIFFWTD_38 = 0,
DIFFWTD_38_INV,
DIFFWTD_MASK_TYPES,
} DIFFWTD_MASK_TYPE;
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
KEY_FRAME = 0,
INTER_FRAME = 1,
INTRA_ONLY_FRAME = 2, // replaces intra-only
@@ -57,7 +57,7 @@ static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
}
static INLINE int is_inter_mode(PREDICTION_MODE mode) {
- return mode >= NEARESTMV && mode <= NEW_NEWMV;
+ return mode >= INTER_MODE_START && mode < INTER_MODE_END;
}
typedef struct {
@@ -66,10 +66,10 @@ typedef struct {
} BUFFER_SET;
static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
- return mode >= NEARESTMV && mode <= NEWMV;
+ return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END;
}
static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
- return mode >= NEAREST_NEARESTMV && mode <= NEW_NEWMV;
+ return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END;
}
static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
@@ -148,10 +148,6 @@ static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
}
-static INLINE int use_masked_motion_search(COMPOUND_TYPE type) {
- return (type == COMPOUND_WEDGE);
-}
-
static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
}
@@ -267,8 +263,8 @@ typedef struct MB_MODE_INFO {
int mi_row;
int mi_col;
#endif
- int num_proj_ref[2];
- WarpedMotionParams wm_params[2];
+ int num_proj_ref;
+ WarpedMotionParams wm_params;
// Index of the alpha Cb and alpha Cr combination
int cfl_alpha_idx;
@@ -376,7 +372,7 @@ static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
}
#endif
-enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
+enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
struct buf_2d {
uint8_t *buf;
@@ -500,6 +496,8 @@ typedef struct jnt_comp_params {
int bck_offset;
} JNT_COMP_PARAMS;
+// Most/all of the pointers are mere pointers to actual arrays are allocated
+// elsewhere. This is mostly for coding convenience.
typedef struct macroblockd {
struct macroblockd_plane plane[MAX_MB_PLANE];
@@ -544,7 +542,7 @@ typedef struct macroblockd {
SgrprojInfo sgrproj_info[MAX_MB_PLANE];
// block dimension in the unit of mode_info.
- uint8_t n8_w, n8_h;
+ uint8_t n4_w, n4_h;
uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
@@ -599,6 +597,9 @@ typedef struct macroblockd {
uint16_t cb_offset[MAX_MB_PLANE];
uint16_t txb_offset[MAX_MB_PLANE];
uint16_t color_index_map_offset[2];
+
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
} MACROBLOCKD;
static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
@@ -623,6 +624,11 @@ static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
}
}
+// For a square block size 'bsize', returns the size of the sub-blocks used by
+// the given partition type. If the partition produces sub-blocks of different
+// sizes, then the function returns the largest sub-block size.
+// Implements the Partition_Subsize lookup table in the spec (Section 9.3.
+// Conversion tables).
// Note: the input block size should be square.
// Otherwise it's considered invalid.
static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
@@ -781,6 +787,8 @@ static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
return intra_mode_to_tx_type(mbmi, plane_type);
}
+// Implements the get_plane_residual_size() function in the spec (Section
+// 5.11.38. Get plane residual size function).
static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
int subsampling_x,
int subsampling_y) {
@@ -952,15 +960,6 @@ typedef void (*foreach_transformed_block_visitor)(int plane, int block,
BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg);
-void av1_foreach_transformed_block_in_plane(
- const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
- foreach_transformed_block_visitor visit, void *arg);
-
-void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
- BLOCK_SIZE bsize, int mi_row, int mi_col,
- foreach_transformed_block_visitor visit,
- void *arg, const int num_planes);
-
void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
int has_eob, int aoff, int loff);
@@ -976,7 +975,7 @@ static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
}
static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
- return (mode >= NEARESTMV) && (mode <= NEWMV);
+ return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
}
static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
@@ -1045,7 +1044,7 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
is_motion_variation_allowed_compound(mbmi)) {
if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
assert(!has_second_ref(mbmi));
- if (mbmi->num_proj_ref[0] >= 1 &&
+ if (mbmi->num_proj_ref >= 1 &&
(allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) {
if (xd->cur_frame_force_integer_mv) {
return OBMC_CAUSAL;
@@ -1174,4 +1173,4 @@ static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
} // extern "C"
#endif
-#endif // AV1_COMMON_BLOCKD_H_
+#endif // AOM_AV1_COMMON_BLOCKD_H_
diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h
index 092230de9..3b2eac8a5 100644
--- a/third_party/aom/av1/common/cdef.h
+++ b/third_party/aom/av1/common/cdef.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_CDEF_H_
-#define AV1_COMMON_CDEF_H_
+#ifndef AOM_AV1_COMMON_CDEF_H_
+#define AOM_AV1_COMMON_CDEF_H_
#define CDEF_STRENGTH_BITS 6
@@ -48,4 +48,4 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_COMMON_CDEF_H_
+#endif // AOM_AV1_COMMON_CDEF_H_
diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h
index 81c6da077..6b4452cd6 100644
--- a/third_party/aom/av1/common/cdef_block.h
+++ b/third_party/aom/av1/common/cdef_block.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#if !defined(_CDEF_BLOCK_H)
-#define _CDEF_BLOCK_H (1)
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_H_
#include "av1/common/odintrin.h"
@@ -56,4 +56,4 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
cdef_list *dlist, int cdef_count, int level,
int sec_strength, int pri_damping, int sec_damping,
int coeff_shift);
-#endif
+#endif // AOM_AV1_COMMON_CDEF_BLOCK_H_
diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h
index d24a7c0fa..14587a023 100644
--- a/third_party/aom/av1/common/cdef_block_simd.h
+++ b/third_party/aom/av1/common/cdef_block_simd.h
@@ -9,6 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+
#include "config/av1_rtcd.h"
#include "av1/common/cdef_block.h"
@@ -913,3 +916,5 @@ void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
}
}
}
+
+#endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h
index bc9fbce1b..d627891bf 100644
--- a/third_party/aom/av1/common/cfl.h
+++ b/third_party/aom/av1/common/cfl.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_CFL_H_
-#define AV1_COMMON_CFL_H_
+#ifndef AOM_AV1_COMMON_CFL_H_
+#define AOM_AV1_COMMON_CFL_H_
#include "av1/common/blockd.h"
#include "av1/common/onyxc_int.h"
@@ -299,4 +299,4 @@ void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst,
return pred[tx_size % TX_SIZES_ALL]; \
}
-#endif // AV1_COMMON_CFL_H_
+#endif // AOM_AV1_COMMON_CFL_H_
diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h
index 72c6d3a1e..bed6083db 100644
--- a/third_party/aom/av1/common/common.h
+++ b/third_party/aom/av1/common/common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_COMMON_H_
-#define AV1_COMMON_COMMON_H_
+#ifndef AOM_AV1_COMMON_COMMON_H_
+#define AOM_AV1_COMMON_COMMON_H_
/* Interface header for common constant data structures and lookup tables */
@@ -60,4 +60,4 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
} // extern "C"
#endif
-#endif // AV1_COMMON_COMMON_H_
+#endif // AOM_AV1_COMMON_COMMON_H_
diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h
index f521f10bf..46e455fdb 100644
--- a/third_party/aom/av1/common/common_data.h
+++ b/third_party/aom/av1/common/common_data.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_COMMON_DATA_H_
-#define AV1_COMMON_COMMON_DATA_H_
+#ifndef AOM_AV1_COMMON_COMMON_DATA_H_
+#define AOM_AV1_COMMON_COMMON_DATA_H_
#include "av1/common/enums.h"
#include "aom/aom_integer.h"
@@ -20,34 +20,43 @@
extern "C" {
#endif
-// Log 2 conversion lookup tables in units of mode info(4x4).
+// Log 2 conversion lookup tables in units of mode info (4x4).
+// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables).
static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = {
0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4
};
+// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables).
static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = {
0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2
};
+// Width/height lookup tables in units of mode info (4x4).
+// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables).
static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = {
1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16
};
+// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables).
static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = {
1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4
};
-// Width/height lookup tables in units of various block sizes
+// Width/height lookup tables in units of samples.
+// The Block_Width table in the spec (Section 9.3. Conversion tables).
static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = {
4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32,
64, 64, 64, 128, 128, 4, 16, 8, 32, 16, 64
};
+// The Block_Height table in the spec (Section 9.3. Conversion tables).
static const uint8_t block_size_high[BLOCK_SIZES_ALL] = {
4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64,
32, 64, 128, 64, 128, 16, 4, 32, 8, 64, 16
};
-// AOMMIN(3, AOMMIN(b_width_log2(bsize), b_height_log2(bsize)))
+// Maps a block size to a context.
+// The Size_Group table in the spec (Section 9.3. Conversion tables).
+// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize)))
static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = {
0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
};
@@ -56,6 +65,8 @@ static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = {
4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10
};
+// A compressed version of the Partition_Subsize table in the spec (9.3.
+// Conversion tables), for square block sizes only.
/* clang-format off */
static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = {
{ // PARTITION_NONE
@@ -350,34 +361,36 @@ static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
TX_64X64, // TX_MODE_LARGEST
TX_64X64, // TX_MODE_SELECT
};
-/* clang-format on */
+// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual
+// size function).
static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = {
- // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1
- // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1
- { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
- { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
- { { BLOCK_8X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
- { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } },
- { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_4X16, BLOCK_4X8 } },
- { { BLOCK_16X8, BLOCK_16X4 }, { BLOCK_8X8, BLOCK_8X4 } },
- { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } },
- { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_8X32, BLOCK_8X16 } },
- { { BLOCK_32X16, BLOCK_32X8 }, { BLOCK_16X16, BLOCK_16X8 } },
- { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } },
- { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_16X64, BLOCK_16X32 } },
- { { BLOCK_64X32, BLOCK_64X16 }, { BLOCK_32X32, BLOCK_32X16 } },
- { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } },
- { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } },
- { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } },
- { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } },
- { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_4X16, BLOCK_4X8 } },
- { { BLOCK_16X4, BLOCK_16X4 }, { BLOCK_8X4, BLOCK_8X4 } },
- { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } },
- { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } },
- { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } },
- { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } }
+ // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1
+ // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1
+ { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
+ { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_4X4 } },
+ { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X4 } },
+ { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } },
+ { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
+ { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } },
+ { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } },
+ { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } },
+ { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } },
+ { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } },
+ { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } },
+ { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } },
+ { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } },
+ { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } },
+ { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } },
+ { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } },
+ { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
+ { { BLOCK_16X4, BLOCK_INVALID }, { BLOCK_8X4, BLOCK_8X4 } },
+ { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } },
+ { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } },
+ { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } },
+ { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } }
};
+/* clang-format on */
// Generates 5 bit field in which each bit set to 1 represents
// a blocksize partition 11111 means we split 128x128, 64x64, 32x32, 16x16
@@ -430,4 +443,4 @@ static const int quant_dist_lookup_table[2][4][2] = {
} // extern "C"
#endif
-#endif // AV1_COMMON_COMMON_DATA_H_
+#endif // AOM_AV1_COMMON_COMMON_DATA_H_
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
index ed962c722..1f11126fc 100644
--- a/third_party/aom/av1/common/convolve.c
+++ b/third_party/aom/av1/common/convolve.c
@@ -173,6 +173,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t res = 0;
@@ -510,31 +511,73 @@ static void convolve_2d_scale_wrapper(
y_step_qn, conv_params);
}
+// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So
+// we may create optimized code to do 2-tap filtering for all bilinear filtering
+// usages, not just IntraBC.
+static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ int subpel_x_q4, int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const InterpFilterParams *filter_params_x =
+ subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
+ const InterpFilterParams *filter_params_y =
+ subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
+ if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, 0, 0, conv_params);
+ } else if (subpel_x_q4 != 0) {
+ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, 0, 0, conv_params);
+ } else {
+ av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, 0, 0, conv_params);
+ }
+}
+
void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilters interp_filters, const int subpel_x_q4,
int x_step_q4, const int subpel_y_q4, int y_step_q4,
int scaled, ConvolveParams *conv_params,
- const struct scale_factors *sf) {
+ const struct scale_factors *sf, int is_intrabc) {
+ assert(IMPLIES(is_intrabc, !scaled));
(void)x_step_q4;
(void)y_step_q4;
(void)dst;
(void)dst_stride;
- InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
- InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+
+ if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
+ convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
+ subpel_y_q4, conv_params);
+ return;
+ }
+
+ InterpFilter filter_x = 0;
+ InterpFilter filter_y = 0;
+ const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
+ const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
+ if (need_filter_params_x)
+ filter_x = av1_extract_interp_filter(interp_filters, 1);
+ if (need_filter_params_y)
+ filter_y = av1_extract_interp_filter(interp_filters, 0);
const InterpFilterParams *filter_params_x =
- av1_get_interp_filter_params_with_block_size(filter_x, w);
+ need_filter_params_x
+ ? av1_get_interp_filter_params_with_block_size(filter_x, w)
+ : NULL;
const InterpFilterParams *filter_params_y =
- av1_get_interp_filter_params_with_block_size(filter_y, h);
+ need_filter_params_y
+ ? av1_get_interp_filter_params_with_block_size(filter_y, h)
+ : NULL;
- if (scaled)
+ if (scaled) {
convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
filter_params_x, filter_params_y, subpel_x_q4,
x_step_q4, subpel_y_q4, y_step_q4, conv_params);
- else
+ } else {
sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
src, src_stride, dst, dst_stride, w, h, filter_params_x,
filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+ }
}
void av1_highbd_convolve_2d_copy_sr_c(
@@ -964,24 +1007,68 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
}
}
+static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, int subpel_x_q4,
+ int subpel_y_q4,
+ ConvolveParams *conv_params,
+ int bd) {
+ const InterpFilterParams *filter_params_x =
+ subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
+ const InterpFilterParams *filter_params_y =
+ subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
+ if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
+ av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, 0, 0,
+ conv_params, bd);
+ } else if (subpel_x_q4 != 0) {
+ av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, 0, 0,
+ conv_params, bd);
+ } else {
+ av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, 0, 0,
+ conv_params, bd);
+ }
+}
+
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
uint8_t *dst8, int dst_stride, int w, int h,
InterpFilters interp_filters,
const int subpel_x_q4, int x_step_q4,
const int subpel_y_q4, int y_step_q4,
int scaled, ConvolveParams *conv_params,
- const struct scale_factors *sf, int bd) {
+ const struct scale_factors *sf,
+ int is_intrabc, int bd) {
+ assert(IMPLIES(is_intrabc, !scaled));
(void)x_step_q4;
(void)y_step_q4;
(void)dst_stride;
-
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1);
- InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0);
+
+ if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
+ subpel_x_q4, subpel_y_q4, conv_params, bd);
+ return;
+ }
+
+ InterpFilter filter_x = 0;
+ InterpFilter filter_y = 0;
+ const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
+ const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
+ if (need_filter_params_x)
+ filter_x = av1_extract_interp_filter(interp_filters, 1);
+ if (need_filter_params_y)
+ filter_y = av1_extract_interp_filter(interp_filters, 0);
const InterpFilterParams *filter_params_x =
- av1_get_interp_filter_params_with_block_size(filter_x, w);
+ need_filter_params_x
+ ? av1_get_interp_filter_params_with_block_size(filter_x, w)
+ : NULL;
const InterpFilterParams *filter_params_y =
- av1_get_interp_filter_params_with_block_size(filter_y, h);
+ need_filter_params_y
+ ? av1_get_interp_filter_params_with_block_size(filter_y, h)
+ : NULL;
if (scaled) {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -1111,7 +1198,8 @@ void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
assert(w <= MAX_SB_SIZE);
assert(h <= MAX_SB_SIZE);
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
index bc2d4bccf..4109dd843 100644
--- a/third_party/aom/av1/common/convolve.h
+++ b/third_party/aom/av1/common/convolve.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_AV1_CONVOLVE_H_
-#define AV1_COMMON_AV1_CONVOLVE_H_
+#ifndef AOM_AV1_COMMON_CONVOLVE_H_
+#define AOM_AV1_COMMON_CONVOLVE_H_
#include "av1/common/filter.h"
#ifdef __cplusplus
@@ -19,7 +19,6 @@ extern "C" {
typedef uint16_t CONV_BUF_TYPE;
typedef struct ConvolveParams {
- int ref;
int do_average;
CONV_BUF_TYPE *dst;
int dst_stride;
@@ -59,15 +58,13 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
InterpFilters interp_filters, const int subpel_x_q4,
int x_step_q4, const int subpel_y_q4, int y_step_q4,
int scaled, ConvolveParams *conv_params,
- const struct scale_factors *sf);
+ const struct scale_factors *sf, int is_intrabc);
-static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
- int plane,
+static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane,
CONV_BUF_TYPE *dst,
int dst_stride,
int is_compound, int bd) {
ConvolveParams conv_params;
- conv_params.ref = ref;
conv_params.do_average = do_average;
assert(IMPLIES(do_average, is_compound));
conv_params.is_compound = is_compound;
@@ -88,15 +85,14 @@ static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
return conv_params;
}
-static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane,
+static INLINE ConvolveParams get_conv_params(int do_average, int plane,
int bd) {
- return get_conv_params_no_round(ref, do_average, plane, NULL, 0, 0, bd);
+ return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd);
}
static INLINE ConvolveParams get_conv_params_wiener(int bd) {
ConvolveParams conv_params;
(void)bd;
- conv_params.ref = 0;
conv_params.do_average = 0;
conv_params.is_compound = 0;
conv_params.round_0 = WIENER_ROUND0_BITS;
@@ -119,10 +115,11 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
const int subpel_x_q4, int x_step_q4,
const int subpel_y_q4, int y_step_q4,
int scaled, ConvolveParams *conv_params,
- const struct scale_factors *sf, int bd);
+ const struct scale_factors *sf,
+ int is_intrabc, int bd);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_COMMON_AV1_CONVOLVE_H_
+#endif // AOM_AV1_COMMON_CONVOLVE_H_
diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h
index ef944c5a0..991692c2f 100644
--- a/third_party/aom/av1/common/entropy.h
+++ b/third_party/aom/av1/common/entropy.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ENTROPY_H_
-#define AV1_COMMON_ENTROPY_H_
+#ifndef AOM_AV1_COMMON_ENTROPY_H_
+#define AOM_AV1_COMMON_ENTROPY_H_
#include "config/aom_config.h"
@@ -178,4 +178,4 @@ static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
} // extern "C"
#endif
-#endif // AV1_COMMON_ENTROPY_H_
+#endif // AOM_AV1_COMMON_ENTROPY_H_
diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h
index 0bd2e20a1..7047f34d2 100644
--- a/third_party/aom/av1/common/entropymode.h
+++ b/third_party/aom/av1/common/entropymode.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ENTROPYMODE_H_
-#define AV1_COMMON_ENTROPYMODE_H_
+#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_
+#define AOM_AV1_COMMON_ENTROPYMODE_H_
#include "av1/common/entropy.h"
#include "av1/common/entropymv.h"
@@ -186,6 +186,8 @@ void av1_set_default_mode_deltas(int8_t *mode_deltas);
void av1_setup_frame_contexts(struct AV1Common *cm);
void av1_setup_past_independence(struct AV1Common *cm);
+// Returns (int)ceil(log2(n)).
+// NOTE: This implementation only works for n <= 2^30.
static INLINE int av1_ceil_log2(int n) {
if (n < 2) return 0;
int i = 1, p = 2;
@@ -207,4 +209,4 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
} // extern "C"
#endif
-#endif // AV1_COMMON_ENTROPYMODE_H_
+#endif // AOM_AV1_COMMON_ENTROPYMODE_H_
diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c
index 446aa433c..491337387 100644
--- a/third_party/aom/av1/common/entropymv.c
+++ b/third_party/aom/av1/common/entropymv.c
@@ -60,61 +60,6 @@ static const nmv_context default_nmv_context = {
} },
};
-static const uint8_t log_in_base_2[] = {
- 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
- 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
- 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
- 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
-};
-
-static INLINE int mv_class_base(MV_CLASS_TYPE c) {
- return c ? CLASS0_SIZE << (c + 2) : 0;
-}
-
-MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
- const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
- ? MV_CLASS_10
- : (MV_CLASS_TYPE)log_in_base_2[z >> 3];
- if (offset) *offset = z - mv_class_base(c);
- return c;
-}
-
void av1_init_mv_probs(AV1_COMMON *cm) {
// NB: this sets CDFs too
cm->fc->nmvc = default_nmv_context;
diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h
index 02ca7b66b..fa818a2c1 100644
--- a/third_party/aom/av1/common/entropymv.h
+++ b/third_party/aom/av1/common/entropymv.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ENTROPYMV_H_
-#define AV1_COMMON_ENTROPYMV_H_
+#ifndef AOM_AV1_COMMON_ENTROPYMV_H_
+#define AOM_AV1_COMMON_ENTROPYMV_H_
#include "config/aom_config.h"
@@ -91,16 +91,6 @@ typedef struct {
nmv_component comps[2];
} nmv_context;
-static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
- if (mv->row == 0) {
- return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
- } else {
- return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
- }
-}
-
-MV_CLASS_TYPE av1_get_mv_class(int z, int *offset);
-
typedef enum {
MV_SUBPEL_NONE = -1,
MV_SUBPEL_LOW_PRECISION = 0,
@@ -111,4 +101,4 @@ typedef enum {
} // extern "C"
#endif
-#endif // AV1_COMMON_ENTROPYMV_H_
+#endif // AOM_AV1_COMMON_ENTROPYMV_H_
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
index 689c25f30..869c06ef2 100644
--- a/third_party/aom/av1/common/enums.h
+++ b/third_party/aom/av1/common/enums.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ENUMS_H_
-#define AV1_COMMON_ENUMS_H_
+#ifndef AOM_AV1_COMMON_ENUMS_H_
+#define AOM_AV1_COMMON_ENUMS_H_
#include "config/aom_config.h"
@@ -274,7 +274,7 @@ typedef enum ATTRIBUTE_PACKED {
TX_TYPES,
} TX_TYPE;
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
REG_REG,
REG_SMOOTH,
REG_SHARP,
@@ -438,6 +438,8 @@ typedef enum ATTRIBUTE_PACKED {
COMP_INTER_MODE_START = NEAREST_NEARESTMV,
COMP_INTER_MODE_END = MB_MODE_COUNT,
COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START,
+ INTER_MODE_START = NEARESTMV,
+ INTER_MODE_END = MB_MODE_COUNT,
INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode.
INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks
} PREDICTION_MODE;
@@ -478,7 +480,7 @@ typedef enum ATTRIBUTE_PACKED {
INTERINTRA_MODES
} INTERINTRA_MODE;
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
COMPOUND_AVERAGE,
COMPOUND_WEDGE,
COMPOUND_DIFFWTD,
@@ -614,4 +616,4 @@ typedef enum ATTRIBUTE_PACKED {
} // extern "C"
#endif
-#endif // AV1_COMMON_ENUMS_H_
+#endif // AOM_AV1_COMMON_ENUMS_H_
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
index 7f8ad583a..571422d11 100644
--- a/third_party/aom/av1/common/filter.h
+++ b/third_party/aom/av1/common/filter.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_FILTER_H_
-#define AV1_COMMON_FILTER_H_
+#ifndef AOM_AV1_COMMON_FILTER_H_
+#define AOM_AV1_COMMON_FILTER_H_
#include <assert.h>
@@ -139,6 +139,17 @@ static const InterpFilterParams
BILINEAR }
};
+// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
+// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV.
+DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = {
+ 64,
+ 64,
+};
+
+static const InterpFilterParams av1_intrabc_filter_params = {
+ av1_intrabc_bilinear_filter, 2, 0, BILINEAR
+};
+
DECLARE_ALIGNED(256, static const InterpKernel,
av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
{ 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 },
@@ -181,6 +192,11 @@ av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
return &av1_interp_filter_params_list[interp_filter];
}
+static INLINE const InterpFilterParams *av1_get_4tap_interp_filter_params(
+ const InterpFilter interp_filter) {
+ return &av1_interp_4tap[interp_filter];
+}
+
static INLINE const int16_t *av1_get_interp_filter_kernel(
const InterpFilter interp_filter) {
return av1_interp_filter_params_list[interp_filter].filter_ptr;
@@ -195,4 +211,4 @@ static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
} // extern "C"
#endif
-#endif // AV1_COMMON_FILTER_H_
+#endif // AOM_AV1_COMMON_FILTER_H_
diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c
index 502ccd27d..fd6c4bc79 100644
--- a/third_party/aom/av1/common/frame_buffers.c
+++ b/third_party/aom/av1/common/frame_buffers.c
@@ -38,6 +38,17 @@ void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
list->int_fb = NULL;
}
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) {
+ int i;
+
+ assert(list != NULL);
+
+ for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+ if (list->int_fb[i].data && !list->int_fb[i].in_use)
+ memset(list->int_fb[i].data, 0, list->int_fb[i].size);
+ }
+}
+
int av1_get_frame_buffer(void *cb_priv, size_t min_size,
aom_codec_frame_buffer_t *fb) {
int i;
diff --git a/third_party/aom/av1/common/frame_buffers.h b/third_party/aom/av1/common/frame_buffers.h
index e7341cfdd..16188e51c 100644
--- a/third_party/aom/av1/common/frame_buffers.h
+++ b/third_party/aom/av1/common/frame_buffers.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_FRAME_BUFFERS_H_
-#define AV1_COMMON_FRAME_BUFFERS_H_
+#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_
+#define AOM_AV1_COMMON_FRAME_BUFFERS_H_
#include "aom/aom_frame_buffer.h"
#include "aom/aom_integer.h"
@@ -36,6 +36,12 @@ int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list);
// Free any data allocated to the frame buffers.
void av1_free_internal_frame_buffers(InternalFrameBufferList *list);
+// Zeros all unused internal frame buffers. In particular, this zeros the
+// frame borders. Call this function after a sequence header change to
+// re-initialize the frame borders for the different width, height, or bit
+// depth.
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list);
+
// Callback used by libaom to request an external frame buffer. |cb_priv|
// Callback private data, which points to an InternalFrameBufferList.
// |min_size| is the minimum size in bytes needed to decode the next frame.
@@ -51,4 +57,4 @@ int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb);
} // extern "C"
#endif
-#endif // AV1_COMMON_FRAME_BUFFERS_H_
+#endif // AOM_AV1_COMMON_FRAME_BUFFERS_H_
diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c
index bc758eb57..2c1cb9827 100644
--- a/third_party/aom/av1/common/idct.c
+++ b/third_party/aom/av1/common/idct.c
@@ -31,21 +31,16 @@ int av1_get_tx_scale(const TX_SIZE tx_size) {
// that input and output could be the same buffer.
// idct
-static void highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest,
- int stride, int eob, int bd) {
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
if (eob > 1)
av1_highbd_iwht4x4_16_add(input, dest, stride, bd);
else
av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
}
-static const int32_t *cast_to_int32(const tran_low_t *input) {
- assert(sizeof(int32_t) == sizeof(tran_low_t));
- return (const int32_t *)input;
-}
-
-void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
int eob = txfm_param->eob;
int bd = txfm_param->bd;
@@ -54,206 +49,150 @@ void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
const TX_TYPE tx_type = txfm_param->tx_type;
if (lossless) {
assert(tx_type == DCT_DCT);
- highbd_iwht4x4_add(input, dest, stride, eob, bd);
+ av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
return;
}
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- }
+
+ av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
}
-static void highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_4x8(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_8x4(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_8x16(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_16x8(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_16x32(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_32x16(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_16x4(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_4x16(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_32x8(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_8x32(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_32x64(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_64x32(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_16x64(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
-}
-static void highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
- const int32_t *src = cast_to_int32(input);
- av1_inv_txfm2d_add_64x16(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
}
-static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
int bd = txfm_param->bd;
const TX_TYPE tx_type = txfm_param->tx_type;
const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+
+ av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
bd);
- break;
- }
}
-static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
- default:
- av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- }
+ av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
}
-static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int bd = txfm_param->bd;
const TX_TYPE tx_type = txfm_param->tx_type;
const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_inv_txfm2d_add_32x32(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- // Assembly version doesn't support IDTX, so use C version for it.
- case IDTX:
- av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
- default: assert(0);
- }
+ av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
}
-static void highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
const int bd = txfm_param->bd;
const TX_TYPE tx_type = txfm_param->tx_type;
const int32_t *src = cast_to_int32(input);
assert(tx_type == DCT_DCT);
- av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
+ av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
}
static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
@@ -270,70 +209,70 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
}
-static void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
+void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const TX_SIZE tx_size = txfm_param->tx_size;
switch (tx_size) {
case TX_32X32:
- highbd_inv_txfm_add_32x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param);
break;
case TX_16X16:
- highbd_inv_txfm_add_16x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param);
break;
case TX_8X8:
- highbd_inv_txfm_add_8x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param);
break;
case TX_4X8:
- highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
break;
case TX_8X4:
- highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
break;
case TX_8X16:
- highbd_inv_txfm_add_8x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param);
break;
case TX_16X8:
- highbd_inv_txfm_add_16x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param);
break;
case TX_16X32:
- highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
break;
case TX_32X16:
- highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
break;
case TX_64X64:
- highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param);
break;
case TX_32X64:
- highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
break;
case TX_64X32:
- highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
break;
case TX_16X64:
- highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
break;
case TX_64X16:
- highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
break;
case TX_4X4:
// this is like av1_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
- av1_highbd_inv_txfm_add_4x4(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param);
break;
case TX_16X4:
- highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
break;
case TX_4X16:
- highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
break;
case TX_8X32:
- highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
break;
case TX_32X8:
- highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
break;
default: assert(0 && "Invalid transform size"); break;
}
@@ -352,7 +291,8 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
}
}
- highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, txfm_param);
+ av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+ txfm_param);
for (int r = 0; r < h; ++r) {
for (int c = 0; c < w; ++c) {
@@ -375,7 +315,7 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd,
assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
if (txfm_param.is_hbd) {
- highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+ av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
} else {
av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
}
diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h
index 50032a167..d9454e73f 100644
--- a/third_party/aom/av1/common/idct.h
+++ b/third_party/aom/av1/common/idct.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_IDCT_H_
-#define AV1_COMMON_IDCT_H_
+#ifndef AOM_AV1_COMMON_IDCT_H_
+#define AOM_AV1_COMMON_IDCT_H_
#include "config/aom_config.h"
@@ -36,11 +36,32 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd,
const tran_low_t *dqcoeff, int plane,
TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
int stride, int eob, int reduced_tx_set);
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+
+static INLINE const int32_t *cast_to_int32(const tran_low_t *input) {
+ assert(sizeof(int32_t) == sizeof(tran_low_t));
+ return (const int32_t *)input;
+}
+
+typedef void(highbd_inv_txfm_add)(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *param);
+
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x8;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x4;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x64;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x64;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x4;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x8;
-void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *param);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_COMMON_IDCT_H_
+#endif // AOM_AV1_COMMON_IDCT_H_
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
index c2495640e..5b0225192 100644
--- a/third_party/aom/av1/common/mv.h
+++ b/third_party/aom/av1/common/mv.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_MV_H_
-#define AV1_COMMON_MV_H_
+#ifndef AOM_AV1_COMMON_MV_H_
+#define AOM_AV1_COMMON_MV_H_
#include "av1/common/common.h"
#include "av1/common/common_data.h"
@@ -56,7 +56,7 @@ typedef struct mv32 {
#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
/* clang-format off */
-typedef enum {
+typedef enum ATTRIBUTE_PACKED {
IDENTITY = 0, // identity transformation, 0-parameter
TRANSLATION = 1, // translational motion 2-parameter
ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter
@@ -298,4 +298,4 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
} // extern "C"
#endif
-#endif // AV1_COMMON_MV_H_
+#endif // AOM_AV1_COMMON_MV_H_
diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c
index 6939df335..7f24ab4e6 100644
--- a/third_party/aom/av1/common/mvref_common.c
+++ b/third_party/aom/av1/common/mvref_common.c
@@ -27,16 +27,19 @@ static void get_mv_projection(MV *output, MV ref, int num, int den) {
den = AOMMIN(den, MAX_FRAME_DISTANCE);
num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
: AOMMAX(num, -MAX_FRAME_DISTANCE);
- int mv_row = ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
- int mv_col = ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
+ const int mv_row =
+ ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
+ const int mv_col =
+ ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
const int clamp_max = MV_UPP - 1;
const int clamp_min = MV_LOW + 1;
output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
}
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
- int mi_row, int mi_col, int x_mis, int y_mis) {
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+ const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+ int x_mis, int y_mis) {
const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
MV_REF *frame_mvs =
cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
@@ -141,38 +144,37 @@ static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
uint8_t *ref_match_count, uint8_t *newmv_count,
int_mv *gm_mv_candidates, int max_row_offset,
int *processed_rows) {
- int end_mi = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+ int end_mi = AOMMIN(xd->n4_w, cm->mi_cols - mi_col);
end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
const int n8_w_8 = mi_size_wide[BLOCK_8X8];
const int n8_w_16 = mi_size_wide[BLOCK_16X16];
int i;
int col_offset = 0;
- const int shift = 0;
// TODO(jingning): Revisit this part after cb4x4 is stable.
if (abs(row_offset) > 1) {
col_offset = 1;
- if ((mi_col & 0x01) && xd->n8_w < n8_w_8) --col_offset;
+ if ((mi_col & 0x01) && xd->n4_w < n8_w_8) --col_offset;
}
- const int use_step_16 = (xd->n8_w >= 16);
+ const int use_step_16 = (xd->n4_w >= 16);
MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
(void)mi_row;
for (i = 0; i < end_mi;) {
const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
const int candidate_bsize = candidate->sb_type;
- const int n8_w = mi_size_wide[candidate_bsize];
- int len = AOMMIN(xd->n8_w, n8_w);
+ const int n4_w = mi_size_wide[candidate_bsize];
+ int len = AOMMIN(xd->n4_w, n4_w);
if (use_step_16)
len = AOMMAX(n8_w_16, len);
else if (abs(row_offset) > 1)
len = AOMMAX(len, n8_w_8);
int weight = 2;
- if (xd->n8_w >= n8_w_8 && xd->n8_w <= n8_w) {
+ if (xd->n4_w >= n8_w_8 && xd->n4_w <= n4_w) {
int inc = AOMMIN(-max_row_offset + row_offset + 1,
mi_size_high[candidate_bsize]);
// Obtain range used in weight calculation.
- weight = AOMMAX(weight, (inc << shift));
+ weight = AOMMAX(weight, inc);
// Update processed rows.
*processed_rows = inc - row_offset - 1;
}
@@ -192,37 +194,36 @@ static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
uint8_t *ref_match_count, uint8_t *newmv_count,
int_mv *gm_mv_candidates, int max_col_offset,
int *processed_cols) {
- int end_mi = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+ int end_mi = AOMMIN(xd->n4_h, cm->mi_rows - mi_row);
end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
const int n8_h_8 = mi_size_high[BLOCK_8X8];
const int n8_h_16 = mi_size_high[BLOCK_16X16];
int i;
int row_offset = 0;
- const int shift = 0;
if (abs(col_offset) > 1) {
row_offset = 1;
- if ((mi_row & 0x01) && xd->n8_h < n8_h_8) --row_offset;
+ if ((mi_row & 0x01) && xd->n4_h < n8_h_8) --row_offset;
}
- const int use_step_16 = (xd->n8_h >= 16);
+ const int use_step_16 = (xd->n4_h >= 16);
(void)mi_col;
for (i = 0; i < end_mi;) {
const MB_MODE_INFO *const candidate =
xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
const int candidate_bsize = candidate->sb_type;
- const int n8_h = mi_size_high[candidate_bsize];
- int len = AOMMIN(xd->n8_h, n8_h);
+ const int n4_h = mi_size_high[candidate_bsize];
+ int len = AOMMIN(xd->n4_h, n4_h);
if (use_step_16)
len = AOMMAX(n8_h_16, len);
else if (abs(col_offset) > 1)
len = AOMMAX(len, n8_h_8);
int weight = 2;
- if (xd->n8_h >= n8_h_8 && xd->n8_h <= n8_h) {
+ if (xd->n4_h >= n8_h_8 && xd->n4_h <= n4_h) {
int inc = AOMMIN(-max_col_offset + col_offset + 1,
mi_size_wide[candidate_bsize]);
// Obtain range used in weight calculation.
- weight = AOMMAX(weight, (inc << shift));
+ weight = AOMMAX(weight, inc);
// Update processed cols.
*processed_cols = inc - col_offset - 1;
}
@@ -248,7 +249,7 @@ static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
mi_pos.row = row_offset;
mi_pos.col = col_offset;
- if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) {
+ if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
const MB_MODE_INFO *const candidate =
xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
const int len = mi_size_wide[BLOCK_8X8];
@@ -290,19 +291,19 @@ static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
// The left hand of two vertical rectangles always has a top right (as the
// block above will have been decoded)
- if (xd->n8_w < xd->n8_h)
+ if (xd->n4_w < xd->n4_h)
if (!xd->is_sec_rect) has_tr = 1;
// The bottom of two horizontal rectangles never has a top right (as the block
// to the right won't have been decoded)
- if (xd->n8_w > xd->n8_h)
+ if (xd->n4_w > xd->n4_h)
if (xd->is_sec_rect) has_tr = 0;
// The bottom left square of a Vertical A (in the old format) does
// not have a top right as it is decoded before the right hand
// rectangle of the partition
if (xd->mi[0]->partition == PARTITION_VERT_A) {
- if (xd->n8_w == xd->n8_h)
+ if (xd->n4_w == xd->n4_h)
if (mask_row & bs) has_tr = 0;
}
@@ -335,7 +336,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
- if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) return 0;
+ if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
const TPL_MV_REF *prev_frame_mvs =
cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) +
@@ -430,20 +431,75 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
return 0;
}
+static void process_compound_ref_mv_candidate(
+ const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+ const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2],
+ int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
+ for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+ MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
+
+ for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
+ if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
+ ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
+ ++ref_id_count[cmp_idx];
+ } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
+ int_mv this_mv = candidate->mv[rf_idx];
+ if (cm->ref_frame_sign_bias[can_rf] !=
+ cm->ref_frame_sign_bias[rf[cmp_idx]]) {
+ this_mv.as_mv.row = -this_mv.as_mv.row;
+ this_mv.as_mv.col = -this_mv.as_mv.col;
+ }
+ ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
+ ++ref_diff_count[cmp_idx];
+ }
+ }
+ }
+}
+
+static void process_single_ref_mv_candidate(
+ const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+ MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+ CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE]) {
+ for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+ if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
+ int_mv this_mv = candidate->mv[rf_idx];
+ if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
+ cm->ref_frame_sign_bias[ref_frame]) {
+ this_mv.as_mv.row = -this_mv.as_mv.row;
+ this_mv.as_mv.col = -this_mv.as_mv.col;
+ }
+ int stack_idx;
+ for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
+ const int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+ if (this_mv.as_int == stack_mv.as_int) break;
+ }
+
+ if (stack_idx == refmv_count[ref_frame]) {
+ ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
+
+ // TODO(jingning): Set an arbitrary small number here. The weight
+ // doesn't matter as long as it is properly initialized.
+ ref_mv_stack[ref_frame][stack_idx].weight = 2;
+ ++refmv_count[ref_frame];
+ }
+ }
+ }
+}
+
static void setup_ref_mv_list(
const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
uint8_t refmv_count[MODE_CTX_REF_FRAMES],
CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
int mi_row, int mi_col, int16_t *mode_context) {
- const int bs = AOMMAX(xd->n8_w, xd->n8_h);
+ const int bs = AOMMAX(xd->n4_w, xd->n4_h);
const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
MV_REFERENCE_FRAME rf[2];
const TileInfo *const tile = &xd->tile;
int max_row_offset = 0, max_col_offset = 0;
- const int row_adj = (xd->n8_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
- const int col_adj = (xd->n8_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
+ const int row_adj = (xd->n4_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
+ const int col_adj = (xd->n4_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
int processed_rows = 0;
int processed_cols = 0;
@@ -455,17 +511,16 @@ static void setup_ref_mv_list(
if (xd->up_available) {
max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
- if (xd->n8_h < mi_size_high[BLOCK_8X8])
+ if (xd->n4_h < mi_size_high[BLOCK_8X8])
max_row_offset = -(2 << 1) + row_adj;
- max_row_offset =
- find_valid_row_offset(tile, mi_row, cm->mi_rows, max_row_offset);
+ max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
}
if (xd->left_available) {
max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
- if (xd->n8_w < mi_size_wide[BLOCK_8X8])
+ if (xd->n4_w < mi_size_wide[BLOCK_8X8])
max_col_offset = -(2 << 1) + col_adj;
max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
@@ -487,12 +542,12 @@ static void setup_ref_mv_list(
gm_mv_candidates, max_col_offset, &processed_cols);
// Check top-right boundary
if (has_tr)
- scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n8_w,
+ scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n4_w,
ref_mv_stack[ref_frame], &row_match_count, &newmv_count,
gm_mv_candidates, &refmv_count[ref_frame]);
- uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
- uint8_t nearest_refmv_count = refmv_count[ref_frame];
+ const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
+ const uint8_t nearest_refmv_count = refmv_count[ref_frame];
// TODO(yunqing): for comp_search, do it for all 3 cases.
for (int idx = 0; idx < nearest_refmv_count; ++idx)
@@ -500,27 +555,27 @@ static void setup_ref_mv_list(
if (cm->allow_ref_frame_mvs) {
int is_available = 0;
- const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n8_h);
- const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n8_w);
- const int blk_row_end = AOMMIN(xd->n8_h, mi_size_high[BLOCK_64X64]);
- const int blk_col_end = AOMMIN(xd->n8_w, mi_size_wide[BLOCK_64X64]);
+ const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n4_h);
+ const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n4_w);
+ const int blk_row_end = AOMMIN(xd->n4_h, mi_size_high[BLOCK_64X64]);
+ const int blk_col_end = AOMMIN(xd->n4_w, mi_size_wide[BLOCK_64X64]);
const int tpl_sample_pos[3][2] = {
{ voffset, -2 },
{ voffset, hoffset },
{ voffset - 2, hoffset },
};
- const int allow_extension = (xd->n8_h >= mi_size_high[BLOCK_8X8]) &&
- (xd->n8_h < mi_size_high[BLOCK_64X64]) &&
- (xd->n8_w >= mi_size_wide[BLOCK_8X8]) &&
- (xd->n8_w < mi_size_wide[BLOCK_64X64]);
-
- int step_h = (xd->n8_h >= mi_size_high[BLOCK_64X64])
- ? mi_size_high[BLOCK_16X16]
- : mi_size_high[BLOCK_8X8];
- int step_w = (xd->n8_w >= mi_size_wide[BLOCK_64X64])
- ? mi_size_wide[BLOCK_16X16]
- : mi_size_wide[BLOCK_8X8];
+ const int allow_extension = (xd->n4_h >= mi_size_high[BLOCK_8X8]) &&
+ (xd->n4_h < mi_size_high[BLOCK_64X64]) &&
+ (xd->n4_w >= mi_size_wide[BLOCK_8X8]) &&
+ (xd->n4_w < mi_size_wide[BLOCK_64X64]);
+
+ const int step_h = (xd->n4_h >= mi_size_high[BLOCK_64X64])
+ ? mi_size_high[BLOCK_16X16]
+ : mi_size_high[BLOCK_8X8];
+ const int step_w = (xd->n4_w >= mi_size_wide[BLOCK_64X64])
+ ? mi_size_wide[BLOCK_16X16]
+ : mi_size_wide[BLOCK_8X8];
for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
@@ -569,7 +624,7 @@ static void setup_ref_mv_list(
max_col_offset, &processed_cols);
}
- uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
+ const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
switch (nearest_match) {
case 0:
@@ -636,62 +691,24 @@ static void setup_ref_mv_list(
int_mv ref_id[2][2], ref_diff[2][2];
int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
- int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
+ int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
- int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
+ int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
int mi_size = AOMMIN(mi_width, mi_height);
for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
- const int candidate_bsize = candidate->sb_type;
-
- for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
- MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
-
- for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
- if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
- ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
- ++ref_id_count[cmp_idx];
- } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
- int_mv this_mv = candidate->mv[rf_idx];
- if (cm->ref_frame_sign_bias[can_rf] !=
- cm->ref_frame_sign_bias[rf[cmp_idx]]) {
- this_mv.as_mv.row = -this_mv.as_mv.row;
- this_mv.as_mv.col = -this_mv.as_mv.col;
- }
- ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
- ++ref_diff_count[cmp_idx];
- }
- }
- }
- idx += mi_size_wide[candidate_bsize];
+ process_compound_ref_mv_candidate(
+ candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+ idx += mi_size_wide[candidate->sb_type];
}
for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
- const int candidate_bsize = candidate->sb_type;
-
- for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
- MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
-
- for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
- if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
- ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
- ++ref_id_count[cmp_idx];
- } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
- int_mv this_mv = candidate->mv[rf_idx];
- if (cm->ref_frame_sign_bias[can_rf] !=
- cm->ref_frame_sign_bias[rf[cmp_idx]]) {
- this_mv.as_mv.row = -this_mv.as_mv.row;
- this_mv.as_mv.col = -this_mv.as_mv.col;
- }
- ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
- ++ref_diff_count[cmp_idx];
- }
- }
- }
- idx += mi_size_high[candidate_bsize];
+ process_compound_ref_mv_candidate(
+ candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+ idx += mi_size_high[candidate->sb_type];
}
// Build up the compound mv predictor
@@ -743,87 +760,37 @@ static void setup_ref_mv_list(
for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
- xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+ xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv,
- xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+ xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
}
} else {
// Handle single reference frame extension
- int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w);
+ int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
- int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h);
+ int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
int mi_size = AOMMIN(mi_width, mi_height);
for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
- const int candidate_bsize = candidate->sb_type;
-
- // TODO(jingning): Refactor the following code.
- for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
- if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
- int_mv this_mv = candidate->mv[rf_idx];
- if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
- cm->ref_frame_sign_bias[ref_frame]) {
- this_mv.as_mv.row = -this_mv.as_mv.row;
- this_mv.as_mv.col = -this_mv.as_mv.col;
- }
- int stack_idx;
- for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
- int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
- if (this_mv.as_int == stack_mv.as_int) break;
- }
-
- if (stack_idx == refmv_count[ref_frame]) {
- ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
-
- // TODO(jingning): Set an arbitrary small number here. The weight
- // doesn't matter as long as it is properly initialized.
- ref_mv_stack[ref_frame][stack_idx].weight = 2;
- ++refmv_count[ref_frame];
- }
- }
- }
- idx += mi_size_wide[candidate_bsize];
+ process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+ ref_mv_stack);
+ idx += mi_size_wide[candidate->sb_type];
}
for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
- const int candidate_bsize = candidate->sb_type;
-
- // TODO(jingning): Refactor the following code.
- for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
- if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
- int_mv this_mv = candidate->mv[rf_idx];
- if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
- cm->ref_frame_sign_bias[ref_frame]) {
- this_mv.as_mv.row = -this_mv.as_mv.row;
- this_mv.as_mv.col = -this_mv.as_mv.col;
- }
- int stack_idx;
- for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
- int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
- if (this_mv.as_int == stack_mv.as_int) break;
- }
-
- if (stack_idx == refmv_count[ref_frame]) {
- ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
-
- // TODO(jingning): Set an arbitrary small number here. The weight
- // doesn't matter as long as it is properly initialized.
- ref_mv_stack[ref_frame][stack_idx].weight = 2;
- ++refmv_count[ref_frame];
- }
- }
- }
- idx += mi_size_high[candidate_bsize];
+ process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+ ref_mv_stack);
+ idx += mi_size_high[candidate->sb_type];
}
for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
- xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd);
+ xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
}
if (mv_ref_list != NULL) {
@@ -936,8 +903,10 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
: -((-mv.col) >> (4 + MI_SIZE_LOG2));
- int row = (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
- int col = (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
+ const int row =
+ (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
+ const int col =
+ (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 ||
col >= (cm->mi_cols >> 1))
@@ -955,37 +924,44 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
return 1;
}
-static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame,
- int dir) {
+// Note: motion_filed_projection finds motion vectors of current frame's
+// reference frame, and projects them to current frame. To make it clear,
+// let's call current frame's reference frame as start frame.
+// Call Start frame's reference frames as reference frames.
+// Call ref_offset as frame distances between start frame and its reference
+// frames.
+static int motion_field_projection(AV1_COMMON *cm,
+ MV_REFERENCE_FRAME start_frame, int dir) {
TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
int ref_offset[REF_FRAMES] = { 0 };
(void)dir;
- int ref_frame_idx = cm->frame_refs[FWD_RF_OFFSET(ref_frame)].idx;
- if (ref_frame_idx < 0) return 0;
+ const int start_frame_idx = cm->frame_refs[FWD_RF_OFFSET(start_frame)].idx;
+ if (start_frame_idx < 0) return 0;
- if (cm->buffer_pool->frame_bufs[ref_frame_idx].intra_only) return 0;
+ if (cm->buffer_pool->frame_bufs[start_frame_idx].intra_only) return 0;
- if (cm->buffer_pool->frame_bufs[ref_frame_idx].mi_rows != cm->mi_rows ||
- cm->buffer_pool->frame_bufs[ref_frame_idx].mi_cols != cm->mi_cols)
+ if (cm->buffer_pool->frame_bufs[start_frame_idx].mi_rows != cm->mi_rows ||
+ cm->buffer_pool->frame_bufs[start_frame_idx].mi_cols != cm->mi_cols)
return 0;
- int ref_frame_index =
- cm->buffer_pool->frame_bufs[ref_frame_idx].cur_frame_offset;
- unsigned int *ref_rf_idx =
- &cm->buffer_pool->frame_bufs[ref_frame_idx].ref_frame_offset[0];
- int cur_frame_index = cm->cur_frame->cur_frame_offset;
- int ref_to_cur = get_relative_dist(cm, ref_frame_index, cur_frame_index);
+ const int start_frame_offset =
+ cm->buffer_pool->frame_bufs[start_frame_idx].cur_frame_offset;
+ const unsigned int *const ref_frame_offsets =
+ &cm->buffer_pool->frame_bufs[start_frame_idx].ref_frame_offset[0];
+ const int cur_frame_offset = cm->cur_frame->cur_frame_offset;
+ int start_to_current_frame_offset =
+ get_relative_dist(cm, start_frame_offset, cur_frame_offset);
for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
- ref_offset[rf] =
- get_relative_dist(cm, ref_frame_index, ref_rf_idx[rf - LAST_FRAME]);
+ ref_offset[rf] = get_relative_dist(cm, start_frame_offset,
+ ref_frame_offsets[rf - LAST_FRAME]);
}
- if (dir == 2) ref_to_cur = -ref_to_cur;
+ if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
- MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[ref_frame_idx].mvs;
+ MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[start_frame_idx].mvs;
const int mvs_rows = (cm->mi_rows + 1) >> 1;
const int mvs_cols = (cm->mi_cols + 1) >> 1;
@@ -999,19 +975,20 @@ static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame,
int mi_r, mi_c;
const int ref_frame_offset = ref_offset[mv_ref->ref_frame];
- int pos_valid = abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
- ref_frame_offset > 0 &&
- abs(ref_to_cur) <= MAX_FRAME_DISTANCE;
+ int pos_valid =
+ abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
+ ref_frame_offset > 0 &&
+ abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE;
if (pos_valid) {
- get_mv_projection(&this_mv.as_mv, fwd_mv, ref_to_cur,
- ref_frame_offset);
+ get_mv_projection(&this_mv.as_mv, fwd_mv,
+ start_to_current_frame_offset, ref_frame_offset);
pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
this_mv.as_mv, dir >> 1);
}
if (pos_valid) {
- int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
+ const int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
@@ -1167,14 +1144,14 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
if (up_available) {
int mi_row_offset = -1;
MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride];
- uint8_t n8_w = mi_size_wide[mbmi->sb_type];
+ uint8_t n4_w = mi_size_wide[mbmi->sb_type];
- if (xd->n8_w <= n8_w) {
+ if (xd->n4_w <= n4_w) {
// Handle "current block width <= above block width" case.
- int col_offset = -mi_col % n8_w;
+ int col_offset = -mi_col % n4_w;
if (col_offset < 0) do_tl = 0;
- if (col_offset + n8_w > xd->n8_w) do_tr = 0;
+ if (col_offset + n4_w > xd->n4_w) do_tr = 0;
if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
@@ -1185,11 +1162,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
}
} else {
// Handle "current block width > above block width" case.
- for (i = 0; i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
+ for (i = 0; i < AOMMIN(xd->n4_w, cm->mi_cols - mi_col); i += mi_step) {
int mi_col_offset = i;
mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
- n8_w = mi_size_wide[mbmi->sb_type];
- mi_step = AOMMIN(xd->n8_w, n8_w);
+ n4_w = mi_size_wide[mbmi->sb_type];
+ mi_step = AOMMIN(xd->n4_w, n4_w);
if (mbmi->ref_frame[0] == ref_frame &&
mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1209,11 +1186,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
int mi_col_offset = -1;
MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
- uint8_t n8_h = mi_size_high[mbmi->sb_type];
+ uint8_t n4_h = mi_size_high[mbmi->sb_type];
- if (xd->n8_h <= n8_h) {
+ if (xd->n4_h <= n4_h) {
// Handle "current block height <= above block height" case.
- int row_offset = -mi_row % n8_h;
+ int row_offset = -mi_row % n4_h;
if (row_offset < 0) do_tl = 0;
@@ -1226,11 +1203,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
}
} else {
// Handle "current block height > above block height" case.
- for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
+ for (i = 0; i < AOMMIN(xd->n4_h, cm->mi_rows - mi_row); i += mi_step) {
int mi_row_offset = i;
mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
- n8_h = mi_size_high[mbmi->sb_type];
- mi_step = AOMMIN(xd->n8_h, n8_h);
+ n4_h = mi_size_high[mbmi->sb_type];
+ mi_step = AOMMIN(xd->n4_h, n4_h);
if (mbmi->ref_frame[0] == ref_frame &&
mbmi->ref_frame[1] == NONE_FRAME) {
@@ -1264,18 +1241,18 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
// Top-right block
if (do_tr &&
- has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n8_w, xd->n8_h))) {
- POSITION trb_pos = { -1, xd->n8_w };
+ has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n4_w, xd->n4_h))) {
+ POSITION trb_pos = { -1, xd->n4_w };
- if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &trb_pos)) {
+ if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
int mi_row_offset = -1;
- int mi_col_offset = xd->n8_w;
+ int mi_col_offset = xd->n4_w;
MB_MODE_INFO *mbmi =
xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
- record_samples(mbmi, pts, pts_inref, 0, -1, xd->n8_w, 1);
+ record_samples(mbmi, pts, pts_inref, 0, -1, xd->n4_w, 1);
np++;
if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
}
@@ -1372,7 +1349,7 @@ static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx,
REF_FRAME_INFO *ref_info) {
- assert(frame_idx >= 0 && frame_idx <= INTER_REFS_PER_FRAME);
+ assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
const int buf_idx = ref_info->buf_idx;
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
index f68c159e1..83f7a1ac0 100644
--- a/third_party/aom/av1/common/mvref_common.h
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_MVREF_COMMON_H_
-#define AV1_COMMON_MVREF_COMMON_H_
+#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_
+#define AOM_AV1_COMMON_MVREF_COMMON_H_
#include "av1/common/onyxc_int.h"
#include "av1/common/blockd.h"
@@ -85,29 +85,17 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
// Checks that the given mi_row, mi_col and search point
// are inside the borders of the tile.
static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
- int mi_rows, const POSITION *mi_pos) {
- const int dependent_horz_tile_flag = 0;
- if (dependent_horz_tile_flag && !tile->tg_horz_boundary) {
- return !(mi_row + mi_pos->row < 0 ||
- mi_col + mi_pos->col < tile->mi_col_start ||
- mi_row + mi_pos->row >= mi_rows ||
- mi_col + mi_pos->col >= tile->mi_col_end);
- } else {
- return !(mi_row + mi_pos->row < tile->mi_row_start ||
- mi_col + mi_pos->col < tile->mi_col_start ||
- mi_row + mi_pos->row >= tile->mi_row_end ||
- mi_col + mi_pos->col >= tile->mi_col_end);
- }
+ const POSITION *mi_pos) {
+ return !(mi_row + mi_pos->row < tile->mi_row_start ||
+ mi_col + mi_pos->col < tile->mi_col_start ||
+ mi_row + mi_pos->row >= tile->mi_row_end ||
+ mi_col + mi_pos->col >= tile->mi_col_end);
}
static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row,
- int mi_rows, int row_offset) {
- const int dependent_horz_tile_flag = 0;
- if (dependent_horz_tile_flag && !tile->tg_horz_boundary)
- return clamp(row_offset, -mi_row, mi_rows - mi_row - 1);
- else
- return clamp(row_offset, tile->mi_row_start - mi_row,
- tile->mi_row_end - mi_row - 1);
+ int row_offset) {
+ return clamp(row_offset, tile->mi_row_start - mi_row,
+ tile->mi_row_end - mi_row - 1);
}
static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col,
@@ -263,8 +251,9 @@ static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
}
}
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
- int mi_row, int mi_col, int x_mis, int y_mis);
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+ const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+ int x_mis, int y_mis);
void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
@@ -286,7 +275,6 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
#define INTRABC_DELAY_PIXELS 256 // Delay of 256 pixels
#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
-#define USE_WAVE_FRONT 1 // Use only top left area of frame for reference.
static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
int mib_size, int mi_row, int mi_col) {
@@ -356,13 +344,12 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col;
if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0;
-#if USE_WAVE_FRONT
+ // Wavefront constraint: use only top left area of frame for reference.
const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64);
const int wf_offset = gradient * (active_sb_row - src_sb_row);
if (src_sb_row > active_sb_row ||
src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset)
return 0;
-#endif
return 1;
}
@@ -371,4 +358,4 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
} // extern "C"
#endif
-#endif // AV1_COMMON_MVREF_COMMON_H_
+#endif // AOM_AV1_COMMON_MVREF_COMMON_H_
diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h
index 3918c82c6..1c90cd93f 100644
--- a/third_party/aom/av1/common/obmc.h
+++ b/third_party/aom/av1/common/obmc.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_OBMC_H_
-#define AV1_COMMON_OBMC_H_
+#ifndef AOM_AV1_COMMON_OBMC_H_
+#define AOM_AV1_COMMON_OBMC_H_
typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos,
uint8_t nb_mi_size,
@@ -30,7 +30,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
// prev_row_mi points into the mi array, starting at the beginning of the
// previous row.
MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
- const int end_col = AOMMIN(mi_col + xd->n8_w, cm->mi_cols);
+ const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols);
uint8_t mi_step;
for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
above_mi_col += mi_step) {
@@ -49,7 +49,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
}
if (is_neighbor_overlappable(*above_mi)) {
++nb_count;
- fun(xd, above_mi_col - mi_col, AOMMIN(xd->n8_w, mi_step), *above_mi,
+ fun(xd, above_mi_col - mi_col, AOMMIN(xd->n4_w, mi_step), *above_mi,
fun_ctxt, num_planes);
}
}
@@ -68,7 +68,7 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
// prev_col_mi points into the mi array, starting at the top of the
// previous column
MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
- const int end_row = AOMMIN(mi_row + xd->n8_h, cm->mi_rows);
+ const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows);
uint8_t mi_step;
for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
left_mi_row += mi_step) {
@@ -82,10 +82,10 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
}
if (is_neighbor_overlappable(*left_mi)) {
++nb_count;
- fun(xd, left_mi_row - mi_row, AOMMIN(xd->n8_h, mi_step), *left_mi,
+ fun(xd, left_mi_row - mi_row, AOMMIN(xd->n4_h, mi_step), *left_mi,
fun_ctxt, num_planes);
}
}
}
-#endif // AV1_COMMON_OBMC_H_
+#endif // AOM_AV1_COMMON_OBMC_H_
diff --git a/third_party/aom/av1/common/obu_util.c b/third_party/aom/av1/common/obu_util.c
new file mode 100644
index 000000000..823b700b1
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/obu_util.h"
+
+#include "aom_dsp/bitreader_buffer.h"
+
+// Returns 1 when OBU type is valid, and 0 otherwise.
+static int valid_obu_type(int obu_type) {
+ int valid_type = 0;
+ switch (obu_type) {
+ case OBU_SEQUENCE_HEADER:
+ case OBU_TEMPORAL_DELIMITER:
+ case OBU_FRAME_HEADER:
+ case OBU_TILE_GROUP:
+ case OBU_METADATA:
+ case OBU_FRAME:
+ case OBU_REDUNDANT_FRAME_HEADER:
+ case OBU_TILE_LIST:
+ case OBU_PADDING: valid_type = 1; break;
+ default: break;
+ }
+ return valid_type;
+}
+
+static aom_codec_err_t read_obu_size(const uint8_t *data,
+ size_t bytes_available,
+ size_t *const obu_size,
+ size_t *const length_field_size) {
+ uint64_t u_obu_size = 0;
+ if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
+ 0) {
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+
+ if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
+ *obu_size = (size_t)u_obu_size;
+ return AOM_CODEC_OK;
+}
+
+// Parses OBU header and stores values in 'header'.
+static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
+ int is_annexb, ObuHeader *header) {
+ if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
+
+ const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
+ if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
+
+ header->size = 1;
+
+ if (aom_rb_read_bit(rb) != 0) {
+ // Forbidden bit. Must not be set.
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+
+ header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
+
+ if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
+
+ header->has_extension = aom_rb_read_bit(rb);
+ header->has_size_field = aom_rb_read_bit(rb);
+
+ if (!header->has_size_field && !is_annexb) {
+ // section 5 obu streams must have obu_size field set.
+ return AOM_CODEC_UNSUP_BITSTREAM;
+ }
+
+ if (aom_rb_read_bit(rb) != 0) {
+ // obu_reserved_1bit must be set to 0.
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+
+ if (header->has_extension) {
+ if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
+
+ header->size += 1;
+ header->temporal_layer_id = aom_rb_read_literal(rb, 3);
+ header->spatial_layer_id = aom_rb_read_literal(rb, 2);
+ if (aom_rb_read_literal(rb, 3) != 0) {
+ // extension_header_reserved_3bits must be set to 0.
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+ }
+
+ return AOM_CODEC_OK;
+}
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+ size_t *consumed, ObuHeader *header,
+ int is_annexb) {
+ if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
+
+ // TODO(tomfinegan): Set the error handler here and throughout this file, and
+ // confirm parsing work done via aom_read_bit_buffer is successful.
+ struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
+ NULL };
+ aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
+ if (parse_result == AOM_CODEC_OK) *consumed = header->size;
+ return parse_result;
+}
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+ size_t bytes_available,
+ int is_annexb,
+ ObuHeader *obu_header,
+ size_t *const payload_size,
+ size_t *const bytes_read) {
+ size_t length_field_size = 0, obu_size = 0;
+ aom_codec_err_t status;
+
+ if (is_annexb) {
+ // Size field comes before the OBU header, and includes the OBU header
+ status =
+ read_obu_size(data, bytes_available, &obu_size, &length_field_size);
+
+ if (status != AOM_CODEC_OK) return status;
+ }
+
+ struct aom_read_bit_buffer rb = { data + length_field_size,
+ data + bytes_available, 0, NULL, NULL };
+
+ status = read_obu_header(&rb, is_annexb, obu_header);
+ if (status != AOM_CODEC_OK) return status;
+
+ if (is_annexb) {
+ // Derive the payload size from the data we've already read
+ if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
+
+ *payload_size = obu_size - obu_header->size;
+ } else {
+ // Size field comes after the OBU header, and is just the payload size
+ status = read_obu_size(data + obu_header->size,
+ bytes_available - obu_header->size, payload_size,
+ &length_field_size);
+ if (status != AOM_CODEC_OK) return status;
+ }
+
+ *bytes_read = length_field_size + obu_header->size;
+ return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/common/obu_util.h b/third_party/aom/av1/common/obu_util.h
new file mode 100644
index 000000000..7c56904c8
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_OBU_UTIL_H_
+#define AOM_AV1_COMMON_OBU_UTIL_H_
+
+#include "aom/aom_codec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ size_t size; // Size (1 or 2 bytes) of the OBU header (including the
+ // optional OBU extension header) in the bitstream.
+ OBU_TYPE type;
+ int has_size_field;
+ int has_extension;
+ // The following fields come from the OBU extension header and therefore are
+ // only used if has_extension is true.
+ int temporal_layer_id;
+ int spatial_layer_id;
+} ObuHeader;
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+ size_t *consumed, ObuHeader *header,
+ int is_annexb);
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+ size_t bytes_available,
+ int is_annexb,
+ ObuHeader *obu_header,
+ size_t *const payload_size,
+ size_t *const bytes_read);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_OBU_UTIL_H_
diff --git a/third_party/aom/av1/common/odintrin.h b/third_party/aom/av1/common/odintrin.h
index e87c5a0bf..e1db0f44d 100644
--- a/third_party/aom/av1/common/odintrin.h
+++ b/third_party/aom/av1/common/odintrin.h
@@ -11,8 +11,8 @@
/* clang-format off */
-#ifndef AV1_COMMON_ODINTRIN_H_
-#define AV1_COMMON_ODINTRIN_H_
+#ifndef AOM_AV1_COMMON_ODINTRIN_H_
+#define AOM_AV1_COMMON_ODINTRIN_H_
#include <stdlib.h>
#include <string.h>
@@ -46,9 +46,9 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
#define OD_MAXI AOMMAX
#define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max)))
-#define OD_CLZ0 (1)
-#define OD_CLZ(x) (-get_msb(x))
-#define OD_ILOG_NZ(x) (OD_CLZ0 - OD_CLZ(x))
+/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer.
+ OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/
+#define OD_ILOG_NZ(x) (1 + get_msb(x))
/*Enable special features for gcc and compatible compilers.*/
#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
@@ -93,4 +93,4 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
} // extern "C"
#endif
-#endif // AV1_COMMON_ODINTRIN_H_
+#endif // AOM_AV1_COMMON_ODINTRIN_H_
diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h
index 6b1bf2d74..ff011c89e 100644
--- a/third_party/aom/av1/common/onyxc_int.h
+++ b/third_party/aom/av1/common/onyxc_int.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_ONYXC_INT_H_
-#define AV1_COMMON_ONYXC_INT_H_
+#ifndef AOM_AV1_COMMON_ONYXC_INT_H_
+#define AOM_AV1_COMMON_ONYXC_INT_H_
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
@@ -480,6 +480,7 @@ typedef struct AV1Common {
int byte_alignment;
int skip_loop_filter;
+ int skip_film_grain;
// Private data associated with the frame buffer callbacks.
void *cb_priv;
@@ -823,18 +824,18 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
xd->chroma_left_mbmi = chroma_left_mi;
}
- xd->n8_h = bh;
- xd->n8_w = bw;
+ xd->n4_h = bh;
+ xd->n4_w = bw;
xd->is_sec_rect = 0;
- if (xd->n8_w < xd->n8_h) {
+ if (xd->n4_w < xd->n4_h) {
// Only mark is_sec_rect as 1 for the last block.
// For PARTITION_VERT_4, it would be (0, 0, 0, 1);
// For other partitions, it would be (0, 1).
- if (!((mi_col + xd->n8_w) & (xd->n8_h - 1))) xd->is_sec_rect = 1;
+ if (!((mi_col + xd->n4_w) & (xd->n4_h - 1))) xd->is_sec_rect = 1;
}
- if (xd->n8_w > xd->n8_h)
- if (mi_row & (xd->n8_w - 1)) xd->is_sec_rect = 1;
+ if (xd->n4_w > xd->n4_h)
+ if (mi_row & (xd->n4_w - 1)) xd->is_sec_rect = 1;
}
static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
@@ -1115,18 +1116,18 @@ static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
}
-static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n8_w, int n8_h, int skip,
+static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
const MACROBLOCKD *xd) {
uint8_t bw = tx_size_wide[tx_size];
uint8_t bh = tx_size_high[tx_size];
if (skip) {
- bw = n8_w * MI_SIZE;
- bh = n8_h * MI_SIZE;
+ bw = n4_w * MI_SIZE;
+ bh = n4_h * MI_SIZE;
}
- set_txfm_ctx(xd->above_txfm_context, bw, n8_w);
- set_txfm_ctx(xd->left_txfm_context, bh, n8_h);
+ set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
+ set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
}
static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
@@ -1338,4 +1339,4 @@ static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) {
} // extern "C"
#endif
-#endif // AV1_COMMON_ONYXC_INT_H_
+#endif // AOM_AV1_COMMON_ONYXC_INT_H_
diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c
index 58933a7b3..026a07809 100644
--- a/third_party/aom/av1/common/ppc/cfl_ppc.c
+++ b/third_party/aom/av1/common/ppc/cfl_ppc.c
@@ -24,19 +24,21 @@
#define CFL_LINE_2 128
#define CFL_LINE_3 192
-typedef vector int8_t int8x16_t;
-typedef vector uint8_t uint8x16_t;
-typedef vector int16_t int16x8_t;
-typedef vector uint16_t uint16x8_t;
-typedef vector int32_t int32x4_t;
-typedef vector uint32_t uint32x4_t;
-typedef vector uint64_t uint64x2_t;
+typedef vector signed char int8x16_t; // NOLINT(runtime/int)
+typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int)
+typedef vector signed short int16x8_t; // NOLINT(runtime/int)
+typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int)
+typedef vector signed int int32x4_t; // NOLINT(runtime/int)
+typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int)
+typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int)
-static INLINE void subtract_average_vsx(int16_t *pred_buf, int width,
- int height, int round_offset,
+static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
+ int width, int height, int round_offset,
int num_pel_log2) {
- const int16_t *end = pred_buf + height * CFL_BUF_LINE;
- const int16_t *sum_buf = pred_buf;
+ // int16_t *dst = dst_ptr;
+ const int16_t *dst_end = dst + height * CFL_BUF_LINE;
+ const int16_t *sum_buf = (const int16_t *)src_ptr;
+ const int16_t *end = sum_buf + height * CFL_BUF_LINE;
const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
@@ -71,43 +73,40 @@ static INLINE void subtract_average_vsx(int16_t *pred_buf, int width,
const int32x4_t avg = vec_sr(sum_32x4, div_shift);
const int16x8_t vec_avg = vec_pack(avg, avg);
do {
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, pred_buf), vec_avg), OFF_0, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, pred_buf), vec_avg),
- OFF_0 + CFL_BUF_LINE_BYTES, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, pred_buf), vec_avg),
- OFF_0 + CFL_LINE_2, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, pred_buf), vec_avg),
- OFF_0 + CFL_LINE_3, pred_buf);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg),
+ OFF_0 + CFL_BUF_LINE_BYTES, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg),
+ OFF_0 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg),
+ OFF_0 + CFL_LINE_3, dst);
if (width >= 16) {
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, pred_buf), vec_avg), OFF_1,
- pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, pred_buf), vec_avg),
- OFF_1 + CFL_LINE_1, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, pred_buf), vec_avg),
- OFF_1 + CFL_LINE_2, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, pred_buf), vec_avg),
- OFF_1 + CFL_LINE_3, pred_buf);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg),
+ OFF_1 + CFL_LINE_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg),
+ OFF_1 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg),
+ OFF_1 + CFL_LINE_3, dst);
}
if (width == 32) {
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, pred_buf), vec_avg), OFF_2,
- pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, pred_buf), vec_avg),
- OFF_2 + CFL_LINE_1, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, pred_buf), vec_avg),
- OFF_2 + CFL_LINE_2, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, pred_buf), vec_avg),
- OFF_2 + CFL_LINE_3, pred_buf);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg),
+ OFF_2 + CFL_LINE_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg),
+ OFF_2 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg),
+ OFF_2 + CFL_LINE_3, dst);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, pred_buf), vec_avg), OFF_3,
- pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, pred_buf), vec_avg),
- OFF_3 + CFL_LINE_1, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, pred_buf), vec_avg),
- OFF_3 + CFL_LINE_2, pred_buf);
- vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, pred_buf), vec_avg),
- OFF_3 + CFL_LINE_3, pred_buf);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg),
+ OFF_3 + CFL_LINE_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg),
+ OFF_3 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg),
+ OFF_3 + CFL_LINE_3, dst);
}
- } while ((pred_buf += CFL_BUF_LINE * 4) < end);
+ } while ((dst += CFL_BUF_LINE * 4) < dst_end);
}
// Declare wrappers for VSX sizes
diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c
index d77739d85..5952441d1 100644
--- a/third_party/aom/av1/common/pred_common.c
+++ b/third_party/aom/av1/common/pred_common.c
@@ -31,8 +31,8 @@ int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
const MB_MODE_INFO *const mbmi = xd->mi[0];
const int ctx_offset =
(mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
- MV_REFERENCE_FRAME ref_frame =
- (dir < 2) ? mbmi->ref_frame[0] : mbmi->ref_frame[1];
+ assert(dir == 0 || dir == 1);
+ const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries corresponding to real macroblocks.
diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h
index 6a835c467..6dba2322d 100644
--- a/third_party/aom/av1/common/pred_common.h
+++ b/third_party/aom/av1/common/pred_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_PRED_COMMON_H_
-#define AV1_COMMON_PRED_COMMON_H_
+#ifndef AOM_AV1_COMMON_PRED_COMMON_H_
+#define AOM_AV1_COMMON_PRED_COMMON_H_
#include "av1/common/blockd.h"
#include "av1/common/mvref_common.h"
@@ -357,4 +357,4 @@ static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
} // extern "C"
#endif
-#endif // AV1_COMMON_PRED_COMMON_H_
+#endif // AOM_AV1_COMMON_PRED_COMMON_H_
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
index ca199e94c..d1f52a660 100644
--- a/third_party/aom/av1/common/quant_common.h
+++ b/third_party/aom/av1/common/quant_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_QUANT_COMMON_H_
-#define AV1_COMMON_QUANT_COMMON_H_
+#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_
+#define AOM_AV1_COMMON_QUANT_COMMON_H_
#include "aom/aom_codec.h"
#include "av1/common/seg_common.h"
@@ -60,4 +60,4 @@ const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp,
} // extern "C"
#endif
-#endif // AV1_COMMON_QUANT_COMMON_H_
+#endif // AOM_AV1_COMMON_QUANT_COMMON_H_
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
index b9f0b57f3..3203efce4 100644
--- a/third_party/aom/av1/common/reconinter.c
+++ b/third_party/aom/av1/common/reconinter.c
@@ -44,10 +44,9 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi,
if (build_for_obmc) return 0;
- if (warp_types->local_warp_allowed && !mbmi->wm_params[0].invalid) {
+ if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) {
if (final_warp_params != NULL)
- memcpy(final_warp_params, &mbmi->wm_params[0],
- sizeof(*final_warp_params));
+ memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params));
return 1;
} else if (warp_types->global_warp_allowed && !gm_params->invalid) {
if (final_warp_params != NULL)
@@ -78,6 +77,9 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]],
build_for_obmc, subpel_params->xs, subpel_params->ys,
&final_warp_params));
+ const int is_intrabc = mi->use_intrabc;
+ assert(IMPLIES(is_intrabc, !do_warp));
+
if (do_warp && xd->cur_frame_force_integer_mv == 0) {
const struct macroblockd_plane *const pd = &xd->plane[plane];
const struct buf_2d *const pre_buf = &pd->pre[ref];
@@ -88,10 +90,11 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
pd->subsampling_x, pd->subsampling_y, conv_params);
} else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
- w, h, conv_params, interp_filters, xd->bd);
+ w, h, conv_params, interp_filters, is_intrabc,
+ xd->bd);
} else {
inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h,
- conv_params, interp_filters);
+ conv_params, interp_filters, is_intrabc);
}
}
@@ -574,37 +577,6 @@ static void build_masked_compound_no_round(
h, subw, subh, conv_params);
}
-static void build_masked_compound(
- uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
- const uint8_t *src1, int src1_stride,
- const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
- int w) {
- // Derive subsampling from h and w passed in. May be refactored to
- // pass in subsampling factors directly.
- const int subh = (2 << mi_size_high_log2[sb_type]) == h;
- const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
- const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
- aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
- mask, block_size_wide[sb_type], w, h, subw, subh);
-}
-
-static void build_masked_compound_highbd(
- uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
- const uint8_t *src1_8, int src1_stride,
- const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
- int w, int bd) {
- // Derive subsampling from h and w passed in. May be refactored to
- // pass in subsampling factors directly.
- const int subh = (2 << mi_size_high_log2[sb_type]) == h;
- const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
- const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
- // const uint8_t *mask =
- // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
- aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
- src1_stride, mask, block_size_wide[sb_type], w, h,
- subw, subh, bd);
-}
-
void av1_make_masked_inter_predictor(
const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
@@ -653,63 +625,6 @@ void av1_make_masked_inter_predictor(
mi->sb_type, h, w, conv_params, xd);
}
-// TODO(sarahparker) av1_highbd_build_inter_predictor and
-// av1_build_inter_predictor should be combined with
-// av1_make_inter_predictor
-void av1_highbd_build_inter_predictor(
- const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
- const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
- InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col,
- int p_row, int plane, enum mv_precision precision, int x, int y,
- const MACROBLOCKD *xd, int can_use_previous) {
- const int is_q4 = precision == MV_PRECISION_Q4;
- const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
- is_q4 ? src_mv->col : src_mv->col * 2 };
- MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
- mv.col += SCALE_EXTRA_OFF;
- mv.row += SCALE_EXTRA_OFF;
- const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
- mv.col & SCALE_SUBPEL_MASK,
- mv.row & SCALE_SUBPEL_MASK };
- ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
-
- src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
- (mv.col >> SCALE_SUBPEL_BITS);
-
- av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
- w, h, &conv_params, interp_filters, warp_types,
- p_col, p_row, plane, ref, xd->mi[0], 0, xd,
- can_use_previous);
-}
-
-void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, const MV *src_mv,
- const struct scale_factors *sf, int w, int h,
- ConvolveParams *conv_params,
- InterpFilters interp_filters,
- const WarpTypesAllowed *warp_types, int p_col,
- int p_row, int plane, int ref,
- enum mv_precision precision, int x, int y,
- const MACROBLOCKD *xd, int can_use_previous) {
- const int is_q4 = precision == MV_PRECISION_Q4;
- const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
- is_q4 ? src_mv->col : src_mv->col * 2 };
- MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
- mv.col += SCALE_EXTRA_OFF;
- mv.row += SCALE_EXTRA_OFF;
-
- const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
- mv.col & SCALE_SUBPEL_MASK,
- mv.row & SCALE_SUBPEL_MASK };
- src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
- (mv.col >> SCALE_SUBPEL_BITS);
-
- av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
- w, h, conv_params, interp_filters, warp_types, p_col,
- p_row, plane, ref, xd->mi[0], 0, xd,
- can_use_previous);
-}
-
void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
int order_idx, int *fwd_offset, int *bck_offset,
int *use_jnt_comp_avg, int is_compound) {
@@ -759,279 +674,6 @@ void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
*bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
}
-static INLINE void calc_subpel_params(
- MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
- int plane, const int pre_x, const int pre_y, int x, int y,
- struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params,
- int bw, int bh) {
- struct macroblockd_plane *const pd = &xd->plane[plane];
- const int is_scaled = av1_is_scaled(sf);
- if (is_scaled) {
- int ssx = pd->subsampling_x;
- int ssy = pd->subsampling_y;
- int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
- orig_pos_y += mv.row * (1 << (1 - ssy));
- int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
- orig_pos_x += mv.col * (1 << (1 - ssx));
- int pos_y = sf->scale_value_y(orig_pos_y, sf);
- int pos_x = sf->scale_value_x(orig_pos_x, sf);
- pos_x += SCALE_EXTRA_OFF;
- pos_y += SCALE_EXTRA_OFF;
-
- const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
- const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
- const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
- << SCALE_SUBPEL_BITS;
- const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
- pos_y = clamp(pos_y, top, bottom);
- pos_x = clamp(pos_x, left, right);
-
- *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
- (pos_x >> SCALE_SUBPEL_BITS);
- subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
- subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
- subpel_params->xs = sf->x_step_q4;
- subpel_params->ys = sf->y_step_q4;
- } else {
- const MV mv_q4 = clamp_mv_to_umv_border_sb(
- xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
- subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
- subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
- subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
- *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
- (x + (mv_q4.col >> SUBPEL_BITS));
- }
-}
-
-static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int plane, const MB_MODE_INFO *mi,
- int build_for_obmc, int bw, int bh,
- int mi_x, int mi_y) {
- struct macroblockd_plane *const pd = &xd->plane[plane];
- int is_compound = has_second_ref(mi);
- int ref;
- const int is_intrabc = is_intrabc_block(mi);
- assert(IMPLIES(is_intrabc, !is_compound));
- int is_global[2] = { 0, 0 };
- for (ref = 0; ref < 1 + is_compound; ++ref) {
- const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
- is_global[ref] = is_global_mv_block(mi, wm->wmtype);
- }
-
- const BLOCK_SIZE bsize = mi->sb_type;
- const int ss_x = pd->subsampling_x;
- const int ss_y = pd->subsampling_y;
- int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
- (block_size_high[bsize] < 8 && ss_y);
-
- if (is_intrabc) sub8x8_inter = 0;
-
- // For sub8x8 chroma blocks, we may be covering more than one luma block's
- // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
- // the top-left corner of the prediction source - the correct top-left corner
- // is at (pre_x, pre_y).
- const int row_start =
- (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
- const int col_start =
- (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
- const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
- const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
-
- sub8x8_inter = sub8x8_inter && !build_for_obmc;
- if (sub8x8_inter) {
- for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
- for (int col = col_start; col <= 0; ++col) {
- const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
- if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
- if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
- }
- }
- }
-
- if (sub8x8_inter) {
- // block size
- const int b4_w = block_size_wide[bsize] >> ss_x;
- const int b4_h = block_size_high[bsize] >> ss_y;
- const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
- const int b8_w = block_size_wide[plane_bsize] >> ss_x;
- const int b8_h = block_size_high[plane_bsize] >> ss_y;
- assert(!is_compound);
-
- const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
-
- int row = row_start;
- for (int y = 0; y < b8_h; y += b4_h) {
- int col = col_start;
- for (int x = 0; x < b8_w; x += b4_w) {
- MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
- is_compound = has_second_ref(this_mbmi);
- DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]);
- int tmp_dst_stride = 8;
- assert(bw < 8 || bh < 8);
- ConvolveParams conv_params = get_conv_params_no_round(
- 0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd);
- conv_params.use_jnt_comp_avg = 0;
- struct buf_2d *const dst_buf = &pd->dst;
- uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
-
- ref = 0;
- const RefBuffer *ref_buf =
- &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
-
- pd->pre[ref].buf0 =
- (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
- pd->pre[ref].buf =
- pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
- ref_buf->buf->uv_stride,
- &ref_buf->sf);
- pd->pre[ref].width = ref_buf->buf->uv_crop_width;
- pd->pre[ref].height = ref_buf->buf->uv_crop_height;
- pd->pre[ref].stride = ref_buf->buf->uv_stride;
-
- const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &ref_buf->sf;
- struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
-
- const MV mv = this_mbmi->mv[ref].as_mv;
-
- uint8_t *pre;
- SubpelParams subpel_params;
- WarpTypesAllowed warp_types;
- warp_types.global_warp_allowed = is_global[ref];
- warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
-
- calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
- &subpel_params, bw, bh);
-
- conv_params.ref = ref;
- conv_params.do_average = ref;
- if (is_masked_compound_type(mi->interinter_comp.type)) {
- // masked compound type has its own average mechanism
- conv_params.do_average = 0;
- }
-
- av1_make_inter_predictor(
- pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf,
- b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
- (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
- plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
-
- ++col;
- }
- ++row;
- }
-
- for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
- return;
- }
-
- {
- DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
- ConvolveParams conv_params = get_conv_params_no_round(
- 0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd);
- av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
- &conv_params.bck_offset,
- &conv_params.use_jnt_comp_avg, is_compound);
-
- struct buf_2d *const dst_buf = &pd->dst;
- uint8_t *const dst = dst_buf->buf;
- for (ref = 0; ref < 1 + is_compound; ++ref) {
- const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
- struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
- const MV mv = mi->mv[ref].as_mv;
-
- uint8_t *pre;
- SubpelParams subpel_params;
- calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre,
- &subpel_params, bw, bh);
-
- WarpTypesAllowed warp_types;
- warp_types.global_warp_allowed = is_global[ref];
- warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
- conv_params.ref = ref;
-
- if (ref && is_masked_compound_type(mi->interinter_comp.type)) {
- // masked compound type has its own average mechanism
- conv_params.do_average = 0;
- av1_make_masked_inter_predictor(
- pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
- bh, &conv_params, mi->interp_filters, plane, &warp_types,
- mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd,
- cm->allow_warped_motion);
- } else {
- conv_params.do_average = ref;
- av1_make_inter_predictor(
- pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
- bh, &conv_params, mi->interp_filters, &warp_types,
- mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref,
- mi, build_for_obmc, xd, cm->allow_warped_motion);
- }
- }
- }
-}
-
-static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
- MACROBLOCKD *xd, BLOCK_SIZE bsize,
- int mi_row, int mi_col,
- int plane_from, int plane_to) {
- int plane;
- const int mi_x = mi_col * MI_SIZE;
- const int mi_y = mi_row * MI_SIZE;
- for (plane = plane_from; plane <= plane_to; ++plane) {
- const struct macroblockd_plane *pd = &xd->plane[plane];
- const int bw = pd->width;
- const int bh = pd->height;
-
- if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
- pd->subsampling_y))
- continue;
-
- build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
- }
-}
-
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize) {
- build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
-
- if (is_interintra_pred(xd->mi[0])) {
- BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
- { xd->plane[0].dst.stride, 0, 0 } };
- if (!ctx) ctx = &default_ctx;
- av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf,
- xd->plane[0].dst.stride, ctx, 0, bsize);
- }
-}
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize) {
- build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
- MAX_MB_PLANE - 1);
-
- if (is_interintra_pred(xd->mi[0])) {
- BUFFER_SET default_ctx = {
- { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
- { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
- };
- if (!ctx) ctx = &default_ctx;
- av1_build_interintra_predictors_sbuv(
- cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
- xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize);
- }
-}
-
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize) {
- const int num_planes = av1_num_planes(cm);
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
- if (num_planes > 1)
- av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
-}
-
void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
const int plane_start, const int plane_end) {
@@ -1292,63 +934,7 @@ void av1_setup_build_prediction_by_above_pred(
xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
xd->mb_to_right_edge = ctxt->mb_to_far_edge +
- (xd->n8_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
-}
-
-static INLINE void build_prediction_by_above_pred(
- MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
- MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
- struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
- const int above_mi_col = ctxt->mi_col + rel_mi_col;
- int mi_x, mi_y;
- MB_MODE_INFO backup_mbmi = *above_mbmi;
-
- av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
- above_mbmi, ctxt, num_planes);
- mi_x = above_mi_col << MI_SIZE_LOG2;
- mi_y = ctxt->mi_row << MI_SIZE_LOG2;
-
- const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-
- for (int j = 0; j < num_planes; ++j) {
- const struct macroblockd_plane *pd = &xd->plane[j];
- int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
- int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
- block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
-
- if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
- build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y);
- }
- *above_mbmi = backup_mbmi;
-}
-
-void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- uint8_t *tmp_buf[MAX_MB_PLANE],
- int tmp_width[MAX_MB_PLANE],
- int tmp_height[MAX_MB_PLANE],
- int tmp_stride[MAX_MB_PLANE]) {
- if (!xd->up_available) return;
-
- // Adjust mb_to_bottom_edge to have the correct value for the OBMC
- // prediction block. This is half the height of the original block,
- // except for 128-wide blocks, where we only use a height of 32.
- int this_height = xd->n8_h * MI_SIZE;
- int pred_height = AOMMIN(this_height / 2, 32);
- xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
-
- struct build_prediction_ctxt ctxt = { cm, mi_row,
- mi_col, tmp_buf,
- tmp_width, tmp_height,
- tmp_stride, xd->mb_to_right_edge };
- BLOCK_SIZE bsize = xd->mi[0]->sb_type;
- foreach_overlappable_nb_above(cm, xd, mi_col,
- max_neighbor_obmc[mi_size_wide_log2[bsize]],
- build_prediction_by_above_pred, &ctxt);
-
- xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
- xd->mb_to_right_edge = ctxt.mb_to_far_edge;
- xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+ (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
}
void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
@@ -1386,101 +972,7 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
xd->mb_to_bottom_edge =
ctxt->mb_to_far_edge +
- (xd->n8_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
-}
-
-static INLINE void build_prediction_by_left_pred(
- MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
- MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
- struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
- const int left_mi_row = ctxt->mi_row + rel_mi_row;
- int mi_x, mi_y;
- MB_MODE_INFO backup_mbmi = *left_mbmi;
-
- av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
- left_mbmi, ctxt, num_planes);
- mi_x = ctxt->mi_col << MI_SIZE_LOG2;
- mi_y = left_mi_row << MI_SIZE_LOG2;
- const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
-
- for (int j = 0; j < num_planes; ++j) {
- const struct macroblockd_plane *pd = &xd->plane[j];
- int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
- block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
- int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
-
- if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
- build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y);
- }
- *left_mbmi = backup_mbmi;
-}
-
-void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- uint8_t *tmp_buf[MAX_MB_PLANE],
- int tmp_width[MAX_MB_PLANE],
- int tmp_height[MAX_MB_PLANE],
- int tmp_stride[MAX_MB_PLANE]) {
- if (!xd->left_available) return;
-
- // Adjust mb_to_right_edge to have the correct value for the OBMC
- // prediction block. This is half the width of the original block,
- // except for 128-wide blocks, where we only use a width of 32.
- int this_width = xd->n8_w * MI_SIZE;
- int pred_width = AOMMIN(this_width / 2, 32);
- xd->mb_to_right_edge += (this_width - pred_width) * 8;
-
- struct build_prediction_ctxt ctxt = { cm, mi_row,
- mi_col, tmp_buf,
- tmp_width, tmp_height,
- tmp_stride, xd->mb_to_bottom_edge };
- BLOCK_SIZE bsize = xd->mi[0]->sb_type;
- foreach_overlappable_nb_left(cm, xd, mi_row,
- max_neighbor_obmc[mi_size_high_log2[bsize]],
- build_prediction_by_left_pred, &ctxt);
-
- xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
- xd->mb_to_right_edge -= (this_width - pred_width) * 8;
- xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
-}
-
-void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col) {
- const int num_planes = av1_num_planes(cm);
- DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
- uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
- int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- int len = sizeof(uint16_t);
- dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
- dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
- dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
- dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
- dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
- dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
- } else {
- dst_buf1[0] = tmp_buf1;
- dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
- dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
- dst_buf2[0] = tmp_buf2;
- dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
- dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
- }
- av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
- dst_width1, dst_height1, dst_stride1);
- av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
- dst_width2, dst_height2, dst_stride2);
- av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
- mi_row, mi_col, 0, num_planes);
- av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
- dst_buf2, dst_stride2);
+ (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
}
/* clang-format off */
@@ -1668,127 +1160,3 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
}
-
-void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *ypred, uint8_t *upred,
- uint8_t *vpred, int ystride, int ustride,
- int vstride, BUFFER_SET *ctx,
- BLOCK_SIZE bsize) {
- av1_build_interintra_predictors_sbp(cm, xd, ypred, ystride, ctx, 0, bsize);
- av1_build_interintra_predictors_sbuv(cm, xd, upred, vpred, ustride, vstride,
- ctx, bsize);
-}
-
-// Builds the inter-predictor for the single ref case
-// for use in the encoder to search the wedges efficiently.
-static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
- int bw, int bh, int x, int y,
- int w, int h, int mi_x, int mi_y,
- int ref, uint8_t *const ext_dst,
- int ext_dst_stride,
- int can_use_previous) {
- struct macroblockd_plane *const pd = &xd->plane[plane];
- const MB_MODE_INFO *mi = xd->mi[0];
-
- const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
- struct buf_2d *const pre_buf = &pd->pre[ref];
- uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
- const MV mv = mi->mv[ref].as_mv;
-
- ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd);
- WarpTypesAllowed warp_types;
- const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
- warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
- warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
- const int pre_x = (mi_x) >> pd->subsampling_x;
- const int pre_y = (mi_y) >> pd->subsampling_y;
- uint8_t *pre;
- SubpelParams subpel_params;
- calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
- &subpel_params, bw, bh);
-
- av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
- &subpel_params, sf, w, h, &conv_params,
- mi->interp_filters, &warp_types, pre_x + x,
- pre_y + y, plane, ref, mi, 0, xd, can_use_previous);
-}
-
-void av1_build_inter_predictors_for_planes_single_buf(
- MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
- int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
- int can_use_previous) {
- int plane;
- const int mi_x = mi_col * MI_SIZE;
- const int mi_y = mi_row * MI_SIZE;
- for (plane = plane_from; plane <= plane_to; ++plane) {
- const BLOCK_SIZE plane_bsize = get_plane_block_size(
- bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
- const int bw = block_size_wide[plane_bsize];
- const int bh = block_size_high[plane_bsize];
- build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x,
- mi_y, ref, ext_dst[plane],
- ext_dst_stride[plane], can_use_previous);
- }
-}
-
-static void build_wedge_inter_predictor_from_buf(
- MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
- int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
- MB_MODE_INFO *const mbmi = xd->mi[0];
- const int is_compound = has_second_ref(mbmi);
- MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
- struct buf_2d *const dst_buf = &pd->dst;
- uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
- mbmi->interinter_comp.seg_mask = xd->seg_mask;
- const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
-
- if (is_compound && is_masked_compound_type(comp_data->type)) {
- if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- av1_build_compound_diffwtd_mask_highbd(
- comp_data->seg_mask, comp_data->mask_type,
- CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
- CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
- else
- av1_build_compound_diffwtd_mask(
- comp_data->seg_mask, comp_data->mask_type, ext_dst0,
- ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
- }
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- build_masked_compound_highbd(
- dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
- CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
- mbmi->sb_type, h, w, xd->bd);
- else
- build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
- ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
- h, w);
- } else {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
- dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
- xd->bd);
- else
- aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
- 0, NULL, 0, w, h);
- }
-}
-
-void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
- int plane_from, int plane_to,
- uint8_t *ext_dst0[3],
- int ext_dst_stride0[3],
- uint8_t *ext_dst1[3],
- int ext_dst_stride1[3]) {
- int plane;
- for (plane = plane_from; plane <= plane_to; ++plane) {
- const BLOCK_SIZE plane_bsize = get_plane_block_size(
- bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
- const int bw = block_size_wide[plane_bsize];
- const int bh = block_size_high[plane_bsize];
- build_wedge_inter_predictor_from_buf(
- xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
- ext_dst1[plane], ext_dst_stride1[plane]);
- }
-}
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
index 6a3def270..db86c777e 100644
--- a/third_party/aom/av1/common/reconinter.h
+++ b/third_party/aom/av1/common/reconinter.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_RECONINTER_H_
-#define AV1_COMMON_RECONINTER_H_
+#ifndef AOM_AV1_COMMON_RECONINTER_H_
+#define AOM_AV1_COMMON_RECONINTER_H_
#include "av1/common/filter.h"
#include "av1/common/onyxc_int.h"
@@ -113,40 +113,48 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
const SubpelParams *subpel_params,
const struct scale_factors *sf, int w, int h,
ConvolveParams *conv_params,
- InterpFilters interp_filters) {
+ InterpFilters interp_filters,
+ int is_intrabc) {
assert(conv_params->do_average == 0 || conv_params->do_average == 1);
assert(sf);
- if (has_scale(subpel_params->xs, subpel_params->ys)) {
+ const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+ assert(IMPLIES(is_intrabc, !is_scaled));
+ if (is_scaled) {
av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
interp_filters, subpel_params->subpel_x,
subpel_params->xs, subpel_params->subpel_y,
- subpel_params->ys, 1, conv_params, sf);
+ subpel_params->ys, 1, conv_params, sf, is_intrabc);
} else {
SubpelParams sp = *subpel_params;
revert_scale_extra_bits(&sp);
av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
- sp.ys, 0, conv_params, sf);
+ sp.ys, 0, conv_params, sf, is_intrabc);
}
}
-static INLINE void highbd_inter_predictor(
- const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
- const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
- int h, ConvolveParams *conv_params, InterpFilters interp_filters, int bd) {
+static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const SubpelParams *subpel_params,
+ const struct scale_factors *sf, int w,
+ int h, ConvolveParams *conv_params,
+ InterpFilters interp_filters,
+ int is_intrabc, int bd) {
assert(conv_params->do_average == 0 || conv_params->do_average == 1);
assert(sf);
- if (has_scale(subpel_params->xs, subpel_params->ys)) {
- av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
- interp_filters, subpel_params->subpel_x,
- subpel_params->xs, subpel_params->subpel_y,
- subpel_params->ys, 1, conv_params, sf, bd);
+ const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+ assert(IMPLIES(is_intrabc, !is_scaled));
+ if (is_scaled) {
+ av1_highbd_convolve_2d_facade(
+ src, src_stride, dst, dst_stride, w, h, interp_filters,
+ subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y,
+ subpel_params->ys, 1, conv_params, sf, is_intrabc, bd);
} else {
SubpelParams sp = *subpel_params;
revert_scale_extra_bits(&sp);
- av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
- interp_filters, sp.subpel_x, sp.xs,
- sp.subpel_y, sp.ys, 0, conv_params, sf, bd);
+ av1_highbd_convolve_2d_facade(
+ src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x,
+ sp.xs, sp.subpel_y, sp.ys, 0, conv_params, sf, is_intrabc, bd);
}
}
@@ -237,35 +245,6 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
return clamped_mv;
}
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize);
-
-void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, const MV *src_mv,
- const struct scale_factors *sf, int w, int h,
- ConvolveParams *conv_params,
- InterpFilters interp_filters,
- const WarpTypesAllowed *warp_types, int p_col,
- int p_row, int plane, int ref,
- enum mv_precision precision, int x, int y,
- const MACROBLOCKD *xd, int can_use_previous);
-
-void av1_highbd_build_inter_predictor(
- const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
- const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
- InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col,
- int p_row, int plane, enum mv_precision precision, int x, int y,
- const MACROBLOCKD *xd, int can_use_previous);
-
static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
const struct scale_factors *sf) {
const int x =
@@ -303,32 +282,6 @@ void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
const struct scale_factors *sf, const int num_planes);
-// Detect if the block have sub-pixel level motion vectors
-// per component.
-#define CHECK_SUBPEL 0
-static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi,
- const MACROBLOCKD *const xd,
- int dir) {
-#if CHECK_SUBPEL
- const BLOCK_SIZE bsize = mbmi->sb_type;
- int plane;
- int ref = (dir >> 1);
-
- if (dir & 0x01) {
- if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
- } else {
- if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
- }
-
- return 0;
-#else
- (void)mbmi;
- (void)xd;
- (void)dir;
- return 1;
-#endif
-}
-
static INLINE void set_default_interp_filters(
MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
mbmi->interp_filters =
@@ -343,21 +296,6 @@ static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
return 1;
}
-static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
- MB_MODE_INFO *const mi = xd->mi[0];
- const int is_compound = has_second_ref(mi);
- int ref;
- for (ref = 0; ref < 1 + is_compound; ++ref) {
- int row_col;
- for (row_col = 0; row_col < 2; ++row_col) {
- const int dir = (ref << 1) + row_col;
- if (has_subpel_mv_component(mi, xd, dir)) {
- return 1;
- }
- }
- }
- return 0;
-}
void av1_setup_build_prediction_by_above_pred(
MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
@@ -367,18 +305,6 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
MB_MODE_INFO *left_mbmi,
struct build_prediction_ctxt *ctxt,
const int num_planes);
-void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- uint8_t *tmp_buf[MAX_MB_PLANE],
- int tmp_width[MAX_MB_PLANE],
- int tmp_height[MAX_MB_PLANE],
- int tmp_stride[MAX_MB_PLANE]);
-void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- uint8_t *tmp_buf[MAX_MB_PLANE],
- int tmp_width[MAX_MB_PLANE],
- int tmp_height[MAX_MB_PLANE],
- int tmp_stride[MAX_MB_PLANE]);
void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
int mi_row, int mi_col,
uint8_t *above[MAX_MB_PLANE],
@@ -389,8 +315,6 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
const uint8_t *av1_get_obmc_mask(int length);
void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
int mi_row, int mi_col);
-void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col);
#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
@@ -406,12 +330,6 @@ static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
const uint8_t *av1_get_compound_type_mask(
const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
-void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
- uint8_t *ypred, uint8_t *upred,
- uint8_t *vpred, int ystride, int ustride,
- int vstride, BUFFER_SET *ctx,
- BLOCK_SIZE bsize);
-
// build interintra_predictors for one plane
void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
uint8_t *pred, int stride,
@@ -431,18 +349,6 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
const uint8_t *inter_pred, int inter_stride,
const uint8_t *intra_pred, int intra_stride);
-// Encoder only
-void av1_build_inter_predictors_for_planes_single_buf(
- MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
- int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
- int can_use_previous);
-void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
- int plane_from, int plane_to,
- uint8_t *ext_dst0[3],
- int ext_dst_stride0[3],
- uint8_t *ext_dst1[3],
- int ext_dst_stride1[3]);
-
void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
int order_idx, int *fwd_offset, int *bck_offset,
int *use_jnt_comp_avg, int is_compound);
@@ -456,4 +362,4 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi,
} // extern "C"
#endif
-#endif // AV1_COMMON_RECONINTER_H_
+#endif // AOM_AV1_COMMON_RECONINTER_H_
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
index 57638f24e..07853aba0 100644
--- a/third_party/aom/av1/common/reconintra.h
+++ b/third_party/aom/av1/common/reconintra.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_RECONINTRA_H_
-#define AV1_COMMON_RECONINTRA_H_
+#ifndef AOM_AV1_COMMON_RECONINTRA_H_
+#define AOM_AV1_COMMON_RECONINTRA_H_
#include <stdlib.h>
@@ -116,4 +116,4 @@ static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_COMMON_RECONINTRA_H_
+#endif // AOM_AV1_COMMON_RECONINTRA_H_
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
index 93d62292a..d61a20aa2 100644
--- a/third_party/aom/av1/common/resize.c
+++ b/third_party/aom/av1/common/resize.c
@@ -170,42 +170,6 @@ static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = {
{ -1, 3, -9, 17, 112, 10, -7, 3 }, { -1, 3, -8, 15, 112, 12, -7, 2 },
};
-// Filters for interpolation (full-band) - no filtering for integer pixels
-static const InterpKernel filteredinterp_filters1000[(1 << RS_SUBPEL_BITS)] = {
- { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -1, 128, 2, -1, 0, 0 },
- { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -4, 127, 6, -3, 1, 0 },
- { 0, 2, -6, 126, 8, -3, 1, 0 }, { 0, 2, -7, 125, 11, -4, 1, 0 },
- { -1, 2, -8, 125, 13, -5, 2, 0 }, { -1, 3, -9, 124, 15, -6, 2, 0 },
- { -1, 3, -10, 123, 18, -6, 2, -1 }, { -1, 3, -11, 122, 20, -7, 3, -1 },
- { -1, 4, -12, 121, 22, -8, 3, -1 }, { -1, 4, -13, 120, 25, -9, 3, -1 },
- { -1, 4, -14, 118, 28, -9, 3, -1 }, { -1, 4, -15, 117, 30, -10, 4, -1 },
- { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 },
- { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 },
- { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 },
- { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 },
- { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 },
- { -1, 6, -20, 97, 58, -17, 6, -1 }, { -1, 6, -20, 95, 61, -18, 6, -1 },
- { -2, 7, -20, 93, 64, -18, 6, -2 }, { -2, 7, -20, 91, 66, -19, 6, -1 },
- { -2, 7, -20, 88, 69, -19, 6, -1 }, { -2, 7, -20, 86, 71, -19, 6, -1 },
- { -2, 7, -20, 84, 74, -20, 7, -2 }, { -2, 7, -20, 81, 76, -20, 7, -1 },
- { -2, 7, -20, 79, 79, -20, 7, -2 }, { -1, 7, -20, 76, 81, -20, 7, -2 },
- { -2, 7, -20, 74, 84, -20, 7, -2 }, { -1, 6, -19, 71, 86, -20, 7, -2 },
- { -1, 6, -19, 69, 88, -20, 7, -2 }, { -1, 6, -19, 66, 91, -20, 7, -2 },
- { -2, 6, -18, 64, 93, -20, 7, -2 }, { -1, 6, -18, 61, 95, -20, 6, -1 },
- { -1, 6, -17, 58, 97, -20, 6, -1 }, { -1, 6, -17, 56, 99, -20, 6, -1 },
- { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 },
- { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 },
- { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 },
- { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 },
- { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 },
- { -1, 3, -9, 28, 118, -14, 4, -1 }, { -1, 3, -9, 25, 120, -13, 4, -1 },
- { -1, 3, -8, 22, 121, -12, 4, -1 }, { -1, 3, -7, 20, 122, -11, 3, -1 },
- { -1, 2, -6, 18, 123, -10, 3, -1 }, { 0, 2, -6, 15, 124, -9, 3, -1 },
- { 0, 2, -5, 13, 125, -8, 2, -1 }, { 0, 1, -4, 11, 125, -7, 2, 0 },
- { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -3, 6, 127, -4, 1, 0 },
- { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, -1, 2, 128, -1, 0, 0 },
-};
-
const int16_t av1_resize_filter_normative[(
1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = {
#if UPSCALE_NORMATIVE_TAPS == 8
@@ -246,6 +210,9 @@ const int16_t av1_resize_filter_normative[(
#endif // UPSCALE_NORMATIVE_TAPS == 8
};
+// Filters for interpolation (full-band) - no filtering for integer pixels
+#define filteredinterp_filters1000 av1_resize_filter_normative
+
// Filters for factor of 2 downsampling.
static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h
index feec3a90e..9a59a8d63 100644
--- a/third_party/aom/av1/common/resize.h
+++ b/third_party/aom/av1/common/resize.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RESIZE_H_
-#define AV1_ENCODER_RESIZE_H_
+#ifndef AOM_AV1_COMMON_RESIZE_H_
+#define AOM_AV1_COMMON_RESIZE_H_
#include <stdio.h>
#include "aom/aom_integer.h"
@@ -109,4 +109,4 @@ int32_t av1_get_upscale_convolve_step(int in_length, int out_length);
} // extern "C"
#endif
-#endif // AV1_ENCODER_RESIZE_H_
+#endif // AOM_AV1_COMMON_RESIZE_H_
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
index 632967957..d276a915b 100644
--- a/third_party/aom/av1/common/restoration.c
+++ b/third_party/aom/av1/common/restoration.c
@@ -661,9 +661,10 @@ const int32_t one_by_x[MAX_NELEM] = {
293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
};
-static void selfguided_restoration_fast_internal(
- int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
- int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+static void calculate_intermediate_result(int32_t *dgd, int width, int height,
+ int dgd_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx,
+ int pass, int32_t *A, int32_t *B) {
const sgr_params_type *const params = &sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
@@ -673,10 +674,7 @@ static void selfguided_restoration_fast_internal(
// We also align the stride to a multiple of 16 bytes, for consistency
// with the SIMD version of this function.
int buf_stride = ((width_ext + 3) & ~3) + 16;
- int32_t A_[RESTORATION_PROC_UNIT_PELS];
- int32_t B_[RESTORATION_PROC_UNIT_PELS];
- int32_t *A = A_;
- int32_t *B = B_;
+ const int step = pass == 0 ? 1 : 2;
int i, j;
assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
@@ -691,7 +689,7 @@ static void selfguided_restoration_fast_internal(
B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
// Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
// for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
- for (i = -1; i < height + 1; i += 2) {
+ for (i = -1; i < height + 1; i += step) {
for (j = -1; j < width + 1; ++j) {
const int k = i * buf_stride + j;
const int n = (2 * r + 1) * (2 * r + 1);
@@ -754,7 +752,31 @@ static void selfguided_restoration_fast_internal(
SGRPROJ_RECIP_BITS);
}
}
+}
+
+static void selfguided_restoration_fast_internal(
+ int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
+ int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes, for consistency
+ // with the SIMD version of this function.
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *A = A_;
+ int32_t *B = B_;
+ int i, j;
+ calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+ sgr_params_idx, radius_idx, 1, A, B);
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
// Use the A[] and B[] arrays to calculate the filtered image
+ (void)r;
assert(r == 2);
for (i = 0; i < height; ++i) {
if (!(i & 1)) { // even row
@@ -796,10 +818,7 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
int dst_stride, int bit_depth,
int sgr_params_idx,
int radius_idx) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
- const int r = params->r[radius_idx];
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
- const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes, for consistency
@@ -810,82 +829,11 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
int32_t *A = A_;
int32_t *B = B_;
int i, j;
-
- assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
- assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
- "Need SGRPROJ_BORDER_* >= r+1");
-
- boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
- width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
- boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
- width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
+ calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+ sgr_params_idx, radius_idx, 0, A, B);
A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
- // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
- // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
- for (i = -1; i < height + 1; ++i) {
- for (j = -1; j < width + 1; ++j) {
- const int k = i * buf_stride + j;
- const int n = (2 * r + 1) * (2 * r + 1);
-
- // a < 2^16 * n < 2^22 regardless of bit depth
- uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
- // b < 2^8 * n < 2^14 regardless of bit depth
- uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
-
- // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
- // and p itself satisfies p < 2^14 * n^2 < 2^26.
- // This bound on p is due to:
- // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
- //
- // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
- // This is an artefact of rounding, and can only happen if all pixels
- // are (almost) identical, so in this case we saturate to p=0.
- uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
-
- const uint32_t s = params->s[radius_idx];
-
- // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
- // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
- // (this holds even after accounting for the rounding in s)
- const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
-
- // Note: We have to be quite careful about the value of A[k].
- // This is used as a blend factor between individual pixel values and the
- // local mean. So it logically has a range of [0, 256], including both
- // endpoints.
- //
- // This is a pain for hardware, as we'd like something which can be stored
- // in exactly 8 bits.
- // Further, in the calculation of B[k] below, if z == 0 and r == 2,
- // then A[k] "should be" 0. But then we can end up setting B[k] to a value
- // slightly above 2^(8 + bit depth), due to rounding in the value of
- // one_by_x[25-1].
- //
- // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
- // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
- // overflow), without significantly affecting the final result: z == 0
- // implies that the image is essentially "flat", so the local mean and
- // individual pixel values are very similar.
- //
- // Note that saturating on the other side, ie. requring A[k] <= 255,
- // would be a bad idea, as that corresponds to the case where the image
- // is very variable, when we want to preserve the local pixel value as
- // much as possible.
- A[k] = x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
- // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
- // one_by_x[n - 1] = round(2^12 / n)
- // => the product here is < 2^(20 + bit_depth) <= 2^32,
- // and B[k] is set to a value < 2^(8 + bit depth)
- // This holds even with the rounding in one_by_x and in the overall
- // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
- B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
- (uint32_t)B[k] *
- (uint32_t)one_by_x[n - 1],
- SGRPROJ_RECIP_BITS);
- }
- }
// Use the A[] and B[] arrays to calculate the filtered image
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
@@ -911,10 +859,10 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
}
}
-void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
- int dgd_stride, int32_t *flt0, int32_t *flt1,
- int flt_stride, int sgr_params_idx,
- int bit_depth, int highbd) {
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0, int32_t *flt1,
+ int flt_stride, int sgr_params_idx,
+ int bit_depth, int highbd) {
int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
int32_t *dgd32 =
@@ -948,6 +896,7 @@ void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
if (params->r[1] > 0)
selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
flt_stride, bit_depth, sgr_params_idx, 1);
+ return 0;
}
void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
@@ -959,8 +908,10 @@ void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
- av1_selfguided_restoration_c(dat8, width, height, stride, flt0, flt1, width,
- eps, bit_depth, highbd);
+ const int ret = av1_selfguided_restoration_c(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ (void)ret;
+ assert(!ret);
const sgr_params_type *const params = &sgr_params[eps];
int xq[2];
decode_xq(xqd, xq, params);
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
index aec37d834..d834f9270 100644
--- a/third_party/aom/av1/common/restoration.h
+++ b/third_party/aom/av1/common/restoration.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_RESTORATION_H_
-#define AV1_COMMON_RESTORATION_H_
+#ifndef AOM_AV1_COMMON_RESTORATION_H_
+#define AOM_AV1_COMMON_RESTORATION_H_
#include "aom_ports/mem.h"
#include "config/aom_config.h"
@@ -120,6 +120,7 @@ extern "C" {
// If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for
// chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN.
#define WIENER_WIN_CHROMA (WIENER_WIN - 2)
+#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
#define WIENER_FILT_PREC_BITS 7
#define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
@@ -373,4 +374,4 @@ void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
} // extern "C"
#endif
-#endif // AV1_COMMON_RESTORATION_H_
+#endif // AOM_AV1_COMMON_RESTORATION_H_
diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h
index 5f02fdb81..748e958c3 100644
--- a/third_party/aom/av1/common/scale.h
+++ b/third_party/aom/av1/common/scale.h
@@ -9,12 +9,11 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_SCALE_H_
-#define AV1_COMMON_SCALE_H_
+#ifndef AOM_AV1_COMMON_SCALE_H_
+#define AOM_AV1_COMMON_SCALE_H_
#include "av1/common/convolve.h"
#include "av1/common/mv.h"
-#include "aom_dsp/aom_convolve.h"
#ifdef __cplusplus
extern "C" {
@@ -65,4 +64,4 @@ static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
} // extern "C"
#endif
-#endif // AV1_COMMON_SCALE_H_
+#endif // AOM_AV1_COMMON_SCALE_H_
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
index d206586b5..233dc0efa 100644
--- a/third_party/aom/av1/common/scan.h
+++ b/third_party/aom/av1/common/scan.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_SCAN_H_
-#define AV1_COMMON_SCAN_H_
+#ifndef AOM_AV1_COMMON_SCAN_H_
+#define AOM_AV1_COMMON_SCAN_H_
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
@@ -52,4 +52,4 @@ static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
} // extern "C"
#endif
-#endif // AV1_COMMON_SCAN_H_
+#endif // AOM_AV1_COMMON_SCAN_H_
diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h
index c851d65fd..8c35bba86 100644
--- a/third_party/aom/av1/common/seg_common.h
+++ b/third_party/aom/av1/common/seg_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_SEG_COMMON_H_
-#define AV1_COMMON_SEG_COMMON_H_
+#ifndef AOM_AV1_COMMON_SEG_COMMON_H_
+#define AOM_AV1_COMMON_SEG_COMMON_H_
#include "aom_dsp/prob.h"
@@ -101,4 +101,4 @@ static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
} // extern "C"
#endif
-#endif // AV1_COMMON_SEG_COMMON_H_
+#endif // AOM_AV1_COMMON_SEG_COMMON_H_
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
index f9b734b8c..8df4c9a09 100644
--- a/third_party/aom/av1/common/thread_common.c
+++ b/third_party/aom/av1/common/thread_common.c
@@ -304,8 +304,9 @@ static INLINE void thread_loop_filter_rows(
}
// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(AV1LfSync *const lf_sync,
- LFWorkerData *const lf_data) {
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+ AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
+ LFWorkerData *const lf_data = (LFWorkerData *)arg2;
thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->xd, lf_sync);
return 1;
@@ -342,7 +343,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
AVxWorker *const worker = &workers[i];
LFWorkerData *const lf_data = &lf_sync->lfdata[i];
- worker->hook = (AVxWorkerHook)loop_filter_row_worker;
+ worker->hook = loop_filter_row_worker;
worker->data1 = lf_sync;
worker->data2 = lf_data;
@@ -649,8 +650,9 @@ AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
}
// Implement row loop restoration for each thread.
-static int loop_restoration_row_worker(AV1LrSync *const lr_sync,
- LRWorkerData *lrworkerdata) {
+static int loop_restoration_row_worker(void *arg1, void *arg2) {
+ AV1LrSync *const lr_sync = (AV1LrSync *)arg1;
+ LRWorkerData *lrworkerdata = (LRWorkerData *)arg2;
AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt;
FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
int lr_unit_row;
@@ -714,10 +716,12 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
int num_rows_lr = 0;
for (int plane = 0; plane < num_planes; plane++) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
const AV1PixelRect tile_rect = ctxt[plane].tile_rect;
const int max_tile_h = tile_rect.bottom - tile_rect.top;
- const int unit_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+ const int unit_size = cm->rst_info[plane].restoration_unit_size;
num_rows_lr =
AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h));
@@ -746,7 +750,7 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
for (i = 0; i < num_workers; ++i) {
AVxWorker *const worker = &workers[i];
lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
- worker->hook = (AVxWorkerHook)loop_restoration_row_worker;
+ worker->hook = loop_restoration_row_worker;
worker->data1 = lr_sync;
worker->data2 = &lr_sync->lrworkerdata[i];
diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h
index 4b0d5d2b8..23d61d72a 100644
--- a/third_party/aom/av1/common/thread_common.h
+++ b/third_party/aom/av1/common/thread_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_LOOPFILTER_THREAD_H_
-#define AV1_COMMON_LOOPFILTER_THREAD_H_
+#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_
+#define AOM_AV1_COMMON_THREAD_COMMON_H_
#include "config/aom_config.h"
@@ -116,4 +116,4 @@ void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
} // extern "C"
#endif
-#endif // AV1_COMMON_LOOPFILTER_THREAD_H_
+#endif // AOM_AV1_COMMON_THREAD_COMMON_H_
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
index 026c904b6..1b413487f 100644
--- a/third_party/aom/av1/common/tile_common.c
+++ b/third_party/aom/av1/common/tile_common.c
@@ -127,6 +127,22 @@ void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
assert(tile->mi_col_end > tile->mi_col_start);
}
+int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) {
+ int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
+ tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
+ int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+ return sb_rows;
+}
+
+int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
+ int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
+ tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
+ int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+ return sb_cols;
+}
+
int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
// Round the frame up to a whole number of max superblocks
mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2);
diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h
index be037fb17..c03553dc6 100644
--- a/third_party/aom/av1/common/tile_common.h
+++ b/third_party/aom/av1/common/tile_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_TILE_COMMON_H_
-#define AV1_COMMON_TILE_COMMON_H_
+#ifndef AOM_AV1_COMMON_TILE_COMMON_H_
+#define AOM_AV1_COMMON_TILE_COMMON_H_
#ifdef __cplusplus
extern "C" {
@@ -44,6 +44,9 @@ void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
// tiles horizontally or vertically in the frame.
int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles);
+int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile);
+int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile);
+
typedef struct {
int left, top, right, bottom;
} AV1PixelRect;
@@ -66,4 +69,4 @@ void av1_calculate_tile_rows(struct AV1Common *const cm);
} // extern "C"
#endif
-#endif // AV1_COMMON_TILE_COMMON_H_
+#endif // AOM_AV1_COMMON_TILE_COMMON_H_
diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h
index 1749baa57..06939ae43 100644
--- a/third_party/aom/av1/common/timing.h
+++ b/third_party/aom/av1/common/timing.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_TIMING_H_
-#define AOM_TIMING_H_
+#ifndef AOM_AV1_COMMON_TIMING_H_
+#define AOM_AV1_COMMON_TIMING_H_
#include "aom/aom_integer.h"
#include "av1/common/enums.h"
@@ -56,4 +56,4 @@ void set_resource_availability_parameters(
int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
int seq_tier);
-#endif // AOM_TIMING_H_
+#endif // AOM_AV1_COMMON_TIMING_H_
diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h
index 9a6b454ac..53e956450 100644
--- a/third_party/aom/av1/common/token_cdfs.h
+++ b/third_party/aom/av1/common/token_cdfs.h
@@ -9,6 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_
+#define AOM_AV1_COMMON_TOKEN_CDFS_H_
+
#include "config/aom_config.h"
#include "av1/common/entropy.h"
@@ -3548,3 +3551,5 @@ static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
{ AOM_CDF3(10923, 21845) },
{ AOM_CDF3(10923, 21845) },
{ AOM_CDF3(10923, 21845) } } } } };
+
+#endif // AOM_AV1_COMMON_TOKEN_CDFS_H_
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
index f0ab79d0f..1dda51f8b 100644
--- a/third_party/aom/av1/common/txb_common.h
+++ b/third_party/aom/av1/common/txb_common.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_TXB_COMMON_H_
-#define AV1_COMMON_TXB_COMMON_H_
+#ifndef AOM_AV1_COMMON_TXB_COMMON_H_
+#define AOM_AV1_COMMON_TXB_COMMON_H_
extern const int16_t k_eob_group_start[12];
extern const int16_t k_eob_offset_bits[12];
@@ -34,24 +34,6 @@ static const int base_level_count_to_index[13] = {
0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
};
-// Note: TX_PAD_2D is dependent to this offset table.
-static const int base_ref_offset[BASE_CONTEXT_POSITION_NUM][2] = {
- /* clang-format off*/
- { -2, 0 }, { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -2 }, { 0, -1 }, { 0, 1 },
- { 0, 2 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 2, 0 }
- /* clang-format on*/
-};
-
-#define CONTEXT_MAG_POSITION_NUM 3
-static const int mag_ref_offset_with_txclass[3][CONTEXT_MAG_POSITION_NUM][2] = {
- { { 0, 1 }, { 1, 0 }, { 1, 1 } },
- { { 0, 1 }, { 1, 0 }, { 0, 2 } },
- { { 0, 1 }, { 1, 0 }, { 2, 0 } }
-};
-static const int mag_ref_offset[CONTEXT_MAG_POSITION_NUM][2] = {
- { 0, 1 }, { 1, 0 }, { 1, 1 }
-};
-
static const TX_CLASS tx_type_to_class[TX_TYPES] = {
TX_CLASS_2D, // DCT_DCT
TX_CLASS_2D, // ADST_DCT
@@ -71,61 +53,6 @@ static const TX_CLASS tx_type_to_class[TX_TYPES] = {
TX_CLASS_HORIZ, // H_FLIPADST
};
-static const int8_t eob_to_pos_small[33] = {
- 0, 1, 2, // 0-2
- 3, 3, // 3-4
- 4, 4, 4, 4, // 5-8
- 5, 5, 5, 5, 5, 5, 5, 5, // 9-16
- 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32
-};
-
-static const int8_t eob_to_pos_large[17] = {
- 6, // place holder
- 7, // 33-64
- 8, 8, // 65-128
- 9, 9, 9, 9, // 129-256
- 10, 10, 10, 10, 10, 10, 10, 10, // 257-512
- 11 // 513-
-};
-
-static INLINE int get_eob_pos_token(const int eob, int *const extra) {
- int t;
-
- if (eob < 33) {
- t = eob_to_pos_small[eob];
- } else {
- const int e = AOMMIN((eob - 1) >> 5, 16);
- t = eob_to_pos_large[e];
- }
-
- *extra = eob - k_eob_group_start[t];
-
- return t;
-}
-
-static INLINE int av1_get_eob_pos_ctx(const TX_TYPE tx_type,
- const int eob_token) {
- static const int8_t tx_type_to_offset[TX_TYPES] = {
- -1, // DCT_DCT
- -1, // ADST_DCT
- -1, // DCT_ADST
- -1, // ADST_ADST
- -1, // FLIPADST_DCT
- -1, // DCT_FLIPADST
- -1, // FLIPADST_FLIPADST
- -1, // ADST_FLIPADST
- -1, // FLIPADST_ADST
- -1, // IDTX
- 10, // V_DCT
- 10, // H_DCT
- 10, // V_ADST
- 10, // H_ADST
- 10, // V_FLIPADST
- 10, // H_FLIPADST
- };
- return eob_token + tx_type_to_offset[tx_type];
-}
-
static INLINE int get_txb_bwl(TX_SIZE tx_size) {
tx_size = av1_get_adjusted_tx_size(tx_size);
return tx_size_wide_log2[tx_size];
@@ -141,36 +68,6 @@ static INLINE int get_txb_high(TX_SIZE tx_size) {
return tx_size_high[tx_size];
}
-static INLINE void get_base_count_mag(int *mag, int *count,
- const tran_low_t *tcoeffs, int bwl,
- int height, int row, int col) {
- mag[0] = 0;
- mag[1] = 0;
- for (int i = 0; i < NUM_BASE_LEVELS; ++i) count[i] = 0;
- for (int idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) {
- const int ref_row = row + base_ref_offset[idx][0];
- const int ref_col = col + base_ref_offset[idx][1];
- if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
- ref_col >= (1 << bwl))
- continue;
- const int pos = (ref_row << bwl) + ref_col;
- tran_low_t abs_coeff = abs(tcoeffs[pos]);
- // count
- for (int i = 0; i < NUM_BASE_LEVELS; ++i) {
- count[i] += abs_coeff > i;
- }
- // mag
- if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0) {
- if (abs_coeff > mag[0]) {
- mag[0] = abs_coeff;
- mag[1] = 1;
- } else if (abs_coeff == mag[0]) {
- ++mag[1];
- }
- }
- }
-}
-
static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int width) {
return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR);
}
@@ -179,30 +76,6 @@ static INLINE int get_padded_idx(const int idx, const int bwl) {
return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2);
}
-static INLINE int get_level_count(const uint8_t *const levels, const int stride,
- const int row, const int col, const int level,
- const int (*nb_offset)[2], const int nb_num) {
- int count = 0;
-
- for (int idx = 0; idx < nb_num; ++idx) {
- const int ref_row = row + nb_offset[idx][0];
- const int ref_col = col + nb_offset[idx][1];
- const int pos = ref_row * stride + ref_col;
- count += levels[pos] > level;
- }
- return count;
-}
-
-static INLINE void get_level_mag(const uint8_t *const levels, const int stride,
- const int row, const int col, int *const mag) {
- for (int idx = 0; idx < CONTEXT_MAG_POSITION_NUM; ++idx) {
- const int ref_row = row + mag_ref_offset[idx][0];
- const int ref_col = col + mag_ref_offset[idx][1];
- const int pos = ref_row * stride + ref_col;
- mag[idx] = levels[pos];
- }
-}
-
static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
int sig_mag) {
const int ctx = base_level_count_to_index[count];
@@ -267,84 +140,6 @@ static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
return ctx_idx;
}
-static INLINE int get_base_ctx(const uint8_t *const levels,
- const int c, // raster order
- const int bwl, const int level_minus_1,
- const int count) {
- const int row = c >> bwl;
- const int col = c - (row << bwl);
- const int stride = (1 << bwl) + TX_PAD_HOR;
- int mag_count = 0;
- int nb_mag[3] = { 0 };
-
- get_level_mag(levels, stride, row, col, nb_mag);
-
- for (int idx = 0; idx < 3; ++idx)
- mag_count += nb_mag[idx] > (level_minus_1 + 1);
- const int ctx_idx =
- get_base_ctx_from_count_mag(row, col, count, AOMMIN(2, mag_count));
- return ctx_idx;
-}
-
-#define BR_CONTEXT_POSITION_NUM 8 // Base range coefficient context
-// Note: TX_PAD_2D is dependent to this offset table.
-static const int br_ref_offset[BR_CONTEXT_POSITION_NUM][2] = {
- /* clang-format off*/
- { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -1 },
- { 0, 1 }, { 1, -1 }, { 1, 0 }, { 1, 1 },
- /* clang-format on*/
-};
-
-static const int br_level_map[9] = {
- 0, 0, 1, 1, 2, 2, 3, 3, 3,
-};
-
-// Note: If BR_MAG_OFFSET changes, the calculation of offset in
-// get_br_ctx_from_count_mag() must be updated.
-#define BR_MAG_OFFSET 1
-// TODO(angiebird): optimize this function by using a table to map from
-// count/mag to ctx
-
-static INLINE int get_br_count_mag(int *mag, const tran_low_t *tcoeffs, int bwl,
- int height, int row, int col, int level) {
- mag[0] = 0;
- mag[1] = 0;
- int count = 0;
- for (int idx = 0; idx < BR_CONTEXT_POSITION_NUM; ++idx) {
- const int ref_row = row + br_ref_offset[idx][0];
- const int ref_col = col + br_ref_offset[idx][1];
- if (ref_row < 0 || ref_col < 0 || ref_row >= height ||
- ref_col >= (1 << bwl))
- continue;
- const int pos = (ref_row << bwl) + ref_col;
- tran_low_t abs_coeff = abs(tcoeffs[pos]);
- count += abs_coeff > level;
- if (br_ref_offset[idx][0] >= 0 && br_ref_offset[idx][1] >= 0) {
- if (abs_coeff > mag[0]) {
- mag[0] = abs_coeff;
- mag[1] = 1;
- } else if (abs_coeff == mag[0]) {
- ++mag[1];
- }
- }
- }
- return count;
-}
-
-static INLINE int get_br_ctx_from_count_mag(const int row, const int col,
- const int count, const int mag) {
- // DC: 0 - 1
- // Top row: 2 - 4
- // Left column: 5 - 7
- // others: 8 - 11
- static const int offset_pos[2][2] = { { 8, 5 }, { 2, 0 } };
- const int mag_clamp = AOMMIN(mag, 6);
- const int offset = mag_clamp >> 1;
- const int ctx =
- br_level_map[count] + offset * BR_TMP_OFFSET + offset_pos[!row][!col];
- return ctx;
-}
-
static INLINE int get_br_ctx_2d(const uint8_t *const levels,
const int c, // raster order
const int bwl) {
@@ -396,38 +191,6 @@ static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
return mag + 14;
}
-#define SIG_REF_OFFSET_NUM 5
-
-// Note: TX_PAD_2D is dependent to these offset tables.
-static const int sig_ref_offset[SIG_REF_OFFSET_NUM][2] = {
- { 0, 1 }, { 1, 0 }, { 1, 1 }, { 0, 2 }, { 2, 0 }
- // , { 1, 2 }, { 2, 1 },
-};
-
-static const int sig_ref_offset_vert[SIG_REF_OFFSET_NUM][2] = {
- { 1, 0 }, { 2, 0 }, { 0, 1 }, { 3, 0 }, { 4, 0 }
- // , { 1, 1 }, { 2, 1 },
-};
-
-static const int sig_ref_offset_horiz[SIG_REF_OFFSET_NUM][2] = {
- { 0, 1 }, { 0, 2 }, { 1, 0 }, { 0, 3 }, { 0, 4 }
- // , { 1, 1 }, { 1, 2 },
-};
-
-#define SIG_REF_DIFF_OFFSET_NUM 3
-
-static const int sig_ref_diff_offset[SIG_REF_DIFF_OFFSET_NUM][2] = {
- { 1, 1 }, { 0, 2 }, { 2, 0 }
-};
-
-static const int sig_ref_diff_offset_vert[SIG_REF_DIFF_OFFSET_NUM][2] = {
- { 2, 0 }, { 3, 0 }, { 4, 0 }
-};
-
-static const int sig_ref_diff_offset_horiz[SIG_REF_DIFF_OFFSET_NUM][2] = {
- { 0, 2 }, { 0, 3 }, { 0, 4 }
-};
-
static const uint8_t clip_max3[256] = {
0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -658,4 +421,4 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
void av1_init_lv_map(AV1_COMMON *cm);
-#endif // AV1_COMMON_TXB_COMMON_H_
+#endif // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
index 412d83ed8..4144c4389 100644
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -562,7 +562,7 @@ static int64_t highbd_warp_error(
const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
- ConvolveParams conv_params = get_conv_params(0, 0, 0, bd);
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
conv_params.use_jnt_comp_avg = 0;
for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
@@ -845,7 +845,7 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
- ConvolveParams conv_params = get_conv_params(0, 0, 0, 8);
+ ConvolveParams conv_params = get_conv_params(0, 0, 8);
conv_params.use_jnt_comp_avg = 0;
for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
index ce4032ee5..a1a4f067d 100644
--- a/third_party/aom/av1/common/warped_motion.h
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_WARPED_MOTION_H_
-#define AV1_COMMON_WARPED_MOTION_H_
+#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_
+#define AOM_AV1_COMMON_WARPED_MOTION_H_
#include <stdio.h>
#include <stdlib.h>
@@ -92,4 +92,4 @@ int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
int mi_col);
int get_shear_params(WarpedMotionParams *wm);
-#endif // AV1_COMMON_WARPED_MOTION_H_
+#endif // AOM_AV1_COMMON_WARPED_MOTION_H_
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
index 0c5286f9d..d9fb53785 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -14,7 +14,6 @@
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
index ae331b40d..5db2ccf6c 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -18,6 +18,12 @@
#include "av1/common/x86/av1_inv_txfm_avx2.h"
#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
const __m256i _r, int8_t cos_bit) {
const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
index 7b5b29cf8..f74cbaeaa 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
-#define AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
#include <immintrin.h>
@@ -68,4 +68,4 @@ void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
}
#endif
-#endif // AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
index dd7cee24c..995bc3da4 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -16,6 +16,12 @@
#include "av1/common/x86/av1_inv_txfm_ssse3.h"
#include "av1/common/x86/av1_txfm_sse2.h"
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
// TODO(binpengsmail@gmail.com): replace some for loop with do {} while
static void idct4_new_sse2(const __m128i *input, __m128i *output,
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
index dc9be25d2..66bd339d1 100644
--- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
-#define AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
#include <emmintrin.h> // SSE2
#include <tmmintrin.h> // SSSE3
@@ -94,10 +94,6 @@ static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
};
-// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
-static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
- 4 * 5793 };
-
DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
};
@@ -233,4 +229,4 @@ void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
} // extern "C"
#endif
-#endif // AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
index 721cfe059..77aeb6eb1 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse2.h
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_X86_AV1_TXFM_SSE2_H_
-#define AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
#include <emmintrin.h> // SSE2
@@ -314,4 +314,4 @@ typedef struct {
#ifdef __cplusplus
}
#endif // __cplusplus
-#endif // AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
index 367e02096..6cad821b1 100644
--- a/third_party/aom/av1/common/x86/av1_txfm_sse4.h
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_TXFM_SSE4_H_
-#define AV1_TXFM_SSE4_H_
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
#include <smmintrin.h>
@@ -45,8 +45,9 @@ static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
__m128i *output,
const int size,
- const int bit) {
- const __m128i sqrt2 = _mm_set1_epi32(NewSqrt2);
+ const int bit,
+ const int val) {
+ const __m128i sqrt2 = _mm_set1_epi32(val);
if (bit > 0) {
int i;
for (i = 0; i < size; i++) {
@@ -68,4 +69,4 @@ static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
}
#endif
-#endif // AV1_TXFM_SSE4_H_
+#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h
index 7479ac3e1..3b342cd4e 100644
--- a/third_party/aom/av1/common/x86/cfl_simd.h
+++ b/third_party/aom/av1/common/x86/cfl_simd.h
@@ -9,6 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_
+#define AOM_AV1_COMMON_X86_CFL_SIMD_H_
+
#include "av1/common/blockd.h"
// SSSE3 version is optimal for with == 4, we reuse them in AVX2
@@ -236,3 +239,5 @@ void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
int dst_stride, int alpha_q3, int bd);
void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
int dst_stride, int alpha_q3, int bd);
+
+#endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
index 1099144fe..0acafd044 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -11,10 +11,8 @@
#include <immintrin.h>
-#include "config/aom_dsp_rtcd.h"
#include "config/av1_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/x86/convolve_common_intrin.h"
#include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
index 637f83cf7..b1a62a4f6 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -11,9 +11,8 @@
#include <emmintrin.h>
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
index f66dee37d..5016642de 100644
--- a/third_party/aom/av1/common/x86/convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -11,9 +11,8 @@
#include <emmintrin.h>
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_common_intrin.h"
@@ -76,8 +75,8 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
return convolve(ss, coeffs);
}
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
- const uint8_t *dst, int dst_stride, int w, int h,
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
@@ -237,8 +236,8 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
}
}
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
- const uint8_t *dst, int dst_stride, int w, int h,
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
index 8444ffa93..ae68f0bbb 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -14,7 +14,6 @@
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
index eb340523a..3f8dafb4b 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -15,7 +15,6 @@
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
index 33183fdee..1d029db39 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -14,7 +14,6 @@
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
index debb05a6d..ade2af03e 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -15,6 +15,9 @@
#include "config/av1_rtcd.h"
#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
// Note:
// Total 32x4 registers to represent 32x32 block coefficients.
@@ -27,131 +30,125 @@
// ... ...
// v124, v125, v126, v127
-static void transpose_32x32_8x8(const __m256i *in, __m256i *out) {
+static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+ __m256i clamped, mask;
+
+ mask = _mm256_cmpgt_epi16(u, max);
+ clamped = _mm256_andnot_si256(mask, u);
+ mask = _mm256_and_si256(mask, max);
+ clamped = _mm256_or_si256(mask, clamped);
+ mask = _mm256_cmpgt_epi16(clamped, zero);
+ clamped = _mm256_and_si256(clamped, mask);
+
+ return clamped;
+}
+
+static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
+ __m256i res0, __m256i res1,
+ const int bd) {
+ __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
+ __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
+
+ x0 = _mm256_add_epi32(res0, x0);
+ x1 = _mm256_add_epi32(res1, x1);
+ x0 = _mm256_packus_epi32(x0, x1);
+ x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+ x0 = highbd_clamp_epi16_avx2(x0, bd);
+ return x0;
+}
+
+static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
+ __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
+
+ _mm256_storeu_si256((__m256i *)(output + i * stride), u);
+ }
+}
+
+static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+ __m256i tmp, round;
+ round = _mm256_set1_epi32(1 << (bit - 1));
+ tmp = _mm256_add_epi32(vec, round);
+ return _mm256_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
+ __m256i *output,
+ const int size,
+ const int bit) {
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = av1_round_shift_32_avx2(input[i], bit);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = _mm256_slli_epi32(input[i], -bit);
+ }
+ }
+}
+
+static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
__m256i u0, u1, u2, u3, u4, u5, u6, u7;
__m256i x0, x1;
- u0 = _mm256_unpacklo_epi32(in[0], in[4]);
- u1 = _mm256_unpackhi_epi32(in[0], in[4]);
+ u0 = _mm256_unpacklo_epi32(in[0], in[1]);
+ u1 = _mm256_unpackhi_epi32(in[0], in[1]);
- u2 = _mm256_unpacklo_epi32(in[8], in[12]);
- u3 = _mm256_unpackhi_epi32(in[8], in[12]);
+ u2 = _mm256_unpacklo_epi32(in[2], in[3]);
+ u3 = _mm256_unpackhi_epi32(in[2], in[3]);
- u4 = _mm256_unpacklo_epi32(in[16], in[20]);
- u5 = _mm256_unpackhi_epi32(in[16], in[20]);
+ u4 = _mm256_unpacklo_epi32(in[4], in[5]);
+ u5 = _mm256_unpackhi_epi32(in[4], in[5]);
- u6 = _mm256_unpacklo_epi32(in[24], in[28]);
- u7 = _mm256_unpackhi_epi32(in[24], in[28]);
+ u6 = _mm256_unpacklo_epi32(in[6], in[7]);
+ u7 = _mm256_unpackhi_epi32(in[6], in[7]);
x0 = _mm256_unpacklo_epi64(u0, u2);
x1 = _mm256_unpacklo_epi64(u4, u6);
out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
- out[16] = _mm256_permute2f128_si256(x0, x1, 0x31);
+ out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
x0 = _mm256_unpackhi_epi64(u0, u2);
x1 = _mm256_unpackhi_epi64(u4, u6);
- out[4] = _mm256_permute2f128_si256(x0, x1, 0x20);
- out[20] = _mm256_permute2f128_si256(x0, x1, 0x31);
+ out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
x0 = _mm256_unpacklo_epi64(u1, u3);
x1 = _mm256_unpacklo_epi64(u5, u7);
- out[8] = _mm256_permute2f128_si256(x0, x1, 0x20);
- out[24] = _mm256_permute2f128_si256(x0, x1, 0x31);
+ out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
x0 = _mm256_unpackhi_epi64(u1, u3);
x1 = _mm256_unpackhi_epi64(u5, u7);
- out[12] = _mm256_permute2f128_si256(x0, x1, 0x20);
- out[28] = _mm256_permute2f128_si256(x0, x1, 0x31);
-}
-
-static void transpose_32x32_16x16(const __m256i *in, __m256i *out) {
- transpose_32x32_8x8(&in[0], &out[0]);
- transpose_32x32_8x8(&in[1], &out[32]);
- transpose_32x32_8x8(&in[32], &out[1]);
- transpose_32x32_8x8(&in[33], &out[33]);
-}
-
-static void transpose_32x32(const __m256i *in, __m256i *out) {
- transpose_32x32_16x16(&in[0], &out[0]);
- transpose_32x32_16x16(&in[2], &out[64]);
- transpose_32x32_16x16(&in[64], &out[2]);
- transpose_32x32_16x16(&in[66], &out[66]);
+ out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
}
-static void load_buffer_32x32(const int32_t *coeff, __m256i *in) {
+static void load_buffer_32x32(const int32_t *coeff, __m256i *in,
+ int input_stiride, int size) {
int i;
- for (i = 0; i < 128; ++i) {
- in[i] = _mm256_loadu_si256((const __m256i *)coeff);
- coeff += 8;
+ for (i = 0; i < size; ++i) {
+ in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride));
}
}
-static __m256i highbd_clamp_epi32(__m256i x, int bd) {
- const __m256i zero = _mm256_setzero_si256();
- const __m256i one = _mm256_set1_epi16(1);
- const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
- __m256i clamped, mask;
-
- mask = _mm256_cmpgt_epi16(x, max);
- clamped = _mm256_andnot_si256(mask, x);
- mask = _mm256_and_si256(mask, max);
- clamped = _mm256_or_si256(mask, clamped);
- mask = _mm256_cmpgt_epi16(clamped, zero);
- clamped = _mm256_and_si256(clamped, mask);
-
- return clamped;
-}
-
-static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
- int fliplr, int flipud, int shift, int bd) {
- __m256i u0, u1, x0, x1, x2, x3, v0, v1, v2, v3;
- const __m256i zero = _mm256_setzero_si256();
- int i = 0;
- (void)fliplr;
- (void)flipud;
-
- __m256i round = _mm256_set1_epi32((1 << shift) >> 1);
-
- while (i < 128) {
- u0 = _mm256_loadu_si256((const __m256i *)output);
- u1 = _mm256_loadu_si256((const __m256i *)(output + 16));
-
- x0 = _mm256_unpacklo_epi16(u0, zero);
- x1 = _mm256_unpackhi_epi16(u0, zero);
- x2 = _mm256_unpacklo_epi16(u1, zero);
- x3 = _mm256_unpackhi_epi16(u1, zero);
-
- v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20);
- v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31);
- v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20);
- v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31);
-
- v0 = _mm256_add_epi32(v0, round);
- v1 = _mm256_add_epi32(v1, round);
- v2 = _mm256_add_epi32(v2, round);
- v3 = _mm256_add_epi32(v3, round);
-
- v0 = _mm256_sra_epi32(v0, _mm_cvtsi32_si128(shift));
- v1 = _mm256_sra_epi32(v1, _mm_cvtsi32_si128(shift));
- v2 = _mm256_sra_epi32(v2, _mm_cvtsi32_si128(shift));
- v3 = _mm256_sra_epi32(v3, _mm_cvtsi32_si128(shift));
-
- v0 = _mm256_add_epi32(v0, x0);
- v1 = _mm256_add_epi32(v1, x1);
- v2 = _mm256_add_epi32(v2, x2);
- v3 = _mm256_add_epi32(v3, x3);
-
- v0 = _mm256_packus_epi32(v0, v1);
- v2 = _mm256_packus_epi32(v2, v3);
-
- v0 = highbd_clamp_epi32(v0, bd);
- v2 = highbd_clamp_epi32(v2, bd);
-
- _mm256_storeu_si256((__m256i *)output, v0);
- _mm256_storeu_si256((__m256i *)(output + 16), v2);
- output += stride;
- i += 4;
- }
+static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *rounding, int bit) {
+ __m256i x;
+ x = _mm256_mullo_epi32(*w0, *n0);
+ x = _mm256_add_epi32(x, *rounding);
+ x = _mm256_srai_epi32(x, bit);
+ return x;
}
static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
@@ -200,18 +197,549 @@ static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
__m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
__m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
+ a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
a0 = _mm256_max_epi32(a0, *clamp_lo);
a0 = _mm256_min_epi32(a0, *clamp_hi);
a1 = _mm256_max_epi32(a1, *clamp_lo);
a1 = _mm256_min_epi32(a1, *clamp_hi);
- a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
- a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
*out0 = a0;
*out1 = a1;
}
+static INLINE void idct32_stage4_avx2(
+ __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
+ const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
+ const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+ bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+ bf1[17] = temp1;
+
+ temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+ bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+ bf1[18] = temp2;
+
+ temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+ bf1[21] = temp1;
+
+ temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+ bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+ bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_avx2(
+ __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
+ const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
+ const __m256i *clamp_hi, const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+ bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+ bf1[9] = temp1;
+
+ temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+ bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+ bf1[10] = temp2;
+
+ addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_avx2(
+ __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
+ const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+ const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[5] = temp1;
+
+ addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+ bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+ bf1[18] = temp1;
+ temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+ bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+ bf1[19] = temp2;
+ temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+ bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[10] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[11] = temp2;
+
+ addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+ temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[22] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
+ const int do_cols, const int bd,
+ const int out_shift,
+ const int log_range) {
+ if (do_cols) {
+ addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31);
+ addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30);
+ addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29);
+ addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28);
+ addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27);
+ addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26);
+ addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25);
+ addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24);
+ addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23);
+ addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22);
+ addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21);
+ addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20);
+ addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19);
+ addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18);
+ addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17);
+ addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i x;
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ x = _mm256_mullo_epi32(in[0], cospi32);
+ x = _mm256_add_epi32(x, rounding);
+ x = _mm256_srai_epi32(x, bit);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ if (do_cols) {
+ x = _mm256_max_epi32(x, clamp_lo);
+ x = _mm256_min_epi32(x, clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ x = _mm256_add_epi32(offset, x);
+ x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ x = _mm256_max_epi32(x, clamp_lo_out);
+ x = _mm256_min_epi32(x, clamp_hi_out);
+ }
+
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+ out[8] = x;
+ out[9] = x;
+ out[10] = x;
+ out[11] = x;
+ out[12] = x;
+ out[13] = x;
+ out[14] = x;
+ out[15] = x;
+ out[16] = x;
+ out[17] = x;
+ out[18] = x;
+ out[19] = x;
+ out[20] = x;
+ out[21] = x;
+ out[22] = x;
+ out[23] = x;
+ out[24] = x;
+ out[25] = x;
+ out[26] = x;
+ out[27] = x;
+ out[28] = x;
+ out[29] = x;
+ out[30] = x;
+ out[31] = x;
+}
+
+static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[4] = in[4];
+ bf1[8] = in[2];
+ bf1[12] = in[6];
+ bf1[16] = in[1];
+ bf1[20] = in[5];
+ bf1[24] = in[3];
+ bf1[28] = in[7];
+
+ // stage 2
+ bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+ bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+ bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+
+ bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+ bf1[17] = bf1[16];
+ bf1[18] = bf1[19];
+ bf1[21] = bf1[20];
+ bf1[22] = bf1[23];
+ bf1[25] = bf1[24];
+ bf1[26] = bf1[27];
+ bf1[29] = bf1[28];
+ bf1[30] = bf1[31];
+
+ // stage 4
+ bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+
+ bf1[9] = bf1[8];
+ bf1[10] = bf1[11];
+ bf1[13] = bf1[12];
+ bf1[14] = bf1[15];
+
+ idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[5] = bf1[4];
+ bf1[6] = bf1[7];
+
+ idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ bf1[3] = bf1[0];
+ bf1[2] = bf1[1];
+
+ idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+ }
+}
+
+static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[2] = in[8];
+ bf1[4] = in[4];
+ bf1[6] = in[12];
+ bf1[8] = in[2];
+ bf1[10] = in[10];
+ bf1[12] = in[6];
+ bf1[14] = in[14];
+ bf1[16] = in[1];
+ bf1[18] = in[9];
+ bf1[20] = in[5];
+ bf1[22] = in[13];
+ bf1[24] = in[3];
+ bf1[26] = in[11];
+ bf1[28] = in[7];
+ bf1[30] = in[15];
+
+ // stage 2
+ bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+ bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
+ bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
+ bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
+ bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
+ bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+ bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
+ bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
+ bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
+ bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
+ bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+ bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
+ bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
+ bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
+ bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
+ bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+
+ addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+ bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
+ bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
+
+ addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+ idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
+ bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
+
+ addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+ idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+ idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+ }
+}
+
static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
int out_shift) {
const int32_t *cospi = cospi_arr(bit);
@@ -270,43 +798,42 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
__m256i bf1[32], bf0[32];
- int col;
- for (col = 0; col < 4; ++col) {
+ {
// stage 0
// stage 1
- bf1[0] = in[0 * 4 + col];
- bf1[1] = in[16 * 4 + col];
- bf1[2] = in[8 * 4 + col];
- bf1[3] = in[24 * 4 + col];
- bf1[4] = in[4 * 4 + col];
- bf1[5] = in[20 * 4 + col];
- bf1[6] = in[12 * 4 + col];
- bf1[7] = in[28 * 4 + col];
- bf1[8] = in[2 * 4 + col];
- bf1[9] = in[18 * 4 + col];
- bf1[10] = in[10 * 4 + col];
- bf1[11] = in[26 * 4 + col];
- bf1[12] = in[6 * 4 + col];
- bf1[13] = in[22 * 4 + col];
- bf1[14] = in[14 * 4 + col];
- bf1[15] = in[30 * 4 + col];
- bf1[16] = in[1 * 4 + col];
- bf1[17] = in[17 * 4 + col];
- bf1[18] = in[9 * 4 + col];
- bf1[19] = in[25 * 4 + col];
- bf1[20] = in[5 * 4 + col];
- bf1[21] = in[21 * 4 + col];
- bf1[22] = in[13 * 4 + col];
- bf1[23] = in[29 * 4 + col];
- bf1[24] = in[3 * 4 + col];
- bf1[25] = in[19 * 4 + col];
- bf1[26] = in[11 * 4 + col];
- bf1[27] = in[27 * 4 + col];
- bf1[28] = in[7 * 4 + col];
- bf1[29] = in[23 * 4 + col];
- bf1[30] = in[15 * 4 + col];
- bf1[31] = in[31 * 4 + col];
+ bf1[0] = in[0];
+ bf1[1] = in[16];
+ bf1[2] = in[8];
+ bf1[3] = in[24];
+ bf1[4] = in[4];
+ bf1[5] = in[20];
+ bf1[6] = in[12];
+ bf1[7] = in[28];
+ bf1[8] = in[2];
+ bf1[9] = in[18];
+ bf1[10] = in[10];
+ bf1[11] = in[26];
+ bf1[12] = in[6];
+ bf1[13] = in[22];
+ bf1[14] = in[14];
+ bf1[15] = in[30];
+ bf1[16] = in[1];
+ bf1[17] = in[17];
+ bf1[18] = in[9];
+ bf1[19] = in[25];
+ bf1[20] = in[5];
+ bf1[21] = in[21];
+ bf1[22] = in[13];
+ bf1[23] = in[29];
+ bf1[24] = in[3];
+ bf1[25] = in[19];
+ bf1[26] = in[11];
+ bf1[27] = in[27];
+ bf1[28] = in[7];
+ bf1[29] = in[23];
+ bf1[30] = in[15];
+ bf1[31] = in[31];
// stage 2
bf0[0] = bf1[0];
@@ -568,91 +1095,255 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
// stage 9
if (do_cols) {
- addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0 * 4 + col,
- out + 31 * 4 + col);
- addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1 * 4 + col,
- out + 30 * 4 + col);
- addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2 * 4 + col,
- out + 29 * 4 + col);
- addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3 * 4 + col,
- out + 28 * 4 + col);
- addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4 * 4 + col,
- out + 27 * 4 + col);
- addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5 * 4 + col,
- out + 26 * 4 + col);
- addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6 * 4 + col,
- out + 25 * 4 + col);
- addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7 * 4 + col,
- out + 24 * 4 + col);
- addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8 * 4 + col,
- out + 23 * 4 + col);
- addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9 * 4 + col,
- out + 22 * 4 + col);
- addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
- out + 21 * 4 + col);
- addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
- out + 20 * 4 + col);
- addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
- out + 19 * 4 + col);
- addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
- out + 18 * 4 + col);
- addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
- out + 17 * 4 + col);
- addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
- out + 16 * 4 + col);
+ addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31);
+ addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30);
+ addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29);
+ addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28);
+ addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27);
+ addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26);
+ addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25);
+ addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24);
+ addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23);
+ addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22);
+ addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21);
+ addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20);
+ addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19);
+ addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18);
+ addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17);
+ addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16);
} else {
- addsub_shift_avx2(bf0[0], bf0[31], out + 0 * 4 + col, out + 31 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[1], bf0[30], out + 1 * 4 + col, out + 30 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[2], bf0[29], out + 2 * 4 + col, out + 29 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[3], bf0[28], out + 3 * 4 + col, out + 28 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[4], bf0[27], out + 4 * 4 + col, out + 27 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[5], bf0[26], out + 5 * 4 + col, out + 26 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[6], bf0[25], out + 6 * 4 + col, out + 25 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[7], bf0[24], out + 7 * 4 + col, out + 24 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[8], bf0[23], out + 8 * 4 + col, out + 23 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[9], bf0[22], out + 9 * 4 + col, out + 22 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
- out + 21 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
- out + 20 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
- out + 19 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
- out + 18 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
- out + 17 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
- out + 16 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
}
}
}
-void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
- int stride, TX_TYPE tx_type, int bd) {
- __m256i in[128], out[128];
- const int8_t *shift = inv_txfm_shift_ls[TX_32X32];
- const int txw_idx = get_txw_idx(TX_32X32);
- const int txh_idx = get_txh_idx(TX_32X32);
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
+ int do_cols, int bd, int out_shift);
+
+static const transform_1d_avx2
+ highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m256i buf1[64 * 2];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_avx2 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ // 1st stage: column transform
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ __m256i buf0[32];
+ const int32_t *input_row = input + i * input_stride * 8;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ __m256i *buf0_cur = buf0 + j * 8;
+ load_buffer_32x32(input_row + j * 8, buf0_cur, input_stride, 8);
+
+ transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
+ }
+
+ row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+ __m256i *_buf1 = buf1 + i * 8;
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+ }
+ }
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
switch (tx_type) {
case DCT_DCT:
- load_buffer_32x32(coeff, in);
- transpose_32x32(in, out);
- idct32_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
- transpose_32x32(in, out);
- idct32_avx2(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_32x32(in, output, stride, 0, 0, -shift[1], bd);
+ highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
+ stride, tx_type, tx_size, eob, bd);
break;
+ default: assert(0); break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ const int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ case DCT_DCT:
+ av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ // Assembly version doesn't support IDTX, so use C version for it.
+ case IDTX:
+ av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+
default: assert(0);
}
}
+
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_32X32:
+ av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param);
+ break;
+ case TX_16X16:
+ av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_8X8:
+ av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X8:
+ av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+ break;
+ case TX_8X4:
+ av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+ break;
+ case TX_8X16:
+ av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X8:
+ av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X32:
+ av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X16:
+ av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+ break;
+ case TX_32X64:
+ av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+ break;
+ case TX_64X32:
+ av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+ break;
+ case TX_4X4:
+ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+ break;
+ case TX_8X32:
+ av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X8:
+ av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+ break;
+ case TX_64X64:
+ case TX_16X64:
+ case TX_64X16:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(
+ input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+ txfm_param->eob, txfm_param->bd);
+ break;
+ default: assert(0 && "Invalid transform size"); break;
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
index 801a4133b..e29e0baf5 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -15,8 +15,60 @@
#include "config/av1_rtcd.h"
#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i clamped, mask;
+
+ mask = _mm_cmpgt_epi16(u, max);
+ clamped = _mm_andnot_si128(mask, u);
+ mask = _mm_and_si128(mask, max);
+ clamped = _mm_or_si128(mask, clamped);
+ mask = _mm_cmpgt_epi16(clamped, zero);
+ clamped = _mm_and_si128(clamped, mask);
+
+ return clamped;
+}
+
+static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
+ __m128i res0, __m128i res1,
+ const int bd) {
+ __m128i x0 = _mm_cvtepi16_epi32(pred);
+ __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
+
+ x0 = _mm_add_epi32(res0, x0);
+ x1 = _mm_add_epi32(res1, x1);
+ x0 = _mm_packus_epi32(x0, x1);
+ x0 = highbd_clamp_epi16(x0, bd);
+ return x0;
+}
+
+static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+ __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
+
+ _mm_storeu_si128((__m128i *)(output + i * stride), u);
+ }
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
+ }
+}
+
static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
@@ -57,18 +109,231 @@ static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
__m128i a0 = _mm_add_epi32(in0_w_offset, in1);
__m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
+ a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
a0 = _mm_max_epi32(a0, *clamp_lo);
a0 = _mm_min_epi32(a0, *clamp_hi);
a1 = _mm_max_epi32(a1, *clamp_lo);
a1 = _mm_min_epi32(a1, *clamp_hi);
- a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
- a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
*out0 = a0;
*out1 = a1;
}
+static INLINE void idct32_stage4_sse4_1(
+ __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
+ const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
+ const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+ bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+ bf1[17] = temp1;
+
+ temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+ bf1[29] =
+ half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+ bf1[18] = temp2;
+
+ temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+ bf1[21] = temp1;
+
+ temp2 =
+ half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+ bf1[25] =
+ half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+ bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_sse4_1(
+ __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
+ const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
+ const __m128i *clamp_hi, const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+ bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+ bf1[9] = temp1;
+
+ temp2 =
+ half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+ bf1[10] = temp2;
+
+ addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_sse4_1(
+ __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
+ const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+ const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[5] = temp1;
+
+ addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+ bf1[29] =
+ half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+ bf1[18] = temp1;
+ temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+ bf1[28] =
+ half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+ bf1[19] = temp2;
+ temp1 =
+ half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+ bf1[27] =
+ half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 =
+ half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[10] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[11] = temp2;
+
+ addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[27] =
+ half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+ temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[25] =
+ half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[22] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[24] =
+ half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
+ const int do_cols, const int bd,
+ const int out_shift,
+ const int log_range) {
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
+ addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
+ addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
+ addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
+ addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
+ addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
+ addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
+ addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
+ addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
+ addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
+ addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
+ addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
+ addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
+ addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
+ addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
+ addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
__m128i *out0, __m128i *out1,
const __m128i *clamp_lo, const __m128i *clamp_hi,
@@ -77,14 +342,14 @@ static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
__m128i a0 = _mm_add_epi32(offset, in0);
__m128i a1 = _mm_sub_epi32(offset, in1);
+ a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
a0 = _mm_max_epi32(a0, *clamp_lo);
a0 = _mm_min_epi32(a0, *clamp_hi);
a1 = _mm_max_epi32(a1, *clamp_lo);
a1 = _mm_min_epi32(a1, *clamp_hi);
- a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
- a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
-
*out0 = a0;
*out1 = a1;
}
@@ -96,9 +361,6 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
- const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
- const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
- const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i u0, u1, u2, u3;
__m128i v0, v1, v2, v3, x, y;
@@ -135,11 +397,19 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
v3 = _mm_add_epi32(v3, rnding);
v3 = _mm_srai_epi32(v3, bit);
- addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
- addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3);
+ addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2);
+ } else {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
+ }
}
-static void iadst4x4_sse4_1(__m128i *in, int bit) {
+static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
const int32_t *sinpi = sinpi_arr(bit);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
@@ -197,6 +467,21 @@ static void iadst4x4_sse4_1(__m128i *in, int bit) {
u3 = _mm_add_epi32(u3, rnding);
u3 = _mm_srai_epi32(u3, bit);
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ u0 = _mm_max_epi32(u0, clamp_lo);
+ u0 = _mm_min_epi32(u0, clamp_hi);
+ u1 = _mm_max_epi32(u1, clamp_lo);
+ u1 = _mm_min_epi32(u1, clamp_hi);
+ u2 = _mm_max_epi32(u2, clamp_lo);
+ u2 = _mm_min_epi32(u2, clamp_hi);
+ u3 = _mm_max_epi32(u3, clamp_lo);
+ u3 = _mm_min_epi32(u3, clamp_hi);
+ }
+
in[0] = u0;
in[1] = u1;
in[2] = u2;
@@ -217,22 +502,6 @@ static INLINE void round_shift_4x4(__m128i *in, int shift) {
in[3] = _mm_srai_epi32(in[3], shift);
}
-static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
- __m128i clamped, mask;
-
- mask = _mm_cmpgt_epi16(u, max);
- clamped = _mm_andnot_si128(mask, u);
- mask = _mm_and_si128(mask, max);
- clamped = _mm_or_si128(mask, clamped);
- mask = _mm_cmpgt_epi16(clamped, zero);
- clamped = _mm_and_si128(clamped, mask);
-
- return clamped;
-}
-
static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
int fliplr, int flipud, int shift, int bd) {
const __m128i zero = _mm_setzero_si128();
@@ -304,49 +573,49 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
case ADST_DCT:
load_buffer_4x4(coeff, in);
idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case FLIPADST_DCT:
load_buffer_4x4(coeff, in);
idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
break;
case ADST_FLIPADST:
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_ADST:
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
- iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
default: assert(0);
@@ -482,14 +751,19 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
} else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
- &clamp_lo, &clamp_hi, out_shift);
+ &clamp_lo_out, &clamp_hi_out, out_shift);
addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
- &clamp_lo, &clamp_hi, out_shift);
+ &clamp_lo_out, &clamp_hi_out, out_shift);
addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
- &clamp_lo, &clamp_hi, out_shift);
+ &clamp_lo_out, &clamp_hi_out, out_shift);
addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
- &clamp_lo, &clamp_hi, out_shift);
+ &clamp_lo_out, &clamp_hi_out, out_shift);
}
}
}
@@ -651,14 +925,18 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
out[12] = u[5];
out[14] = _mm_sub_epi32(kZero, u[1]);
} else {
- neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo, &clamp_hi,
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
out_shift);
- neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo, &clamp_hi,
+ neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
}
// Odd 8 points: 1, 3, ..., 15
@@ -796,14 +1074,18 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
out[13] = u[5];
out[15] = _mm_sub_epi32(kZero, u[1]);
} else {
- neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo, &clamp_hi,
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
out_shift);
- neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo, &clamp_hi,
+ neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
}
}
@@ -976,64 +1258,1141 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
}
}
-// 16x16
-static void load_buffer_16x16(const int32_t *coeff, __m128i *in) {
- int i;
- for (i = 0; i < 64; ++i) {
- in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2)));
+static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ x = _mm_mullo_epi32(in[0], cospi32);
+ x = _mm_add_epi32(x, rnding);
+ x = _mm_srai_epi32(x, bit);
+
+ // stage 4
+ // stage 5
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ x = _mm_add_epi32(x, offset);
+ x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ x = _mm_max_epi32(x, clamp_lo_out);
+ x = _mm_min_epi32(x, clamp_hi_out);
}
+
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
}
-static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
- int col) {
- int i;
- for (i = 0; i < 16; i += 2) {
- in8x8[i] = in[col];
- in8x8[i + 1] = in[col + 1];
- col += 4;
+static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0];
+ u1 = in[4];
+ u2 = in[2];
+ u3 = in[6];
+
+ x = _mm_mullo_epi32(in[1], cospi56);
+ y = _mm_mullo_epi32(in[7], cospim8);
+ u4 = _mm_add_epi32(x, y);
+ u4 = _mm_add_epi32(u4, rnding);
+ u4 = _mm_srai_epi32(u4, bit);
+
+ x = _mm_mullo_epi32(in[1], cospi8);
+ y = _mm_mullo_epi32(in[7], cospi56);
+ u7 = _mm_add_epi32(x, y);
+ u7 = _mm_add_epi32(u7, rnding);
+ u7 = _mm_srai_epi32(u7, bit);
+
+ x = _mm_mullo_epi32(in[5], cospi24);
+ y = _mm_mullo_epi32(in[3], cospim40);
+ u5 = _mm_add_epi32(x, y);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ x = _mm_mullo_epi32(in[5], cospi40);
+ y = _mm_mullo_epi32(in[3], cospi24);
+ u6 = _mm_add_epi32(x, y);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ // stage 3
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u1, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = _mm_mullo_epi32(v5, cospi32);
+ y = _mm_mullo_epi32(v6, cospi32);
+ u6 = _mm_add_epi32(y, x);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ u5 = _mm_sub_epi32(y, x);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ // stage 5
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
+ addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
+ addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
+ addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
}
}
-static void swap_addr(uint16_t **output1, uint16_t **output2) {
- uint16_t *tmp;
- tmp = *output1;
- *output1 = *output2;
- *output2 = tmp;
+static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ __m128i u[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(kZero, x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // stage 3
+ // stage 4
+ __m128i temp1, temp2;
+ temp1 = _mm_mullo_epi32(u[0], cospi16);
+ x = _mm_mullo_epi32(u[1], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+ u[4] = temp1;
+
+ temp2 = _mm_mullo_epi32(u[0], cospi48);
+ x = _mm_mullo_epi32(u[1], cospi16);
+ u[5] = _mm_sub_epi32(temp2, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // stage 5
+ // stage 6
+ temp1 = _mm_mullo_epi32(u[0], cospi32);
+ x = _mm_mullo_epi32(u[1], cospi32);
+ u[2] = _mm_add_epi32(temp1, x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(temp1, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ temp1 = _mm_mullo_epi32(u[4], cospi32);
+ x = _mm_mullo_epi32(u[5], cospi32);
+ u[6] = _mm_add_epi32(temp1, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(temp1, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
}
-static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
- int fliplr, int flipud, int shift, int bd) {
- __m128i in8x8[16];
- uint16_t *leftUp = &output[0];
- uint16_t *rightUp = &output[8];
- uint16_t *leftDown = &output[8 * stride];
- uint16_t *rightDown = &output[8 * stride + 8];
+static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[8], v[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
- if (fliplr) {
- swap_addr(&leftUp, &rightUp);
- swap_addr(&leftDown, &rightDown);
+ u[0] = _mm_mullo_epi32(in[7], cospi4);
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[7], cospi60);
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[5], cospi20);
+ x = _mm_mullo_epi32(in[2], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[5], cospi44);
+ x = _mm_mullo_epi32(in[2], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[3], cospi36);
+ x = _mm_mullo_epi32(in[4], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[3], cospi28);
+ x = _mm_mullo_epi32(in[4], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[1], cospi52);
+ x = _mm_mullo_epi32(in[6], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[1], cospi12);
+ x = _mm_mullo_epi32(in[6], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
}
+}
- if (flipud) {
- swap_addr(&leftUp, &leftDown);
- swap_addr(&rightUp, &rightDown);
+static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ in[0] = _mm_mullo_epi32(in[0], cospi32);
+ in[0] = _mm_add_epi32(in[0], rnding);
+ in[0] = _mm_srai_epi32(in[0], bit);
+
+ // stage 5
+ // stage 6
+ // stage 7
+ if (do_cols) {
+ in[0] = _mm_max_epi32(in[0], clamp_lo);
+ in[0] = _mm_min_epi32(in[0], clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ in[0] = _mm_add_epi32(in[0], offset);
+ in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+ in[0] = _mm_max_epi32(in[0], clamp_lo_out);
+ in[0] = _mm_min_epi32(in[0], clamp_hi_out);
+ }
+
+ out[0] = in[0];
+ out[1] = in[0];
+ out[2] = in[0];
+ out[3] = in[0];
+ out[4] = in[0];
+ out[5] = in[0];
+ out[6] = in[0];
+ out[7] = in[0];
+ out[8] = in[0];
+ out[9] = in[0];
+ out[10] = in[0];
+ out[11] = in[0];
+ out[12] = in[0];
+ out[13] = in[0];
+ out[14] = in[0];
+ out[15] = in[0];
}
+}
- // Left-up quarter
- assign_8x8_input_from_16x16(in, in8x8, 0);
- write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd);
+static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[2] = in[4];
+ u[4] = in[2];
+ u[6] = in[6];
+ u[8] = in[1];
+ u[10] = in[5];
+ u[12] = in[3];
+ u[14] = in[7];
+
+ // stage 2
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+
+ u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+ u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+
+ u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+ u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+
+ u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+ // stage 3
+ u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+ u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
+ u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
+
+ addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm_mullo_epi32(u[0], cospi32);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+ u[1] = u[0];
- // Right-up quarter
- assign_8x8_input_from_16x16(in, in8x8, 2);
- write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd);
+ u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+ u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
- // Left-down quarter
- assign_8x8_input_from_16x16(in, in8x8, 32);
- write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd);
+ addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
- // Right-down quarter
- assign_8x8_input_from_16x16(in, in8x8, 34);
- write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
+ x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = x;
+ y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = y;
+
+ // stage 5
+ addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ x = _mm_mullo_epi32(u[5], cospi32);
+ y = _mm_mullo_epi32(u[6], cospi32);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[13], cospi32);
+ u[10] = _mm_sub_epi32(y, x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[13] = _mm_add_epi32(x, y);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi32);
+ y = _mm_mullo_epi32(u[12], cospi32);
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ u[12] = _mm_add_epi32(x, y);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+ // stage 7
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
+ addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
+ addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
+ addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
+ addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
+ addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
+ addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
+ addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i v[16], x, y, temp1, temp2;
+
+ // Calculate the column 0, 1, 2, 3
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ x = _mm_mullo_epi32(in[0], cospi62);
+ v[0] = _mm_add_epi32(x, rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi2);
+ v[1] = _mm_sub_epi32(zero, x);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ // stage 3
+ v[8] = v[0];
+ v[9] = v[1];
+
+ // stage 4
+ temp1 = _mm_mullo_epi32(v[8], cospi8);
+ x = _mm_mullo_epi32(v[9], cospi56);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[8], cospi56);
+ x = _mm_mullo_epi32(v[9], cospi8);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[8] = temp1;
+ v[9] = temp2;
+
+ // stage 5
+ v[4] = v[0];
+ v[5] = v[1];
+ v[12] = v[8];
+ v[13] = v[9];
+
+ // stage 6
+ temp1 = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[4] = temp1;
+ v[5] = temp2;
+
+ temp1 = _mm_mullo_epi32(v[12], cospi16);
+ x = _mm_mullo_epi32(v[13], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[12], cospi48);
+ x = _mm_mullo_epi32(v[13], cospi16);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[12] = temp1;
+ v[13] = temp2;
+
+ // stage 7
+ v[2] = v[0];
+ v[3] = v[1];
+ v[6] = v[4];
+ v[7] = v[5];
+ v[10] = v[8];
+ v[11] = v[9];
+ v[14] = v[12];
+ v[15] = v[13];
+
+ // stage 8
+ y = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ v[2] = _mm_add_epi32(y, x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(y, x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ y = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ v[6] = _mm_add_epi32(y, x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(y, x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ y = _mm_mullo_epi32(v[10], cospi32);
+ x = _mm_mullo_epi32(v[11], cospi32);
+ v[10] = _mm_add_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ y = _mm_mullo_epi32(v[14], cospi32);
+ x = _mm_mullo_epi32(v[15], cospi32);
+ v[14] = _mm_add_epi32(y, x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(y, x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+ out[2] = v[12];
+ out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+ out[4] = v[6];
+ out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+ out[6] = v[10];
+ out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+ out[8] = v[3];
+ out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+ out[10] = v[15];
+ out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+ out[12] = v[5];
+ out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+ out[14] = v[9];
+ out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[16], x, y;
+
+ // Calculate the column 0, 1, 2, 3
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ __m128i zero = _mm_setzero_si128();
+ x = _mm_mullo_epi32(in[0], cospi62);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi2);
+ u[1] = _mm_sub_epi32(zero, x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ x = _mm_mullo_epi32(in[2], cospi54);
+ u[2] = _mm_add_epi32(x, rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ x = _mm_mullo_epi32(in[2], cospi10);
+ u[3] = _mm_sub_epi32(zero, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ x = _mm_mullo_epi32(in[4], cospi46);
+ u[4] = _mm_add_epi32(x, rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ x = _mm_mullo_epi32(in[4], cospi18);
+ u[5] = _mm_sub_epi32(zero, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ x = _mm_mullo_epi32(in[6], cospi38);
+ u[6] = _mm_add_epi32(x, rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ x = _mm_mullo_epi32(in[6], cospi26);
+ u[7] = _mm_sub_epi32(zero, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ u[8] = _mm_mullo_epi32(in[7], cospi34);
+ u[8] = _mm_add_epi32(u[8], rnding);
+ u[8] = _mm_srai_epi32(u[8], bit);
+
+ u[9] = _mm_mullo_epi32(in[7], cospi30);
+ u[9] = _mm_add_epi32(u[9], rnding);
+ u[9] = _mm_srai_epi32(u[9], bit);
+
+ u[10] = _mm_mullo_epi32(in[5], cospi42);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[11] = _mm_mullo_epi32(in[5], cospi22);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ u[12] = _mm_mullo_epi32(in[3], cospi50);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ u[13] = _mm_mullo_epi32(in[3], cospi14);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ u[14] = _mm_mullo_epi32(in[1], cospi58);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ u[15] = _mm_mullo_epi32(in[1], cospi6);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ y = _mm_mullo_epi32(u[8], cospi56);
+ x = _mm_mullo_epi32(u[9], cospi56);
+ u[8] = _mm_mullo_epi32(u[8], cospi8);
+ u[8] = _mm_add_epi32(u[8], x);
+ u[8] = _mm_add_epi32(u[8], rnding);
+ u[8] = _mm_srai_epi32(u[8], bit);
+
+ x = _mm_mullo_epi32(u[9], cospi8);
+ u[9] = _mm_sub_epi32(y, x);
+ u[9] = _mm_add_epi32(u[9], rnding);
+ u[9] = _mm_srai_epi32(u[9], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi24);
+ y = _mm_mullo_epi32(u[10], cospi24);
+ u[10] = _mm_mullo_epi32(u[10], cospi40);
+ u[10] = _mm_add_epi32(u[10], x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi40);
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi8);
+ y = _mm_mullo_epi32(u[12], cospi8);
+ u[12] = _mm_mullo_epi32(u[12], cospim56);
+ u[12] = _mm_add_epi32(u[12], x);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ x = _mm_mullo_epi32(u[13], cospim56);
+ u[13] = _mm_sub_epi32(y, x);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[15], cospi40);
+ y = _mm_mullo_epi32(u[14], cospi40);
+ u[14] = _mm_mullo_epi32(u[14], cospim24);
+ u[14] = _mm_add_epi32(u[14], x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ x = _mm_mullo_epi32(u[15], cospim24);
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ x = _mm_mullo_epi32(u[5], cospi48);
+ y = _mm_mullo_epi32(u[4], cospi48);
+ u[4] = _mm_mullo_epi32(u[4], cospi16);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ x = _mm_mullo_epi32(u[5], cospi16);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ x = _mm_mullo_epi32(u[7], cospi16);
+ y = _mm_mullo_epi32(u[6], cospi16);
+ u[6] = _mm_mullo_epi32(u[6], cospim48);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ x = _mm_mullo_epi32(u[7], cospim48);
+ u[7] = _mm_sub_epi32(y, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi48);
+ y = _mm_mullo_epi32(u[12], cospi48);
+ u[12] = _mm_mullo_epi32(u[12], cospi16);
+ u[12] = _mm_add_epi32(u[12], x);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi16);
+ u[13] = _mm_sub_epi32(y, x);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[15], cospi16);
+ y = _mm_mullo_epi32(u[14], cospi16);
+ u[14] = _mm_mullo_epi32(u[14], cospim48);
+ u[14] = _mm_add_epi32(u[14], x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ x = _mm_mullo_epi32(u[15], cospim48);
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 7
+ addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ y = _mm_mullo_epi32(u[2], cospi32);
+ x = _mm_mullo_epi32(u[3], cospi32);
+ u[2] = _mm_add_epi32(y, x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(y, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+ y = _mm_mullo_epi32(u[6], cospi32);
+ x = _mm_mullo_epi32(u[7], cospi32);
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(y, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ y = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[11], cospi32);
+ u[10] = _mm_add_epi32(y, x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ y = _mm_mullo_epi32(u[14], cospi32);
+ x = _mm_mullo_epi32(u[15], cospi32);
+ u[14] = _mm_add_epi32(y, x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
+ out[2] = u[12];
+ out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
+ out[4] = u[6];
+ out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
+ out[6] = u[10];
+ out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
+ out[8] = u[3];
+ out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
+ out[10] = u[15];
+ out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
+ out[12] = u[5];
+ out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
+ out[14] = u[9];
+ out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
}
static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
@@ -1067,27 +2426,26 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i u[16], v[16], x, y;
- int col;
- for (col = 0; col < 4; ++col) {
+ {
// stage 0
// stage 1
- u[0] = in[0 * 4 + col];
- u[1] = in[8 * 4 + col];
- u[2] = in[4 * 4 + col];
- u[3] = in[12 * 4 + col];
- u[4] = in[2 * 4 + col];
- u[5] = in[10 * 4 + col];
- u[6] = in[6 * 4 + col];
- u[7] = in[14 * 4 + col];
- u[8] = in[1 * 4 + col];
- u[9] = in[9 * 4 + col];
- u[10] = in[5 * 4 + col];
- u[11] = in[13 * 4 + col];
- u[12] = in[3 * 4 + col];
- u[13] = in[11 * 4 + col];
- u[14] = in[7 * 4 + col];
- u[15] = in[15 * 4 + col];
+ u[0] = in[0];
+ u[1] = in[8];
+ u[2] = in[4];
+ u[3] = in[12];
+ u[4] = in[2];
+ u[5] = in[10];
+ u[6] = in[6];
+ u[7] = in[14];
+ u[8] = in[1];
+ u[9] = in[9];
+ u[10] = in[5];
+ u[11] = in[13];
+ u[12] = in[3];
+ u[13] = in[11];
+ u[14] = in[7];
+ u[15] = in[15];
// stage 2
v[0] = u[0];
@@ -1200,37 +2558,37 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
// stage 7
if (do_cols) {
- addsub_no_clamp_sse4_1(v[0], v[15], out + 0 * 4 + col,
- out + 15 * 4 + col);
- addsub_no_clamp_sse4_1(v[1], v[14], out + 1 * 4 + col,
- out + 14 * 4 + col);
- addsub_no_clamp_sse4_1(v[2], v[13], out + 2 * 4 + col,
- out + 13 * 4 + col);
- addsub_no_clamp_sse4_1(v[3], v[12], out + 3 * 4 + col,
- out + 12 * 4 + col);
- addsub_no_clamp_sse4_1(v[4], v[11], out + 4 * 4 + col,
- out + 11 * 4 + col);
- addsub_no_clamp_sse4_1(v[5], v[10], out + 5 * 4 + col,
- out + 10 * 4 + col);
- addsub_no_clamp_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col);
- addsub_no_clamp_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col);
+ addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
+ addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
+ addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
+ addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
+ addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
+ addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
+ addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
+ addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
} else {
- addsub_shift_sse4_1(v[0], v[15], out + 0 * 4 + col, out + 15 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[1], v[14], out + 1 * 4 + col, out + 14 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[2], v[13], out + 2 * 4 + col, out + 13 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[3], v[12], out + 3 * 4 + col, out + 12 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[4], v[11], out + 4 * 4 + col, out + 11 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[5], v[10], out + 5 * 4 + col, out + 10 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
- addsub_shift_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col,
- &clamp_lo, &clamp_hi, out_shift);
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
}
}
}
@@ -1269,106 +2627,104 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i u[16], v[16], x, y;
- const int col_num = 4;
- int col;
// Calculate the column 0, 1, 2, 3
- for (col = 0; col < col_num; ++col) {
+ {
// stage 0
// stage 1
// stage 2
- v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
- x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
+ v[0] = _mm_mullo_epi32(in[15], cospi2);
+ x = _mm_mullo_epi32(in[0], cospi62);
v[0] = _mm_add_epi32(v[0], x);
v[0] = _mm_add_epi32(v[0], rnding);
v[0] = _mm_srai_epi32(v[0], bit);
- v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
- x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
+ v[1] = _mm_mullo_epi32(in[15], cospi62);
+ x = _mm_mullo_epi32(in[0], cospi2);
v[1] = _mm_sub_epi32(v[1], x);
v[1] = _mm_add_epi32(v[1], rnding);
v[1] = _mm_srai_epi32(v[1], bit);
- v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
- x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
+ v[2] = _mm_mullo_epi32(in[13], cospi10);
+ x = _mm_mullo_epi32(in[2], cospi54);
v[2] = _mm_add_epi32(v[2], x);
v[2] = _mm_add_epi32(v[2], rnding);
v[2] = _mm_srai_epi32(v[2], bit);
- v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
- x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
+ v[3] = _mm_mullo_epi32(in[13], cospi54);
+ x = _mm_mullo_epi32(in[2], cospi10);
v[3] = _mm_sub_epi32(v[3], x);
v[3] = _mm_add_epi32(v[3], rnding);
v[3] = _mm_srai_epi32(v[3], bit);
- v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
- x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
+ v[4] = _mm_mullo_epi32(in[11], cospi18);
+ x = _mm_mullo_epi32(in[4], cospi46);
v[4] = _mm_add_epi32(v[4], x);
v[4] = _mm_add_epi32(v[4], rnding);
v[4] = _mm_srai_epi32(v[4], bit);
- v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
- x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
+ v[5] = _mm_mullo_epi32(in[11], cospi46);
+ x = _mm_mullo_epi32(in[4], cospi18);
v[5] = _mm_sub_epi32(v[5], x);
v[5] = _mm_add_epi32(v[5], rnding);
v[5] = _mm_srai_epi32(v[5], bit);
- v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
- x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
+ v[6] = _mm_mullo_epi32(in[9], cospi26);
+ x = _mm_mullo_epi32(in[6], cospi38);
v[6] = _mm_add_epi32(v[6], x);
v[6] = _mm_add_epi32(v[6], rnding);
v[6] = _mm_srai_epi32(v[6], bit);
- v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
- x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
+ v[7] = _mm_mullo_epi32(in[9], cospi38);
+ x = _mm_mullo_epi32(in[6], cospi26);
v[7] = _mm_sub_epi32(v[7], x);
v[7] = _mm_add_epi32(v[7], rnding);
v[7] = _mm_srai_epi32(v[7], bit);
- v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
- x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
+ v[8] = _mm_mullo_epi32(in[7], cospi34);
+ x = _mm_mullo_epi32(in[8], cospi30);
v[8] = _mm_add_epi32(v[8], x);
v[8] = _mm_add_epi32(v[8], rnding);
v[8] = _mm_srai_epi32(v[8], bit);
- v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
- x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
+ v[9] = _mm_mullo_epi32(in[7], cospi30);
+ x = _mm_mullo_epi32(in[8], cospi34);
v[9] = _mm_sub_epi32(v[9], x);
v[9] = _mm_add_epi32(v[9], rnding);
v[9] = _mm_srai_epi32(v[9], bit);
- v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
- x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
+ v[10] = _mm_mullo_epi32(in[5], cospi42);
+ x = _mm_mullo_epi32(in[10], cospi22);
v[10] = _mm_add_epi32(v[10], x);
v[10] = _mm_add_epi32(v[10], rnding);
v[10] = _mm_srai_epi32(v[10], bit);
- v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
- x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
+ v[11] = _mm_mullo_epi32(in[5], cospi22);
+ x = _mm_mullo_epi32(in[10], cospi42);
v[11] = _mm_sub_epi32(v[11], x);
v[11] = _mm_add_epi32(v[11], rnding);
v[11] = _mm_srai_epi32(v[11], bit);
- v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
- x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
+ v[12] = _mm_mullo_epi32(in[3], cospi50);
+ x = _mm_mullo_epi32(in[12], cospi14);
v[12] = _mm_add_epi32(v[12], x);
v[12] = _mm_add_epi32(v[12], rnding);
v[12] = _mm_srai_epi32(v[12], bit);
- v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
- x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
+ v[13] = _mm_mullo_epi32(in[3], cospi14);
+ x = _mm_mullo_epi32(in[12], cospi50);
v[13] = _mm_sub_epi32(v[13], x);
v[13] = _mm_add_epi32(v[13], rnding);
v[13] = _mm_srai_epi32(v[13], bit);
- v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
- x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
+ v[14] = _mm_mullo_epi32(in[1], cospi58);
+ x = _mm_mullo_epi32(in[14], cospi6);
v[14] = _mm_add_epi32(v[14], x);
v[14] = _mm_add_epi32(v[14], rnding);
v[14] = _mm_srai_epi32(v[14], bit);
- v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
- x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
+ v[15] = _mm_mullo_epi32(in[1], cospi6);
+ x = _mm_mullo_epi32(in[14], cospi58);
v[15] = _mm_sub_epi32(v[15], x);
v[15] = _mm_add_epi32(v[15], rnding);
v[15] = _mm_srai_epi32(v[15], bit);
@@ -1575,268 +2931,835 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
// stage 9
if (do_cols) {
- out[0 * col_num + col] = v[0];
- out[1 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
- out[2 * col_num + col] = v[12];
- out[3 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
- out[4 * col_num + col] = v[6];
- out[5 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
- out[6 * col_num + col] = v[10];
- out[7 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
- out[8 * col_num + col] = v[3];
- out[9 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
- out[10 * col_num + col] = v[15];
- out[11 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
- out[12 * col_num + col] = v[5];
- out[13 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
- out[14 * col_num + col] = v[9];
- out[15 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+ out[0] = v[0];
+ out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+ out[2] = v[12];
+ out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+ out[4] = v[6];
+ out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+ out[6] = v[10];
+ out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+ out[8] = v[3];
+ out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+ out[10] = v[15];
+ out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+ out[12] = v[5];
+ out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+ out[14] = v[9];
+ out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
} else {
- neg_shift_sse4_1(v[0], v[8], out + 0 * col_num + col,
- out + 1 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[12], v[4], out + 2 * col_num + col,
- out + 3 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[6], v[14], out + 4 * col_num + col,
- out + 5 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[10], v[2], out + 6 * col_num + col,
- out + 7 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[3], v[11], out + 8 * col_num + col,
- out + 9 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[15], v[7], out + 10 * col_num + col,
- out + 11 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[5], v[13], out + 12 * col_num + col,
- out + 13 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
- neg_shift_sse4_1(v[9], v[1], out + 14 * col_num + col,
- out + 15 * col_num + col, &clamp_lo, &clamp_hi,
- out_shift);
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
}
}
}
-void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
- int stride, TX_TYPE tx_type, int bd) {
- __m128i in[64], out[64];
- const int8_t *shift = inv_txfm_shift_ls[TX_16X16];
- const int txw_idx = get_txw_idx(TX_16X16);
- const int txh_idx = get_txh_idx(TX_16X16);
-
- switch (tx_type) {
- case DCT_DCT:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
- break;
- case DCT_ADST:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
- break;
- case ADST_DCT:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
- break;
- case ADST_ADST:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
- break;
- case FLIPADST_DCT:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
- break;
- case DCT_FLIPADST:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
- break;
- case ADST_FLIPADST:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
- break;
- case FLIPADST_FLIPADST:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 1, 1, -shift[1], bd);
- break;
- case FLIPADST_ADST:
- load_buffer_16x16(coeff, in);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
- break;
- default: assert(0);
+static INLINE void idct64_stage8_sse4_1(
+ __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
+ const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+ const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ int i;
+ __m128i temp1, temp2, temp3, temp4;
+ temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+ u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+ u[10] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+ u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+ u[11] = temp2;
+
+ for (i = 16; i < 20; ++i) {
+ addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+ addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
+ clamp_hi);
}
+
+ temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+ u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+ u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+ u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+ u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+ u[36] = temp1;
+ u[37] = temp2;
+ u[38] = temp3;
+ u[39] = temp4;
+
+ temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+ u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+ u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+ u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+ u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
}
-static void load_buffer_64x64_lower_32x32(const int32_t *coeff, __m128i *in) {
- int i, j;
+static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ int i;
+ __m128i temp1, temp2, temp3, temp4;
+ for (i = 0; i < 8; ++i) {
+ addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+ }
- __m128i zero = _mm_setzero_si128();
+ temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+ u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+ u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+ u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+ u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+ u[20] = temp1;
+ u[21] = temp2;
+ u[22] = temp3;
+ u[23] = temp4;
+ for (i = 32; i < 40; i++) {
+ addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+ }
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 8; ++j) {
- in[16 * i + j] =
- _mm_loadu_si128((const __m128i *)(coeff + 32 * i + 4 * j));
- in[16 * i + j + 8] = zero;
- }
+ for (i = 48; i < 56; i++) {
+ addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+ }
+}
+
+static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ __m128i temp1, temp2, temp3, temp4;
+ for (int i = 0; i < 16; i++) {
+ addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
}
- for (i = 0; i < 512; ++i) in[512 + i] = zero;
+ temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+ u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+ u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+ u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+ u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+
+ temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+ u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+ u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+ u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+ u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+ u[44] = temp1;
+ u[45] = temp2;
+ u[46] = temp3;
+ u[47] = temp4;
}
-static void transpose_64x64(__m128i *in, __m128i *out, int do_cols) {
- int i, j;
- for (i = 0; i < (do_cols ? 16 : 8); ++i) {
- for (j = 0; j < 8; ++j) {
- TRANSPOSE_4X4(in[(4 * i + 0) * 16 + j], in[(4 * i + 1) * 16 + j],
- in[(4 * i + 2) * 16 + j], in[(4 * i + 3) * 16 + j],
- out[(4 * j + 0) * 16 + i], out[(4 * j + 1) * 16 + i],
- out[(4 * j + 2) * 16 + i], out[(4 * j + 3) * 16 + i]);
+static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
+ int bd, int out_shift,
+ const int log_range) {
+ if (do_cols) {
+ for (int i = 0; i < 32; i++) {
+ addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
+ }
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ for (int i = 0; i < 32; i++) {
+ addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
+ &clamp_lo_out, &clamp_hi_out, out_shift);
}
}
}
-static void assign_16x16_input_from_32x32(const __m128i *in, __m128i *in16x16,
- int col) {
- int i;
- for (i = 0; i < 16 * 16 / 4; i += 4) {
- in16x16[i] = in[col];
- in16x16[i + 1] = in[col + 1];
- in16x16[i + 2] = in[col + 2];
- in16x16[i + 3] = in[col + 3];
- col += 8;
+static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+
+ {
+ __m128i x;
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
+
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ if (do_cols) {
+ x = _mm_max_epi32(x, clamp_lo);
+ x = _mm_min_epi32(x, clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ x = _mm_add_epi32(x, offset);
+ x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+
+ x = _mm_max_epi32(x, clamp_lo_out);
+ x = _mm_min_epi32(x, clamp_hi_out);
+ }
+
+ out[0] = x;
+ out[63] = x;
+ out[1] = x;
+ out[62] = x;
+ out[2] = x;
+ out[61] = x;
+ out[3] = x;
+ out[60] = x;
+ out[4] = x;
+ out[59] = x;
+ out[5] = x;
+ out[58] = x;
+ out[6] = x;
+ out[57] = x;
+ out[7] = x;
+ out[56] = x;
+ out[8] = x;
+ out[55] = x;
+ out[9] = x;
+ out[54] = x;
+ out[10] = x;
+ out[53] = x;
+ out[11] = x;
+ out[52] = x;
+ out[12] = x;
+ out[51] = x;
+ out[13] = x;
+ out[50] = x;
+ out[14] = x;
+ out[49] = x;
+ out[15] = x;
+ out[48] = x;
+ out[16] = x;
+ out[47] = x;
+ out[17] = x;
+ out[46] = x;
+ out[18] = x;
+ out[45] = x;
+ out[19] = x;
+ out[44] = x;
+ out[20] = x;
+ out[43] = x;
+ out[21] = x;
+ out[42] = x;
+ out[22] = x;
+ out[41] = x;
+ out[23] = x;
+ out[40] = x;
+ out[24] = x;
+ out[39] = x;
+ out[25] = x;
+ out[38] = x;
+ out[26] = x;
+ out[37] = x;
+ out[27] = x;
+ out[36] = x;
+ out[28] = x;
+ out[35] = x;
+ out[29] = x;
+ out[34] = x;
+ out[30] = x;
+ out[33] = x;
+ out[31] = x;
+ out[32] = x;
}
}
-static void write_buffer_32x32(__m128i *in, uint16_t *output, int stride,
- int fliplr, int flipud, int shift, int bd) {
- __m128i in16x16[16 * 16 / 4];
- uint16_t *leftUp = &output[0];
- uint16_t *rightUp = &output[16];
- uint16_t *leftDown = &output[16 * stride];
- uint16_t *rightDown = &output[16 * stride + 16];
+static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
- if (fliplr) {
- swap_addr(&leftUp, &rightUp);
- swap_addr(&leftDown, &rightDown);
- }
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
- if (flipud) {
- swap_addr(&leftUp, &leftDown);
- swap_addr(&rightUp, &rightDown);
- }
+ {
+ __m128i u[64];
+
+ // stage 1
+ u[0] = in[0];
+ u[8] = in[4];
+ u[16] = in[2];
+ u[24] = in[6];
+ u[32] = in[1];
+ u[40] = in[5];
+ u[48] = in[3];
+ u[56] = in[7];
+
+ // stage 2
+ u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
- // Left-up quarter
- assign_16x16_input_from_32x32(in, in16x16, 0);
- write_buffer_16x16(in16x16, leftUp, stride, fliplr, flipud, shift, bd);
+ // stage 3
+ u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[62] = u[63];
- // Right-up quarter
- assign_16x16_input_from_32x32(in, in16x16, 32 / 2 / 4);
- write_buffer_16x16(in16x16, rightUp, stride, fliplr, flipud, shift, bd);
+ // stage 4
+ __m128i temp1, temp2;
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ u[17] = u[16];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[30] = u[31];
+
+ temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = temp2;
+
+ temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[46] = temp2;
- // Left-down quarter
- assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4);
- write_buffer_16x16(in16x16, leftDown, stride, fliplr, flipud, shift, bd);
+ // stage 5
+ u[9] = u[8];
+ u[14] = u[15];
+
+ temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[22] = temp2;
+
+ u[35] = u[32];
+ u[34] = u[33];
+ u[36] = u[39];
+ u[37] = u[38];
+ u[43] = u[40];
+ u[42] = u[41];
+ u[44] = u[47];
+ u[45] = u[46];
+ u[51] = u[48];
+ u[50] = u[49];
+ u[52] = u[55];
+ u[53] = u[54];
+ u[59] = u[56];
+ u[58] = u[57];
+ u[60] = u[63];
+ u[61] = u[62];
- // Right-down quarter
- assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4 + 32 / 2 / 4);
- write_buffer_16x16(in16x16, rightDown, stride, fliplr, flipud, shift, bd);
-}
+ // stage 6
+ temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[0] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = temp2;
+ u[19] = u[16];
+ u[18] = u[17];
+ u[20] = u[23];
+ u[21] = u[22];
+ u[27] = u[24];
+ u[26] = u[25];
+ u[28] = u[31];
+ u[29] = u[30];
+
+ temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = temp1;
+ temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[35] = temp2;
+ temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[36] = temp1;
+ temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[37] = temp2;
+ temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = temp1;
+ temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[43] = temp2;
+ temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[44] = temp1;
+ temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[45] = temp2;
-static void assign_32x32_input_from_64x64(const __m128i *in, __m128i *in32x32,
- int col) {
- int i;
- for (i = 0; i < 32 * 32 / 4; i += 8) {
- in32x32[i] = in[col];
- in32x32[i + 1] = in[col + 1];
- in32x32[i + 2] = in[col + 2];
- in32x32[i + 3] = in[col + 3];
- in32x32[i + 4] = in[col + 4];
- in32x32[i + 5] = in[col + 5];
- in32x32[i + 6] = in[col + 6];
- in32x32[i + 7] = in[col + 7];
- col += 16;
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ u[11] = u[8];
+ u[10] = u[9];
+ u[12] = u[15];
+ u[13] = u[14];
+
+ temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = temp1;
+ temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[19] = temp2;
+ temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[20] = temp1;
+ temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[21] = temp2;
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ u[7] = u[0];
+ u[6] = u[1];
+ u[5] = u[2];
+ u[4] = u[3];
+ u[9] = u[9];
+
+ idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
}
}
-static void write_buffer_64x64(__m128i *in, uint16_t *output, int stride,
- int fliplr, int flipud, int shift, int bd) {
- __m128i in32x32[32 * 32 / 4];
- uint16_t *leftUp = &output[0];
- uint16_t *rightUp = &output[32];
- uint16_t *leftDown = &output[32 * stride];
- uint16_t *rightDown = &output[32 * stride + 32];
+static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
- if (fliplr) {
- swap_addr(&leftUp, &rightUp);
- swap_addr(&leftDown, &rightDown);
- }
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
- if (flipud) {
- swap_addr(&leftUp, &leftDown);
- swap_addr(&rightUp, &rightDown);
- }
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+ {
+ __m128i u[64];
+ __m128i tmp1, tmp2, tmp3, tmp4;
+ // stage 1
+ u[0] = in[0];
+ u[32] = in[1];
+ u[36] = in[9];
+ u[40] = in[5];
+ u[44] = in[13];
+ u[48] = in[3];
+ u[52] = in[11];
+ u[56] = in[7];
+ u[60] = in[15];
+ u[16] = in[2];
+ u[20] = in[10];
+ u[24] = in[6];
+ u[28] = in[14];
+ u[4] = in[8];
+ u[8] = in[4];
+ u[12] = in[12];
+
+ // stage 2
+ u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+ u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+ u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+ u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+ u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+ u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+ u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+ u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+ u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
- // Left-up quarter
- assign_32x32_input_from_64x64(in, in32x32, 0);
- write_buffer_32x32(in32x32, leftUp, stride, fliplr, flipud, shift, bd);
+ // stage 3
+ u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+ u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
+ u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
+ u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
+ u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[34] = u[35];
+ u[37] = u[36];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[42] = u[43];
+ u[45] = u[44];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[50] = u[51];
+ u[53] = u[52];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[58] = u[59];
+ u[61] = u[60];
+ u[62] = u[63];
- // Right-up quarter
- assign_32x32_input_from_64x64(in, in32x32, 64 / 2 / 4);
- write_buffer_32x32(in32x32, rightUp, stride, fliplr, flipud, shift, bd);
+ // stage 4
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+ u[17] = u[16];
+ u[18] = u[19];
+ u[21] = u[20];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[26] = u[27];
+ u[29] = u[28];
+ u[30] = u[31];
+
+ tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = tmp1;
+ u[34] = tmp2;
+ u[37] = tmp3;
+ u[38] = tmp4;
+
+ tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = tmp1;
+ u[42] = tmp2;
+ u[45] = tmp3;
+ u[46] = tmp4;
- // Left-down quarter
- assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4);
- write_buffer_32x32(in32x32, leftDown, stride, fliplr, flipud, shift, bd);
+ // stage 5
+ u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+
+ u[9] = u[8];
+ u[10] = u[11];
+ u[13] = u[12];
+ u[14] = u[15];
+
+ tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = tmp1;
+ u[18] = tmp2;
+ u[21] = tmp3;
+ u[22] = tmp4;
- // Right-down quarter
- assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4 + 64 / 2 / 4);
- write_buffer_32x32(in32x32, rightDown, stride, fliplr, flipud, shift, bd);
+ for (i = 32; i < 64; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[0] = tmp1;
+ u[5] = u[4];
+ u[6] = u[7];
+
+ tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = tmp1;
+ tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = tmp2;
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = tmp1;
+ u[35] = tmp2;
+ u[36] = tmp3;
+ u[37] = tmp4;
+
+ tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = tmp1;
+ u[43] = tmp2;
+ u[44] = tmp3;
+ u[45] = tmp4;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[5] = tmp1;
+ addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = tmp1;
+ u[19] = tmp2;
+ u[20] = tmp3;
+ u[21] = tmp4;
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+ }
}
static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
@@ -1847,7 +3770,6 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
- int col;
const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
@@ -1929,46 +3851,46 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
- for (col = 0; col < (do_cols ? 64 / 4 : 32 / 4); ++col) {
+ {
__m128i u[64], v[64];
// stage 1
- u[32] = in[1 * 16 + col];
- u[34] = in[17 * 16 + col];
- u[36] = in[9 * 16 + col];
- u[38] = in[25 * 16 + col];
- u[40] = in[5 * 16 + col];
- u[42] = in[21 * 16 + col];
- u[44] = in[13 * 16 + col];
- u[46] = in[29 * 16 + col];
- u[48] = in[3 * 16 + col];
- u[50] = in[19 * 16 + col];
- u[52] = in[11 * 16 + col];
- u[54] = in[27 * 16 + col];
- u[56] = in[7 * 16 + col];
- u[58] = in[23 * 16 + col];
- u[60] = in[15 * 16 + col];
- u[62] = in[31 * 16 + col];
-
- v[16] = in[2 * 16 + col];
- v[18] = in[18 * 16 + col];
- v[20] = in[10 * 16 + col];
- v[22] = in[26 * 16 + col];
- v[24] = in[6 * 16 + col];
- v[26] = in[22 * 16 + col];
- v[28] = in[14 * 16 + col];
- v[30] = in[30 * 16 + col];
-
- u[8] = in[4 * 16 + col];
- u[10] = in[20 * 16 + col];
- u[12] = in[12 * 16 + col];
- u[14] = in[28 * 16 + col];
-
- v[4] = in[8 * 16 + col];
- v[6] = in[24 * 16 + col];
-
- u[0] = in[0 * 16 + col];
- u[2] = in[16 * 16 + col];
+ u[32] = in[1];
+ u[34] = in[17];
+ u[36] = in[9];
+ u[38] = in[25];
+ u[40] = in[5];
+ u[42] = in[21];
+ u[44] = in[13];
+ u[46] = in[29];
+ u[48] = in[3];
+ u[50] = in[19];
+ u[52] = in[11];
+ u[54] = in[27];
+ u[56] = in[7];
+ u[58] = in[23];
+ u[60] = in[15];
+ u[62] = in[31];
+
+ v[16] = in[2];
+ v[18] = in[18];
+ v[20] = in[10];
+ v[22] = in[26];
+ v[24] = in[6];
+ v[26] = in[22];
+ v[28] = in[14];
+ v[30] = in[30];
+
+ u[8] = in[4];
+ u[10] = in[20];
+ u[12] = in[12];
+ u[14] = in[28];
+
+ v[4] = in[8];
+ v[6] = in[24];
+
+ u[0] = in[0];
+ u[2] = in[16];
// stage 2
v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
@@ -2301,39 +4223,1126 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
// stage 11
if (do_cols) {
for (i = 0; i < 32; i++) {
- addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
- &out[16 * (63 - i) + col]);
+ addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
}
} else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
for (i = 0; i < 32; i++) {
- addsub_shift_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
- &out[16 * (63 - i) + col], &clamp_lo, &clamp_hi,
- out_shift);
+ addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
+ &clamp_lo_out, &clamp_hi_out, out_shift);
}
}
}
}
-void av1_inv_txfm2d_add_64x64_sse4_1(const int32_t *coeff, uint16_t *output,
- int stride, TX_TYPE tx_type, int bd) {
- __m128i in[64 * 64 / 4], out[64 * 64 / 4];
- const int8_t *shift = inv_txfm_shift_ls[TX_64X64];
- const int txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
- const int txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
+static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1;
+
+ // stage 0
+ // stage 1
+ bf1 = in[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ if (do_cols) {
+ bf1 = _mm_max_epi32(bf1, clamp_lo);
+ bf1 = _mm_min_epi32(bf1, clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ bf1 = _mm_add_epi32(bf1, offset);
+ bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
+ bf1 = _mm_max_epi32(bf1, clamp_lo_out);
+ bf1 = _mm_min_epi32(bf1, clamp_hi_out);
+ }
+ out[0] = bf1;
+ out[1] = bf1;
+ out[2] = bf1;
+ out[3] = bf1;
+ out[4] = bf1;
+ out[5] = bf1;
+ out[6] = bf1;
+ out[7] = bf1;
+ out[8] = bf1;
+ out[9] = bf1;
+ out[10] = bf1;
+ out[11] = bf1;
+ out[12] = bf1;
+ out[13] = bf1;
+ out[14] = bf1;
+ out[15] = bf1;
+ out[16] = bf1;
+ out[17] = bf1;
+ out[18] = bf1;
+ out[19] = bf1;
+ out[20] = bf1;
+ out[21] = bf1;
+ out[22] = bf1;
+ out[23] = bf1;
+ out[24] = bf1;
+ out[25] = bf1;
+ out[26] = bf1;
+ out[27] = bf1;
+ out[28] = bf1;
+ out[29] = bf1;
+ out[30] = bf1;
+ out[31] = bf1;
+}
+
+static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32];
+
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[4] = in[4];
+ bf1[8] = in[2];
+ bf1[12] = in[6];
+ bf1[16] = in[1];
+ bf1[20] = in[5];
+ bf1[24] = in[3];
+ bf1[28] = in[7];
+
+ // stage 2
+ bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+ bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+ bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+
+ bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+ bf1[17] = bf1[16];
+ bf1[18] = bf1[19];
+ bf1[21] = bf1[20];
+ bf1[22] = bf1[23];
+ bf1[25] = bf1[24];
+ bf1[26] = bf1[27];
+ bf1[29] = bf1[28];
+ bf1[30] = bf1[31];
+
+ // stage 4 :
+ bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+
+ bf1[9] = bf1[8];
+ bf1[10] = bf1[11];
+ bf1[13] = bf1[12];
+ bf1[14] = bf1[15];
+
+ idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[5] = bf1[4];
+ bf1[6] = bf1[7];
+
+ idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ bf1[3] = bf1[0];
+ bf1[2] = bf1[1];
+
+ idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+}
+
+static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32];
+
+ // stage 0
+ // stage 1
+
+ bf1[0] = in[0];
+ bf1[2] = in[8];
+ bf1[4] = in[4];
+ bf1[6] = in[12];
+ bf1[8] = in[2];
+ bf1[10] = in[10];
+ bf1[12] = in[6];
+ bf1[14] = in[14];
+ bf1[16] = in[1];
+ bf1[18] = in[9];
+ bf1[20] = in[5];
+ bf1[22] = in[13];
+ bf1[24] = in[3];
+ bf1[26] = in[11];
+ bf1[28] = in[7];
+ bf1[30] = in[15];
+
+ // stage 2
+ bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+ bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
+ bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
+ bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
+ bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
+ bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+ bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
+ bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
+ bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
+ bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
+ bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+ bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
+ bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
+ bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
+ bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
+ bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+
+ addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+ // stage 4
+ bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+ bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
+ bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
+
+ addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+ idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
+ bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
+
+ addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+ idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+ // stage 6
+ addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+ idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+}
+
+static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32], bf0[32];
+
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[1] = in[16];
+ bf1[2] = in[8];
+ bf1[3] = in[24];
+ bf1[4] = in[4];
+ bf1[5] = in[20];
+ bf1[6] = in[12];
+ bf1[7] = in[28];
+ bf1[8] = in[2];
+ bf1[9] = in[18];
+ bf1[10] = in[10];
+ bf1[11] = in[26];
+ bf1[12] = in[6];
+ bf1[13] = in[22];
+ bf1[14] = in[14];
+ bf1[15] = in[30];
+ bf1[16] = in[1];
+ bf1[17] = in[17];
+ bf1[18] = in[9];
+ bf1[19] = in[25];
+ bf1[20] = in[5];
+ bf1[21] = in[21];
+ bf1[22] = in[13];
+ bf1[23] = in[29];
+ bf1[24] = in[3];
+ bf1[25] = in[19];
+ bf1[26] = in[11];
+ bf1[27] = in[27];
+ bf1[28] = in[7];
+ bf1[29] = in[23];
+ bf1[30] = in[15];
+ bf1[31] = in[31];
+
+ // stage 2
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] = bf1[4];
+ bf0[5] = bf1[5];
+ bf0[6] = bf1[6];
+ bf0[7] = bf1[7];
+ bf0[8] = bf1[8];
+ bf0[9] = bf1[9];
+ bf0[10] = bf1[10];
+ bf0[11] = bf1[11];
+ bf0[12] = bf1[12];
+ bf0[13] = bf1[13];
+ bf0[14] = bf1[14];
+ bf0[15] = bf1[15];
+ bf0[16] =
+ half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+ bf0[17] =
+ half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+ bf0[31] =
+ half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
+
+ // stage 3
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] =
+ half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+ bf1[9] =
+ half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+ bf1[15] =
+ half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+ addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] =
+ half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+ bf0[5] =
+ half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+ bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+ addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+ bf0[16] = bf1[16];
+ bf0[17] =
+ half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+ bf0[19] = bf1[19];
+ bf0[20] = bf1[20];
+ bf0[21] =
+ half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] =
+ half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+ bf0[27] = bf1[27];
+ bf0[28] = bf1[28];
+ bf0[29] =
+ half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+ bf0[31] = bf1[31];
+
+ // stage 5
+ bf1[0] =
+ half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+ bf1[1] =
+ half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+ bf1[2] =
+ half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+ bf1[3] =
+ half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+ addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] =
+ half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] =
+ half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+ bf1[15] = bf0[15];
+ addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+ bf0[4] = bf1[4];
+ bf0[5] =
+ half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[7] = bf1[7];
+ addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] =
+ half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+ bf0[22] = bf1[22];
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] = bf1[25];
+ bf0[26] =
+ half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 7
+ addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] =
+ half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+ // stage 8
+ addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] = bf1[18];
+ bf0[19] = bf1[19];
+ bf0[20] =
+ half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[28] = bf1[28];
+ bf0[29] = bf1[29];
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 9
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
+ addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
+ addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
+ addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
+ addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
+ addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
+ addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
+ addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
+ addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
+ addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
+ addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
+ addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
+ addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
+ addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
+ addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
+ addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+ break;
+ default:
+ av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
+ uint8_t *dest, int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
+ uint8_t *dest, int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
switch (tx_type) {
case DCT_DCT:
- load_buffer_64x64_lower_32x32(coeff, in);
- transpose_64x64(in, out, 0);
- idct64x64_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
- -shift[0]);
- transpose_64x64(in, out, 1);
- idct64x64_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
- write_buffer_64x64(in, output, stride, 0, 0, -shift[1], bd);
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ // Assembly version doesn't support IDTX, so use C version for it.
+ case IDTX:
+ av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
break;
+ default: assert(0);
+ }
+}
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ int eob = txfm_param->eob;
+ int bd = txfm_param->bd;
+ int lossless = txfm_param->lossless;
+ const int32_t *src = cast_to_int32(input);
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+ return;
+ }
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+ break;
default:
- av1_inv_txfm2d_add_64x64_c(coeff, output, stride, tx_type, bd);
+ av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ }
+}
+
+static const transform_1d_sse4_1
+ highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
+ { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ {
+ { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
+ NULL },
+ { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
+ NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
+ idct32x32_sse4_1 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
+ idct64x64_sse4_1 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64 * 16];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 2;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+ __m128i buf0[64];
+ const int32_t *input_row = input + i * input_stride * 4;
+ for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(
+ buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ highbd_inv_txfm2d_add_no_identity_sse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ default: assert(0); break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_32X32:
+ av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X16:
+ av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_8X8:
+ av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X8:
+ av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+ break;
+ case TX_8X4:
+ av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+ break;
+ case TX_8X16:
+ av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X8:
+ av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X32:
+ av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X16:
+ av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+ break;
+ case TX_32X64:
+ av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+ break;
+ case TX_64X32:
+ av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+ break;
+ case TX_4X4:
+ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+ break;
+ case TX_8X32:
+ av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X8:
+ av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+ break;
+ case TX_64X64:
+ case TX_16X64:
+ case TX_64X16:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(
+ input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+ txfm_param->eob, txfm_param->bd);
break;
+ default: assert(0 && "Invalid transform size"); break;
}
}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
index 608bd88a4..e298cf653 100644
--- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -14,7 +14,6 @@
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/x86/convolve_common_intrin.h"
#include "aom_dsp/x86/convolve_sse4_1.h"
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
index b29bd1d79..6f24e5948 100644
--- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H
-#define _HIGHBD_TXFM_UTILITY_SSE4_H
+#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
#include <smmintrin.h> /* SSE4.1 */
@@ -75,6 +75,17 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
out[63]);
}
+static INLINE void transpose_32x32(const __m128i *input, __m128i *output) {
+ for (int j = 0; j < 8; j++) {
+ for (int i = 0; i < 8; i++) {
+ TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8],
+ input[i * 32 + j + 16], input[i * 32 + j + 24],
+ output[j * 32 + i + 0], output[j * 32 + i + 8],
+ output[j * 32 + i + 16], output[j * 32 + i + 24]);
+ }
+ }
+}
+
// Note:
// rounding = 1 << (bit - 1)
static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
@@ -100,4 +111,15 @@ static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
return x;
}
-#endif // _HIGHBD_TXFM_UTILITY_SSE4_H
+typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift);
+
+typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+ const int num_cols);
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd);
+
+#endif // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
index a08beaafd..4bcab0564 100644
--- a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -19,10 +19,21 @@ static const uint8_t warp_highbd_arrange_bytes[16] = {
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
};
-static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
- int sx, int alpha, int k,
- const int offset_bits_horiz,
- const int reduce_bits_horiz) {
+static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
+ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+static const uint8_t highbd_shuffle_alpha0_mask2[16] = {
+ 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
+};
+static const uint8_t highbd_shuffle_alpha0_mask3[16] = {
+ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m128i *coeff) {
// Filter even-index pixels
const __m128i tmp_0 = _mm_loadu_si128(
(__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
@@ -43,27 +54,13 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
// coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
- const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
// coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
- const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
// coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
- const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
// coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
- const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
- ((1 << reduce_bits_horiz) >> 1));
-
- // Calculate filtered results
- const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
- const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
- const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
- const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
-
- __m128i res_even =
- _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
- res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
- _mm_cvtsi32_si128(reduce_bits_horiz));
+ coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
// Filter odd-index pixels
const __m128i tmp_1 = _mm_loadu_si128(
@@ -80,15 +77,63 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
- const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+ coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
+ int sx, __m128i *coeff) {
+ // Filter coeff
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ coeff[0] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
+ coeff[2] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
+ coeff[4] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
+ coeff[6] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
+
+ coeff[1] = coeff[0];
+ coeff[3] = coeff[2];
+ coeff[5] = coeff[4];
+ coeff[7] = coeff[6];
+}
+
+static INLINE void highbd_filter_src_pixels(
+ const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
+ const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
+ const __m128i src_1 = *src;
+ const __m128i src2_1 = *src2;
- const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
- const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
- const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
- const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
+ const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
+ ((1 << reduce_bits_horiz) >> 1));
+
+ const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
+
+ __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+ _mm_cvtsi32_si128(reduce_bits_horiz));
+
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
__m128i res_odd =
_mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
@@ -101,6 +146,145 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
}
+static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
+ __m128i *tmp, int sx, int alpha, int k,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ (void)alpha;
+ int k;
+
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)alpha;
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter_beta0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ int k;
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+}
+
+static INLINE void highbd_prepare_warp_horizontal_filter(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ if (alpha == 0 && beta == 0)
+ highbd_warp_horizontal_filter_alpha0_beta0(
+ ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+
+ else if (alpha == 0 && beta != 0)
+ highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+
+ else if (alpha != 0 && beta == 0)
+ highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ else
+ highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+}
+
void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
int width, int height, int stride,
uint16_t *pred, int p_col, int p_row,
@@ -247,27 +431,13 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
- horizontal_filter(src_padded, src2_padded, tmp, sx, alpha, k,
- offset_bits_horiz, reduce_bits_horiz);
+ highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
+ offset_bits_horiz, reduce_bits_horiz);
}
} else {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- int sx = sx4 + beta * (k + 4);
-
- // Load source pixels
- const __m128i src =
- _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
- const __m128i src2 =
- _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
-
- horizontal_filter(src, src2, tmp, sx, alpha, k, offset_bits_horiz,
- reduce_bits_horiz);
- }
+ highbd_prepare_warp_horizontal_filter(
+ ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
}
// Vertical filter
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
index d1ea26290..9f2e2b457 100644
--- a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -13,7 +13,6 @@
#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/x86/convolve_common_intrin.h"
#include "aom_dsp/x86/convolve_sse4_1.h"
@@ -21,6 +20,21 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
+static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi16(w0);
+ const __m256i wt1 = _mm256_set1_epi16(w1);
+ const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ return wt;
+}
+
+static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
+ return _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
+}
+
void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
const InterpFilterParams *filter_params_x,
@@ -34,11 +48,7 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint8_t *const src_ptr = src - fo_horiz;
const int bits = FILTER_BITS - conv_params->round_1;
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi16(w0);
- const __m256i wt1 = _mm256_set1_epi16(w1);
- const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ const __m256i wt = unpack_weights_avx2(conv_params);
const int do_average = conv_params->do_average;
const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
const int offset_0 =
@@ -68,13 +78,11 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
(void)subpel_y_q4;
for (i = 0; i < h; i += 2) {
+ const uint8_t *src_data = src_ptr + i * src_stride;
+ CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
for (j = 0; j < w; j += 8) {
- const __m256i data = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&src_ptr[i * src_stride + j + src_stride]))),
- 0x20);
+ const __m256i data =
+ load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
__m256i res = convolve_lowbd_x(data, coeffs, filt);
@@ -86,13 +94,8 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
// Accumulate values into the destination buffer
if (do_average) {
- const __m256i data_ref_0 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
- 0x20);
-
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
const __m256i comp_avg_res =
comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
@@ -141,11 +144,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m256i round_const =
_mm256_set1_epi32((1 << conv_params->round_1) >> 1);
const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi16(w0);
- const __m256i wt1 = _mm256_set1_epi16(w1);
- const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ const __m256i wt = unpack_weights_avx2(conv_params);
const int do_average = conv_params->do_average;
const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
const int offset_0 =
@@ -172,72 +171,35 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
for (j = 0; j < w; j += 16) {
const uint8_t *data = &src_ptr[j];
__m256i src6;
-
// Load lines a and b. Line a to lower 128, line b to upper 128
- const __m256i src_01a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- 0x20);
-
- const __m256i src_12a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- 0x20);
-
- const __m256i src_23a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- 0x20);
-
- const __m256i src_34a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- 0x20);
-
- const __m256i src_45a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
- const __m256i src_56a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- src6, 0x20);
-
- s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
- s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
- s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
- s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
- s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
- s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+ {
+ __m256i src_ab[7];
+ __m256i src_a[7];
+ src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ for (int kk = 0; kk < 6; ++kk) {
+ data += src_stride;
+ src_a[kk + 1] =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+ }
+ src6 = src_a[6];
+ s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+ s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+ s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+ s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+ s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+ s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+ }
for (i = 0; i < h; i += 2) {
- data = &src_ptr[i * src_stride + j];
- const __m256i src_67a = _mm256_permute2x128_si256(
- src6,
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- 0x20);
+ data = &src_ptr[(i + 7) * src_stride + j];
+ const __m256i src7 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
- const __m256i src_78a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- src6, 0x20);
+ _mm_loadu_si128((__m128i *)(data + src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
@@ -266,13 +228,8 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
if (w - j < 16) {
if (do_average) {
- const __m256i data_ref_0 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
- 0x20);
-
+ const __m256i data_ref_0 = load_line2_avx2(
+ &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
const __m256i comp_avg_res =
comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
@@ -325,19 +282,12 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
_mm256_add_epi16(res_hi_round, offset_const_2);
if (do_average) {
- const __m256i data_ref_0_lo = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
- 0x20);
-
- const __m256i data_ref_0_hi = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j + 8]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]))),
- 0x20);
+ const __m256i data_ref_0_lo = load_line2_avx2(
+ &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i data_ref_0_hi =
+ load_line2_avx2(&dst[i * dst_stride + j + 8],
+ &dst[i * dst_stride + j + 8 + dst_stride]);
const __m256i comp_avg_res_lo =
comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
@@ -404,11 +354,7 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi16(w0);
- const __m256i wt1 = _mm256_set1_epi16(w1);
- const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ const __m256i wt = unpack_weights_avx2(conv_params);
const int do_average = conv_params->do_average;
const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
const int offset_0 =
@@ -442,15 +388,14 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
for (j = 0; j < w; j += 8) {
/* Horizontal filter */
{
+ const uint8_t *src_h = src_ptr + j;
for (i = 0; i < im_h; i += 2) {
- __m256i data = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+ __m256i data =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
if (i + 1 < im_h)
data = _mm256_inserti128_si256(
- data,
- _mm_loadu_si128(
- (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
- 1);
+ data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
+ src_h += (src_stride << 1);
__m256i res = convolve_lowbd_x(data, coeffs_x, filt);
res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
@@ -500,13 +445,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
if (do_average) {
- const __m256i data_ref_0 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
- 0x20);
-
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
const __m256i comp_avg_res =
comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
@@ -534,12 +475,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
if (do_average) {
- const __m256i data_ref_0 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
- 0x20);
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
const __m256i comp_avg_res =
comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
@@ -598,11 +536,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
const __m128i left_shift = _mm_cvtsi32_si128(bits);
const int do_average = conv_params->do_average;
const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m256i wt0 = _mm256_set1_epi16(w0);
- const __m256i wt1 = _mm256_set1_epi16(w1);
- const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ const __m256i wt = unpack_weights_avx2(conv_params);
const __m256i zero = _mm256_setzero_si256();
const int offset_0 =
@@ -663,13 +597,8 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
// Accumulate values into the destination buffer
if (do_average) {
- const __m256i data_ref_0 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
- 0x20);
-
+ const __m256i data_ref_0 = load_line2_avx2(
+ &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
const __m256i comp_avg_res =
comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c
index ffbb31849..f645e0454 100644
--- a/third_party/aom/av1/common/x86/reconinter_avx2.c
+++ b/third_party/aom/av1/common/x86/reconinter_avx2.c
@@ -16,8 +16,504 @@
#include "aom/aom_integer.h"
#include "aom_dsp/blend.h"
#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
#include "av1/common/blockd.h"
+static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
+ const __m256i s1) {
+ const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
+ return _mm256_abs_epi16(
+ _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4)));
+ // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int stride0,
+ const uint8_t *src1, int stride1,
+ int h, int w) {
+ const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+ const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
+ int i = 0;
+ if (4 == w) {
+ do {
+ const __m128i s0A = xx_loadl_32(src0);
+ const __m128i s0B = xx_loadl_32(src0 + stride0);
+ const __m128i s0C = xx_loadl_32(src0 + stride0 * 2);
+ const __m128i s0D = xx_loadl_32(src0 + stride0 * 3);
+ const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+ const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
+ const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
+ const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
+
+ const __m128i s1A = xx_loadl_32(src1);
+ const __m128i s1B = xx_loadl_32(src1 + stride1);
+ const __m128i s1C = xx_loadl_32(src1 + stride1 * 2);
+ const __m128i s1D = xx_loadl_32(src1 + stride1 * 3);
+ const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+ const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
+ const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
+ const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
+ const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ const __m128i x_m8 =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
+ xx_storeu_128(mask, x_m8);
+ src0 += (stride0 << 2);
+ src1 += (stride1 << 2);
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (8 == w) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + stride0);
+ const __m128i s0C = xx_loadl_64(src0 + stride0 * 2);
+ const __m128i s0D = xx_loadl_64(src0 + stride0 * 3);
+ const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
+ const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + stride1);
+ const __m128i s1C = xx_loadl_64(src1 + stride1 * 2);
+ const __m128i s1D = xx_loadl_64(src1 + stride1 * 3);
+ const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
+ const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
+ const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
+ const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
+ const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
+ yy_storeu_256(mask, m8);
+ src0 += stride0 << 2;
+ src1 += stride1 << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (16 == w) {
+ do {
+ const __m128i s0A = xx_load_128(src0);
+ const __m128i s0B = xx_load_128(src0 + stride0);
+ const __m128i s1A = xx_load_128(src1);
+ const __m128i s1B = xx_load_128(src1 + stride1);
+ const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
+ const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
+ const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
+ const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);
+
+ const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
+ const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);
+
+ const __m256i m8 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
+ yy_storeu_256(mask, m8);
+ src0 += stride0 << 1;
+ src1 += stride1 << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else {
+ do {
+ int j = 0;
+ do {
+ const __m256i s0 = yy_loadu_256(src0 + j);
+ const __m256i s1 = yy_loadu_256(src1 + j);
+ const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
+ const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
+ const __m256i s0H =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
+ const __m256i s1H =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
+ const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
+ const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
+ const __m256i m8 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
+ yy_storeu_256(mask + j, m8);
+ j += 32;
+ } while (j < w);
+ src0 += stride0;
+ src1 += stride1;
+ mask += w;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
+ const __m256i *data_src1,
+ const __m256i *round_const,
+ const __m256i *mask_base_16,
+ const __m256i *clip_diff, int round) {
+ const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+ const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+ const __m256i diff = _mm256_max_epu16(diffa, diffb);
+ const __m256i diff_round =
+ _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+ const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+ const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+ return diff_clamp;
+}
+
+static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
+ const __m256i *data_src1,
+ const __m256i *round_const,
+ const __m256i *mask_base_16,
+ const __m256i *clip_diff,
+ int round) {
+ const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+ const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+ const __m256i diff = _mm256_max_epu16(diffa, diffb);
+ const __m256i diff_round =
+ _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+ const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+ const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+ const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
+ return diff_const_16;
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_avx2(
+ uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+ const int mask_base = 38;
+ const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+ const __m256i y38 = _mm256_set1_epi16(mask_base);
+ const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ int i = 0;
+ if (w == 4) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+ const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+ _mm_unpacklo_epi64(s0A, s0B));
+ const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+ _mm_unpacklo_epi64(s1A, s1B));
+ const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ xx_storeu_128(mask,
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (w == 8) {
+ do {
+ const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+ const __m256i s0CD =
+ yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+ const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+ const __m256i s1CD =
+ yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+ const __m256i m16AB =
+ calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+ const __m256i m16CD =
+ calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (w == 16) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else if (w == 32) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 32;
+ i += 1;
+ } while (i < h);
+ } else if (w == 64) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 64;
+ i += 1;
+ } while (i < h);
+ } else {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s0E = yy_loadu_256(src0 + 64);
+ const __m256i s0F = yy_loadu_256(src0 + 80);
+ const __m256i s0G = yy_loadu_256(src0 + 96);
+ const __m256i s0H = yy_loadu_256(src0 + 112);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i s1E = yy_loadu_256(src1 + 64);
+ const __m256i s1F = yy_loadu_256(src1 + 80);
+ const __m256i s1G = yy_loadu_256(src1 + 96);
+ const __m256i s1H = yy_loadu_256(src1 + 112);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m16E =
+ calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+ const __m256i m16F =
+ calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+ const __m256i m16G =
+ calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+ const __m256i m16H =
+ calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+ const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+ yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 128;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_inv_avx2(
+ uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+ const int mask_base = 38;
+ const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+ const __m256i y38 = _mm256_set1_epi16(mask_base);
+ const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ int i = 0;
+ if (w == 4) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+ const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+ _mm_unpacklo_epi64(s0A, s0B));
+ const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+ _mm_unpacklo_epi64(s1A, s1B));
+ const __m256i m16 =
+ calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ xx_storeu_128(mask,
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (w == 8) {
+ do {
+ const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+ const __m256i s0CD =
+ yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+ const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+ const __m256i s1CD =
+ yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+ const __m256i m16AB =
+ calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+ const __m256i m16CD =
+ calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (w == 16) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else if (w == 32) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 32;
+ i += 1;
+ } while (i < h);
+ } else if (w == 64) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 64;
+ i += 1;
+ } while (i < h);
+ } else {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s0E = yy_loadu_256(src0 + 64);
+ const __m256i s0F = yy_loadu_256(src0 + 80);
+ const __m256i s0G = yy_loadu_256(src0 + 96);
+ const __m256i s0H = yy_loadu_256(src0 + 112);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i s1E = yy_loadu_256(src1 + 64);
+ const __m256i s1F = yy_loadu_256(src1 + 80);
+ const __m256i s1G = yy_loadu_256(src1 + 96);
+ const __m256i s1H = yy_loadu_256(src1 + 112);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m16E =
+ calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+ const __m256i m16F =
+ calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+ const __m256i m16G =
+ calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+ const __m256i m16H =
+ calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+ const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+ yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 128;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_d16_avx2(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ const int shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ // When rounding constant is added, there is a possibility of overflow.
+ // However that much precision is not required. Code should very well work for
+ // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+ // there is a possibility of corner case bugs.
+ assert(DIFF_FACTOR_LOG2 == 4);
+ assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+
+ if (mask_type == DIFFWTD_38) {
+ build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
+ src1_stride, h, w, shift);
+ } else {
+ build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
+ src1_stride, h, w, shift);
+ }
+}
+
void av1_build_compound_diffwtd_mask_highbd_avx2(
uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c
index 375def62e..0aaf1f454 100644
--- a/third_party/aom/av1/common/x86/selfguided_avx2.c
+++ b/third_party/aom/av1/common/x86/selfguided_avx2.c
@@ -546,17 +546,18 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
}
}
-void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
- int dgd_stride, int32_t *flt0,
- int32_t *flt1, int flt_stride,
- int sgr_params_idx, int bit_depth,
- int highbd) {
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0,
+ int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth,
+ int highbd) {
// The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
// Ctl and Dtl is 32-byte aligned.
const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
- DECLARE_ALIGNED(32, int32_t,
- buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]);
+ int32_t *buf = aom_memalign(
+ 32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3));
+ if (!buf) return -1;
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -625,6 +626,8 @@ void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
height, highbd);
}
+ aom_free(buf);
+ return 0;
}
void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
@@ -635,8 +638,10 @@ void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
- av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1,
- width, eps, bit_depth, highbd);
+ const int ret = av1_selfguided_restoration_avx2(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ (void)ret;
+ assert(!ret);
const sgr_params_type *const params = &sgr_params[eps];
int xq[2];
decode_xq(xqd, xq, params);
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index c64150b9d..ea3f6d942 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -499,13 +499,15 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
}
}
-void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
- int height, int dgd_stride,
- int32_t *flt0, int32_t *flt1,
- int flt_stride, int sgr_params_idx,
- int bit_depth, int highbd) {
- DECLARE_ALIGNED(16, int32_t, buf[4 * RESTORATION_PROC_UNIT_PELS]);
- memset(buf, 0, sizeof(buf));
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
+ int height, int dgd_stride, int32_t *flt0,
+ int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth,
+ int highbd) {
+ int32_t *buf = (int32_t *)aom_memalign(
+ 16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+ if (!buf) return -1;
+ memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -574,6 +576,8 @@ void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
height, highbd);
}
+ aom_free(buf);
+ return 0;
}
void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
@@ -584,8 +588,10 @@ void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
- av1_selfguided_restoration_sse4_1(dat8, width, height, stride, flt0, flt1,
- width, eps, bit_depth, highbd);
+ const int ret = av1_selfguided_restoration_sse4_1(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ (void)ret;
+ assert(!ret);
const sgr_params_type *const params = &sgr_params[eps];
int xq[2];
decode_xq(xqd, xq, params);
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c
index efc542cbf..b810cea2e 100644
--- a/third_party/aom/av1/common/x86/warp_plane_sse4.c
+++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c
@@ -203,15 +203,72 @@ static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8,
static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9,
9, 11, 11, 13, 13, 15, 15, 0 };
-static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
- int alpha, int k,
+static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1 };
+
+static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3,
+ 2, 3, 2, 3, 2, 3, 2, 3 };
+
+static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5,
+ 4, 5, 4, 5, 4, 5, 4, 5 };
+
+static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7,
+ 6, 7, 6, 7, 6, 7, 6, 7 };
+
+static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 0, 1, 2, 3 };
+static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7,
+ 4, 5, 6, 7, 4, 5, 6, 7 };
+static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11,
+ 8, 9, 10, 11, 8, 9, 10, 11 };
+static const uint8_t shuffle_gamma0_mask3[16] = {
+ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
const int offset_bits_horiz,
- const int reduce_bits_horiz) {
+ const int reduce_bits_horiz, int k) {
const __m128i src_even =
_mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
const __m128i src_odd =
_mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+ // The pixel order we need for 'src' is:
+ // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+ const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+ const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
+ // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+ const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+ _mm_srli_si128(src_odd, 4));
+ const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
+ // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+ const __m128i src_13 =
+ _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+ const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
+ // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+ const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
+ _mm_srli_si128(src_even, 6));
+ const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
+
+ const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
+ ((1 << reduce_bits_horiz) >> 1));
+ // Note: The values res_02 + res_46 and res_13 + res_57 both
+ // fit into int16s at this point, but their sum may be too wide to fit
+ // into an int16. However, once we also add round_const, the sum of
+ // all of these fits into a uint16.
+ //
+ // The wrapping behaviour of _mm_add_* is used here to make sure we
+ // get the correct result despite converting between different
+ // (implicit) types.
+ const __m128i res_even = _mm_add_epi16(res_02, res_46);
+ const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+ const __m128i res =
+ _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+ tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+}
+
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m128i *coeff) {
// Filter even-index pixels
const __m128i tmp_0 = _mm_loadl_epi64(
(__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
@@ -249,47 +306,504 @@ static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
// Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
// Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+ coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
// Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
// Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+ coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
- // The pixel order we need for 'src' is:
- // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
- const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
- const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
- // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
- const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
- _mm_srli_si128(src_odd, 4));
- const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
- // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
- const __m128i src_13 =
- _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
- const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
- // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
- const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
- _mm_srli_si128(src_even, 6));
- const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
+static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
+ __m128i *coeff) {
+ // Filter even-index pixels
+ const __m128i tmp_0 =
+ _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
- const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
- ((1 << reduce_bits_horiz) >> 1));
+ // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+ coeff[0] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01));
+ // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+ coeff[1] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23));
+ // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+ coeff[2] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45));
+ // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+ coeff[3] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67));
+}
- // Note: The values res_02 + res_46 and res_13 + res_57 both
- // fit into int16s at this point, but their sum may be too wide to fit
- // into an int16. However, once we also add round_const, the sum of
- // all of these fits into a uint16.
- //
- // The wrapping behaviour of _mm_add_* is used here to make sure we
- // get the correct result despite converting between different
- // (implicit) types.
- const __m128i res_even = _mm_add_epi16(res_02, res_46);
- const __m128i res_odd = _mm_add_epi16(res_13, res_57);
- const __m128i res =
- _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
- tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+ int alpha, int k,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+}
+
+static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
+ int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta,
+ int p_height, int height, int i,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+}
+
+static INLINE void warp_horizontal_filter_alpha0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)alpha;
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void warp_horizontal_filter_beta0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ int k;
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ (void)alpha;
+ int k;
+
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void unpack_weights_and_set_round_const(
+ ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+ __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
+ *res_sub_const =
+ _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ *wt = _mm_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
+ __m128i *coeffs) {
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ // even coeffs
+ coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ // odd coeffs
+ coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
+ __m128i *coeffs) {
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ // even coeffs
+ coeffs[0] =
+ _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0));
+ coeffs[1] =
+ _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1));
+ coeffs[2] =
+ _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2));
+ coeffs[3] =
+ _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3));
+
+ // odd coeffs
+ coeffs[4] = coeffs[0];
+ coeffs[5] = coeffs[1];
+ coeffs[6] = coeffs[2];
+ coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
+ __m128i *res_lo, __m128i *res_hi,
+ int k) {
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ const __m128i *src = tmp + (k + 4);
+ const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+
+ const __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
+
+ const __m128i res_odd =
+ _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output(
+ __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
+ const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
+ uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
+ const int reduce_bits_vert, int p_stride, int p_width,
+ const int round_bits) {
+ __m128i res_lo_1 = *res_lo;
+ __m128i res_hi_1 = *res_hi;
+
+ if (conv_params->is_compound) {
+ __m128i *const p =
+ (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+ res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
+ reduce_bits_vert);
+ const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
+ __m128i res_lo_16;
+ if (conv_params->do_average) {
+ __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ const __m128i p_16 = _mm_loadl_epi64(p);
+
+ if (conv_params->use_jnt_comp_avg) {
+ const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
+ const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
+ }
+
+ res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
+
+ res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
+ round_bits);
+ __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
+ *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
+ } else {
+ _mm_storel_epi64(p, temp_lo_16);
+ }
+ if (p_width > 4) {
+ __m128i *const p4 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+ res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
+ reduce_bits_vert);
+ const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
+ __m128i res_hi_16;
+
+ if (conv_params->do_average) {
+ __m128i *const dst8_4 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ const __m128i p4_16 = _mm_loadl_epi64(p4);
+
+ if (conv_params->use_jnt_comp_avg) {
+ const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
+ const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+ res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
+ }
+ res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
+
+ res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
+ round_bits);
+ __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
+ *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
+
+ } else {
+ _mm_storel_epi64(p4, temp_hi_16);
+ }
+ }
+ } else {
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+ const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ // Note: If we're outputting a 4x4 block, we need to be very careful
+ // to only output 4 pixels at this point, to avoid encode/decode
+ // mismatches when encoding with multiple threads.
+ if (p_width == 4) {
+ *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+ } else {
+ _mm_storel_epi64(p, res_8bit);
+ }
+ }
+}
+
+static INLINE void warp_vertical_filter(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs(gamma, sy, coeffs);
+
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ int k;
+ (void)gamma;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
+
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_delta0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ (void)delta;
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ (void)delta;
+ (void)gamma;
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void prepare_warp_vertical_filter(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ if (gamma == 0 && delta == 0)
+ warp_vertical_filter_gamma0_delta0(
+ pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
+ sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
+ else if (gamma == 0 && delta != 0)
+ warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+ else if (gamma != 0 && delta == 0)
+ warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+ else
+ warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+}
+
+static INLINE void prepare_warp_horizontal_filter(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ if (alpha == 0 && beta == 0)
+ warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ else if (alpha == 0 && beta != 0)
+ warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+ else if (alpha != 0 && beta == 0)
+ warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+ else
+ warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
}
void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
@@ -309,24 +823,12 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
- const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
const __m128i reduce_bits_vert_const =
_mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
const int round_bits =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const __m128i res_sub_const =
- _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
- (1 << (offset_bits - conv_params->round_1 - 1)));
- __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
- __m128i round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
-
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const __m128i wt0 = _mm_set1_epi16(w0);
- const __m128i wt1 = _mm_set1_epi16(w1);
- const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
/* Note: For this code to work, the left/right frame borders need to be
@@ -340,6 +842,13 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
}
}*/
+ __m128i res_add_const_1;
+ if (conv_params->is_compound == 1) {
+ res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
+ } else {
+ res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+ }
for (i = 0; i < p_height; i += 8) {
for (j = 0; j < p_width; j += 8) {
@@ -419,203 +928,15 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
reduce_bits_horiz);
}
} else {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- int sx = sx4 + beta * (k + 4);
-
- // Load source pixels
- const __m128i src =
- _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
- horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
- reduce_bits_horiz);
- }
+ prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
}
// Vertical filter
- for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
- int sy = sy4 + delta * (k + 4);
-
- // Load from tmp and rearrange pairs of consecutive rows into the
- // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
- const __m128i *src = tmp + (k + 4);
- const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
- const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
- const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
- const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
- // Filter even-index pixels
- const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
- const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
- const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
- const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
- const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
- const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
- const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
- const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
- const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
- const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
- const __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- if (conv_params->is_compound) {
- __m128i *const p =
- (__m128i *)&conv_params
- ->dst[(i + k + 4) * conv_params->dst_stride + j];
- res_lo = _mm_add_epi32(res_lo, res_add_const);
- res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
- reduce_bits_vert_shift);
- const __m128i temp_lo_16 = _mm_packus_epi32(res_lo, res_lo);
- __m128i res_lo_16;
- if (conv_params->do_average) {
- __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
- const __m128i p_16 = _mm_loadl_epi64(p);
-
- if (conv_params->use_jnt_comp_avg) {
- const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
- const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
- const __m128i shifted_32 =
- _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
- res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
- } else {
- res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
- }
-
- res_lo_16 = _mm_add_epi16(res_lo_16, res_sub_const);
-
- res_lo_16 = _mm_sra_epi16(
- _mm_add_epi16(res_lo_16, round_bits_const), round_bits_shift);
- __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
- *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
- } else {
- _mm_storel_epi64(p, temp_lo_16);
- }
- if (p_width > 4) {
- __m128i *const p4 =
- (__m128i *)&conv_params
- ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
-
- res_hi = _mm_add_epi32(res_hi, res_add_const);
- res_hi =
- _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
- reduce_bits_vert_shift);
- const __m128i temp_hi_16 = _mm_packus_epi32(res_hi, res_hi);
- __m128i res_hi_16;
-
- if (conv_params->do_average) {
- __m128i *const dst8_4 =
- (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
- const __m128i p4_16 = _mm_loadl_epi64(p4);
-
- if (conv_params->use_jnt_comp_avg) {
- const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
- const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, wt);
- const __m128i shifted_32 =
- _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
- res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
- } else {
- res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
- }
- res_hi_16 = _mm_add_epi16(res_hi_16, res_sub_const);
-
- res_hi_16 = _mm_sra_epi16(
- _mm_add_epi16(res_hi_16, round_bits_const), round_bits_shift);
- __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
- *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
-
- } else {
- _mm_storel_epi64(p4, temp_hi_16);
- }
- }
- } else {
- // Round and pack into 8 bits
- const __m128i round_const =
- _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
- ((1 << reduce_bits_vert) >> 1));
-
- const __m128i res_lo_round = _mm_srai_epi32(
- _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
- const __m128i res_hi_round = _mm_srai_epi32(
- _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
-
- const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
- __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
- // Store, blending with 'pred' if needed
- __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
- // Note: If we're outputting a 4x4 block, we need to be very careful
- // to only output 4 pixels at this point, to avoid encode/decode
- // mismatches when encoding with multiple threads.
- if (p_width == 4) {
- *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
- } else {
- _mm_storel_epi64(p, res_8bit);
- }
- }
- }
+ prepare_warp_vertical_filter(
+ pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
+ j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
}
}
}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
index e1449fd21..87a6e1239 100644
--- a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
@@ -39,7 +39,8 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
DECLARE_ALIGNED(32, uint16_t,
temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
- int intermediate_height = h + SUBPEL_TAPS - 1;
+ int intermediate_height = h + SUBPEL_TAPS - 2;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
const int center_tap = ((SUBPEL_TAPS - 1) / 2);
const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
index 3083d224b..f9d00b733 100644
--- a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
+++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
@@ -32,7 +32,8 @@ void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
DECLARE_ALIGNED(16, uint16_t,
temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
- int intermediate_height = h + SUBPEL_TAPS - 1;
+ int intermediate_height = h + SUBPEL_TAPS - 2;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
int i, j;
const int center_tap = ((SUBPEL_TAPS - 1) / 2);
const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
diff --git a/third_party/aom/av1/decoder/accounting.h b/third_party/aom/av1/decoder/accounting.h
index 9099d081b..288e5e63e 100644
--- a/third_party/aom/av1/decoder/accounting.h
+++ b/third_party/aom/av1/decoder/accounting.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_ACCOUNTING_H_
-#define AOM_ACCOUNTING_H_
+#ifndef AOM_AV1_DECODER_ACCOUNTING_H_
+#define AOM_AV1_DECODER_ACCOUNTING_H_
#include <stdlib.h>
#include "aom/aomdx.h"
@@ -79,4 +79,4 @@ void aom_accounting_dump(Accounting *accounting);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
-#endif // AOM_ACCOUNTING_H_
+#endif // AOM_AV1_DECODER_ACCOUNTING_H_
diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c
index 6dbc4f3eb..31f14b531 100644
--- a/third_party/aom/av1/decoder/decodeframe.c
+++ b/third_party/aom/av1/decoder/decodeframe.c
@@ -43,6 +43,7 @@
#include "av1/common/entropy.h"
#include "av1/common/entropymode.h"
#include "av1/common/entropymv.h"
+#include "av1/common/frame_buffers.h"
#include "av1/common/idct.h"
#include "av1/common/mvref_common.h"
#include "av1/common/pred_common.h"
@@ -87,18 +88,25 @@ int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
static void set_planes_to_neutral_grey(const SequenceHeader *const seq_params,
const YV12_BUFFER_CONFIG *const buf,
int only_chroma) {
- const int val = 1 << (seq_params->bit_depth - 1);
-
- for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
- const int is_uv = plane > 0;
- for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
- if (seq_params->use_highbitdepth) {
- // TODO(yaowu): replace this with aom_memset16() for speed
- for (int col_idx = 0; col_idx < buf->crop_widths[is_uv]; col_idx++) {
- uint16_t *base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
- base[row_idx * buf->strides[is_uv] + col_idx] = val;
+ if (seq_params->use_highbitdepth) {
+ const int val = 1 << (seq_params->bit_depth - 1);
+ for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+ const int is_uv = plane > 0;
+ uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
+ // Set the first row to neutral grey. Then copy the first row to all
+ // subsequent rows.
+ if (buf->crop_heights[is_uv] > 0) {
+ aom_memset16(base, val, buf->crop_widths[is_uv]);
+ for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+ memcpy(&base[row_idx * buf->strides[is_uv]], base,
+ sizeof(*base) * buf->crop_widths[is_uv]);
}
- } else {
+ }
+ }
+ } else {
+ for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+ const int is_uv = plane > 0;
+ for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
memset(&buf->buffers[plane][row_idx * buf->uv_stride], 1 << 7,
buf->crop_widths[is_uv]);
}
@@ -687,11 +695,10 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
for (int x = 0; x < b8_w; x += b4_w) {
MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
is_compound = has_second_ref(this_mbmi);
- DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]);
int tmp_dst_stride = 8;
assert(bw < 8 || bh < 8);
ConvolveParams conv_params = get_conv_params_no_round(
- 0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd);
+ 0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
conv_params.use_jnt_comp_avg = 0;
struct buf_2d *const dst_buf = &pd->dst;
uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
@@ -735,7 +742,6 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv,
subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref],
&pre, &src_stride);
- conv_params.ref = ref;
conv_params.do_average = ref;
if (is_masked_compound_type(mi->interinter_comp.type)) {
// masked compound type has its own average mechanism
@@ -762,7 +768,6 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
uint8_t *const dst = dst_buf->buf;
uint8_t *pre[2];
SubpelParams subpel_params[2];
- DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
int src_stride[2];
for (ref = 0; ref < 1 + is_compound; ++ref) {
const struct scale_factors *const sf =
@@ -797,7 +802,7 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
}
ConvolveParams conv_params = get_conv_params_no_round(
- 0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd);
+ 0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
&conv_params.bck_offset,
&conv_params.use_jnt_comp_avg, is_compound);
@@ -808,7 +813,6 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
WarpTypesAllowed warp_types;
warp_types.global_warp_allowed = is_global[ref];
warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
- conv_params.ref = ref;
conv_params.do_average = ref;
if (is_masked_compound_type(mi->interinter_comp.type)) {
// masked compound type has its own average mechanism
@@ -931,7 +935,7 @@ static void dec_build_prediction_by_above_preds(
// Adjust mb_to_bottom_edge to have the correct value for the OBMC
// prediction block. This is half the height of the original block,
// except for 128-wide blocks, where we only use a height of 32.
- int this_height = xd->n8_h * MI_SIZE;
+ int this_height = xd->n4_h * MI_SIZE;
int pred_height = AOMMIN(this_height / 2, 32);
xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
@@ -984,7 +988,7 @@ static void dec_build_prediction_by_left_preds(
// Adjust mb_to_right_edge to have the correct value for the OBMC
// prediction block. This is half the width of the original block,
// except for 128-wide blocks, where we only use a width of 32.
- int this_width = xd->n8_w * MI_SIZE;
+ int this_width = xd->n4_w * MI_SIZE;
int pred_width = AOMMIN(this_width / 2, 32);
xd->mb_to_right_edge += (this_width - pred_width) * 8;
@@ -1006,8 +1010,6 @@ static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
MACROBLOCKD *xd, int mi_row,
int mi_col) {
const int num_planes = av1_num_planes(cm);
- DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -1018,19 +1020,23 @@ static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int len = sizeof(uint16_t);
- dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
- dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
- dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
- dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
- dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
- dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
+ dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+ dst_buf1[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+ dst_buf1[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+ dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+ dst_buf2[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+ dst_buf2[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
} else {
- dst_buf1[0] = tmp_buf1;
- dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
- dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
- dst_buf2[0] = tmp_buf2;
- dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
- dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
+ dst_buf1[0] = xd->tmp_obmc_bufs[0];
+ dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+ dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+ dst_buf2[0] = xd->tmp_obmc_bufs[1];
+ dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+ dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
}
dec_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
dst_width1, dst_height1, dst_stride1);
@@ -1069,8 +1075,9 @@ static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
}
dec_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
- if (mbmi->motion_mode == OBMC_CAUSAL)
+ if (mbmi->motion_mode == OBMC_CAUSAL) {
dec_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+ }
#if CONFIG_MISMATCH_DEBUG
for (int plane = 0; plane < num_planes; ++plane) {
const struct macroblockd_plane *pd = &xd->plane[plane];
@@ -1225,9 +1232,18 @@ static void decode_token_recon_block(AV1Decoder *const pbi,
set_color_index_map_offset);
}
+#if LOOP_FILTER_BITMASK
+static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ MB_MODE_INFO *mbmi);
+#endif
+
static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
- TX_SIZE tx_size, int depth, int blk_row,
- int blk_col, aom_reader *r) {
+ TX_SIZE tx_size, int depth,
+#if LOOP_FILTER_BITMASK
+ AV1_COMMON *cm, int mi_row, int mi_col,
+#endif
+ int blk_row, int blk_col, aom_reader *r) {
FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
int is_split = 0;
const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -1271,15 +1287,29 @@ static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
mbmi->tx_size = sub_txs;
txfm_partition_update(xd->above_txfm_context + blk_col,
xd->left_txfm_context + blk_row, sub_txs, tx_size);
+#if LOOP_FILTER_BITMASK
+ store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col, BLOCK_8X8,
+ TX_4X4, mbmi);
+#endif
return;
}
+#if LOOP_FILTER_BITMASK
+ if (depth + 1 == MAX_VARTX_DEPTH) {
+ store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+ txsize_to_bsize[tx_size], sub_txs, mbmi);
+ }
+#endif
assert(bsw > 0 && bsh > 0);
for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
int offsetr = blk_row + row;
int offsetc = blk_col + col;
- read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r);
+ read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1,
+#if LOOP_FILTER_BITMASK
+ cm, mi_row, mi_col,
+#endif
+ offsetr, offsetc, r);
}
}
} else {
@@ -1293,6 +1323,10 @@ static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
mbmi->tx_size = tx_size;
txfm_partition_update(xd->above_txfm_context + blk_col,
xd->left_txfm_context + blk_row, tx_size, tx_size);
+#if LOOP_FILTER_BITMASK
+ store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+ txsize_to_bsize[tx_size], tx_size, mbmi);
+#endif
}
}
@@ -1330,6 +1364,191 @@ static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
}
}
+#if LOOP_FILTER_BITMASK
+static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ MB_MODE_INFO *mbmi) {
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
+ const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
+ const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const int is_square_transform_size = tx_size <= TX_64X64;
+ int mask_id = 0;
+ int offset = 0;
+ const int half_ratio_tx_size_max32 =
+ (tx_size > TX_64X64) & (tx_size <= TX_32X16);
+ if (is_square_transform_size) {
+ switch (tx_size) {
+ case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+ case TX_8X8:
+ mask_id = mask_id_table_tx_8x8[bsize];
+ offset = 19;
+ break;
+ case TX_16X16:
+ mask_id = mask_id_table_tx_16x16[bsize];
+ offset = 33;
+ break;
+ case TX_32X32:
+ mask_id = mask_id_table_tx_32x32[bsize];
+ offset = 42;
+ break;
+ case TX_64X64: mask_id = 46; break;
+ default: assert(!is_square_transform_size); return;
+ }
+ mask_id += offset;
+ } else if (half_ratio_tx_size_max32) {
+ int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size];
+ mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+ } else if (tx_size == TX_32X64) {
+ mask_id = 59;
+ } else if (tx_size == TX_64X32) {
+ mask_id = 60;
+ } else { // quarter ratio tx size
+ mask_id = 61 + (tx_size - TX_4X16);
+ }
+ int index = 0;
+ const int row = mi_row % MI_SIZE_64X64;
+ const int col = mi_col % MI_SIZE_64X64;
+ const int shift = get_index_shift(col, row, &index);
+ const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+ for (int i = 0; i + index < 4; ++i) {
+ // y vertical.
+ lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // y horizontal.
+ lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ // u/v vertical.
+ lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // u/v horizontal.
+ lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ }
+}
+
+static void store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+ // Use a lookup table that provides one bitmask for a given block size and
+ // a univariant transform size.
+ int index;
+ int shift;
+ int row;
+ int col;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
+ const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
+ const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
+ int mask_id = 0;
+ int offset = 0;
+ const int half_ratio_tx_size_max32 =
+ (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16);
+ if (is_square_transform_size) {
+ switch (mbmi->tx_size) {
+ case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+ case TX_8X8:
+ mask_id = mask_id_table_tx_8x8[bsize];
+ offset = 19;
+ break;
+ case TX_16X16:
+ mask_id = mask_id_table_tx_16x16[bsize];
+ offset = 33;
+ break;
+ case TX_32X32:
+ mask_id = mask_id_table_tx_32x32[bsize];
+ offset = 42;
+ break;
+ case TX_64X64: mask_id = 46; break;
+ default: assert(!is_square_transform_size); return;
+ }
+ mask_id += offset;
+ } else if (half_ratio_tx_size_max32) {
+ int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size];
+ mask_id =
+ 47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+ } else if (mbmi->tx_size == TX_32X64) {
+ mask_id = 59;
+ } else if (mbmi->tx_size == TX_64X32) {
+ mask_id = 60;
+ } else { // quarter ratio tx size
+ mask_id = 61 + (mbmi->tx_size - TX_4X16);
+ }
+ row = mi_row % MI_SIZE_64X64;
+ col = mi_col % MI_SIZE_64X64;
+ shift = get_index_shift(col, row, &index);
+ const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+ for (int i = 0; i + index < 4; ++i) {
+ // y vertical.
+ lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // y horizontal.
+ lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ // u/v vertical.
+ lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // u/v horizontal.
+ lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ }
+}
+
+static void store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+ int index;
+ int shift;
+ int row;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const int row_start = mi_row % MI_SIZE_64X64;
+ const int col_start = mi_col % MI_SIZE_64X64;
+ shift = get_index_shift(col_start, row_start, &index);
+ const uint64_t top_edge_mask =
+ ((uint64_t)1 << (shift + mi_size_wide[bsize])) - ((uint64_t)1 << shift);
+ lfm->is_horz_border.bits[index] |= top_edge_mask;
+ const int is_vert_border = mask_id_table_vert_border[bsize];
+ const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start;
+ for (int i = 0; i + index < 4; ++i) {
+ lfm->is_vert_border.bits[i + index] |=
+ (left_mask_univariant_reordered[is_vert_border].bits[i] << vert_shift);
+ }
+ const int is_skip = mbmi->skip && is_inter_block(mbmi);
+ if (is_skip) {
+ const int is_skip_mask = mask_id_table_tx_4x4[bsize];
+ for (int i = 0; i + index < 4; ++i) {
+ lfm->skip.bits[i + index] |=
+ (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift);
+ }
+ }
+ const uint8_t level_vert_y = get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
+ const uint8_t level_horz_y = get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
+ const uint8_t level_u = get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
+ const uint8_t level_v = get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
+ for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
+ index = 0;
+ row = r % MI_SIZE_64X64;
+ memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_u[row][col_start], level_u,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_v[row][col_start], level_v,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ }
+}
+#endif
+
static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td,
int mi_row, int mi_col, aom_reader *r,
PARTITION_TYPE partition, BLOCK_SIZE bsize) {
@@ -1353,14 +1572,46 @@ static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td,
for (int idy = 0; idy < height; idy += bh)
for (int idx = 0; idx < width; idx += bw)
- read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r);
+ read_tx_size_vartx(xd, mbmi, max_tx_size, 0,
+#if LOOP_FILTER_BITMASK
+ cm, mi_row, mi_col,
+#endif
+ idy, idx, r);
} else {
mbmi->tx_size = read_tx_size(cm, xd, inter_block_tx, !mbmi->skip, r);
if (inter_block_tx)
memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
- set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h,
+ set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
mbmi->skip && is_inter_block(mbmi), xd);
+#if LOOP_FILTER_BITMASK
+ const int w = mi_size_wide[bsize];
+ const int h = mi_size_high[bsize];
+ if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
+ store_bitmask_univariant_tx(cm, mi_row, mi_col, bsize, mbmi);
+ } else {
+ for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
+ for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
+ store_bitmask_univariant_tx(cm, mi_row + row, mi_col + col,
+ BLOCK_64X64, mbmi);
+ }
+ }
+ }
+#endif
+ }
+#if LOOP_FILTER_BITMASK
+ const int w = mi_size_wide[bsize];
+ const int h = mi_size_high[bsize];
+ if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
+ store_bitmask_other_info(cm, mi_row, mi_col, bsize, mbmi);
+ } else {
+ for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
+ for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
+ store_bitmask_other_info(cm, mi_row + row, mi_col + col, BLOCK_64X64,
+ mbmi);
+ }
+ }
}
+#endif
if (cm->delta_q_present_flag) {
for (int i = 0; i < MAX_SEGMENTS; i++) {
@@ -1952,6 +2203,11 @@ static void setup_quantization(AV1_COMMON *const cm,
cm->v_dc_delta_q = cm->u_dc_delta_q;
cm->v_ac_delta_q = cm->u_ac_delta_q;
}
+ } else {
+ cm->u_dc_delta_q = 0;
+ cm->u_ac_delta_q = 0;
+ cm->v_dc_delta_q = 0;
+ cm->v_ac_delta_q = 0;
}
cm->dequant_bit_depth = seq_params->bit_depth;
cm->using_qmatrix = aom_rb_read_bit(rb);
@@ -2082,29 +2338,9 @@ static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
cm->cur_frame->height = cm->height;
}
-static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
- struct aom_read_bit_buffer *rb) {
- const SequenceHeader *const seq_params = &cm->seq_params;
- int width, height;
+static void setup_buffer_pool(AV1_COMMON *cm) {
BufferPool *const pool = cm->buffer_pool;
-
- if (frame_size_override_flag) {
- int num_bits_width = seq_params->num_bits_width;
- int num_bits_height = seq_params->num_bits_height;
- av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
- if (width > seq_params->max_frame_width ||
- height > seq_params->max_frame_height) {
- aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
- "Frame dimensions are larger than the maximum values");
- }
- } else {
- width = seq_params->max_frame_width;
- height = seq_params->max_frame_height;
- }
-
- setup_superres(cm, rb, &width, &height);
- resize_context_buffers(cm, width, height);
- setup_render_size(cm, rb);
+ const SequenceHeader *const seq_params = &cm->seq_params;
lock_buffer_pool(pool);
if (aom_realloc_frame_buffer(
@@ -2140,6 +2376,31 @@ static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
}
+static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
+ struct aom_read_bit_buffer *rb) {
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ int width, height;
+
+ if (frame_size_override_flag) {
+ int num_bits_width = seq_params->num_bits_width;
+ int num_bits_height = seq_params->num_bits_height;
+ av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+ if (width > seq_params->max_frame_width ||
+ height > seq_params->max_frame_height) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Frame dimensions are larger than the maximum values");
+ }
+ } else {
+ width = seq_params->max_frame_width;
+ height = seq_params->max_frame_height;
+ }
+
+ setup_superres(cm, rb, &width, &height);
+ resize_context_buffers(cm, width, height);
+ setup_render_size(cm, rb);
+ setup_buffer_pool(cm);
+}
+
static void setup_sb_size(SequenceHeader *seq_params,
struct aom_read_bit_buffer *rb) {
set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
@@ -2158,7 +2419,6 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
int width, height;
int found = 0;
int has_valid_ref_frame = 0;
- BufferPool *const pool = cm->buffer_pool;
for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
if (aom_rb_read_bit(rb)) {
YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
@@ -2208,39 +2468,7 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Referenced frame has incompatible color format");
}
-
- lock_buffer_pool(pool);
- if (aom_realloc_frame_buffer(
- get_frame_new_buffer(cm), cm->width, cm->height,
- seq_params->subsampling_x, seq_params->subsampling_y,
- seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment,
- &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
- pool->cb_priv)) {
- unlock_buffer_pool(pool);
- aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
- "Failed to allocate frame buffer");
- }
- unlock_buffer_pool(pool);
-
- pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x =
- seq_params->subsampling_x;
- pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y =
- seq_params->subsampling_y;
- pool->frame_bufs[cm->new_fb_idx].buf.bit_depth =
- (unsigned int)seq_params->bit_depth;
- pool->frame_bufs[cm->new_fb_idx].buf.color_primaries =
- seq_params->color_primaries;
- pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
- seq_params->transfer_characteristics;
- pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
- seq_params->matrix_coefficients;
- pool->frame_bufs[cm->new_fb_idx].buf.monochrome = seq_params->monochrome;
- pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
- seq_params->chroma_sample_position;
- pool->frame_bufs[cm->new_fb_idx].buf.color_range = seq_params->color_range;
- pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
- pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
+ setup_buffer_pool(cm);
}
// Same function as av1_read_uniform but reading from uncompresses header wb
@@ -2252,7 +2480,7 @@ static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
if (v < m)
return v;
else
- return (v << 1) - m + aom_rb_read_literal(rb, 1);
+ return (v << 1) - m + aom_rb_read_bit(rb);
}
static void read_tile_info_max_tile(AV1_COMMON *const cm,
@@ -2344,6 +2572,10 @@ static void read_tile_info(AV1Decoder *const pbi,
// tile to use for cdf update
cm->context_update_tile_id =
aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+ if (cm->context_update_tile_id >= cm->tile_rows * cm->tile_cols) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid context_update_tile_id");
+ }
// tile size magnitude
pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
}
@@ -2746,31 +2978,13 @@ static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
#endif // CONFIG_MULTITHREAD
}
-static INLINE int get_sb_rows_in_tile(AV1Decoder *pbi, TileInfo tile) {
- AV1_COMMON *cm = &pbi->common;
- int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
- tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
- int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
-
- return sb_rows;
-}
-
-static INLINE int get_sb_cols_in_tile(AV1Decoder *pbi, TileInfo tile) {
- AV1_COMMON *cm = &pbi->common;
- int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
- tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
- int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
-
- return sb_cols;
-}
-
static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
TileInfo tile_info, const int mi_row) {
AV1_COMMON *const cm = &pbi->common;
const int num_planes = av1_num_planes(cm);
TileDataDec *const tile_data =
pbi->tile_data + tile_info.tile_row * cm->tile_cols + tile_info.tile_col;
- const int sb_cols_in_tile = get_sb_cols_in_tile(pbi, tile_info);
+ const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
const int sb_row_in_tile =
(mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2;
int sb_col_in_tile = 0;
@@ -2792,15 +3006,11 @@ static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
}
static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
+ if (aom_reader_has_overflowed(r)) return -1;
+
uint32_t nb_bits = aom_reader_tell(r);
uint32_t nb_bytes = (nb_bits + 7) >> 3;
-
- const uint8_t *p_begin = aom_reader_find_begin(r);
- const uint8_t *p_end = aom_reader_find_end(r);
-
- // It is legal to have no padding bytes (nb_bytes == p_end - p_begin).
- if ((ptrdiff_t)nb_bytes > p_end - p_begin) return -1;
- const uint8_t *p = p_begin + nb_bytes;
+ const uint8_t *p = aom_reader_find_begin(r) + nb_bytes;
// aom_reader_tell() returns 1 for a newly initialized decoder, and the
// return value only increases as values are decoded. So nb_bits > 0, and
@@ -2810,6 +3020,7 @@ static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
if ((last_byte & (2 * pattern - 1)) != pattern) return -1;
// Make sure that all padding bytes are zero as required by the spec.
+ const uint8_t *p_end = aom_reader_find_end(r);
while (p < p_end) {
if (*p != 0) return -1;
p++;
@@ -2863,6 +3074,11 @@ static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row,
// Bit-stream parsing and decoding of the superblock
decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
cm->seq_params.sb_size, 0x3);
+
+ if (aom_reader_has_overflowed(td->bit_reader)) {
+ aom_merge_corrupted_flag(&td->xd.corrupted, 1);
+ return;
+ }
}
}
@@ -2950,6 +3166,11 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
td->xd.corrupted = 0;
td->xd.mc_buf[0] = td->mc_buf[0];
td->xd.mc_buf[1] = td->mc_buf[1];
+ td->xd.tmp_conv_dst = td->tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ td->xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
+ }
+
for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
@@ -3236,6 +3457,7 @@ static int row_mt_worker_hook(void *arg1, void *arg2) {
#endif
frame_row_mt_info->row_mt_exit = 1;
#if CONFIG_MULTITHREAD
+ pthread_cond_broadcast(pbi->row_mt_cond_);
pthread_mutex_unlock(pbi->row_mt_mutex_);
#endif
return 0;
@@ -3386,16 +3608,24 @@ static void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm,
aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
}
-void av1_free_mc_tmp_buf(ThreadData *thread_data, int use_highbd) {
+void av1_free_mc_tmp_buf(ThreadData *thread_data) {
int ref;
for (ref = 0; ref < 2; ref++) {
- if (use_highbd)
+ if (thread_data->mc_buf_use_highbd)
aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
else
aom_free(thread_data->mc_buf[ref]);
thread_data->mc_buf[ref] = NULL;
}
thread_data->mc_buf_size = 0;
+ thread_data->mc_buf_use_highbd = 0;
+
+ aom_free(thread_data->tmp_conv_dst);
+ thread_data->tmp_conv_dst = NULL;
+ for (int i = 0; i < 2; ++i) {
+ aom_free(thread_data->tmp_obmc_bufs[i]);
+ thread_data->tmp_obmc_bufs[i] = NULL;
+ }
}
static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data,
@@ -3411,6 +3641,17 @@ static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data,
}
}
thread_data->mc_buf_size = buf_size;
+ thread_data->mc_buf_use_highbd = use_highbd;
+
+ CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+ sizeof(*thread_data->tmp_conv_dst)));
+ for (int i = 0; i < 2; ++i) {
+ CHECK_MEM_ERROR(
+ cm, thread_data->tmp_obmc_bufs[i],
+ aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->tmp_obmc_bufs[i])));
+ }
}
static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook,
@@ -3425,6 +3666,10 @@ static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook,
thread_data->td->xd.corrupted = 0;
thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0];
thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1];
+ thread_data->td->xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->xd.tmp_obmc_bufs[j] = thread_data->td->tmp_obmc_bufs[j];
+ }
winterface->sync(worker);
worker->hook = worker_hook;
@@ -3511,7 +3756,7 @@ static void decode_mt_init(AV1Decoder *pbi) {
for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
if (thread_data->td->mc_buf_size != buf_size) {
- av1_free_mc_tmp_buf(thread_data->td, use_highbd);
+ av1_free_mc_tmp_buf(thread_data->td);
allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
}
}
@@ -3783,8 +4028,8 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
av1_tile_init(&tile_data->tile_info, cm, row, col);
- max_sb_rows =
- AOMMAX(max_sb_rows, get_sb_rows_in_tile(pbi, tile_data->tile_info));
+ max_sb_rows = AOMMAX(max_sb_rows,
+ av1_get_sb_rows_in_tile(cm, tile_data->tile_info));
}
}
@@ -3905,6 +4150,8 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
if (!seq_params->monochrome)
pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
+ else
+ pars->chroma_scaling_from_luma = 0;
if (seq_params->monochrome || pars->chroma_scaling_from_luma ||
((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
@@ -4412,6 +4659,29 @@ static void show_existing_frame_reset(AV1Decoder *const pbi,
*cm->fc = cm->frame_contexts[existing_frame_idx];
}
+static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ int i;
+
+ memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+ memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+
+ lock_buffer_pool(cm->buffer_pool);
+ for (i = 0; i < FRAME_BUFFERS; ++i) {
+ if (i != cm->new_fb_idx) {
+ frame_bufs[i].ref_count = 0;
+ cm->buffer_pool->release_fb_cb(cm->buffer_pool->cb_priv,
+ &frame_bufs[i].raw_frame_buffer);
+ } else {
+ assert(frame_bufs[i].ref_count == 1);
+ }
+ frame_bufs[i].cur_frame_offset = 0;
+ av1_zero(frame_bufs[i].ref_frame_offset);
+ }
+ av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
+ unlock_buffer_pool(cm->buffer_pool);
+}
+
// On success, returns 0. On failure, calls aom_internal_error and does not
// return.
static int read_uncompressed_header(AV1Decoder *pbi,
@@ -4443,6 +4713,11 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->reset_decoder_state = 0;
if (cm->show_existing_frame) {
+ if (pbi->sequence_header_changed) {
+ aom_internal_error(
+ &cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "New sequence header starts with a show_existing_frame.");
+ }
// Show an existing frame directly.
const int existing_frame_idx = aom_rb_read_literal(rb, 3);
const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
@@ -4493,6 +4768,18 @@ static int read_uncompressed_header(AV1Decoder *pbi,
}
cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2); // 2 bits
+ if (pbi->sequence_header_changed) {
+ if (pbi->common.frame_type == KEY_FRAME) {
+ // This is the start of a new coded video sequence.
+ pbi->sequence_header_changed = 0;
+ pbi->decoding_first_frame = 1;
+ reset_frame_buffers(&pbi->common);
+ } else {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Sequence header has changed without a keyframe.");
+ }
+ }
+
cm->show_frame = aom_rb_read_bit(rb);
if (seq_params->still_picture &&
(cm->frame_type != KEY_FRAME || !cm->show_frame)) {
@@ -4582,8 +4869,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
}
}
- frame_size_override_flag =
- frame_is_sframe(cm) ? 1 : aom_rb_read_literal(rb, 1);
+ frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb);
cm->frame_offset =
aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1);
@@ -5152,7 +5438,7 @@ static void setup_frame_info(AV1Decoder *pbi) {
const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
if (pbi->td.mc_buf_size != buf_size) {
- av1_free_mc_tmp_buf(&pbi->td, use_highbd);
+ av1_free_mc_tmp_buf(&pbi->td);
allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd);
}
}
@@ -5166,6 +5452,11 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
const int tile_count_tg = end_tile - start_tile + 1;
if (initialize_flag) setup_frame_info(pbi);
+ const int num_planes = av1_num_planes(cm);
+#if LOOP_FILTER_BITMASK
+ av1_loop_filter_frame_init(cm, 0, num_planes);
+ av1_zero_array(cm->lf.lfm, cm->lf.lfm_num);
+#endif
if (pbi->max_threads > 1 && !(cm->large_scale_tile && !pbi->ext_tile_debug) &&
pbi->row_mt)
@@ -5177,7 +5468,6 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
else
*p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
- const int num_planes = av1_num_planes(cm);
// If the bit stream is monochrome, set the U and V buffers to a constant.
if (num_planes < 3) {
set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);
@@ -5190,7 +5480,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
if (!cm->allow_intrabc && !cm->single_tile_decoding) {
if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
#if LOOP_FILTER_BITMASK
- av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+ av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 1, 0,
num_planes, 0);
#else
if (pbi->num_workers > 1) {
@@ -5255,6 +5545,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
if (!xd->corrupted) {
if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+ assert(cm->context_update_tile_id < pbi->allocated_tiles);
*cm->fc = pbi->tile_data[cm->context_update_tile_id].tctx;
av1_reset_cdf_symbol_counters(cm->fc);
}
diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h
index d289b31f2..ddad273f1 100644
--- a/third_party/aom/av1/decoder/decodeframe.h
+++ b/third_party/aom/av1/decoder/decodeframe.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_DECODER_DECODEFRAME_H_
-#define AV1_DECODER_DECODEFRAME_H_
+#ifndef AOM_AV1_DECODER_DECODEFRAME_H_
+#define AOM_AV1_DECODER_DECODEFRAME_H_
#ifdef __cplusplus
extern "C" {
@@ -74,7 +74,7 @@ struct aom_read_bit_buffer *av1_init_read_bit_buffer(
struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
const uint8_t *data_end);
-void av1_free_mc_tmp_buf(struct ThreadData *thread_data, int use_highbd);
+void av1_free_mc_tmp_buf(struct ThreadData *thread_data);
void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
@@ -82,4 +82,4 @@ void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
} // extern "C"
#endif
-#endif // AV1_DECODER_DECODEFRAME_H_
+#endif // AOM_AV1_DECODER_DECODEFRAME_H_
diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c
index 5e920b18d..551e4d543 100644
--- a/third_party/aom/av1/decoder/decodemv.c
+++ b/third_party/aom/av1/decoder/decodemv.c
@@ -94,42 +94,26 @@ static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
}
return reduced_delta_qindex;
}
-static int read_delta_lflevel(AV1_COMMON *cm, const MACROBLOCKD *xd,
- aom_reader *r, int lf_id,
- MB_MODE_INFO *const mbmi, int mi_col,
+static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r,
+ aom_cdf_prob *const cdf,
+ const MB_MODE_INFO *const mbmi, int mi_col,
int mi_row) {
- int sign, abs, reduced_delta_lflevel = 0;
- BLOCK_SIZE bsize = mbmi->sb_type;
+ int reduced_delta_lflevel = 0;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
const int b_col = mi_col & (cm->seq_params.mib_size - 1);
const int b_row = mi_row & (cm->seq_params.mib_size - 1);
const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
- FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
read_delta_lf_flag) {
- if (cm->delta_lf_multi) {
- assert(lf_id >= 0 &&
- lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
- : FRAME_LF_COUNT - 2));
- abs = aom_read_symbol(r, ec_ctx->delta_lf_multi_cdf[lf_id],
- DELTA_LF_PROBS + 1, ACCT_STR);
- } else {
- abs = aom_read_symbol(r, ec_ctx->delta_lf_cdf, DELTA_LF_PROBS + 1,
- ACCT_STR);
- }
+ int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
const int smallval = (abs < DELTA_LF_SMALL);
if (!smallval) {
const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
const int thr = (1 << rem_bits) + 1;
abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
}
-
- if (abs) {
- sign = aom_read_bit(r, ACCT_STR);
- } else {
- sign = 1;
- }
-
+ const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1;
reduced_delta_lflevel = sign ? -abs : abs;
}
return reduced_delta_lflevel;
@@ -618,19 +602,22 @@ static void read_filter_intra_mode_info(const AV1_COMMON *const cm,
void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
int blk_col, TX_SIZE tx_size, aom_reader *r) {
MB_MODE_INFO *mbmi = xd->mi[0];
- const int inter_block = is_inter_block(mbmi);
- FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-
const int txk_type_idx =
av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
TX_TYPE *tx_type = &mbmi->txk_type[txk_type_idx];
+ *tx_type = DCT_DCT;
+
+ // No need to read transform type if block is skipped.
+ if (mbmi->skip || segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+ return;
+
+ // No need to read transform type for lossless mode(qindex==0).
+ const int qindex =
+ cm->seg.enabled ? xd->qindex[mbmi->segment_id] : cm->base_qindex;
+ if (qindex <= 0) return;
- const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
- if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1 &&
- ((!cm->seg.enabled && cm->base_qindex > 0) ||
- (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
- !mbmi->skip &&
- !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const int inter_block = is_inter_block(mbmi);
+ if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1) {
const TxSetType tx_set_type =
av1_get_ext_tx_set_type(tx_size, inter_block, cm->reduced_tx_set_used);
const int eset =
@@ -639,23 +626,22 @@ void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
// there is no need to read the tx_type
assert(eset != 0);
+ const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
if (inter_block) {
*tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
} else {
- PREDICTION_MODE intra_dir;
- if (mbmi->filter_intra_mode_info.use_filter_intra)
- intra_dir =
- fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
- else
- intra_dir = mbmi->mode;
+ const PREDICTION_MODE intra_mode =
+ mbmi->filter_intra_mode_info.use_filter_intra
+ ? fimode_to_intradir[mbmi->filter_intra_mode_info
+ .filter_intra_mode]
+ : mbmi->mode;
*tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
- r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+ r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode],
av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
}
- } else {
- *tx_type = DCT_DCT;
}
}
@@ -720,6 +706,43 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
}
}
+// If delta q is present, reads delta_q index.
+// Also reads delta_q loop filter levels, if present.
+static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ const int mi_row, const int mi_col,
+ aom_reader *r) {
+ if (cm->delta_q_present_flag) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ xd->current_qindex +=
+ read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
+ /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
+ xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
+ FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+ if (cm->delta_lf_present_flag) {
+ if (cm->delta_lf_multi) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ const int tmp_lvl =
+ xd->delta_lf[lf_id] +
+ read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi,
+ mi_col, mi_row) *
+ cm->delta_lf_res;
+ mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
+ clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ }
+ } else {
+ const int tmp_lvl = xd->delta_lf_from_base +
+ read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf,
+ mbmi, mi_col, mi_row) *
+ cm->delta_lf_res;
+ mbmi->delta_lf_from_base = xd->delta_lf_from_base =
+ clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ }
+ }
+ }
+}
+
static void read_intra_frame_mode_info(AV1_COMMON *const cm,
MACROBLOCKD *const xd, int mi_row,
int mi_col, aom_reader *r) {
@@ -743,33 +766,7 @@ static void read_intra_frame_mode_info(AV1_COMMON *const cm,
read_cdef(cm, r, xd, mi_col, mi_row);
- if (cm->delta_q_present_flag) {
- xd->current_qindex +=
- read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
- /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
- xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
- if (cm->delta_lf_present_flag) {
- if (cm->delta_lf_multi) {
- const int frame_lf_count =
- av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
- for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
- const int tmp_lvl =
- xd->delta_lf[lf_id] +
- read_delta_lflevel(cm, xd, r, lf_id, mbmi, mi_col, mi_row) *
- cm->delta_lf_res;
- mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
- clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
- }
- } else {
- const int tmp_lvl =
- xd->delta_lf_from_base +
- read_delta_lflevel(cm, xd, r, -1, mbmi, mi_col, mi_row) *
- cm->delta_lf_res;
- mbmi->delta_lf_from_base = xd->delta_lf_from_base =
- clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
- }
- }
- }
+ read_delta_q_params(cm, xd, mi_row, mi_col, r);
mbmi->current_qindex = xd->current_qindex;
@@ -1402,7 +1399,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
mbmi->motion_mode = SIMPLE_TRANSLATION;
if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode &&
!has_second_ref(mbmi))
- mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+ mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
if (mbmi->ref_frame[1] != INTRA_FRAME)
@@ -1463,20 +1460,20 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
read_mb_interp_filter(cm, xd, mbmi, r);
if (mbmi->motion_mode == WARPED_CAUSAL) {
- mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
- mbmi->wm_params[0].invalid = 0;
+ mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+ mbmi->wm_params.invalid = 0;
- if (mbmi->num_proj_ref[0] > 1)
- mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
- mbmi->num_proj_ref[0], bsize);
+ if (mbmi->num_proj_ref > 1)
+ mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+ mbmi->num_proj_ref, bsize);
- if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
+ if (find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
- &mbmi->wm_params[0], mi_row, mi_col)) {
+ &mbmi->wm_params, mi_row, mi_col)) {
#if WARPED_MOTION_DEBUG
printf("Warning: unexpected warped model from aomenc\n");
#endif
- mbmi->wm_params[0].invalid = 1;
+ mbmi->wm_params.invalid = 1;
}
}
@@ -1512,33 +1509,7 @@ static void read_inter_frame_mode_info(AV1Decoder *const pbi,
read_cdef(cm, r, xd, mi_col, mi_row);
- if (cm->delta_q_present_flag) {
- xd->current_qindex +=
- read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
- /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
- xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
- if (cm->delta_lf_present_flag) {
- if (cm->delta_lf_multi) {
- const int frame_lf_count =
- av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
- for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
- const int tmp_lvl =
- xd->delta_lf[lf_id] +
- read_delta_lflevel(cm, xd, r, lf_id, mbmi, mi_col, mi_row) *
- cm->delta_lf_res;
- mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
- clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
- }
- } else {
- const int tmp_lvl =
- xd->delta_lf_from_base +
- read_delta_lflevel(cm, xd, r, -1, mbmi, mi_col, mi_row) *
- cm->delta_lf_res;
- mbmi->delta_lf_from_base = xd->delta_lf_from_base =
- clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
- }
- }
- }
+ read_delta_q_params(cm, xd, mi_row, mi_col, r);
if (!mbmi->skip_mode)
inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
diff --git a/third_party/aom/av1/decoder/decodemv.h b/third_party/aom/av1/decoder/decodemv.h
index 6243bb168..1625e5bd2 100644
--- a/third_party/aom/av1/decoder/decodemv.h
+++ b/third_party/aom/av1/decoder/decodemv.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_DECODER_DECODEMV_H_
-#define AV1_DECODER_DECODEMV_H_
+#ifndef AOM_AV1_DECODER_DECODEMV_H_
+#define AOM_AV1_DECODER_DECODEMV_H_
#include "aom_dsp/bitreader.h"
@@ -32,4 +32,4 @@ void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
int blk_col, TX_SIZE tx_size, aom_reader *r);
-#endif // AV1_DECODER_DECODEMV_H_
+#endif // AOM_AV1_DECODER_DECODEMV_H_
diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c
index e978fad6c..a5f4fd67f 100644
--- a/third_party/aom/av1/decoder/decoder.c
+++ b/third_party/aom/av1/decoder/decoder.c
@@ -37,16 +37,11 @@
#include "av1/decoder/obu.h"
static void initialize_dec(void) {
- static volatile int init_done = 0;
-
- if (!init_done) {
- av1_rtcd();
- aom_dsp_rtcd();
- aom_scale_rtcd();
- av1_init_intra_predictors();
- av1_init_wedge_masks();
- init_done = 1;
- }
+ av1_rtcd();
+ aom_dsp_rtcd();
+ aom_scale_rtcd();
+ av1_init_intra_predictors();
+ av1_init_wedge_masks();
}
static void dec_setup_mi(AV1_COMMON *cm) {
@@ -171,8 +166,7 @@ void av1_decoder_remove(AV1Decoder *pbi) {
if (pbi->thread_data) {
for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
- const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0;
- av1_free_mc_tmp_buf(thread_data->td, use_highbd);
+ av1_free_mc_tmp_buf(thread_data->td);
aom_free(thread_data->td);
}
aom_free(pbi->thread_data);
@@ -209,8 +203,7 @@ void av1_decoder_remove(AV1Decoder *pbi) {
#if CONFIG_ACCOUNTING
aom_accounting_clear(&pbi->accounting);
#endif
- const int use_highbd = pbi->common.seq_params.use_highbitdepth ? 1 : 0;
- av1_free_mc_tmp_buf(&pbi->td, use_highbd);
+ av1_free_mc_tmp_buf(&pbi->td);
aom_free(pbi);
}
diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h
index 610b98d95..5ca939c24 100644
--- a/third_party/aom/av1/decoder/decoder.h
+++ b/third_party/aom/av1/decoder/decoder.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_DECODER_DECODER_H_
-#define AV1_DECODER_DECODER_H_
+#ifndef AOM_AV1_DECODER_DECODER_H_
+#define AOM_AV1_DECODER_DECODER_H_
#include "config/aom_config.h"
@@ -55,6 +55,11 @@ typedef struct ThreadData {
CB_BUFFER cb_buffer_base;
uint8_t *mc_buf[2];
int32_t mc_buf_size;
+ int mc_buf_use_highbd; // Boolean: whether the byte pointers stored in
+ // mc_buf were converted from highbd pointers.
+
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit;
decode_block_visitor_fn_t predict_and_recon_intra_block_visit;
@@ -199,6 +204,7 @@ typedef struct AV1Decoder {
int tg_start; // First tile in the current tilegroup
int tg_size_bit_offset;
int sequence_header_ready;
+ int sequence_header_changed;
#if CONFIG_INSPECTION
aom_inspect_cb inspect_cb;
void *inspect_ctx;
@@ -308,4 +314,4 @@ typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td,
} // extern "C"
#endif
-#endif // AV1_DECODER_DECODER_H_
+#endif // AOM_AV1_DECODER_DECODER_H_
diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h
index 687bba958..fe04f6abd 100644
--- a/third_party/aom/av1/decoder/decodetxb.h
+++ b/third_party/aom/av1/decoder/decodetxb.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef DECODETXB_H_
-#define DECODETXB_H_
+#ifndef AOM_AV1_DECODER_DECODETXB_H_
+#define AOM_AV1_DECODER_DECODETXB_H_
#include "config/aom_config.h"
@@ -29,4 +29,4 @@ void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
MACROBLOCKD *const xd, aom_reader *const r,
const int plane, const int row, const int col,
const TX_SIZE tx_size);
-#endif // DECODETXB_H_
+#endif // AOM_AV1_DECODER_DECODETXB_H_
diff --git a/third_party/aom/av1/decoder/detokenize.h b/third_party/aom/av1/decoder/detokenize.h
index ec85bf7ea..173b437a9 100644
--- a/third_party/aom/av1/decoder/detokenize.h
+++ b/third_party/aom/av1/decoder/detokenize.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_DECODER_DETOKENIZE_H_
-#define AV1_DECODER_DETOKENIZE_H_
+#ifndef AOM_AV1_DECODER_DETOKENIZE_H_
+#define AOM_AV1_DECODER_DETOKENIZE_H_
#include "config/aom_config.h"
@@ -26,4 +26,4 @@ void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_DECODER_DETOKENIZE_H_
+#endif // AOM_AV1_DECODER_DETOKENIZE_H_
diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h
index 9f854e015..1d264b07e 100644
--- a/third_party/aom/av1/decoder/dthread.h
+++ b/third_party/aom/av1/decoder/dthread.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_DECODER_DTHREAD_H_
-#define AV1_DECODER_DTHREAD_H_
+#ifndef AOM_AV1_DECODER_DTHREAD_H_
+#define AOM_AV1_DECODER_DTHREAD_H_
#include "config/aom_config.h"
@@ -79,4 +79,4 @@ void av1_frameworker_copy_context(AVxWorker *const dst_worker,
} // extern "C"
#endif
-#endif // AV1_DECODER_DTHREAD_H_
+#endif // AOM_AV1_DECODER_DTHREAD_H_
diff --git a/third_party/aom/av1/decoder/inspection.h b/third_party/aom/av1/decoder/inspection.h
index bb604f684..7214a9bed 100644
--- a/third_party/aom/av1/decoder/inspection.h
+++ b/third_party/aom/av1/decoder/inspection.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AOM_INSPECTION_H_
-#define AOM_INSPECTION_H_
+#ifndef AOM_AV1_DECODER_INSPECTION_H_
+#define AOM_AV1_DECODER_INSPECTION_H_
#ifdef __cplusplus
extern "C" {
@@ -81,4 +81,4 @@ int ifd_inspect(insp_frame_data *fd, void *decoder);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
-#endif // AOM_INSPECTION_H_
+#endif // AOM_AV1_DECODER_INSPECTION_H_
diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c
index 715bc6837..44ecf818e 100644
--- a/third_party/aom/av1/decoder/obu.c
+++ b/third_party/aom/av1/decoder/obu.c
@@ -18,6 +18,7 @@
#include "aom_ports/mem_ops.h"
#include "av1/common/common.h"
+#include "av1/common/obu_util.h"
#include "av1/common/timing.h"
#include "av1/decoder/decoder.h"
#include "av1/decoder/decodeframe.h"
@@ -42,85 +43,6 @@ typedef enum {
SCALABILITY_SS = 14
} SCALABILITY_STRUCTURES;
-// Returns 1 when OBU type is valid, and 0 otherwise.
-static int valid_obu_type(int obu_type) {
- int valid_type = 0;
- switch (obu_type) {
- case OBU_SEQUENCE_HEADER:
- case OBU_TEMPORAL_DELIMITER:
- case OBU_FRAME_HEADER:
- case OBU_TILE_GROUP:
- case OBU_METADATA:
- case OBU_FRAME:
- case OBU_REDUNDANT_FRAME_HEADER:
- case OBU_TILE_LIST:
- case OBU_PADDING: valid_type = 1; break;
- default: break;
- }
- return valid_type;
-}
-
-// Parses OBU header and stores values in 'header'.
-static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
- int is_annexb, ObuHeader *header) {
- if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
-
- const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
- if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
-
- header->size = 1;
-
- if (aom_rb_read_bit(rb) != 0) {
- // Forbidden bit. Must not be set.
- return AOM_CODEC_CORRUPT_FRAME;
- }
-
- header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
-
- if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
-
- header->has_extension = aom_rb_read_bit(rb);
- header->has_size_field = aom_rb_read_bit(rb);
-
- if (!header->has_size_field && !is_annexb) {
- // section 5 obu streams must have obu_size field set.
- return AOM_CODEC_UNSUP_BITSTREAM;
- }
-
- if (aom_rb_read_bit(rb) != 0) {
- // obu_reserved_1bit must be set to 0.
- return AOM_CODEC_CORRUPT_FRAME;
- }
-
- if (header->has_extension) {
- if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
-
- header->size += 1;
- header->temporal_layer_id = aom_rb_read_literal(rb, 3);
- header->spatial_layer_id = aom_rb_read_literal(rb, 2);
- if (aom_rb_read_literal(rb, 3) != 0) {
- // extension_header_reserved_3bits must be set to 0.
- return AOM_CODEC_CORRUPT_FRAME;
- }
- }
-
- return AOM_CODEC_OK;
-}
-
-aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
- size_t *consumed, ObuHeader *header,
- int is_annexb) {
- if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
-
- // TODO(tomfinegan): Set the error handler here and throughout this file, and
- // confirm parsing work done via aom_read_bit_buffer is successful.
- struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
- NULL };
- aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
- if (parse_result == AOM_CODEC_OK) *consumed = header->size;
- return parse_result;
-}
-
aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
int operating_point_idc, unsigned int *number_spatial_layers,
unsigned int *number_temporal_layers) {
@@ -208,7 +130,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
SequenceHeader *const seq_params = &sh;
seq_params->profile = av1_read_profile(rb);
- if (seq_params->profile > PROFILE_2) {
+ if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) {
cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
return 0;
}
@@ -349,10 +271,8 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
// If a sequence header has been decoded before, we check if the new
// one is consistent with the old one.
if (pbi->sequence_header_ready) {
- if (!are_seq_headers_consistent(&cm->seq_params, seq_params)) {
- aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
- "Inconsistent sequence headers received.");
- }
+ if (!are_seq_headers_consistent(&cm->seq_params, seq_params))
+ pbi->sequence_header_changed = 1;
}
cm->seq_params = *seq_params;
@@ -620,9 +540,9 @@ static void read_metadata_hdr_mdcv(const uint8_t *data, size_t sz) {
static void scalability_structure(struct aom_read_bit_buffer *rb) {
int spatial_layers_cnt = aom_rb_read_literal(rb, 2);
- int spatial_layer_dimensions_present_flag = aom_rb_read_literal(rb, 1);
- int spatial_layer_description_present_flag = aom_rb_read_literal(rb, 1);
- int temporal_group_description_present_flag = aom_rb_read_literal(rb, 1);
+ int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb);
+ int spatial_layer_description_present_flag = aom_rb_read_bit(rb);
+ int temporal_group_description_present_flag = aom_rb_read_bit(rb);
aom_rb_read_literal(rb, 3); // reserved
if (spatial_layer_dimensions_present_flag) {
@@ -643,8 +563,8 @@ static void scalability_structure(struct aom_read_bit_buffer *rb) {
temporal_group_size = aom_rb_read_literal(rb, 8);
for (i = 0; i < temporal_group_size; i++) {
aom_rb_read_literal(rb, 3);
- aom_rb_read_literal(rb, 1);
- aom_rb_read_literal(rb, 1);
+ aom_rb_read_bit(rb);
+ aom_rb_read_bit(rb);
int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3);
for (j = 0; j < temporal_group_ref_cnt; j++) {
aom_rb_read_literal(rb, 8);
@@ -716,61 +636,6 @@ static size_t read_metadata(const uint8_t *data, size_t sz) {
return sz;
}
-static aom_codec_err_t read_obu_size(const uint8_t *data,
- size_t bytes_available,
- size_t *const obu_size,
- size_t *const length_field_size) {
- uint64_t u_obu_size = 0;
- if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
- 0) {
- return AOM_CODEC_CORRUPT_FRAME;
- }
-
- if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
- *obu_size = (size_t)u_obu_size;
- return AOM_CODEC_OK;
-}
-
-aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
- size_t bytes_available,
- int is_annexb,
- ObuHeader *obu_header,
- size_t *const payload_size,
- size_t *const bytes_read) {
- size_t length_field_size = 0, obu_size = 0;
- aom_codec_err_t status;
-
- if (is_annexb) {
- // Size field comes before the OBU header, and includes the OBU header
- status =
- read_obu_size(data, bytes_available, &obu_size, &length_field_size);
-
- if (status != AOM_CODEC_OK) return status;
- }
-
- struct aom_read_bit_buffer rb = { data + length_field_size,
- data + bytes_available, 0, NULL, NULL };
-
- status = read_obu_header(&rb, is_annexb, obu_header);
- if (status != AOM_CODEC_OK) return status;
-
- if (is_annexb) {
- // Derive the payload size from the data we've already read
- if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
-
- *payload_size = obu_size - obu_header->size;
- } else {
- // Size field comes after the OBU header, and is just the payload size
- status = read_obu_size(data + obu_header->size,
- bytes_available - obu_header->size, payload_size,
- &length_field_size);
- if (status != AOM_CODEC_OK) return status;
- }
-
- *bytes_read = length_field_size + obu_header->size;
- return AOM_CODEC_OK;
-}
-
// On success, returns a boolean that indicates whether the decoding of the
// current frame is finished. On failure, sets cm->error.error_code and
// returns -1.
@@ -781,8 +646,6 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
int frame_decoding_finished = 0;
int is_first_tg_obu_received = 1;
uint32_t frame_header_size = 0;
- int seq_header_received = 0;
- size_t seq_header_size = 0;
ObuHeader obu_header;
memset(&obu_header, 0, sizeof(obu_header));
pbi->seen_frame_header = 0;
@@ -853,19 +716,8 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
pbi->seen_frame_header = 0;
break;
case OBU_SEQUENCE_HEADER:
- if (!seq_header_received) {
- decoded_payload_size = read_sequence_header_obu(pbi, &rb);
- if (cm->error.error_code != AOM_CODEC_OK) return -1;
-
- seq_header_size = decoded_payload_size;
- seq_header_received = 1;
- } else {
- // Seeing another sequence header, skip as all sequence headers are
- // required to be identical except for the contents of
- // operating_parameters_info and the amount of trailing bits.
- // TODO(yaowu): verifying redundant sequence headers are identical.
- decoded_payload_size = seq_header_size;
- }
+ decoded_payload_size = read_sequence_header_obu(pbi, &rb);
+ if (cm->error.error_code != AOM_CODEC_OK) return -1;
break;
case OBU_FRAME_HEADER:
case OBU_REDUNDANT_FRAME_HEADER:
@@ -889,6 +741,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
assert(rb.bit_offset == 0);
rb.bit_offset = 8 * frame_header_size;
}
+
decoded_payload_size = frame_header_size;
pbi->frame_header_size = frame_header_size;
@@ -938,6 +791,11 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
decoded_payload_size = read_metadata(data, payload_size);
break;
case OBU_TILE_LIST:
+ if (CONFIG_NORMAL_TILE_MODE) {
+ cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return -1;
+ }
+
// This OBU type is purely for the large scale tile coding mode.
// The common camera frame header has to be already decoded.
if (!pbi->camera_frame_header_ready) {
diff --git a/third_party/aom/av1/decoder/obu.h b/third_party/aom/av1/decoder/obu.h
index 5f2197058..5ab243fc9 100644
--- a/third_party/aom/av1/decoder/obu.h
+++ b/third_party/aom/av1/decoder/obu.h
@@ -9,35 +9,12 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_DECODER_OBU_H
-#define AV1_DECODER_OBU_H
+#ifndef AOM_AV1_DECODER_OBU_H_
+#define AOM_AV1_DECODER_OBU_H_
#include "aom/aom_codec.h"
#include "av1/decoder/decoder.h"
-typedef struct {
- size_t size; // Size (1 or 2 bytes) of the OBU header (including the
- // optional OBU extension header) in the bitstream.
- OBU_TYPE type;
- int has_size_field;
- int has_extension;
- // The following fields come from the OBU extension header and therefore are
- // only used if has_extension is true.
- int temporal_layer_id;
- int spatial_layer_id;
-} ObuHeader;
-
-aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
- size_t *consumed, ObuHeader *header,
- int is_annexb);
-
-aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
- size_t bytes_available,
- int is_annexb,
- ObuHeader *obu_header,
- size_t *const payload_size,
- size_t *const bytes_read);
-
// Try to decode one frame from a buffer.
// Returns 1 if we decoded a frame,
// 0 if we didn't decode a frame but that's okay
@@ -51,4 +28,4 @@ aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
int operating_point_idc, unsigned int *num_spatial_layers,
unsigned int *num_temporal_layers);
-#endif
+#endif // AOM_AV1_DECODER_OBU_H_
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
index b721b6d2b..80f8e2e66 100644
--- a/third_party/aom/av1/encoder/aq_complexity.c
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -143,9 +143,10 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
aom_clear_system_state();
- low_var_thresh = (cpi->oxcf.pass == 2) ? AOMMAX(cpi->twopass.mb_av_energy,
- MIN_DEFAULT_LV_THRESH)
- : DEFAULT_LV_THRESH;
+ low_var_thresh =
+ (cpi->oxcf.pass == 2)
+ ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH)
+ : DEFAULT_LV_THRESH;
av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes);
logvar = av1_log_block_var(cpi, mb, bs);
diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h
index af525b36d..3421d74c9 100644
--- a/third_party/aom/av1/encoder/aq_complexity.h
+++ b/third_party/aom/av1/encoder/aq_complexity.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_AQ_COMPLEXITY_H_
-#define AV1_ENCODER_AQ_COMPLEXITY_H_
+#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
#ifdef __cplusplus
extern "C" {
@@ -34,4 +34,4 @@ void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi);
} // extern "C"
#endif
-#endif // AV1_ENCODER_AQ_COMPLEXITY_H_
+#endif // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
index dec2c730d..f532d48da 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -80,9 +80,11 @@ CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
}
void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
- aom_free(cr->map);
- aom_free(cr->last_coded_q_map);
- aom_free(cr);
+ if (cr != NULL) {
+ aom_free(cr->map);
+ aom_free(cr->last_coded_q_map);
+ aom_free(cr);
+ }
}
// Check if we should turn off cyclic refresh based on bitrate condition.
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
index 459ab80b8..b45781983 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.h
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_AQ_CYCLICREFRESH_H_
-#define AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
#include "av1/common/blockd.h"
@@ -95,4 +95,4 @@ static INLINE int cyclic_refresh_segment_id(int segment_id) {
} // extern "C"
#endif
-#endif // AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#endif // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
index 6cb6adc42..58f906bdc 100644
--- a/third_party/aom/av1/encoder/aq_variance.c
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -14,34 +14,33 @@
#include "aom_ports/mem.h"
#include "av1/encoder/aq_variance.h"
-
#include "av1/common/seg_common.h"
+#include "av1/encoder/encodeframe.h"
#include "av1/encoder/ratectrl.h"
#include "av1/encoder/rd.h"
#include "av1/encoder/segmentation.h"
#include "av1/encoder/dwt.h"
#include "aom_ports/system_state.h"
+static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0,
+ 0.9, .8, .7, .6 };
+
+static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0,
+ 0.75, 1.0, 1.0, 1.0 };
#define ENERGY_MIN (-4)
#define ENERGY_MAX (1)
#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
#define ENERGY_IN_BOUNDS(energy) \
assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
-static const double rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0,
- 0.75, 1.0, 1.0, 1.0 };
-static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
-
-#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
-
DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
+
DECLARE_ALIGNED(16, static const uint16_t,
av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
-unsigned int av1_vaq_segment_id(int energy) {
- ENERGY_IN_BOUNDS(energy);
- return SEGMENT_ID(energy);
-}
+static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
+
+#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
void av1_vaq_frame_setup(AV1_COMP *cpi) {
AV1_COMMON *cm = &cpi->common;
@@ -51,6 +50,12 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
int resolution_change =
cm->prev_frame && (cm->width != cm->prev_frame->width ||
cm->height != cm->prev_frame->height);
+ int avg_energy = (int)(cpi->twopass.mb_av_energy - 2);
+ double avg_ratio;
+ if (avg_energy > 7) avg_energy = 7;
+ if (avg_energy < 0) avg_energy = 0;
+ avg_ratio = rate_ratio[avg_energy];
+
if (resolution_change) {
memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
av1_clearall_segfeatures(seg);
@@ -69,9 +74,11 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
aom_clear_system_state();
for (i = 0; i < MAX_SEGMENTS; ++i) {
- int qindex_delta =
- av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
- rate_ratio[i], cm->seq_params.bit_depth);
+ // Set up avg segment id to be 1.0 and adjust the other segments around
+ // it.
+ int qindex_delta = av1_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[i] / avg_ratio,
+ cm->seq_params.bit_depth);
// We don't allow qindex 0 in a segment if the base value is not 0.
// Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
@@ -87,114 +94,58 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) {
}
}
-/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions
- * of variance() and highbd_8_variance(). It should not.
- */
-static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int w, int h, unsigned int *sse,
- int *sum) {
- int i, j;
-
- *sum = 0;
- *sse = 0;
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+ // This functions returns a score for the blocks local variance as calculated
+ // by: sum of the log of the (4x4 variances) of each subblock to the current
+ // block (x,bs)
+ // * 32 / number of pixels in the block_size.
+ // This is used for segmentation because to avoid situations in which a large
+ // block with a gentle gradient gets marked high variance even though each
+ // subblock has a low variance. This allows us to assign the same segment
+ // number for the same sorts of area regardless of how the partitioning goes.
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
- }
-
- a += a_stride;
- b += b_stride;
- }
-}
-
-static void aq_highbd_variance64(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w, int h,
- uint64_t *sse, uint64_t *sum) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ double var = 0;
+ unsigned int sse;
int i, j;
- uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
- }
- a += a_stride;
- b += b_stride;
- }
-}
-
-static void aq_highbd_8_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w, int h,
- unsigned int *sse, int *sum) {
- uint64_t sse_long = 0;
- uint64_t sum_long = 0;
- aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sse = (unsigned int)sse_long;
- *sum = (int)sum_long;
-}
-
-static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
- BLOCK_SIZE bs) {
- MACROBLOCKD *xd = &x->e_mbd;
- unsigned int var, sse;
int right_overflow =
(xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
int bottom_overflow =
(xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
- if (right_overflow || bottom_overflow) {
- const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
- const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
- int avg;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
- CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, bw, bh,
- &sse, &avg);
- sse >>= 2 * (xd->bd - 8);
- avg >>= (xd->bd - 8);
- } else {
- aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
- bw, bh, &sse, &avg);
- }
- var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh));
- return (unsigned int)((uint64_t)var * 256) / (bw * bh);
- } else {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- var =
- cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
- CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse);
- } else {
- var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
- av1_all_zeros, 0, &sse);
+ const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+ const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+ aom_clear_system_state();
+
+ for (i = 0; i < bh; i += 4) {
+ for (j = 0; j < bw; j += 4) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ var +=
+ log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+ x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+ x->plane[0].src.stride,
+ CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
+ 16);
+ } else {
+ var +=
+ log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+ x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+ x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
+ 16);
+ }
}
- return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
}
-}
+ // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561.
+ var /= (bw / 4 * bh / 4);
+ if (var > 7) var = 7;
-double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
- unsigned int var = block_variance(cpi, x, bs);
aom_clear_system_state();
- return log(var + 1.0);
+ return (int)(var);
}
#define DEFAULT_E_MIDPOINT 10.0
-int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
- double energy;
- double energy_midpoint;
- aom_clear_system_state();
- energy_midpoint =
- (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
- energy = av1_log_block_var(cpi, x, bs) - energy_midpoint;
- return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
-}
unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
MACROBLOCKD *xd = &x->e_mbd;
@@ -231,17 +182,21 @@ int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
int block_var_level) {
- ENERGY_IN_BOUNDS(block_var_level);
-
- const int rate_level = SEGMENT_ID(block_var_level);
+ int rate_level;
const AV1_COMMON *const cm = &cpi->common;
+
+ if (DELTAQ_MODULATION == 1) {
+ ENERGY_IN_BOUNDS(block_var_level);
+ rate_level = SEGMENT_ID(block_var_level);
+ } else {
+ rate_level = block_var_level;
+ }
int qindex_delta = av1_compute_qdelta_by_rate(
- &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[rate_level],
+ &cpi->rc, cm->frame_type, cm->base_qindex, deltaq_rate_ratio[rate_level],
cm->seq_params.bit_depth);
if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
qindex_delta = -cm->base_qindex + 1;
}
-
return qindex_delta;
}
diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h
index b1a8bc38a..2d22b663e 100644
--- a/third_party/aom/av1/encoder/aq_variance.h
+++ b/third_party/aom/av1/encoder/aq_variance.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_AQ_VARIANCE_H_
-#define AV1_ENCODER_AQ_VARIANCE_H_
+#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_
+#define AOM_AV1_ENCODER_AQ_VARIANCE_H_
#include "av1/encoder/encoder.h"
@@ -18,11 +18,9 @@
extern "C" {
#endif
-unsigned int av1_vaq_segment_id(int energy);
void av1_vaq_frame_setup(AV1_COMP *cpi);
-int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
-double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
int block_var_level);
int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
@@ -32,4 +30,4 @@ int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
} // extern "C"
#endif
-#endif // AV1_ENCODER_AQ_VARIANCE_H_
+#endif // AOM_AV1_ENCODER_AQ_VARIANCE_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
index b92b3469f..98505e0b1 100644
--- a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
@@ -11,24 +11,7 @@
#include <stdlib.h>
#include "av1/encoder/av1_fwd_txfm1d.h"
-
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
- int32_t size, int8_t bit);
-
-#define range_check(stage, input, buf, size, bit) \
- range_check_func(stage, input, buf, size, bit)
-#else // CONFIG_COEFFICIENT_RANGE_CHECKING
-
-#define range_check(stage, input, buf, size, bit) \
- { \
- (void)stage; \
- (void)input; \
- (void)buf; \
- (void)size; \
- (void)bit; \
- }
-#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+#include "av1/common/av1_txfm.h"
void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range) {
@@ -40,7 +23,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[4];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -49,7 +32,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[1] = input[1] + input[2];
bf1[2] = -input[2] + input[1];
bf1[3] = -input[3] + input[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -60,7 +43,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -70,7 +53,7 @@ void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[1] = bf0[2];
bf1[2] = bf0[1];
bf1[3] = bf0[3];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -83,7 +66,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[8];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -96,7 +79,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = -input[5] + input[2];
bf1[6] = -input[6] + input[1];
bf1[7] = -input[7] + input[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -111,7 +94,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
bf1[7] = bf0[7];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -126,7 +109,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = -bf0[5] + bf0[4];
bf1[6] = -bf0[6] + bf0[7];
bf1[7] = bf0[7] + bf0[6];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -141,7 +124,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -155,7 +138,7 @@ void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = bf0[5];
bf1[6] = bf0[3];
bf1[7] = bf0[7];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -168,7 +151,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[16];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -189,7 +172,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = -input[13] + input[2];
bf1[14] = -input[14] + input[1];
bf1[15] = -input[15] + input[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -212,7 +195,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
bf1[14] = bf0[14];
bf1[15] = bf0[15];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -235,7 +218,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = -bf0[13] + bf0[14];
bf1[14] = bf0[14] + bf0[13];
bf1[15] = bf0[15] + bf0[12];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -258,7 +241,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
bf1[15] = bf0[15];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -281,7 +264,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = -bf0[13] + bf0[12];
bf1[14] = -bf0[14] + bf0[15];
bf1[15] = bf0[15] + bf0[14];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -304,7 +287,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -326,7 +309,7 @@ void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[11];
bf1[14] = bf0[7];
bf1[15] = bf0[15];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -339,7 +322,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[32];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -376,7 +359,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = -input[29] + input[2];
bf1[30] = -input[30] + input[1];
bf1[31] = -input[31] + input[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -415,7 +398,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = bf0[29];
bf1[30] = bf0[30];
bf1[31] = bf0[31];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -454,7 +437,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = bf0[29] + bf0[26];
bf1[30] = bf0[30] + bf0[25];
bf1[31] = bf0[31] + bf0[24];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -493,7 +476,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
bf1[30] = bf0[30];
bf1[31] = bf0[31];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -532,7 +515,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = -bf0[29] + bf0[30];
bf1[30] = bf0[30] + bf0[29];
bf1[31] = bf0[31] + bf0[28];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -571,7 +554,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
bf1[31] = bf0[31];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -610,7 +593,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = -bf0[29] + bf0[28];
bf1[30] = -bf0[30] + bf0[31];
bf1[31] = bf0[31] + bf0[30];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 8
stage++;
@@ -649,7 +632,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
stage++;
@@ -687,7 +670,7 @@ void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[29] = bf0[23];
bf1[30] = bf0[15];
bf1[31] = bf0[31];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -698,7 +681,7 @@ void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t s0, s1, s2, s3, s4, s5, s6, s7;
// stage 0
- range_check(0, input, input, 4, stage_range[0]);
+ av1_range_check_buf(0, input, input, 4, stage_range[0]);
x0 = input[0];
x1 = input[1];
x2 = input[2];
@@ -746,7 +729,7 @@ void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
output[1] = round_shift(s1, bit);
output[2] = round_shift(s2, bit);
output[3] = round_shift(s3, bit);
- range_check(6, input, output, 4, stage_range[6]);
+ av1_range_check_buf(6, input, output, 4, stage_range[6]);
}
void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -759,7 +742,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[8];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -773,7 +756,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = input[6];
bf1[6] = input[2];
bf1[7] = -input[5];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -788,7 +771,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = bf0[5];
bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -802,7 +785,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = bf0[5] + bf0[7];
bf1[6] = bf0[4] - bf0[6];
bf1[7] = bf0[5] - bf0[7];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -817,7 +800,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -831,7 +814,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = bf0[1] - bf0[5];
bf1[6] = bf0[2] - bf0[6];
bf1[7] = bf0[3] - bf0[7];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -846,7 +829,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -860,7 +843,7 @@ void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[5] = bf0[2];
bf1[6] = bf0[7];
bf1[7] = bf0[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -873,7 +856,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[16];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -895,7 +878,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = -input[13];
bf1[14] = -input[5];
bf1[15] = input[10];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -918,7 +901,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[13];
bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -940,7 +923,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[13] + bf0[15];
bf1[14] = bf0[12] - bf0[14];
bf1[15] = bf0[13] - bf0[15];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -963,7 +946,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -985,7 +968,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[9] - bf0[13];
bf1[14] = bf0[10] - bf0[14];
bf1[15] = bf0[11] - bf0[15];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -1008,7 +991,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -1030,7 +1013,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[5] - bf0[13];
bf1[14] = bf0[6] - bf0[14];
bf1[15] = bf0[7] - bf0[15];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 8
stage++;
@@ -1053,7 +1036,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
stage++;
@@ -1075,7 +1058,7 @@ void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[13] = bf0[2];
bf1[14] = bf0[15];
bf1[15] = bf0[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -1084,14 +1067,14 @@ void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
for (int i = 0; i < 4; ++i)
output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
assert(stage_range[0] + NewSqrt2Bits <= 32);
- range_check(0, input, output, 4, stage_range[0]);
+ av1_range_check_buf(0, input, output, 4, stage_range[0]);
}
void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range) {
(void)cos_bit;
for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
- range_check(0, input, output, 8, stage_range[0]);
+ av1_range_check_buf(0, input, output, 8, stage_range[0]);
}
void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -1100,14 +1083,14 @@ void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
for (int i = 0; i < 16; ++i)
output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
assert(stage_range[0] + NewSqrt2Bits <= 32);
- range_check(0, input, output, 16, stage_range[0]);
+ av1_range_check_buf(0, input, output, 16, stage_range[0]);
}
void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range) {
(void)cos_bit;
for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
- range_check(0, input, output, 32, stage_range[0]);
+ av1_range_check_buf(0, input, output, 32, stage_range[0]);
}
void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
@@ -1120,7 +1103,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
int32_t step[64];
// stage 0;
- range_check(stage, input, input, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
@@ -1189,7 +1172,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = -input[61] + input[2];
bf1[62] = -input[62] + input[1];
bf1[63] = -input[63] + input[0];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
@@ -1260,7 +1243,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[61];
bf1[62] = bf0[62];
bf1[63] = bf0[63];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
@@ -1331,7 +1314,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[61] + bf0[50];
bf1[62] = bf0[62] + bf0[49];
bf1[63] = bf0[63] + bf0[48];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
@@ -1402,7 +1385,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[61];
bf1[62] = bf0[62];
bf1[63] = bf0[63];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 5
stage++;
@@ -1473,7 +1456,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[61] + bf0[58];
bf1[62] = bf0[62] + bf0[57];
bf1[63] = bf0[63] + bf0[56];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 6
stage++;
@@ -1544,7 +1527,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
bf1[62] = bf0[62];
bf1[63] = bf0[63];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 7
stage++;
@@ -1615,7 +1598,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = -bf0[61] + bf0[62];
bf1[62] = bf0[62] + bf0[61];
bf1[63] = bf0[63] + bf0[60];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 8
stage++;
@@ -1686,7 +1669,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
bf1[63] = bf0[63];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 9
stage++;
@@ -1757,7 +1740,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = -bf0[61] + bf0[60];
bf1[62] = -bf0[62] + bf0[63];
bf1[63] = bf0[63] + bf0[62];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 10
stage++;
@@ -1828,7 +1811,7 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
// stage 11
stage++;
@@ -1898,5 +1881,5 @@ void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
bf1[61] = bf0[47];
bf1[62] = bf0[31];
bf1[63] = bf0[63];
- range_check(stage, input, bf1, size, stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
index 9472af8e6..9dcf16552 100644
--- a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_FWD_TXFM1D_H_
-#define AV1_FWD_TXFM1D_H_
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
#include "av1/common/av1_txfm.h"
@@ -46,4 +46,4 @@ void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
}
#endif
-#endif // AV1_FWD_TXFM1D_H_
+#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
index 174689a14..98b6530db 100644
--- a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
@@ -9,11 +9,11 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_FWD_TXFM2D_CFG_H_
-#define AV1_FWD_TXFM2D_CFG_H_
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
#include "av1/common/enums.h"
#include "av1/encoder/av1_fwd_txfm1d.h"
extern const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL];
extern const int8_t fwd_cos_bit_col[5][5];
extern const int8_t fwd_cos_bit_row[5][5];
-#endif // AV1_FWD_TXFM2D_CFG_H_
+#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
index d0477b35b..a0a926005 100644
--- a/third_party/aom/av1/encoder/av1_quantize.c
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -273,35 +273,32 @@ void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
- // obsolete skip_block
- const int skip_block = 0;
const qm_val_t *qm_ptr = qparam->qmatrix;
const qm_val_t *iqm_ptr = qparam->iqmatrix;
if (qm_ptr != NULL && iqm_ptr != NULL) {
- quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
- sc->scan, sc->iscan, qm_ptr, iqm_ptr,
- qparam->log_scale);
+ quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
} else {
switch (qparam->log_scale) {
case 0:
- aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
- sc->scan, sc->iscan);
+ aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
break;
case 1:
- aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
- sc->scan, sc->iscan);
+ aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
break;
case 2:
- aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
- sc->scan, sc->iscan);
+ aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
break;
default: assert(0);
}
@@ -392,28 +389,25 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
const SCAN_ORDER *sc,
const QUANT_PARAM *qparam) {
- // obsolete skip_block
- const int skip_block = 0;
const qm_val_t *qm_ptr = qparam->qmatrix;
const qm_val_t *iqm_ptr = qparam->iqmatrix;
if (qm_ptr != NULL && iqm_ptr != NULL) {
- highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
- sc->scan, sc->iscan, qm_ptr, iqm_ptr,
- qparam->log_scale);
+ highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
} else {
switch (qparam->log_scale) {
case 0:
if (LIKELY(n_coeffs >= 8)) {
- aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
- p->round_QTX, p->quant_QTX, p->quant_shift_QTX,
- qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
- eob_ptr, sc->scan, sc->iscan);
+ aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
} else {
// TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
// quantization
- aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin_QTX,
+ aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, p->zbin_QTX,
p->round_QTX, p->quant_QTX,
p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
@@ -421,15 +415,15 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
break;
case 1:
aom_highbd_quantize_b_32x32(
- coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX,
- p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
- p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
break;
case 2:
aom_highbd_quantize_b_64x64(
- coeff_ptr, n_coeffs, skip_block, p->zbin_QTX, p->round_QTX,
- p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
- p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
break;
default: assert(0);
}
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
index eaf8374de..35af9a67a 100644
--- a/third_party/aom/av1/encoder/av1_quantize.h
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_QUANTIZE_H_
-#define AV1_ENCODER_QUANTIZE_H_
+#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_
#include "config/aom_config.h"
@@ -145,4 +145,4 @@ void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
} // extern "C"
#endif
-#endif // AV1_ENCODER_QUANTIZE_H_
+#endif // AOM_AV1_ENCODER_AV1_QUANTIZE_H_
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
index 2070755cd..2c4acdb02 100644
--- a/third_party/aom/av1/encoder/bitstream.c
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -18,6 +18,7 @@
#include "aom_dsp/binary_codes_writer.h"
#include "aom_dsp/bitwriter_buffer.h"
#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
#include "aom_ports/mem_ops.h"
#include "aom_ports/system_state.h"
#if CONFIG_BITSTREAM_DEBUG
@@ -30,7 +31,6 @@
#include "av1/common/entropymode.h"
#include "av1/common/entropymv.h"
#include "av1/common/mvref_common.h"
-#include "av1/common/odintrin.h"
#include "av1/common/pred_common.h"
#include "av1/common/reconinter.h"
#include "av1/common/reconintra.h"
@@ -66,11 +66,11 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
aom_writer *const w, int plane,
FRAME_COUNTS *counts);
-static void write_intra_mode_kf(FRAME_CONTEXT *frame_ctx,
- const MB_MODE_INFO *mi,
- const MB_MODE_INFO *above_mi,
- const MB_MODE_INFO *left_mi,
- PREDICTION_MODE mode, aom_writer *w) {
+static void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
+ const MB_MODE_INFO *mi,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi,
+ PREDICTION_MODE mode, aom_writer *w) {
assert(!is_intrabc_block(mi));
(void)mi;
aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
@@ -297,7 +297,7 @@ static void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex,
DELTA_Q_PROBS + 1);
if (!smallval) {
- rem_bits = OD_ILOG_NZ(abs - 1) - 1;
+ rem_bits = get_msb(abs - 1);
thr = (1 << rem_bits) + 1;
aom_write_literal(w, rem_bits - 1, 3);
aom_write_literal(w, abs - thr, rem_bits);
@@ -326,7 +326,7 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
}
if (!smallval) {
- rem_bits = OD_ILOG_NZ(abs - 1) - 1;
+ rem_bits = get_msb(abs - 1);
thr = (1 << rem_bits) + 1;
aom_write_literal(w, rem_bits - 1, 3);
aom_write_literal(w, abs - thr, rem_bits);
@@ -836,8 +836,8 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
}
}
-static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
- PREDICTION_MODE mode, aom_writer *w) {
+static void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
+ PREDICTION_MODE mode, aom_writer *w) {
aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
INTRA_MODES);
}
@@ -933,45 +933,24 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
}
}
-static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
- const int mi_col, aom_writer *w) {
+// If delta q is present, writes delta_q index.
+// Also writes delta_q loop filter levels, if present.
+static void write_delta_q_params(AV1_COMP *cpi, const int mi_row,
+ const int mi_col, int skip, aom_writer *w) {
AV1_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->td.mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
- const struct segmentation *const seg = &cm->seg;
- struct segmentation_probs *const segp = &ec_ctx->seg;
- const MB_MODE_INFO *const mbmi = xd->mi[0];
- const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
- const PREDICTION_MODE mode = mbmi->mode;
- const int segment_id = mbmi->segment_id;
- const BLOCK_SIZE bsize = mbmi->sb_type;
- const int allow_hp = cm->allow_high_precision_mv;
- const int is_inter = is_inter_block(mbmi);
- const int is_compound = has_second_ref(mbmi);
- int skip, ref;
- (void)mi_row;
- (void)mi_col;
-
- write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1);
-
- write_skip_mode(cm, xd, segment_id, mbmi, w);
-
- assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
- skip = mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
-
- write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0);
-
- write_cdef(cm, xd, w, skip, mi_col, mi_row);
-
if (cm->delta_q_present_flag) {
- int super_block_upper_left =
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int super_block_upper_left =
((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+
if ((bsize != cm->seq_params.sb_size || skip == 0) &&
super_block_upper_left) {
assert(mbmi->current_qindex > 0);
- int reduced_delta_qindex =
+ const int reduced_delta_qindex =
(mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
write_delta_qindex(xd, reduced_delta_qindex, w);
xd->current_qindex = mbmi->current_qindex;
@@ -996,37 +975,96 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
}
}
}
+}
- if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+static void write_intra_prediction_modes(AV1_COMP *cpi, const int mi_row,
+ const int mi_col, int is_keyframe,
+ aom_writer *w) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const PREDICTION_MODE mode = mbmi->mode;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
- if (mbmi->skip_mode) return;
+ // Y mode.
+ if (is_keyframe) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
+ } else {
+ write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w);
+ }
- if (!is_inter) {
- write_intra_mode(ec_ctx, bsize, mode, w);
- const int use_angle_delta = av1_use_angle_delta(bsize);
+ // Y angle delta.
+ const int use_angle_delta = av1_use_angle_delta(bsize);
+ if (use_angle_delta && av1_is_directional_mode(mode)) {
+ write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+ ec_ctx->angle_delta_cdf[mode - V_PRED]);
+ }
- if (use_angle_delta && av1_is_directional_mode(mode)) {
- write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
- ec_ctx->angle_delta_cdf[mode - V_PRED]);
+ // UV mode and UV angle delta.
+ if (!cm->seq_params.monochrome &&
+ is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y)) {
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+ if (uv_mode == UV_CFL_PRED)
+ write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+ if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
+ write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+ ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
}
+ }
- if (!cm->seq_params.monochrome &&
- is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
- xd->plane[1].subsampling_y)) {
- const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
- write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
- if (uv_mode == UV_CFL_PRED)
- write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
- if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
- write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
- ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
- }
- }
+ // Palette.
+ if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) {
+ write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
+ }
- if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
- write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
+ // Filter intra.
+ write_filter_intra_mode_info(cm, xd, mbmi, w);
+}
- write_filter_intra_mode_info(cm, xd, mbmi, w);
+static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
+ const int mi_col, aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const struct segmentation *const seg = &cm->seg;
+ struct segmentation_probs *const segp = &ec_ctx->seg;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const PREDICTION_MODE mode = mbmi->mode;
+ const int segment_id = mbmi->segment_id;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int allow_hp = cm->allow_high_precision_mv;
+ const int is_inter = is_inter_block(mbmi);
+ const int is_compound = has_second_ref(mbmi);
+ int ref;
+
+ write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1);
+
+ write_skip_mode(cm, xd, segment_id, mbmi, w);
+
+ assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
+ const int skip =
+ mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
+
+ write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0);
+
+ write_cdef(cm, xd, w, skip, mi_col, mi_row);
+
+ write_delta_q_params(cpi, mi_row, mi_col, skip, w);
+
+ if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+
+ if (mbmi->skip_mode) return;
+
+ if (!is_inter) {
+ write_intra_prediction_modes(cpi, mi_row, mi_col, 0, w);
} else {
int16_t mode_ctx;
@@ -1172,11 +1210,7 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
const struct segmentation *const seg = &cm->seg;
struct segmentation_probs *const segp = &ec_ctx->seg;
- const MB_MODE_INFO *const above_mi = xd->above_mbmi;
- const MB_MODE_INFO *const left_mi = xd->left_mbmi;
const MB_MODE_INFO *const mbmi = xd->mi[0];
- const BLOCK_SIZE bsize = mbmi->sb_type;
- const PREDICTION_MODE mode = mbmi->mode;
if (seg->segid_preskip && seg->update_map)
write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
@@ -1188,69 +1222,14 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
write_cdef(cm, xd, w, skip, mi_col, mi_row);
- if (cm->delta_q_present_flag) {
- int super_block_upper_left =
- ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
- ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
- if ((bsize != cm->seq_params.sb_size || skip == 0) &&
- super_block_upper_left) {
- assert(mbmi->current_qindex > 0);
- int reduced_delta_qindex =
- (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
- write_delta_qindex(xd, reduced_delta_qindex, w);
- xd->current_qindex = mbmi->current_qindex;
- if (cm->delta_lf_present_flag) {
- if (cm->delta_lf_multi) {
- const int frame_lf_count =
- av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
- for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
- int reduced_delta_lflevel =
- (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
- cm->delta_lf_res;
- write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
- xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
- }
- } else {
- int reduced_delta_lflevel =
- (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
- cm->delta_lf_res;
- write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
- xd->delta_lf_from_base = mbmi->delta_lf_from_base;
- }
- }
- }
- }
+ write_delta_q_params(cpi, mi_row, mi_col, skip, w);
if (av1_allow_intrabc(cm)) {
write_intrabc_info(xd, mbmi_ext, w);
if (is_intrabc_block(mbmi)) return;
}
- write_intra_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
-
- const int use_angle_delta = av1_use_angle_delta(bsize);
- if (use_angle_delta && av1_is_directional_mode(mode)) {
- write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
- ec_ctx->angle_delta_cdf[mode - V_PRED]);
- }
-
- if (!cm->seq_params.monochrome &&
- is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
- xd->plane[1].subsampling_y)) {
- const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
- write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
- if (uv_mode == UV_CFL_PRED)
- write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
- if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
- write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
- ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
- }
- }
-
- if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
- write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
-
- write_filter_intra_mode_info(cm, xd, mbmi, w);
+ write_intra_prediction_modes(cpi, mi_row, mi_col, 1, w);
}
#if CONFIG_RD_DEBUG
@@ -1549,10 +1528,10 @@ static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w);
} else {
write_selected_tx_size(xd, w);
- set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, 0, xd);
+ set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h, 0, xd);
}
} else {
- set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h,
+ set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
skip && is_inter_block(mbmi), xd);
}
@@ -1694,15 +1673,14 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
}
static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
- aom_writer *const w, const TOKENEXTRA **tok,
- const TOKENEXTRA *const tok_end) {
+ aom_writer *const w, int tile_row, int tile_col) {
AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
const int mi_row_start = tile->mi_row_start;
const int mi_row_end = tile->mi_row_end;
const int mi_col_start = tile->mi_col_start;
const int mi_col_end = tile->mi_col_end;
- int mi_row, mi_col;
+ int mi_row, mi_col, sb_row_in_tile;
av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row);
av1_init_above_context(cm, xd, tile->tile_row);
@@ -1716,13 +1694,21 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
for (mi_row = mi_row_start; mi_row < mi_row_end;
mi_row += cm->seq_params.mib_size) {
+ sb_row_in_tile =
+ (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2;
+ const TOKENEXTRA *tok =
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].start;
+ const TOKENEXTRA *tok_end =
+ tok + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count;
+
av1_zero_left_context(xd);
for (mi_col = mi_col_start; mi_col < mi_col_end;
mi_col += cm->seq_params.mib_size) {
- write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
+ write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
cm->seq_params.sb_size);
}
+ assert(tok == cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop);
}
}
@@ -2220,33 +2206,12 @@ static void write_ext_tile_info(const AV1_COMMON *const cm,
}
}
-#if USE_GF16_MULTI_LAYER
-static int get_refresh_mask_gf16(AV1_COMP *cpi) {
- if (cpi->common.frame_type == KEY_FRAME || frame_is_sframe(&cpi->common))
- return 0xFF;
-
- int refresh_mask = 0;
-
- if (cpi->refresh_last_frame || cpi->refresh_golden_frame ||
- cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
- cpi->refresh_alt_ref_frame) {
- assert(cpi->refresh_fb_idx >= 0 && cpi->refresh_fb_idx < REF_FRAMES);
- refresh_mask |= (1 << cpi->refresh_fb_idx);
- }
-
- return refresh_mask;
-}
-#endif // USE_GF16_MULTI_LAYER
-
static int get_refresh_mask(AV1_COMP *cpi) {
if ((cpi->common.frame_type == KEY_FRAME && cpi->common.show_frame) ||
frame_is_sframe(&cpi->common))
return 0xFF;
int refresh_mask = 0;
-#if USE_GF16_MULTI_LAYER
- if (cpi->rc.baseline_gf_interval == 16) return get_refresh_mask_gf16(cpi);
-#endif // USE_GF16_MULTI_LAYER
// NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be
// notified to get LAST3_FRAME refreshed and then the virtual indexes for all
@@ -2281,8 +2246,13 @@ static int get_refresh_mask(AV1_COMP *cpi) {
// Note: This is highly specific to the use of ARF as a forward reference,
// and this needs to be generalized as other uses are implemented
// (like RTC/temporal scalability).
- return refresh_mask |
- (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]);
+
+ if (cpi->preserve_arf_as_gld) {
+ return refresh_mask;
+ } else {
+ return refresh_mask |
+ (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]);
+ }
} else {
const int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
return refresh_mask |
@@ -2574,9 +2544,9 @@ static void write_film_grain_params(AV1_COMP *cpi,
aom_wb_write_literal(wb, pars->random_seed, 16);
- pars->random_seed += 3245; // For film grain test vectors purposes
+ pars->random_seed += 3381; // Changing random seed for film grain
if (!pars->random_seed) // Random seed should not be zero
- pars->random_seed += 1735;
+ pars->random_seed += 7391;
if (cm->frame_type == INTER_FRAME)
aom_wb_write_bit(wb, pars->update_parameters);
else
@@ -2685,7 +2655,8 @@ static void write_sb_size(SequenceHeader *seq_params,
aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
}
-void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) {
+static void write_sequence_header(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *wb) {
AV1_COMMON *const cm = &cpi->common;
SequenceHeader *seq_params = &cm->seq_params;
@@ -2695,8 +2666,10 @@ void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) {
int max_frame_height = cpi->oxcf.forced_max_frame_height
? cpi->oxcf.forced_max_frame_height
: cpi->oxcf.height;
+ // max((int)ceil(log2(max_frame_width)), 1)
const int num_bits_width =
(max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1;
+ // max((int)ceil(log2(max_frame_height)), 1)
const int num_bits_height =
(max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1;
assert(num_bits_width <= 16);
@@ -2954,7 +2927,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
assert(cm->frame_type == KEY_FRAME);
}
if (!seq_params->reduced_still_picture_hdr) {
- if (cm->show_existing_frame) {
+ if (encode_show_existing_frame(cm)) {
RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
@@ -3254,14 +3227,14 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
if (cm->base_qindex > 0) {
aom_wb_write_bit(wb, cm->delta_q_present_flag);
if (cm->delta_q_present_flag) {
- aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2);
+ aom_wb_write_literal(wb, get_msb(cm->delta_q_res), 2);
xd->current_qindex = cm->base_qindex;
if (cm->allow_intrabc)
assert(cm->delta_lf_present_flag == 0);
else
aom_wb_write_bit(wb, cm->delta_lf_present_flag);
if (cm->delta_lf_present_flag) {
- aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
+ aom_wb_write_literal(wb, get_msb(cm->delta_lf_res), 2);
aom_wb_write_bit(wb, cm->delta_lf_multi);
av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
}
@@ -3508,7 +3481,7 @@ static void write_bitstream_level(BitstreamLevel bl,
aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
}
-static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
+uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
AV1_COMMON *const cm = &cpi->common;
struct aom_write_bit_buffer wb = { dst, 0 };
uint32_t size = 0;
@@ -3619,7 +3592,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
AV1_COMMON *const cm = &cpi->common;
aom_writer mode_bc;
int tile_row, tile_col;
- TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
uint32_t total_size = 0;
const int tile_cols = cm->tile_cols;
@@ -3684,8 +3656,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
- const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
- const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
const int data_offset = have_tiles ? 4 : 0;
const int tile_idx = tile_row * tile_cols + tile_col;
TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
@@ -3703,8 +3673,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
mode_bc.allow_update_cdf =
mode_bc.allow_update_cdf && !cm->disable_cdf_update;
aom_start_encode(&mode_bc, buf->data + data_offset);
- write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
- assert(tok == tok_end);
+ write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
aom_stop_encode(&mode_bc);
tile_size = mode_bc.pos;
buf->size = tile_size;
@@ -3794,8 +3763,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
const int tile_idx = tile_row * tile_cols + tile_col;
TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
- const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
- const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
int is_last_tile_in_tg = 0;
if (new_tg) {
@@ -3847,7 +3814,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes);
aom_start_encode(&mode_bc, dst + total_size);
- write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+ write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
aom_stop_encode(&mode_bc);
tile_size = mode_bc.pos;
assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
@@ -3990,7 +3957,8 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
data += obu_header_size + obu_payload_size + length_field_size;
}
- const int write_frame_header = (cm->num_tg > 1 || cm->show_existing_frame);
+ const int write_frame_header =
+ (cm->num_tg > 1 || encode_show_existing_frame(cm));
struct aom_write_bit_buffer saved_wb;
if (write_frame_header) {
// Write Frame Header OBU.
@@ -4017,7 +3985,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
saved_wb.bit_buffer += length_field_size;
}
- if (cm->show_existing_frame) {
+ if (encode_show_existing_frame(cm)) {
data_size = 0;
} else {
// Each tile group obu will be preceded by 4-byte size of the tile group
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
index 2047b6833..465ccaed5 100644
--- a/third_party/aom/av1/encoder/bitstream.h
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_BITSTREAM_H_
-#define AV1_ENCODER_BITSTREAM_H_
+#ifndef AOM_AV1_ENCODER_BITSTREAM_H_
+#define AOM_AV1_ENCODER_BITSTREAM_H_
#ifdef __cplusplus
extern "C" {
@@ -20,8 +20,13 @@ extern "C" {
struct aom_write_bit_buffer;
-void write_sequence_header(AV1_COMP *cpi, struct aom_write_bit_buffer *wb);
+// Writes only the OBU Sequence Header payload, and returns the size of the
+// payload written to 'dst'. This function does not write the OBU header, the
+// optional extension, or the OBU size to 'dst'.
+uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst);
+// Writes the OBU header byte, and the OBU header extension byte when
+// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
uint8_t *const dst);
@@ -32,8 +37,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) {
// Do not swap gf and arf indices for internal overlay frames
- return !cpi->multi_arf_allowed && cpi->rc.is_src_frame_alt_ref &&
- !cpi->rc.is_src_frame_ext_arf;
+ return cpi->rc.is_src_frame_alt_ref && !cpi->rc.is_src_frame_ext_arf;
}
void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
@@ -44,4 +48,4 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
} // extern "C"
#endif
-#endif // AV1_ENCODER_BITSTREAM_H_
+#endif // AOM_AV1_ENCODER_BITSTREAM_H_
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
index 003e59e39..0bc5dea82 100644
--- a/third_party/aom/av1/encoder/block.h
+++ b/third_party/aom/av1/encoder/block.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_BLOCK_H_
-#define AV1_ENCODER_BLOCK_H_
+#ifndef AOM_AV1_ENCODER_BLOCK_H_
+#define AOM_AV1_ENCODER_BLOCK_H_
#include "av1/common/entropymv.h"
#include "av1/common/entropy.h"
@@ -170,6 +170,7 @@ typedef struct {
InterpFilters filters;
int_mv mv[2];
int8_t ref_frames[2];
+ COMPOUND_TYPE comp_type;
} INTERPOLATION_FILTER_STATS;
typedef struct macroblock MACROBLOCK;
@@ -254,6 +255,19 @@ struct macroblock {
PALETTE_BUFFER *palette_buffer;
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
+
+ // buffer for hash value calculation of a block
+ // used only in av1_get_block_hash_value()
+ // [first hash/second hash]
+ // [two buffers used ping-pong]
+ uint32_t *hash_value_buffer[2][2];
+
+ CRC_CALCULATOR crc_calculator1;
+ CRC_CALCULATOR crc_calculator2;
+ int g_crc_initialized;
+
// These define limits to motion vector components to prevent them
// from extending outside the UMV borders
MvLimits mv_limits;
@@ -344,7 +358,6 @@ struct macroblock {
#if CONFIG_DIST_8X8
int using_dist_8x8;
aom_tune_metric tune_metric;
- DECLARE_ALIGNED(16, int16_t, pred_luma[MAX_SB_SQUARE]);
#endif // CONFIG_DIST_8X8
int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
@@ -352,6 +365,8 @@ struct macroblock {
int tx_search_prune[EXT_TX_SET_TYPES];
int must_find_valid_partition;
int tx_split_prune_flag; // Flag to skip tx split RD search.
+ int recalc_luma_mc_data; // Flag to indicate recalculation of MC data during
+ // interpolation filter search
};
static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
@@ -400,8 +415,38 @@ static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
return depth;
}
+static INLINE void set_blk_skip(MACROBLOCK *x, int plane, int blk_idx,
+ int skip) {
+ if (skip)
+ x->blk_skip[blk_idx] |= 1UL << plane;
+ else
+ x->blk_skip[blk_idx] &= ~(1UL << plane);
+#ifndef NDEBUG
+ // Set chroma planes to uninitialized states when luma is set to check if
+ // it will be set later
+ if (plane == 0) {
+ x->blk_skip[blk_idx] |= 1UL << (1 + 4);
+ x->blk_skip[blk_idx] |= 1UL << (2 + 4);
+ }
+
+ // Clear the initialization checking bit
+ x->blk_skip[blk_idx] &= ~(1UL << (plane + 4));
+#endif
+}
+
+static INLINE int is_blk_skip(MACROBLOCK *x, int plane, int blk_idx) {
+#ifndef NDEBUG
+ // Check if this is initialized
+ assert(!(x->blk_skip[blk_idx] & (1UL << (plane + 4))));
+
+ // The magic number is 0x77, this is to test if there is garbage data
+ assert((x->blk_skip[blk_idx] & 0x88) == 0);
+#endif
+ return (x->blk_skip[blk_idx] >> plane) & 1;
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_BLOCK_H_
+#endif // AOM_AV1_ENCODER_BLOCK_H_
diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c
index 66dedd9ed..f7cff9e53 100644
--- a/third_party/aom/av1/encoder/blockiness.c
+++ b/third_party/aom/av1/encoder/blockiness.c
@@ -16,7 +16,6 @@
#include "av1/common/common.h"
#include "av1/common/filter.h"
#include "aom/aom_integer.h"
-#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_filter.h"
#include "aom_ports/mem.h"
#include "aom_ports/system_state.h"
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
index d6e556b93..57f59f304 100644
--- a/third_party/aom/av1/encoder/context_tree.c
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -175,14 +175,15 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
}
void av1_free_pc_tree(ThreadData *td, const int num_planes) {
- const int tree_nodes_inc = 1024;
-
- const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
- int i;
- for (i = 0; i < tree_nodes; ++i)
- free_tree_contexts(&td->pc_tree[i], num_planes);
- aom_free(td->pc_tree);
- td->pc_tree = NULL;
+ if (td->pc_tree != NULL) {
+ const int tree_nodes_inc = 1024;
+ const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+ for (int i = 0; i < tree_nodes; ++i) {
+ free_tree_contexts(&td->pc_tree[i], num_planes);
+ }
+ aom_free(td->pc_tree);
+ td->pc_tree = NULL;
+ }
}
void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
index c05f48a7a..4efc34985 100644
--- a/third_party/aom/av1/encoder/context_tree.h
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_CONTEXT_TREE_H_
-#define AV1_ENCODER_CONTEXT_TREE_H_
+#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_
+#define AOM_AV1_ENCODER_CONTEXT_TREE_H_
#include "av1/common/blockd.h"
#include "av1/encoder/block.h"
@@ -56,6 +56,8 @@ typedef struct {
int hybrid_pred_diff;
int comp_pred_diff;
int single_pred_diff;
+ // Skip certain ref frames during RD search of rectangular partitions.
+ int skip_ref_frame_mask;
// TODO(jingning) Use RD_COST struct here instead. This involves a boarder
// scope of refactoring.
@@ -109,4 +111,4 @@ void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
} // extern "C"
#endif
-#endif /* AV1_ENCODER_CONTEXT_TREE_H_ */
+#endif // AOM_AV1_ENCODER_CONTEXT_TREE_H_
diff --git a/third_party/aom/av1/encoder/corner_detect.h b/third_party/aom/av1/encoder/corner_detect.h
index 0317db5b3..cab59a774 100644
--- a/third_party/aom/av1/encoder/corner_detect.h
+++ b/third_party/aom/av1/encoder/corner_detect.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_CORNER_DETECT_H_
-#define AV1_ENCODER_CORNER_DETECT_H_
+#ifndef AOM_AV1_ENCODER_CORNER_DETECT_H_
+#define AOM_AV1_ENCODER_CORNER_DETECT_H_
#include <stdio.h>
#include <stdlib.h>
@@ -19,4 +19,4 @@
int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
int *points, int max_points);
-#endif // AV1_ENCODER_CORNER_DETECT_H_
+#endif // AOM_AV1_ENCODER_CORNER_DETECT_H_
diff --git a/third_party/aom/av1/encoder/corner_match.h b/third_party/aom/av1/encoder/corner_match.h
index 3b16f9efc..535d2faed 100644
--- a/third_party/aom/av1/encoder/corner_match.h
+++ b/third_party/aom/av1/encoder/corner_match.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_CORNER_MATCH_H_
-#define AV1_ENCODER_CORNER_MATCH_H_
+#ifndef AOM_AV1_ENCODER_CORNER_MATCH_H_
+#define AOM_AV1_ENCODER_CORNER_MATCH_H_
#include <stdio.h>
#include <stdlib.h>
@@ -30,4 +30,4 @@ int determine_correspondence(unsigned char *frm, int *frm_corners,
int height, int frm_stride, int ref_stride,
int *correspondence_pts);
-#endif // AV1_ENCODER_CORNER_MATCH_H_
+#endif // AOM_AV1_ENCODER_CORNER_MATCH_H_
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
index 5de7765c5..af5b09837 100644
--- a/third_party/aom/av1/encoder/cost.h
+++ b/third_party/aom/av1/encoder/cost.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_COST_H_
-#define AV1_ENCODER_COST_H_
+#ifndef AOM_AV1_ENCODER_COST_H_
+#define AOM_AV1_ENCODER_COST_H_
#include "aom_dsp/prob.h"
#include "aom/aom_integer.h"
@@ -44,4 +44,4 @@ void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
} // extern "C"
#endif
-#endif // AV1_ENCODER_COST_H_
+#endif // AOM_AV1_ENCODER_COST_H_
diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h
index 03318e5b7..37306c6a5 100644
--- a/third_party/aom/av1/encoder/dwt.h
+++ b/third_party/aom/av1/encoder/dwt.h
@@ -9,6 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#ifndef AOM_AV1_ENCODER_DWT_H_
+#define AOM_AV1_ENCODER_DWT_H_
+
#include "av1/common/common.h"
#include "av1/common/enums.h"
@@ -18,3 +21,5 @@ void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride);
void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
int hbd);
int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd);
+
+#endif // AOM_AV1_ENCODER_DWT_H_
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
index 27ca53761..cb226c59e 100644
--- a/third_party/aom/av1/encoder/encodeframe.c
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -40,11 +40,11 @@
#include "av1/common/reconinter.h"
#include "av1/common/seg_common.h"
#include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
#include "av1/encoder/aq_complexity.h"
#include "av1/encoder/aq_cyclicrefresh.h"
#include "av1/encoder/aq_variance.h"
-#include "av1/common/warped_motion.h"
#include "av1/encoder/global_motion.h"
#include "av1/encoder/encodeframe.h"
#include "av1/encoder/encodemb.h"
@@ -56,6 +56,7 @@
#include "av1/encoder/partition_model_weights.h"
#include "av1/encoder/rd.h"
#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
#include "av1/encoder/segmentation.h"
#include "av1/encoder/tokenize.h"
@@ -348,8 +349,9 @@ static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
x->skip = 0;
}
-static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data,
- ThreadData *td, PICK_MODE_CONTEXT *ctx, int mi_row,
+static void update_state(const AV1_COMP *const cpi,
+ const TileDataEnc *const tile_data, ThreadData *td,
+ const PICK_MODE_CONTEXT *const ctx, int mi_row,
int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
int i, x_idx, y;
const AV1_COMMON *const cm = &cpi->common;
@@ -359,7 +361,7 @@ static void update_state(const AV1_COMP *const cpi, TileDataEnc *tile_data,
MACROBLOCKD *const xd = &x->e_mbd;
struct macroblock_plane *const p = x->plane;
struct macroblockd_plane *const pd = xd->plane;
- MB_MODE_INFO *mi = &ctx->mic;
+ const MB_MODE_INFO *const mi = &ctx->mic;
MB_MODE_INFO *const mi_addr = xd->mi[0];
const struct segmentation *const seg = &cm->seg;
const int bw = mi_size_wide[mi->sb_type];
@@ -505,12 +507,12 @@ static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q);
}
-static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
MACROBLOCK *const x, int mi_row, int mi_col,
RD_STATS *rd_cost, PARTITION_TYPE partition,
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
int64_t best_rd) {
- const AV1_COMMON *const cm = &cpi->common;
+ AV1_COMMON *const cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
TileInfo *const tile_info = &tile_data->tile_info;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -522,6 +524,13 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode;
int i, orig_rdmult;
+ if (best_rd < 0) {
+ ctx->rdcost = INT64_MAX;
+ ctx->skip = 0;
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
+
aom_clear_system_state();
set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
@@ -588,9 +597,10 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
if (aq_mode == VARIANCE_AQ) {
if (cpi->vaq_refresh) {
- const int energy =
- bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
- mbmi->segment_id = av1_vaq_segment_id(energy);
+ const int energy = bsize <= BLOCK_16X16
+ ? x->mb_energy
+ : av1_log_block_var(cpi, x, bsize);
+ mbmi->segment_id = energy;
}
x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
} else if (aq_mode == COMPLEXITY_AQ) {
@@ -1407,8 +1417,8 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col,
RUN_TYPE dry_run, BLOCK_SIZE bsize,
- PARTITION_TYPE partition, PICK_MODE_CONTEXT *ctx,
- int *rate) {
+ PARTITION_TYPE partition,
+ const PICK_MODE_CONTEXT *const ctx, int *rate) {
TileInfo *const tile = &tile_data->tile_info;
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *xd = &x->e_mbd;
@@ -1691,7 +1701,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
- x->mb_energy = av1_block_energy(cpi, x, bsize);
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
}
if (do_partition_search &&
@@ -1728,7 +1738,20 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
pc_tree->partitioning = partition;
}
}
-
+ for (int b = 0; b < 2; ++b) {
+ pc_tree->horizontal[b].skip_ref_frame_mask = 0;
+ pc_tree->vertical[b].skip_ref_frame_mask = 0;
+ }
+ for (int b = 0; b < 3; ++b) {
+ pc_tree->horizontala[b].skip_ref_frame_mask = 0;
+ pc_tree->horizontalb[b].skip_ref_frame_mask = 0;
+ pc_tree->verticala[b].skip_ref_frame_mask = 0;
+ pc_tree->verticalb[b].skip_ref_frame_mask = 0;
+ }
+ for (int b = 0; b < 4; ++b) {
+ pc_tree->horizontal4[b].skip_ref_frame_mask = 0;
+ pc_tree->vertical4[b].skip_ref_frame_mask = 0;
+ }
switch (partition) {
case PARTITION_NONE:
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
@@ -1741,7 +1764,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
mi_row + hbs < cm->mi_rows) {
RD_STATS tmp_rdc;
- PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
+ const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
av1_init_rd_stats(&tmp_rdc);
update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
@@ -1765,7 +1788,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
mi_col + hbs < cm->mi_cols) {
RD_STATS tmp_rdc;
- PICK_MODE_CONTEXT *ctx_v = &pc_tree->vertical[0];
+ const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
av1_init_rd_stats(&tmp_rdc);
update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
@@ -1812,7 +1835,8 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
case PARTITION_HORZ_A:
case PARTITION_HORZ_B:
case PARTITION_HORZ_4:
- case PARTITION_VERT_4: assert(0 && "Cannot handle extended partiton types");
+ case PARTITION_VERT_4:
+ assert(0 && "Cannot handle extended partition types");
default: assert(0); break;
}
@@ -2164,7 +2188,8 @@ static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
}
-static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+static INLINE void load_pred_mv(MACROBLOCK *x,
+ const PICK_MODE_CONTEXT *const ctx) {
memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
}
@@ -2221,12 +2246,11 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
// Try searching for an encoding for the given subblock. Returns zero if the
// rdcost is already too high (to tell the caller not to bother searching for
// encodings of further subblocks)
-static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
- TileDataEnc *tile_data, TOKENEXTRA **tp,
- int is_first, int is_last, int mi_row, int mi_col,
- BLOCK_SIZE subsize, RD_STATS *best_rdc,
- RD_STATS *sum_rdc, RD_STATS *this_rdc,
- PARTITION_TYPE partition,
+static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TOKENEXTRA **tp, int is_last,
+ int mi_row, int mi_col, BLOCK_SIZE subsize,
+ RD_STATS *best_rdc, RD_STATS *sum_rdc,
+ RD_STATS *this_rdc, PARTITION_TYPE partition,
PICK_MODE_CONTEXT *prev_ctx,
PICK_MODE_CONTEXT *this_ctx) {
#define RTS_X_RATE_NOCOEF_ARG
@@ -2236,25 +2260,20 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
- // On the first time around, write the rd stats straight to sum_rdc. Also, we
- // should treat sum_rdc as containing zeros (even if it doesn't) to avoid
- // having to zero it at the start.
- if (is_first) this_rdc = sum_rdc;
- const int64_t spent_rdcost = is_first ? 0 : sum_rdc->rdcost;
- const int64_t rdcost_remaining = best_rdc->rdcost - spent_rdcost;
+ const int64_t rdcost_remaining = best_rdc->rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc->rdcost - sum_rdc->rdcost);
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
rdcost_remaining);
- if (!is_first) {
- if (this_rdc->rate == INT_MAX) {
- sum_rdc->rdcost = INT64_MAX;
- } else {
- sum_rdc->rate += this_rdc->rate;
- sum_rdc->dist += this_rdc->dist;
- sum_rdc->rdcost += this_rdc->rdcost;
- }
+ if (this_rdc->rate == INT_MAX) {
+ sum_rdc->rdcost = INT64_MAX;
+ } else {
+ sum_rdc->rate += this_rdc->rate;
+ sum_rdc->dist += this_rdc->dist;
+ sum_rdc->rdcost += this_rdc->rdcost;
}
if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0;
@@ -2271,7 +2290,7 @@ static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
#undef RTS_MAX_RDCOST
}
-static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
+static void rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
TileDataEnc *tile_data, TOKENEXTRA **tp,
PC_TREE *pc_tree, RD_STATS *best_rdc,
PICK_MODE_CONTEXT ctxs[3],
@@ -2284,13 +2303,16 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
MACROBLOCKD *const xd = &x->e_mbd;
RD_STATS sum_rdc, this_rdc;
#define RTP_STX_TRY_ARGS
-
- if (!rd_try_subblock(cpi, td, tile_data, tp, 1, 0, mi_row0, mi_col0, subsize0,
+ int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ av1_init_rd_stats(&sum_rdc);
+ sum_rdc.rate = x->partition_cost[pl][partition];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row0, mi_col0, subsize0,
best_rdc, &sum_rdc, &this_rdc,
RTP_STX_TRY_ARGS partition, ctx, &ctxs[0]))
return;
- if (!rd_try_subblock(cpi, td, tile_data, tp, 0, 0, mi_row1, mi_col1, subsize1,
+ if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row1, mi_col1, subsize1,
best_rdc, &sum_rdc, &this_rdc,
RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1]))
return;
@@ -2302,15 +2324,13 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
// difference (obviously) doesn't contribute to the error.
const int try_block2 = 1;
if (try_block2 &&
- !rd_try_subblock(cpi, td, tile_data, tp, 0, 1, mi_row2, mi_col2, subsize2,
+ !rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2,
best_rdc, &sum_rdc, &this_rdc,
RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2]))
return;
if (sum_rdc.rdcost >= best_rdc->rdcost) return;
- int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- sum_rdc.rate += x->partition_cost[pl][partition];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost >= best_rdc->rdcost) return;
@@ -2321,45 +2341,6 @@ static void rd_test_partition3(const AV1_COMP *const cpi, ThreadData *td,
#undef RTP_STX_TRY_ARGS
}
-#if CONFIG_DIST_8X8
-static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
- uint8_t *src_plane_8x8[MAX_MB_PLANE],
- uint8_t *dst_plane_8x8[MAX_MB_PLANE]) {
- const AV1_COMMON *const cm = &cpi->common;
- const int num_planes = av1_num_planes(cm);
- MACROBLOCKD *const xd = &x->e_mbd;
- int64_t dist_8x8, dist_8x8_uv, total_dist;
- const int src_stride = x->plane[0].src.stride;
- int plane;
-
- const int dst_stride = xd->plane[0].dst.stride;
- dist_8x8 =
- av1_dist_8x8(cpi, x, src_plane_8x8[0], src_stride, dst_plane_8x8[0],
- dst_stride, BLOCK_8X8, 8, 8, 8, 8, x->qindex)
- << 4;
-
- // Compute chroma distortion for a luma 8x8 block
- dist_8x8_uv = 0;
-
- if (num_planes > 1) {
- for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
- unsigned sse;
- const int src_stride_uv = x->plane[plane].src.stride;
- const int dst_stride_uv = xd->plane[plane].dst.stride;
- const int ssx = xd->plane[plane].subsampling_x;
- const int ssy = xd->plane[plane].subsampling_y;
- const BLOCK_SIZE plane_bsize = get_plane_block_size(BLOCK_8X8, ssx, ssy);
-
- cpi->fn_ptr[plane_bsize].vf(src_plane_8x8[plane], src_stride_uv,
- dst_plane_8x8[plane], dst_stride_uv, &sse);
- dist_8x8_uv += (int64_t)sse << 4;
- }
- }
-
- return total_dist = dist_8x8 + dist_8x8_uv;
-}
-#endif // CONFIG_DIST_8X8
-
static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
pc_tree->partitioning = PARTITION_NONE;
pc_tree->cb_search_range = SEARCH_FULL_PLANE;
@@ -2372,7 +2353,7 @@ static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
}
}
-static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
+static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
TileDataEnc *tile_data, TOKENEXTRA **tp,
int mi_row, int mi_col, BLOCK_SIZE bsize,
RD_STATS *rd_cost, int64_t best_rd,
@@ -2410,7 +2391,12 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
(void)*tp_orig;
(void)split_rd;
- av1_zero(pc_tree->pc_tree_stats);
+ if (best_rd < 0) {
+ pc_tree->none.rdcost = INT64_MAX;
+ pc_tree->none.skip = 0;
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
pc_tree->pc_tree_stats.valid = 1;
// Override partition costs at the edges of the frame in the same
@@ -2441,9 +2427,11 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
#ifndef NDEBUG
// Nothing should rely on the default value of this array (which is just
- // leftover from encoding the previous block. Setting it to magic number
+ // leftover from encoding the previous block. Setting it to fixed pattern
// when debugging.
- memset(x->blk_skip, 234, sizeof(x->blk_skip));
+ // bit 0, 1, 2 are blk_skip of each plane
+ // bit 4, 5, 6 are initialization checking of each plane
+ memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
#endif // NDEBUG
assert(mi_size_wide[bsize] == mi_size_high[bsize]);
@@ -2456,19 +2444,35 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
- x->mb_energy = av1_block_energy(cpi, x, bsize);
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
xd->left_txfm_context =
xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
+ do_square_split = 0;
+ }
+#endif
+
// PARTITION_NONE
if (partition_none_allowed) {
- if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
-
+ int pt_cost = 0;
+ if (bsize_at_least_8x8) {
+ pc_tree->partitioning = PARTITION_NONE;
+ pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+ ? partition_cost[PARTITION_NONE]
+ : 0;
+ }
+ int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0);
+ int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc.rdcost - partition_rd_cost);
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
- PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost);
+ PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost;
pc_tree->pc_tree_stats.skip = ctx_none->skip;
@@ -2476,9 +2480,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
if (none_rd) *none_rd = this_rdc.rdcost;
if (this_rdc.rate != INT_MAX) {
if (bsize_at_least_8x8) {
- const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
- ? partition_cost[PARTITION_NONE]
- : 0;
this_rdc.rate += pt_cost;
this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
}
@@ -2520,17 +2521,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
int64_t temp_best_rdcost = best_rdc.rdcost;
pn_rdc = best_rdc;
-#if CONFIG_DIST_8X8
- uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
-
- if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
- for (int i = 0; i < MAX_MB_PLANE; i++) {
- src_plane_8x8[i] = x->plane[i].src.buf;
- dst_plane_8x8[i] = xd->plane[i].dst.buf;
- }
- }
-#endif // CONFIG_DIST_8X8
-
// PARTITION_SPLIT
if (do_square_split) {
int reached_last_index = 0;
@@ -2548,6 +2538,8 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
pc_tree->split[idx]->index = idx;
int64_t *p_split_rd = &split_rd[idx];
+ // TODO(Cherma) : Account for partition cost while passing best rd to
+ // rd_pick_sqr_partition()
rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row + y_idx,
mi_col + x_idx, subsize, &this_rdc,
temp_best_rdcost - sum_rdc.rdcost,
@@ -2568,14 +2560,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
}
reached_last_index = (idx == 4);
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && reached_last_index &&
- sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
- sum_rdc.dist = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
- sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
- }
-#endif // CONFIG_DIST_8X8
-
if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
sum_rdc.rate += partition_cost[PARTITION_SPLIT];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -2634,14 +2618,6 @@ static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
}
}
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
- best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
- encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
- pc_tree, NULL);
- }
-#endif // CONFIG_DIST_8X8
-
if (bsize == cm->seq_params.sb_size) {
assert(best_rdc.rate < INT_MAX);
assert(best_rdc.dist < INT64_MAX);
@@ -2791,6 +2767,99 @@ static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
}
#undef FEATURE_SIZE
+static void ml_prune_rect_partition(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int64_t best_rd, int64_t none_rd,
+ int64_t *split_rd,
+ int *const dst_prune_horz,
+ int *const dst_prune_vert) {
+ if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+ best_rd = AOMMAX(best_rd, 1);
+ const NN_CONFIG *nn_config = NULL;
+ const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f };
+ float cur_thresh = 0.0f;
+ switch (bsize) {
+ case BLOCK_8X8:
+ nn_config = &av1_rect_partition_nnconfig_8;
+ cur_thresh = prob_thresholds[0];
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_rect_partition_nnconfig_16;
+ cur_thresh = prob_thresholds[1];
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_rect_partition_nnconfig_32;
+ cur_thresh = prob_thresholds[2];
+ break;
+ case BLOCK_64X64:
+ nn_config = &av1_rect_partition_nnconfig_64;
+ cur_thresh = prob_thresholds[3];
+ break;
+ case BLOCK_128X128:
+ nn_config = &av1_rect_partition_nnconfig_128;
+ cur_thresh = prob_thresholds[4];
+ break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config) return;
+ aom_clear_system_state();
+
+ // 1. Compute input features
+ float features[9];
+
+ // RD cost ratios
+ for (int i = 0; i < 5; i++) features[i] = 1.0f;
+ if (none_rd > 0 && none_rd < 1000000000)
+ features[0] = (float)none_rd / (float)best_rd;
+ for (int i = 0; i < 4; i++) {
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ features[1 + i] = (float)split_rd[i] / (float)best_rd;
+ }
+
+ // Variance ratios
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ int whole_block_variance;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ whole_block_variance = av1_high_get_sby_perpixel_variance(
+ cpi, &x->plane[0].src, bsize, xd->bd);
+ } else {
+ whole_block_variance =
+ av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+ }
+ whole_block_variance = AOMMAX(whole_block_variance, 1);
+
+ int split_variance[4];
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ struct buf_2d buf;
+ buf.stride = x->plane[0].src.stride;
+ const int bw = block_size_wide[bsize];
+ for (int i = 0; i < 4; ++i) {
+ const int x_idx = (i & 1) * bw / 2;
+ const int y_idx = (i >> 1) * bw / 2;
+ buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ split_variance[i] =
+ av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
+ } else {
+ split_variance[i] = av1_get_sby_perpixel_variance(cpi, &buf, subsize);
+ }
+ }
+
+ for (int i = 0; i < 4; i++)
+ features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
+
+ // 2. Do the prediction and prune 0-2 partitions based on their probabilities
+ float raw_scores[3] = { 0.0f };
+ av1_nn_predict(features, nn_config, raw_scores);
+ float probs[3] = { 0.0f };
+ av1_nn_softmax(raw_scores, probs, 3);
+
+ // probs[0] is the probability of the fact that both rectangular partitions
+ // are worse than current best_rd
+ if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1;
+ if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1;
+}
+
// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
// considered.
static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
@@ -2880,13 +2949,14 @@ static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
#define FEATURES 18
#define LABELS 4
// Use a ML model to predict if horz4 and vert4 should be considered.
-static void ml_prune_4_partition(const AV1_COMP *const cpi,
- const MACROBLOCK *const x, BLOCK_SIZE bsize,
- int part_ctx, int64_t best_rd,
- int64_t horz_rd[2], int64_t vert_rd[2],
- int64_t split_rd[4],
+static void ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int part_ctx,
+ int64_t best_rd, int64_t horz_rd[2],
+ int64_t vert_rd[2], int64_t split_rd[4],
int *const partition_horz4_allowed,
- int *const partition_vert4_allowed) {
+ int *const partition_vert4_allowed,
+ unsigned int pb_source_variance, int mi_row,
+ int mi_col) {
if (best_rd >= 1000000000) return;
const NN_CONFIG *nn_config = NULL;
switch (bsize) {
@@ -2903,7 +2973,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
float features[FEATURES];
int feature_index = 0;
features[feature_index++] = (float)part_ctx;
- features[feature_index++] = (float)get_unsigned_bits(x->source_variance);
+ features[feature_index++] = (float)get_unsigned_bits(pb_source_variance);
const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
int sub_block_rdcost[8] = { 0 };
@@ -2937,6 +3007,8 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
{
BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+ av1_num_planes(&cpi->common));
const int src_stride = x->plane[0].src.stride;
const uint8_t *src = x->plane[0].src.buf;
const MACROBLOCKD *const xd = &x->e_mbd;
@@ -2990,7 +3062,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
}
}
- const float denom = (float)(x->source_variance + 1);
+ const float denom = (float)(pb_source_variance + 1);
const float low_b = 0.1f;
const float high_b = 10.0f;
for (int i = 0; i < 4; ++i) {
@@ -3022,9 +3094,9 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
// Make decisions based on the model scores.
int thresh = max_score;
switch (bsize) {
- case BLOCK_16X16: thresh -= 400; break;
- case BLOCK_32X32: thresh -= 400; break;
- case BLOCK_64X64: thresh -= 100; break;
+ case BLOCK_16X16: thresh -= 500; break;
+ case BLOCK_32X32: thresh -= 500; break;
+ case BLOCK_64X64: thresh -= 200; break;
default: break;
}
*partition_horz4_allowed = 0;
@@ -3039,10 +3111,73 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi,
#undef FEATURES
#undef LABELS
+#define FEATURES 4
+// ML-based partition search breakout.
+static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ const MACROBLOCK *const x,
+ const RD_STATS *const rd_stats,
+ unsigned int pb_source_variance) {
+ const NN_CONFIG *nn_config = NULL;
+ int thresh = 0;
+ switch (bsize) {
+ case BLOCK_8X8:
+ nn_config = &av1_partition_breakout_nnconfig_8;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[0];
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_partition_breakout_nnconfig_16;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[1];
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_partition_breakout_nnconfig_32;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[2];
+ break;
+ case BLOCK_64X64:
+ nn_config = &av1_partition_breakout_nnconfig_64;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[3];
+ break;
+ case BLOCK_128X128:
+ nn_config = &av1_partition_breakout_nnconfig_128;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[4];
+ break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config || thresh < 0) return 0;
+
+ // Generate feature values.
+ float features[FEATURES];
+ int feature_index = 0;
+ aom_clear_system_state();
+
+ const int num_pels_log2 = num_pels_log2_lookup[bsize];
+ float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
+ rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+ rate_f;
+ features[feature_index++] = rate_f;
+
+ const float dist_f =
+ (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2);
+ features[feature_index++] = dist_f;
+
+ features[feature_index++] = (float)pb_source_variance;
+
+ const int dc_q = (int)x->plane[0].dequant_QTX[0];
+ features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
+ assert(feature_index == FEATURES);
+
+ // Calculate score using the NN model.
+ float score = 0.0f;
+ av1_nn_predict(features, nn_config, &score);
+
+ // Make decision.
+ return (int)(score * 100) >= thresh;
+}
+#undef FEATURES
+
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
-static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
+static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
TileDataEnc *tile_data, TOKENEXTRA **tp,
int mi_row, int mi_col, BLOCK_SIZE bsize,
RD_STATS *rd_cost, int64_t best_rd,
@@ -3068,6 +3203,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
int do_rectangular_split = 1;
+ int64_t cur_none_rd = 0;
int64_t split_rd[4] = { 0, 0, 0, 0 };
int64_t horz_rd[2] = { 0, 0 };
int64_t vert_rd[2] = { 0, 0 };
@@ -3077,6 +3213,12 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
int vert_ctx_is_ready = 0;
BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+ if (best_rd < 0) {
+ pc_tree->none.rdcost = INT64_MAX;
+ pc_tree->none.skip = 0;
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
// Override skipping rectangular partition operations for edge blocks
@@ -3129,9 +3271,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
#ifndef NDEBUG
// Nothing should rely on the default value of this array (which is just
- // leftover from encoding the previous block. Setting it to magic number
+ // leftover from encoding the previous block. Setting it to fixed pattern
// when debugging.
- memset(x->blk_skip, 234, sizeof(x->blk_skip));
+ // bit 0, 1, 2 are blk_skip of each plane
+ // bit 4, 5, 6 are initialization checking of each plane
+ memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
#endif // NDEBUG
assert(mi_size_wide[bsize] == mi_size_high[bsize]);
@@ -3143,7 +3287,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
- x->mb_energy = av1_block_energy(cpi, x, bsize);
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
const int cb_partition_search_ctrl =
@@ -3285,22 +3429,56 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
}
#endif
+ // Ref frames picked in the [i_th] quarter subblock during square partition
+ // RD search. It may be used to prune ref frame selection of rect partitions.
+ int ref_frames_used[4] = {
+ 0,
+ };
+
BEGIN_PARTITION_SEARCH:
if (x->must_find_valid_partition) {
partition_none_allowed = has_rows && has_cols;
partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
}
+
+ // Partition block source pixel variance.
+ unsigned int pb_source_variance = UINT_MAX;
+
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (block_size_high[bsize] <= 8) partition_horz_allowed = 0;
+ if (block_size_wide[bsize] <= 8) partition_vert_allowed = 0;
+ if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
+ do_square_split = 0;
+ }
+#endif
+
// PARTITION_NONE
if (partition_none_allowed) {
+ int pt_cost = 0;
+ if (bsize_at_least_8x8) {
+ pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+ ? partition_cost[PARTITION_NONE]
+ : 0;
+ }
+ int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0);
+ int64_t best_remain_rdcost = (best_rdc.rdcost == INT64_MAX)
+ ? INT64_MAX
+ : (best_rdc.rdcost - partition_rd_cost);
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
- PARTITION_NONE, bsize, ctx_none, best_rdc.rdcost);
+ PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+ pb_source_variance = x->source_variance;
if (none_rd) *none_rd = this_rdc.rdcost;
+ cur_none_rd = this_rdc.rdcost;
if (this_rdc.rate != INT_MAX) {
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame);
+ for (int i = 0; i < 4; ++i) {
+ ref_frames_used[i] |= (1 << ref_type);
+ }
+ }
if (bsize_at_least_8x8) {
- const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
- ? partition_cost[PARTITION_NONE]
- : 0;
this_rdc.rate += pt_cost;
this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
}
@@ -3318,16 +3496,29 @@ BEGIN_PARTITION_SEARCH:
best_rdc = this_rdc;
if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
- // If all y, u, v transform blocks in this partition are skippable, and
- // the dist & rate are within the thresholds, the partition search is
- // terminated for current branch of the partition search tree.
- // The dist & rate thresholds are set to 0 at speed 0 to disable the
- // early termination at that speed.
- if (!x->e_mbd.lossless[xd->mi[0]->segment_id] &&
- (ctx_none->skippable && best_rdc.dist < dist_breakout_thr &&
- best_rdc.rate < rate_breakout_thr)) {
- do_square_split = 0;
- do_rectangular_split = 0;
+ if ((do_square_split || do_rectangular_split) &&
+ !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
+ const int use_ml_based_breakout =
+ bsize <= cpi->sf.use_square_partition_only_threshold &&
+ bsize > BLOCK_4X4 && xd->bd == 8;
+ if (use_ml_based_breakout) {
+ if (ml_predict_breakout(cpi, bsize, x, &this_rdc,
+ pb_source_variance)) {
+ do_square_split = 0;
+ do_rectangular_split = 0;
+ }
+ }
+
+ // If all y, u, v transform blocks in this partition are skippable,
+ // and the dist & rate are within the thresholds, the partition
+ // search is terminated for current branch of the partition search
+ // tree. The dist & rate thresholds are set to 0 at speed 0 to
+ // disable the early termination at that speed.
+ if (best_rdc.dist < dist_breakout_thr &&
+ best_rdc.rate < rate_breakout_thr) {
+ do_square_split = 0;
+ do_rectangular_split = 0;
+ }
}
#if CONFIG_FP_MB_STATS
@@ -3384,24 +3575,14 @@ BEGIN_PARTITION_SEARCH:
// store estimated motion vector
if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
-#if CONFIG_DIST_8X8
- uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
-
- if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
- for (int i = 0; i < num_planes; i++) {
- src_plane_8x8[i] = x->plane[i].src.buf;
- dst_plane_8x8[i] = xd->plane[i].dst.buf;
- }
- }
-#endif // CONFIG_DIST_8X8
-
// PARTITION_SPLIT
if (do_square_split) {
av1_init_rd_stats(&sum_rdc);
- int reached_last_index = 0;
subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
- int idx;
+ sum_rdc.rate = partition_cost[PARTITION_SPLIT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ int idx;
for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
const int x_idx = (idx & 1) * mi_step;
const int y_idx = (idx >> 1) * mi_step;
@@ -3413,8 +3594,13 @@ BEGIN_PARTITION_SEARCH:
pc_tree->split[idx]->index = idx;
int64_t *p_split_rd = &split_rd[idx];
+ int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc.rdcost - sum_rdc.rdcost);
+ if (cpi->sf.prune_ref_frame_for_rect_partitions)
+ pc_tree->split[idx]->none.rate = INT_MAX;
rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
- subsize, &this_rdc, best_rdc.rdcost - sum_rdc.rdcost,
+ subsize, &this_rdc, best_remain_rdcost,
pc_tree->split[idx], p_split_rd);
if (this_rdc.rate == INT_MAX) {
@@ -3424,11 +3610,16 @@ BEGIN_PARTITION_SEARCH:
sum_rdc.rate += this_rdc.rate;
sum_rdc.dist += this_rdc.dist;
sum_rdc.rdcost += this_rdc.rdcost;
-
+ if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+ pc_tree->split[idx]->none.rate != INT_MAX) {
+ const int ref_type =
+ av1_ref_frame_type(pc_tree->split[idx]->none.mic.ref_frame);
+ ref_frames_used[idx] |= (1 << ref_type);
+ }
if (idx <= 1 && (bsize <= BLOCK_8X8 ||
pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
- MB_MODE_INFO *const mbmi = &(pc_tree->split[idx]->none.mic);
- PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
// Neither palette mode nor cfl predicted
if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1;
@@ -3436,60 +3627,83 @@ BEGIN_PARTITION_SEARCH:
}
}
}
- reached_last_index = (idx == 4);
-
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && reached_last_index &&
- sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
- int64_t dist_8x8;
- dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-#ifdef DEBUG_DIST_8X8
- // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/)
- assert(sum_rdc.dist == dist_8x8);
-#endif // DEBUG_DIST_8X8
- sum_rdc.dist = dist_8x8;
- sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
- }
-#endif // CONFIG_DIST_8X8
+ const int reached_last_index = (idx == 4);
if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
- sum_rdc.rate += partition_cost[PARTITION_SPLIT];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost < best_rdc.rdcost) {
best_rdc = sum_rdc;
pc_tree->partitioning = PARTITION_SPLIT;
}
- } else if (cpi->sf.less_rectangular_check) {
+ } else if (cpi->sf.less_rectangular_check_level > 0) {
// skip rectangular partition test when larger block size
// gives better rd cost
- do_rectangular_split &= !partition_none_allowed;
+ if (cpi->sf.less_rectangular_check_level == 2 || idx <= 2)
+ do_rectangular_split &= !partition_none_allowed;
}
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
} // if (do_split)
+ pc_tree->horizontal[0].skip_ref_frame_mask = 0;
+ pc_tree->horizontal[1].skip_ref_frame_mask = 0;
+ pc_tree->vertical[0].skip_ref_frame_mask = 0;
+ pc_tree->vertical[1].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[1];
+ if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2] | ref_frames_used[3];
+ if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[2];
+ if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1] | ref_frames_used[3];
+ if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames;
+ }
+
+ int prune_horz = 0;
+ int prune_vert = 0;
+ if (cpi->sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
+ (partition_horz_allowed || partition_vert_allowed)) {
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+ ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd,
+ split_rd, &prune_horz, &prune_vert);
+ }
+
// PARTITION_HORZ
- if (partition_horz_allowed &&
+ if (partition_horz_allowed && !prune_horz &&
(do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
av1_init_rd_stats(&sum_rdc);
subsize = get_partition_subsize(bsize, PARTITION_HORZ);
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
- partition_none_allowed)
+ partition_none_allowed) {
pc_tree->horizontal[0].pred_interp_filter =
av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
-
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+ }
+ int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc.rdcost - sum_rdc.rdcost);
+ sum_rdc.rate = partition_cost[PARTITION_HORZ];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
- best_rdc.rdcost);
- horz_rd[0] = sum_rdc.rdcost;
+ best_remain_rdcost);
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ }
+ horz_rd[0] = this_rdc.rdcost;
if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) {
- PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
- MB_MODE_INFO *const mbmi = &(pc_tree->horizontal[0].mic);
- PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
+ const MB_MODE_INFO *const mbmi = &pc_tree->horizontal[0].mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
// Neither palette mode nor cfl predicted
if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1;
@@ -3501,24 +3715,15 @@ BEGIN_PARTITION_SEARCH:
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
- partition_none_allowed)
+ partition_none_allowed) {
pc_tree->horizontal[1].pred_interp_filter =
av1_extract_interp_filter(ctx_h->mic.interp_filters, 0);
-
+ }
rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
best_rdc.rdcost - sum_rdc.rdcost);
horz_rd[1] = this_rdc.rdcost;
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
- update_state(cpi, tile_data, td, &pc_tree->horizontal[1],
- mi_row + mi_step, mi_col, subsize, DRY_RUN_NORMAL);
- encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL,
- mi_row + mi_step, mi_col, subsize, NULL);
- }
-#endif // CONFIG_DIST_8X8
-
if (this_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
} else {
@@ -3526,24 +3731,9 @@ BEGIN_PARTITION_SEARCH:
sum_rdc.dist += this_rdc.dist;
sum_rdc.rdcost += this_rdc.rdcost;
}
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
- bsize == BLOCK_8X8) {
- int64_t dist_8x8;
- dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-#ifdef DEBUG_DIST_8X8
- // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/)
- assert(sum_rdc.dist == dist_8x8);
-#endif // DEBUG_DIST_8X8
- sum_rdc.dist = dist_8x8;
- sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
- }
-#endif // CONFIG_DIST_8X8
}
if (sum_rdc.rdcost < best_rdc.rdcost) {
- sum_rdc.rate += partition_cost[PARTITION_HORZ];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost < best_rdc.rdcost) {
best_rdc = sum_rdc;
@@ -3555,7 +3745,7 @@ BEGIN_PARTITION_SEARCH:
}
// PARTITION_VERT
- if (partition_vert_allowed &&
+ if (partition_vert_allowed && !prune_vert &&
(do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) {
av1_init_rd_stats(&sum_rdc);
subsize = get_partition_subsize(bsize, PARTITION_VERT);
@@ -3563,18 +3753,31 @@ BEGIN_PARTITION_SEARCH:
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
- partition_none_allowed)
+ partition_none_allowed) {
pc_tree->vertical[0].pred_interp_filter =
av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
-
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+ }
+ sum_rdc.rate = partition_cost[PARTITION_VERT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc.rdcost - sum_rdc.rdcost);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
PARTITION_VERT, subsize, &pc_tree->vertical[0],
- best_rdc.rdcost);
- vert_rd[0] = sum_rdc.rdcost;
+ best_remain_rdcost);
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ }
+ vert_rd[0] = this_rdc.rdcost;
const int64_t vert_max_rdcost = best_rdc.rdcost;
if (sum_rdc.rdcost < vert_max_rdcost && has_cols) {
- MB_MODE_INFO *const mbmi = &(pc_tree->vertical[0].mic);
- PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const MB_MODE_INFO *const mbmi = &pc_tree->vertical[0].mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
// Neither palette mode nor cfl predicted
if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1;
@@ -3587,24 +3790,15 @@ BEGIN_PARTITION_SEARCH:
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
- partition_none_allowed)
+ partition_none_allowed) {
pc_tree->vertical[1].pred_interp_filter =
av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
-
+ }
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
PARTITION_VERT, subsize, &pc_tree->vertical[1],
best_rdc.rdcost - sum_rdc.rdcost);
vert_rd[1] = this_rdc.rdcost;
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
- update_state(cpi, tile_data, td, &pc_tree->vertical[1], mi_row,
- mi_col + mi_step, subsize, DRY_RUN_NORMAL);
- encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
- mi_col + mi_step, subsize, NULL);
- }
-#endif // CONFIG_DIST_8X8
-
if (this_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
} else {
@@ -3612,25 +3806,9 @@ BEGIN_PARTITION_SEARCH:
sum_rdc.dist += this_rdc.dist;
sum_rdc.rdcost += this_rdc.rdcost;
}
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
- bsize == BLOCK_8X8) {
- int64_t dist_8x8;
- dist_8x8 = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-#ifdef DEBUG_DIST_8X8
- // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 &&
- 0 /* !CONFIG_CFL */)
- assert(sum_rdc.dist == dist_8x8);
-#endif // DEBUG_DIST_8X8
- sum_rdc.dist = dist_8x8;
- sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
- }
-#endif // CONFIG_DIST_8X8
}
if (sum_rdc.rdcost < best_rdc.rdcost) {
- sum_rdc.rate += partition_cost[PARTITION_VERT];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost < best_rdc.rdcost) {
best_rdc = sum_rdc;
@@ -3641,6 +3819,17 @@ BEGIN_PARTITION_SEARCH:
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
}
+ if (pb_source_variance == UINT_MAX) {
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ pb_source_variance = av1_high_get_sby_perpixel_variance(
+ cpi, &x->plane[0].src, bsize, xd->bd);
+ } else {
+ pb_source_variance =
+ av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+ }
+ }
+
const int ext_partition_allowed =
do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
@@ -3649,15 +3838,26 @@ BEGIN_PARTITION_SEARCH:
int horzab_partition_allowed = ext_partition_allowed;
int vertab_partition_allowed = ext_partition_allowed;
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) {
+ horzab_partition_allowed = 0;
+ vertab_partition_allowed = 0;
+ }
+ }
+#endif
+
if (cpi->sf.prune_ext_partition_types_search_level) {
if (cpi->sf.prune_ext_partition_types_search_level == 1) {
+ // TODO(debargha,huisu@google.com): may need to tune the threshold for
+ // pb_source_variance.
horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
(pc_tree->partitioning == PARTITION_NONE &&
- x->source_variance < 32) ||
+ pb_source_variance < 32) ||
pc_tree->partitioning == PARTITION_SPLIT);
vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
(pc_tree->partitioning == PARTITION_NONE &&
- x->source_variance < 32) ||
+ pb_source_variance < 32) ||
pc_tree->partitioning == PARTITION_SPLIT);
} else {
horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
@@ -3712,6 +3912,9 @@ BEGIN_PARTITION_SEARCH:
if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed &&
partition_horz_allowed && partition_vert_allowed) {
+ // TODO(huisu@google.com): x->source_variance may not be the current block's
+ // variance. The correct one to use is pb_source_variance.
+ // Need to re-train the model to fix it.
ml_prune_ab_partition(bsize, pc_tree->partitioning,
get_unsigned_bits(x->source_variance),
best_rdc.rdcost, horz_rd, vert_rd, split_rd,
@@ -3736,6 +3939,21 @@ BEGIN_PARTITION_SEARCH:
pc_tree->horizontala[1].rd_mode_is_ready = 1;
}
}
+ pc_tree->horizontala[0].skip_ref_frame_mask = 0;
+ pc_tree->horizontala[1].skip_ref_frame_mask = 0;
+ pc_tree->horizontala[2].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0];
+ if (used_frames)
+ pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1];
+ if (used_frames)
+ pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2] | ref_frames_used[3];
+ if (used_frames)
+ pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames;
+ }
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row,
@@ -3754,6 +3972,21 @@ BEGIN_PARTITION_SEARCH:
pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B;
pc_tree->horizontalb[0].rd_mode_is_ready = 1;
}
+ pc_tree->horizontalb[0].skip_ref_frame_mask = 0;
+ pc_tree->horizontalb[1].skip_ref_frame_mask = 0;
+ pc_tree->horizontalb[2].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[1];
+ if (used_frames)
+ pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2];
+ if (used_frames)
+ pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[3];
+ if (used_frames)
+ pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames;
+ }
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
PARTITION_HORZ_B, mi_row, mi_col, subsize,
@@ -3773,6 +4006,18 @@ BEGIN_PARTITION_SEARCH:
pc_tree->verticala[0].mic.partition = PARTITION_VERT_A;
pc_tree->verticala[0].rd_mode_is_ready = 1;
}
+ pc_tree->verticala[0].skip_ref_frame_mask = 0;
+ pc_tree->verticala[1].skip_ref_frame_mask = 0;
+ pc_tree->verticala[2].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0];
+ if (used_frames) pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2];
+ if (used_frames) pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1] | ref_frames_used[3];
+ if (used_frames) pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames;
+ }
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
PARTITION_VERT_A, mi_row, mi_col, bsize2,
@@ -3791,6 +4036,18 @@ BEGIN_PARTITION_SEARCH:
pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B;
pc_tree->verticalb[0].rd_mode_is_ready = 1;
}
+ pc_tree->verticalb[0].skip_ref_frame_mask = 0;
+ pc_tree->verticalb[1].skip_ref_frame_mask = 0;
+ pc_tree->verticalb[2].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[2];
+ if (used_frames) pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1];
+ if (used_frames) pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[3];
+ if (used_frames) pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames;
+ }
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row,
@@ -3823,9 +4080,19 @@ BEGIN_PARTITION_SEARCH:
partition_horz_allowed && partition_vert_allowed) {
ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning, best_rdc.rdcost,
horz_rd, vert_rd, split_rd, &partition_horz4_allowed,
- &partition_vert4_allowed);
+ &partition_vert4_allowed, pb_source_variance, mi_row,
+ mi_col);
}
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (block_size_high[bsize] <= 16 || block_size_wide[bsize] <= 16) {
+ partition_horz4_allowed = 0;
+ partition_vert4_allowed = 0;
+ }
+ }
+#endif
+
// PARTITION_HORZ_4
if (partition_horz4_allowed && has_rows &&
(do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
@@ -3834,25 +4101,33 @@ BEGIN_PARTITION_SEARCH:
PICK_MODE_CONTEXT *ctx_prev = ctx_none;
subsize = get_partition_subsize(bsize, PARTITION_HORZ_4);
+ sum_rdc.rate = partition_cost[PARTITION_HORZ_4];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
for (int i = 0; i < 4; ++i) {
- int this_mi_row = mi_row + i * quarter_step;
+ const int this_mi_row = mi_row + i * quarter_step;
if (i > 0 && this_mi_row >= cm->mi_rows) break;
PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
ctx_this->rd_mode_is_ready = 0;
- if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3),
- this_mi_row, mi_col, subsize, &best_rdc, &sum_rdc,
- &this_rdc, PARTITION_HORZ_4, ctx_prev, ctx_this))
+ ctx_this->skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ const int used_frames = i <= 1
+ ? (ref_frames_used[0] | ref_frames_used[1])
+ : (ref_frames_used[2] | ref_frames_used[3]);
+ if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
+ }
+ if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row,
+ mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
+ PARTITION_HORZ_4, ctx_prev, ctx_this))
break;
ctx_prev = ctx_this;
}
if (sum_rdc.rdcost < best_rdc.rdcost) {
- sum_rdc.rate += partition_cost[PARTITION_HORZ_4];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost < best_rdc.rdcost) {
best_rdc = sum_rdc;
@@ -3870,16 +4145,25 @@ BEGIN_PARTITION_SEARCH:
PICK_MODE_CONTEXT *ctx_prev = ctx_none;
subsize = get_partition_subsize(bsize, PARTITION_VERT_4);
+ sum_rdc.rate = partition_cost[PARTITION_VERT_4];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
for (int i = 0; i < 4; ++i) {
- int this_mi_col = mi_col + i * quarter_step;
+ const int this_mi_col = mi_col + i * quarter_step;
if (i > 0 && this_mi_col >= cm->mi_cols) break;
PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
ctx_this->rd_mode_is_ready = 0;
- if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3), mi_row,
+ ctx_this->skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ const int used_frames = i <= 1
+ ? (ref_frames_used[0] | ref_frames_used[2])
+ : (ref_frames_used[1] | ref_frames_used[3]);
+ if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
+ }
+ if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row,
this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
PARTITION_VERT_4, ctx_prev, ctx_this))
break;
@@ -3888,7 +4172,6 @@ BEGIN_PARTITION_SEARCH:
}
if (sum_rdc.rdcost < best_rdc.rdcost) {
- sum_rdc.rate += partition_cost[PARTITION_VERT_4];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost < best_rdc.rdcost) {
best_rdc = sum_rdc;
@@ -3924,14 +4207,6 @@ BEGIN_PARTITION_SEARCH:
}
}
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
- best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
- encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
- pc_tree, NULL);
- }
-#endif // CONFIG_DIST_8X8
-
if (bsize == cm->seq_params.sb_size) {
assert(best_rdc.rate < INT_MAX);
assert(best_rdc.dist < INT64_MAX);
@@ -3950,6 +4225,15 @@ static void init_first_partition_pass_stats_tables(
}
}
+// clear pc_tree_stats
+static INLINE void clear_pc_tree_stats(PC_TREE *pt) {
+ if (pt == NULL) return;
+ pt->pc_tree_stats.valid = 0;
+ for (int i = 0; i < 4; ++i) {
+ clear_pc_tree_stats(pt->split[i]);
+ }
+}
+
// Minimum number of samples to trigger the
// mode_pruning_based_on_two_pass_partition_search feature.
#define FIRST_PARTITION_PASS_MIN_SAMPLES 16
@@ -3963,7 +4247,6 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
SPEED_FEATURES *const sf = &cpi->sf;
- int mi_col;
const int leaf_nodes = 256;
// Initialize the left context for the new SB row
@@ -3977,26 +4260,16 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
}
}
+ PC_TREE *const pc_root =
+ td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2];
// Code each SB in the row
- for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+ for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
mi_col += cm->seq_params.mib_size) {
- const struct segmentation *const seg = &cm->seg;
- int dummy_rate;
- int64_t dummy_dist;
- RD_STATS dummy_rdc;
- int i;
- int seg_skip = 0;
-
- const int idx_str = cm->mi_stride * mi_row + mi_col;
- MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
- PC_TREE *const pc_root =
- td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2];
-
av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
av1_fill_mode_rates(cm, x, xd->tile_ctx);
if (sf->adaptive_pred_interp_filter) {
- for (i = 0; i < leaf_nodes; ++i) {
+ for (int i = 0; i < leaf_nodes; ++i) {
td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
@@ -4015,10 +4288,12 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
av1_zero(x->pred_mv);
pc_root->index = 0;
+ const struct segmentation *const seg = &cm->seg;
+ int seg_skip = 0;
if (seg->enabled) {
const uint8_t *const map =
seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
- int segment_id =
+ const int segment_id =
map ? get_segment_id(cm, map, cm->seq_params.sb_size, mi_row, mi_col)
: 0;
seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
@@ -4039,15 +4314,14 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
cpi, block_wavelet_energy_level);
} else {
const int block_var_level =
- av1_block_energy(cpi, x, cm->seq_params.sb_size);
+ av1_log_block_var(cpi, x, cm->seq_params.sb_size);
x->sb_energy_level = block_var_level;
offset_qindex =
av1_compute_deltaq_from_energy_level(cpi, block_var_level);
}
- int qmask = ~(cm->delta_q_res - 1);
+ const int qmask = ~(cm->delta_q_res - 1);
int current_qindex = clamp(cm->base_qindex + offset_qindex,
cm->delta_q_res, 256 - cm->delta_q_res);
-
current_qindex =
((current_qindex - cm->base_qindex + cm->delta_q_res / 2) & qmask) +
cm->base_qindex;
@@ -4058,18 +4332,16 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
xd->mi[0]->current_qindex = current_qindex;
av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) {
- int j, k;
- int lfmask = ~(cm->delta_lf_res - 1);
- int delta_lf_from_base = offset_qindex / 2;
- delta_lf_from_base =
- ((delta_lf_from_base + cm->delta_lf_res / 2) & lfmask);
+ const int lfmask = ~(cm->delta_lf_res - 1);
+ const int delta_lf_from_base =
+ ((offset_qindex / 2 + cm->delta_lf_res / 2) & lfmask);
// pre-set the delta lf for loop filter. Note that this value is set
// before mi is assigned for each block in current superblock
- for (j = 0; j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row);
- j++) {
- for (k = 0; k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col);
- k++) {
+ for (int j = 0;
+ j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row); j++) {
+ for (int k = 0;
+ k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col); k++) {
cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
.delta_lf_from_base =
clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
@@ -4085,19 +4357,24 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
}
}
+ int dummy_rate;
+ int64_t dummy_dist;
+ RD_STATS dummy_rdc;
+ const int idx_str = cm->mi_stride * mi_row + mi_col;
+ MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
x->source_variance = UINT_MAX;
if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
- BLOCK_SIZE bsize;
set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
- bsize = seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size;
+ const BLOCK_SIZE bsize =
+ seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size;
set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
pc_root);
} else if (cpi->partition_search_skippable_frame) {
- BLOCK_SIZE bsize;
set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
- bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+ const BLOCK_SIZE bsize =
+ get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
@@ -4113,9 +4390,9 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
reset_partition(pc_root, cm->seq_params.sb_size);
x->use_cb_search_range = 0;
init_first_partition_pass_stats_tables(x->first_partition_pass_stats);
+ // Do the first pass if we need two pass partition search
if (cpi->sf.two_pass_partition_search &&
- cpi->sf.use_square_partition_only_threshold <
- cm->seq_params.sb_size &&
+ cpi->sf.use_square_partition_only_threshold > BLOCK_4X4 &&
mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows &&
mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols &&
cm->frame_type != KEY_FRAME) {
@@ -4123,6 +4400,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
// Reset the stats tables.
if (sf->mode_pruning_based_on_two_pass_partition_search)
av1_zero(x->first_partition_pass_stats);
+ clear_pc_tree_stats(pc_root);
rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col,
cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
pc_root, NULL);
@@ -4130,7 +4408,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
x->source_variance = UINT_MAX;
if (sf->adaptive_pred_interp_filter) {
- for (i = 0; i < leaf_nodes; ++i) {
+ for (int i = 0; i < leaf_nodes; ++i) {
td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
@@ -4157,7 +4435,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
x->use_cb_search_range = 1;
if (sf->mode_pruning_based_on_two_pass_partition_search) {
- for (i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+ for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
FIRST_PARTITION_PASS_STATS *const stat =
&x->first_partition_pass_stats[i];
if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
@@ -4174,21 +4452,17 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
}
}
}
-
- rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
- cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
- pc_root, NULL);
- } else {
- rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
- cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
- pc_root, NULL);
}
+
+ rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+ cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, pc_root,
+ NULL);
}
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
// TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
cm->tile_rows == 1) {
- av1_inter_mode_data_fit(x->rdmult);
+ av1_inter_mode_data_fit(tile_data, x->rdmult);
}
#endif
}
@@ -4233,6 +4507,32 @@ static TX_MODE select_tx_mode(const AV1_COMP *cpi) {
return cpi->common.tx_mode;
}
+void av1_alloc_tile_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int tile_col, tile_row;
+
+ if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
+ CHECK_MEM_ERROR(
+ cm, cpi->tile_data,
+ aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+ cpi->allocated_tiles = tile_cols * tile_rows;
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ int i, j;
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ for (j = 0; j < MAX_MODES; ++j) {
+ tile_data->thresh_freq_fact[i][j] = 32;
+ tile_data->mode_map[i][j] = j;
+ }
+ }
+ }
+}
+
void av1_init_tile_data(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
@@ -4240,28 +4540,9 @@ void av1_init_tile_data(AV1_COMP *cpi) {
const int tile_rows = cm->tile_rows;
int tile_col, tile_row;
TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+ TOKENLIST *tplist = cpi->tplist[0][0];
unsigned int tile_tok = 0;
-
- if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
- if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
- CHECK_MEM_ERROR(
- cm, cpi->tile_data,
- aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
- cpi->allocated_tiles = tile_cols * tile_rows;
-
- for (tile_row = 0; tile_row < tile_rows; ++tile_row)
- for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
- TileDataEnc *const tile_data =
- &cpi->tile_data[tile_row * tile_cols + tile_col];
- int i, j;
- for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
- for (j = 0; j < MAX_MODES; ++j) {
- tile_data->thresh_freq_fact[i][j] = 32;
- tile_data->mode_map[i][j] = j;
- }
- }
- }
- }
+ int tplist_count = 0;
for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
@@ -4274,6 +4555,9 @@ void av1_init_tile_data(AV1_COMP *cpi) {
pre_tok = cpi->tile_tok[tile_row][tile_col];
tile_tok = allocated_tokens(
*tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+ cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
+ tplist = cpi->tplist[tile_row][tile_col];
+ tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
tile_data->allow_update_cdf = !cm->large_scale_tile;
tile_data->allow_update_cdf =
tile_data->allow_update_cdf && !cm->disable_cdf_update;
@@ -4281,15 +4565,56 @@ void av1_init_tile_data(AV1_COMP *cpi) {
}
}
+void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
+ int tile_col, int mi_row) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const int tile_cols = cm->tile_cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ TOKENEXTRA *tok = NULL;
+ int sb_row_in_tile;
+ int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+
+ int num_mb_rows_in_sb =
+ ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+
+ sb_row_in_tile =
+ (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2;
+
+ get_start_tok(cpi, tile_row, tile_col, mi_row, &tok,
+ cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok;
+
+ encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok;
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].count =
+ (unsigned int)(cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop -
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].start);
+
+ assert(
+ (unsigned int)(tok -
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].start) <=
+ get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
+ cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes));
+
+ (void)tile_mb_cols;
+ (void)num_mb_rows_in_sb;
+}
+
void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
int tile_col) {
AV1_COMMON *const cm = &cpi->common;
TileDataEnc *const this_tile =
&cpi->tile_data[tile_row * cm->tile_cols + tile_col];
const TileInfo *const tile_info = &this_tile->tile_info;
- TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
int mi_row;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ av1_inter_mode_data_init(this_tile);
+#endif
+
av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
tile_info->mi_col_end, tile_row);
av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
@@ -4310,25 +4635,23 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
mi_row += cm->seq_params.mib_size) {
- encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+ av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
}
-
- cpi->tok_count[tile_row][tile_col] =
- (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
- assert(cpi->tok_count[tile_row][tile_col] <=
- allocated_tokens(*tile_info,
- cm->seq_params.mib_size_log2 + MI_SIZE_LOG2,
- av1_num_planes(cm)));
}
static void encode_tiles(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
int tile_col, tile_row;
+ if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
+ av1_alloc_tile_data(cpi);
+
av1_init_tile_data(cpi);
- for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
- for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
cpi->intrabc_used |= cpi->td.intrabc_used_this_tile;
}
@@ -4616,6 +4939,13 @@ static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) {
return 0;
}
+static void set_default_interp_skip_flags(AV1_COMP *cpi) {
+ const int num_planes = av1_num_planes(&cpi->common);
+ cpi->default_interp_skip_flags = (num_planes == 1)
+ ? DEFAULT_LUMA_INTERP_SKIP_FLAG
+ : DEFAULT_INTERP_SKIP_FLAG;
+}
+
static void encode_frame_internal(AV1_COMP *cpi) {
ThreadData *const td = &cpi->td;
MACROBLOCK *const x = &td->mb;
@@ -4683,41 +5013,41 @@ static void encode_frame_internal(AV1_COMP *cpi) {
av1_hash_table_create(&cm->cur_frame->hash_table);
av1_generate_block_2x2_hash_value(cpi->source, block_hash_values[0],
- is_block_same[0]);
+ is_block_same[0], &cpi->td.mb);
av1_generate_block_hash_value(cpi->source, 4, block_hash_values[0],
block_hash_values[1], is_block_same[0],
- is_block_same[1]);
+ is_block_same[1], &cpi->td.mb);
av1_add_to_hash_map_by_row_with_precal_data(
&cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
pic_width, pic_height, 4);
av1_generate_block_hash_value(cpi->source, 8, block_hash_values[1],
block_hash_values[0], is_block_same[1],
- is_block_same[0]);
+ is_block_same[0], &cpi->td.mb);
av1_add_to_hash_map_by_row_with_precal_data(
&cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
pic_width, pic_height, 8);
av1_generate_block_hash_value(cpi->source, 16, block_hash_values[0],
block_hash_values[1], is_block_same[0],
- is_block_same[1]);
+ is_block_same[1], &cpi->td.mb);
av1_add_to_hash_map_by_row_with_precal_data(
&cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
pic_width, pic_height, 16);
av1_generate_block_hash_value(cpi->source, 32, block_hash_values[1],
block_hash_values[0], is_block_same[1],
- is_block_same[0]);
+ is_block_same[0], &cpi->td.mb);
av1_add_to_hash_map_by_row_with_precal_data(
&cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
pic_width, pic_height, 32);
av1_generate_block_hash_value(cpi->source, 64, block_hash_values[0],
block_hash_values[1], is_block_same[0],
- is_block_same[1]);
+ is_block_same[1], &cpi->td.mb);
av1_add_to_hash_map_by_row_with_precal_data(
&cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
pic_width, pic_height, 64);
av1_generate_block_hash_value(cpi->source, 128, block_hash_values[1],
block_hash_values[0], is_block_same[1],
- is_block_same[0]);
+ is_block_same[0], &cpi->td.mb);
av1_add_to_hash_map_by_row_with_precal_data(
&cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
pic_width, pic_height, 128);
@@ -4769,7 +5099,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
av1_initialize_rd_consts(cpi);
av1_initialize_me_consts(cpi, x, cm->base_qindex);
init_encode_frame_mb_context(cpi);
-
+ set_default_interp_skip_flags(cpi);
if (cm->prev_frame)
cm->last_frame_seg_map = cm->prev_frame->seg_map;
else
@@ -4793,6 +5123,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
av1_zero(rdc->global_motion_used);
av1_zero(cpi->gmparams_cost);
+#if !CONFIG_GLOBAL_MOTION_SEARCH
+ cpi->global_motion_search_done = 1;
+#endif // !CONFIG_GLOBAL_MOTION_SEARCH
if (cpi->common.frame_type == INTER_FRAME && cpi->source &&
!cpi->global_motion_search_done) {
YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
@@ -4939,27 +5272,13 @@ static void encode_frame_internal(AV1_COMP *cpi) {
}
#endif
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- av1_inter_mode_data_init();
-#endif
-
- // If allowed, encoding tiles in parallel with one thread handling one tile.
- // TODO(geza.lore): The multi-threaded encoder is not safe with more than
- // 1 tile rows, as it uses the single above_context et al arrays from
- // cpi->common
- if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols) > 1 && cm->tile_rows == 1)
+ if (cpi->row_mt && (cpi->oxcf.max_threads > 1))
+ av1_encode_tiles_mt(cpi);
+ else if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols * cm->tile_rows) > 1)
av1_encode_tiles_mt(cpi);
else
encode_tiles(cpi);
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-#if INTER_MODE_RD_TEST
- if (cpi->sf.inter_mode_rd_model_estimation) {
- av1_inter_mode_data_show(cm);
- }
-#endif
-#endif
-
aom_usec_timer_mark(&emr_timer);
cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer);
}
@@ -5407,7 +5726,7 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
}
mbmi->tx_size = tx_size;
- set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h,
+ set_txfm_ctxs(tx_size, xd->n4_w, xd->n4_h,
(mbmi->skip || seg_skip) && is_inter_block(mbmi), xd);
}
CFL_CTX *const cfl = &xd->cfl;
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
index 62141dba4..e8cf9b468 100644
--- a/third_party/aom/av1/encoder/encodeframe.h
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_ENCODEFRAME_H_
-#define AV1_ENCODER_ENCODEFRAME_H_
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_H_
#include "aom/aom_integer.h"
#include "av1/common/blockd.h"
@@ -20,7 +20,7 @@
extern "C" {
#endif
-#define DELTAQ_MODULATION 0 // 0: variance based, 1: wavelet AC energy based
+#define DELTAQ_MODULATION 1 // 0: variance based, 1: wavelet AC energy based
struct macroblock;
struct yv12_buffer_config;
@@ -33,12 +33,15 @@ void av1_setup_src_planes(struct macroblock *x,
void av1_encode_frame(struct AV1_COMP *cpi);
+void av1_alloc_tile_data(struct AV1_COMP *cpi);
void av1_init_tile_data(struct AV1_COMP *cpi);
void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
int tile_col);
+void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td,
+ int tile_row, int tile_col, int mi_row);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_ENCODEFRAME_H_
+#endif // AOM_AV1_ENCODER_ENCODEFRAME_H_
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
index cea8db6f9..ad12577e6 100644
--- a/third_party/aom/av1/encoder/encodemb.c
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -222,11 +222,8 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
a = &args->ta[blk_col];
l = &args->tl[blk_row];
- // Assert not magic number (uninitialized).
- assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234);
- if ((plane != 0 || x->blk_skip[blk_row * bw + blk_col] == 0) &&
- !mbmi->skip_mode) {
+ if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) {
TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
tx_size, cm->reduced_tx_set_used);
if (args->enable_optimize_b) {
@@ -350,6 +347,66 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
}
}
+void av1_foreach_transformed_block_in_plane(
+ const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+ foreach_transformed_block_visitor visit, void *arg) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+ // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+ // transform size varies per plane, look it up in a common way.
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+ const uint8_t txh_unit = tx_size_high_unit[tx_size];
+ const int step = txw_unit * txh_unit;
+ int i = 0, r, c;
+
+ // If mb_to_right_edge is < 0 we are in a situation in which
+ // the current block size extends into the UMV and we won't
+ // visit the sub blocks that are wholly within the UMV.
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+
+ int blk_row, blk_col;
+
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+ int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+ int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+ mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_blocks_high; r += mu_blocks_high) {
+ const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
+ const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
+ for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
+ for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
+ visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
+ i += step;
+ }
+ }
+ }
+ }
+}
+
+void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ foreach_transformed_block_visitor visit,
+ void *arg, const int num_planes) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (!is_chroma_reference(mi_row, mi_col, bsize,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y))
+ continue;
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
+ }
+}
+
typedef struct encode_block_pass1_args {
AV1_COMMON *cm;
MACROBLOCK *x;
@@ -382,7 +439,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
txfm_param.tx_set_type = av1_get_ext_tx_set_type(
txfm_param.tx_size, is_inter_block(xd->mi[0]), cm->reduced_tx_set_used);
if (txfm_param.is_hbd) {
- av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, &txfm_param);
+ av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
return;
}
av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
@@ -513,9 +570,7 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
- // Assert not magic number (uninitialized).
- assert(plane != 0 || x->blk_skip[blk_row * bw + blk_col] != 234);
- if (plane == 0 && x->blk_skip[blk_row * bw + blk_col]) {
+ if (plane == 0 && is_blk_skip(x, plane, blk_row * bw + blk_col)) {
*eob = 0;
p->txb_entropy_ctx[block] = 0;
} else {
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
index 673f87ea7..39080de59 100644
--- a/third_party/aom/av1/encoder/encodemb.h
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_ENCODEMB_H_
-#define AV1_ENCODER_ENCODEMB_H_
+#ifndef AOM_AV1_ENCODER_ENCODEMB_H_
+#define AOM_AV1_ENCODER_ENCODEMB_H_
#include "config/aom_config.h"
@@ -47,7 +47,18 @@ typedef enum AV1_XFORM_QUANT {
void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
int mi_row, int mi_col, RUN_TYPE dry_run);
+
+void av1_foreach_transformed_block_in_plane(
+ const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+ foreach_transformed_block_visitor visit, void *arg);
+
+void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ foreach_transformed_block_visitor visit,
+ void *arg, const int num_planes);
+
void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
+
void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, TX_TYPE tx_type,
@@ -82,4 +93,4 @@ void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
} // extern "C"
#endif
-#endif // AV1_ENCODER_ENCODEMB_H_
+#endif // AOM_AV1_ENCODER_ENCODEMB_H_
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
index 944e2c53d..42eb5abf6 100644
--- a/third_party/aom/av1/encoder/encodemv.c
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -18,19 +18,37 @@
#include "av1/encoder/encodemv.h"
#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+
+static INLINE int mv_class_base(MV_CLASS_TYPE c) {
+ return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
+static INLINE uint8_t log_in_base_2(unsigned int n) {
+ // get_msb() is only valid when n != 0.
+ return n == 0 ? 0 : get_msb(n);
+}
+
+static INLINE MV_CLASS_TYPE get_mv_class(int z, int *offset) {
+ const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
+ ? MV_CLASS_10
+ : (MV_CLASS_TYPE)log_in_base_2(z >> 3);
+ if (offset) *offset = z - mv_class_base(c);
+ return c;
+}
static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
MvSubpelPrecision precision) {
+ assert(comp != 0);
int offset;
const int sign = comp < 0;
const int mag = sign ? -comp : comp;
- const int mv_class = av1_get_mv_class(mag - 1, &offset);
+ const int mv_class = get_mv_class(mag - 1, &offset);
const int d = offset >> 3; // int mv data
const int fr = (offset >> 1) & 3; // fractional mv data
const int hp = offset & 1; // high precision mv data
- assert(comp != 0);
-
// Sign
aom_write_symbol(w, sign, mvcomp->sign_cdf, 2);
@@ -89,7 +107,7 @@ static void build_nmv_component_cost_table(int *mvcost,
for (v = 1; v <= MV_MAX; ++v) {
int z, c, o, d, e, f, cost = 0;
z = v - 1;
- c = av1_get_mv_class(z, &o);
+ c = get_mv_class(z, &o);
cost += class_cost[c];
d = (o >> 3); /* int mv data */
f = (o >> 1) & 3; /* fractional pel mv data */
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
index 64e9e7162..37ff547c8 100644
--- a/third_party/aom/av1/encoder/encodemv.h
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_ENCODEMV_H_
-#define AV1_ENCODER_ENCODEMV_H_
+#ifndef AOM_AV1_ENCODER_ENCODEMV_H_
+#define AOM_AV1_ENCODER_ENCODEMV_H_
#include "av1/encoder/encoder.h"
@@ -40,8 +40,16 @@ void av1_find_best_ref_mvs_from_stack(int allow_hp,
int_mv *nearest_mv, int_mv *near_mv,
int is_integer);
+static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
+ if (mv->row == 0) {
+ return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
+ } else {
+ return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_ENCODEMV_H_
+#endif // AOM_AV1_ENCODER_ENCODEMV_H_
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
index 13ea32e38..a2da2df89 100644
--- a/third_party/aom/av1/encoder/encoder.c
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -14,9 +14,28 @@
#include <stdio.h>
#include "config/aom_config.h"
-#include "config/av1_rtcd.h"
#include "config/aom_dsp_rtcd.h"
#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/noise_model.h"
+#endif
+#include "aom_dsp/psnr.h"
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
#include "av1/common/alloccommon.h"
#include "av1/common/cdef.h"
@@ -38,6 +57,7 @@
#include "av1/encoder/encodetxb.h"
#include "av1/encoder/ethread.h"
#include "av1/encoder/firstpass.h"
+#include "av1/encoder/grain_test_vectors.h"
#include "av1/encoder/hash_motion.h"
#include "av1/encoder/mbgraph.h"
#include "av1/encoder/picklpf.h"
@@ -49,26 +69,6 @@
#include "av1/encoder/speed_features.h"
#include "av1/encoder/temporal_filter.h"
-#include "aom_dsp/psnr.h"
-#if CONFIG_INTERNAL_STATS
-#include "aom_dsp/ssim.h"
-#endif
-#include "av1/encoder/grain_test_vectors.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#if CONFIG_DENOISE
-#include "aom_dsp/grain_table.h"
-#include "aom_dsp/noise_util.h"
-#include "aom_dsp/noise_model.h"
-#endif
-#include "aom_ports/aom_timer.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
-#include "aom_scale/aom_scale.h"
-#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
-#include "aom_util/debug_util.h"
-#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
-
#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
// av1 uses 10,000,000 ticks/second as time stamp
@@ -413,18 +413,13 @@ static void swap_mi_and_prev_mi(AV1_COMMON *cm) {
}
void av1_initialize_enc(void) {
- static volatile int init_done = 0;
-
- if (!init_done) {
- av1_rtcd();
- aom_dsp_rtcd();
- aom_scale_rtcd();
- av1_init_intra_predictors();
- av1_init_me_luts();
- av1_rc_init_minq_luts();
- av1_init_wedge_masks();
- init_done = 1;
- }
+ av1_rtcd();
+ aom_dsp_rtcd();
+ aom_scale_rtcd();
+ av1_init_intra_predictors();
+ av1_init_me_luts();
+ av1_rc_init_minq_luts();
+ av1_init_wedge_masks();
}
static void dealloc_context_buffers_ext(AV1_COMP *cpi) {
@@ -506,6 +501,11 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
aom_free(cpi->td.mb.wsrc_buf);
cpi->td.mb.wsrc_buf = NULL;
+ for (int i = 0; i < 2; i++)
+ for (int j = 0; j < 2; j++) {
+ aom_free(cpi->td.mb.hash_value_buffer[i][j]);
+ cpi->td.mb.hash_value_buffer[i][j] = NULL;
+ }
aom_free(cpi->td.mb.mask_buf);
cpi->td.mb.mask_buf = NULL;
@@ -527,10 +527,18 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
aom_free(cpi->tile_tok[0][0]);
cpi->tile_tok[0][0] = 0;
+ aom_free(cpi->tplist[0][0]);
+ cpi->tplist[0][0] = NULL;
+
av1_free_pc_tree(&cpi->td, num_planes);
aom_free(cpi->td.mb.palette_buffer);
+ aom_free(cpi->td.mb.tmp_conv_dst);
+ for (int j = 0; j < 2; ++j) {
+ aom_free(cpi->td.mb.tmp_obmc_bufs[j]);
+ }
+
#if CONFIG_DENOISE
if (cpi->denoise_and_model) {
aom_denoise_and_model_free(cpi->denoise_and_model);
@@ -785,6 +793,10 @@ static void alloc_compressor_data(AV1_COMP *cpi) {
av1_alloc_context_buffers(cm, cm->width, cm->height);
+ int mi_rows_aligned_to_sb =
+ ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+ int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
av1_alloc_txb_buf(cpi);
alloc_context_buffers_ext(cpi);
@@ -797,6 +809,11 @@ static void alloc_compressor_data(AV1_COMP *cpi) {
CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
}
+ aom_free(cpi->tplist[0][0]);
+
+ CHECK_MEM_ERROR(cm, cpi->tplist[0][0],
+ aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
+ sizeof(*cpi->tplist[0][0])));
av1_setup_pc_tree(&cpi->common, &cpi->td);
}
@@ -1067,6 +1084,32 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
10; // Default value (not signaled)
}
+ if (cm->seq_params.monochrome) {
+ cm->seq_params.subsampling_x = 1;
+ cm->seq_params.subsampling_y = 1;
+ } else if (cm->seq_params.color_primaries == AOM_CICP_CP_BT_709 &&
+ cm->seq_params.transfer_characteristics == AOM_CICP_TC_SRGB &&
+ cm->seq_params.matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ cm->seq_params.subsampling_x = 0;
+ cm->seq_params.subsampling_y = 0;
+ } else {
+ if (cm->seq_params.profile == 0) {
+ cm->seq_params.subsampling_x = 1;
+ cm->seq_params.subsampling_y = 1;
+ } else if (cm->seq_params.profile == 1) {
+ cm->seq_params.subsampling_x = 0;
+ cm->seq_params.subsampling_y = 0;
+ } else {
+ if (cm->seq_params.bit_depth == AOM_BITS_12) {
+ cm->seq_params.subsampling_x = oxcf->chroma_subsampling_x;
+ cm->seq_params.subsampling_y = oxcf->chroma_subsampling_y;
+ } else {
+ cm->seq_params.subsampling_x = 1;
+ cm->seq_params.subsampling_y = 0;
+ }
+ }
+ }
+
cm->width = oxcf->width;
cm->height = oxcf->height;
set_sb_size(&cm->seq_params,
@@ -2326,6 +2369,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
cpi->oxcf = *oxcf;
cpi->common.options = oxcf->cfg;
+ cpi->row_mt = oxcf->row_mt;
x->e_mbd.bd = (int)seq_params->bit_depth;
x->e_mbd.global_motion = cm->global_motion;
@@ -2350,6 +2394,22 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
CHECK_MEM_ERROR(cm, x->palette_buffer,
aom_memalign(16, sizeof(*x->palette_buffer)));
}
+
+ if (x->tmp_conv_dst == NULL) {
+ CHECK_MEM_ERROR(
+ cm, x->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
+ x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
+ }
+ for (int i = 0; i < 2; ++i) {
+ if (x->tmp_obmc_bufs[i] == NULL) {
+ CHECK_MEM_ERROR(cm, x->tmp_obmc_bufs[i],
+ aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*x->tmp_obmc_bufs[i])));
+ x->e_mbd.tmp_obmc_bufs[i] = x->tmp_obmc_bufs[i];
+ }
+ }
+
av1_reset_segment_features(cm);
set_high_precision_mv(cpi, 1, 0);
@@ -2367,11 +2427,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
rc->worst_quality = cpi->oxcf.worst_allowed_q;
rc->best_quality = cpi->oxcf.best_allowed_q;
- if (!oxcf->large_scale_tile)
- cm->interp_filter = cpi->sf.default_interp_filter;
- else
- cm->interp_filter = EIGHTTAP_REGULAR;
-
+ cm->interp_filter = oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
cm->switchable_motion_mode = 1;
if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
@@ -2588,6 +2644,15 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
(int32_t *)aom_memalign(
16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
+ for (int x = 0; x < 2; x++)
+ for (int y = 0; y < 2; y++)
+ CHECK_MEM_ERROR(
+ cm, cpi->td.mb.hash_value_buffer[x][y],
+ (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*cpi->td.mb.hash_value_buffer[0][0])));
+
+ cpi->td.mb.g_crc_initialized = 0;
+
CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
(int32_t *)aom_memalign(
16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
@@ -2913,9 +2978,19 @@ void av1_remove_compressor(AV1_COMP *cpi) {
// Deallocate allocated thread data.
if (t < cpi->num_workers - 1) {
aom_free(thread_data->td->palette_buffer);
+ aom_free(thread_data->td->tmp_conv_dst);
+ for (int j = 0; j < 2; ++j) {
+ aom_free(thread_data->td->tmp_obmc_bufs[j]);
+ }
aom_free(thread_data->td->above_pred_buf);
aom_free(thread_data->td->left_pred_buf);
aom_free(thread_data->td->wsrc_buf);
+ for (int x = 0; x < 2; x++) {
+ for (int y = 0; y < 2; y++) {
+ aom_free(thread_data->td->hash_value_buffer[x][y]);
+ thread_data->td->hash_value_buffer[x][y] = NULL;
+ }
+ }
aom_free(thread_data->td->mask_buf);
aom_free(thread_data->td->counts);
av1_free_pc_tree(thread_data->td, num_planes);
@@ -3058,53 +3133,7 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
}
#endif
-#if USE_GF16_MULTI_LAYER
-static void check_show_existing_frame_gf16(AV1_COMP *cpi) {
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- AV1_COMMON *const cm = &cpi->common;
- const FRAME_UPDATE_TYPE next_frame_update_type =
- gf_group->update_type[gf_group->index];
-
- if (cm->show_existing_frame == 1) {
- cm->show_existing_frame = 0;
- } else if (cpi->rc.is_last_bipred_frame) {
- cpi->rc.is_last_bipred_frame = 0;
- cm->show_existing_frame = 1;
- cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[BWDREF_FRAME - 1];
- } else if (next_frame_update_type == OVERLAY_UPDATE ||
- next_frame_update_type == INTNL_OVERLAY_UPDATE) {
- // Check the temporal filtering status for the next OVERLAY frame
- const int num_arfs_in_gf = cpi->num_extra_arfs + 1;
- int which_arf = 0, arf_idx;
- // Identify the index to the next overlay frame.
- for (arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) {
- if (gf_group->index == cpi->arf_pos_for_ovrly[arf_idx]) {
- which_arf = arf_idx;
- break;
- }
- }
- assert(arf_idx < num_arfs_in_gf);
- if (cpi->is_arf_filter_off[which_arf]) {
- cm->show_existing_frame = 1;
- cpi->rc.is_src_frame_alt_ref = 1;
- cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
- ? cpi->ref_fb_idx[ALTREF_FRAME - 1]
- : cpi->ref_fb_idx[BWDREF_FRAME - 1];
- cpi->is_arf_filter_off[which_arf] = 0;
- }
- }
- cpi->rc.is_src_frame_ext_arf = 0;
-}
-#endif // USE_GF16_MULTI_LAYER
-
static void check_show_existing_frame(AV1_COMP *cpi) {
-#if USE_GF16_MULTI_LAYER
- if (cpi->rc.baseline_gf_interval == 16) {
- check_show_existing_frame_gf16(cpi);
- return;
- }
-#endif // USE_GF16_MULTI_LAYER
-
const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
AV1_COMMON *const cm = &cpi->common;
const FRAME_UPDATE_TYPE next_frame_update_type =
@@ -3350,13 +3379,13 @@ static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) {
EXTREF_FRAME - 1 };
for (int i = 2; i > 0; --i) {
- cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]];
-
// [0] is allocated to the current coded frame, i.e. bwdref
memcpy(
cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME],
sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME]));
+
+ cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]];
}
}
@@ -3370,52 +3399,16 @@ static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) {
EXTREF_FRAME - 1 };
for (int i = 0; i < 2; ++i) {
- cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]];
-
// [0] is allocated to the current coded frame, i.e. bwdref
memcpy(
cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME],
sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME]));
- }
-}
-#endif // USE_SYMM_MULTI_LAYER
-#if USE_GF16_MULTI_LAYER
-static void update_reference_frames_gf16(AV1_COMP *cpi) {
- AV1_COMMON *const cm = &cpi->common;
- BufferPool *const pool = cm->buffer_pool;
-
- if (cm->frame_type == KEY_FRAME) {
- for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
- ref_cnt_fb(pool->frame_bufs,
- &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]],
- cm->new_fb_idx);
- }
- } else {
- if (cpi->refresh_last_frame || cpi->refresh_golden_frame ||
- cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
- cpi->refresh_alt_ref_frame) {
- assert(cpi->refresh_fb_idx >= 0 && cpi->refresh_fb_idx < REF_FRAMES);
- ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->refresh_fb_idx],
- cm->new_fb_idx);
- }
-
- // TODO(zoeliu): To handle cpi->interp_filter_selected[].
-
- // For GF of 16, an additional ref frame index mapping needs to be handled
- // if this is the last frame to encode in the current GF group.
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- if (gf_group->update_type[gf_group->index + 1] == OVERLAY_UPDATE)
- av1_ref_frame_map_idx_updates(cpi, gf_group->index + 1);
+ cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]];
}
-
-#if DUMP_REF_FRAME_IMAGES == 1
- // Dump out all reference frame images.
- dump_ref_frame_images(cpi);
-#endif // DUMP_REF_FRAME_IMAGES
}
-#endif // USE_GF16_MULTI_LAYER
+#endif // USE_SYMM_MULTI_LAYER
static void update_reference_frames(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
@@ -3424,12 +3417,20 @@ static void update_reference_frames(AV1_COMP *cpi) {
// for the purpose to verify no mismatch between encoder and decoder.
if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
-#if USE_GF16_MULTI_LAYER
- if (cpi->rc.baseline_gf_interval == 16) {
- update_reference_frames_gf16(cpi);
- return;
+ // In the case of show_existing frame, we will not send fresh flag
+ // to decoder. Any change in the reference frame buffer can be done by
+ // switching the virtual indices.
+ if (cm->show_existing_frame) {
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ cpi->rc.is_bwd_ref_frame = 0;
+ cpi->rc.is_last_bipred_frame = 0;
+ cpi->rc.is_bipred_frame = 0;
}
-#endif // USE_GF16_MULTI_LAYER
BufferPool *const pool = cm->buffer_pool;
@@ -3458,9 +3459,15 @@ static void update_reference_frames(AV1_COMP *cpi) {
// slot and, if we're updating the GF, the current frame becomes the new GF.
int tmp;
- ref_cnt_fb(pool->frame_bufs,
- &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]],
- cm->new_fb_idx);
+ // ARF in general is a better reference than overlay. We shouldkeep ARF as
+ // reference instead of replacing it with overlay.
+
+ if (!cpi->preserve_arf_as_gld) {
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]],
+ cm->new_fb_idx);
+ }
+
tmp = cpi->ref_fb_idx[ALTREF_FRAME - 1];
cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp;
@@ -3758,7 +3765,7 @@ static void set_size_independent_vars(AV1_COMP *cpi) {
av1_set_speed_features_framesize_independent(cpi);
av1_set_rd_speed_thresholds(cpi);
av1_set_rd_speed_thresholds_sub8x8(cpi);
- cpi->common.interp_filter = cpi->sf.default_interp_filter;
+ cpi->common.interp_filter = SWITCHABLE;
cpi->common.switchable_motion_mode = 1;
}
@@ -3818,7 +3825,8 @@ static void set_restoration_unit_size(int width, int height, int sx, int sy,
rst[2].restoration_unit_size = rst[1].restoration_unit_size;
}
-static void init_ref_frame_bufs(AV1_COMMON *cm) {
+static void init_ref_frame_bufs(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
int i;
BufferPool *const pool = cm->buffer_pool;
cm->new_fb_idx = INVALID_IDX;
@@ -3828,7 +3836,7 @@ static void init_ref_frame_bufs(AV1_COMMON *cm) {
}
if (cm->seq_params.force_screen_content_tools) {
for (i = 0; i < FRAME_BUFFERS; ++i) {
- av1_hash_table_init(&pool->frame_bufs[i].hash_table);
+ av1_hash_table_init(&pool->frame_bufs[i].hash_table, &cpi->td.mb);
}
}
}
@@ -3846,7 +3854,7 @@ static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
seq_params->use_highbitdepth = use_highbitdepth;
alloc_raw_frame_buffers(cpi);
- init_ref_frame_bufs(cm);
+ init_ref_frame_bufs(cpi);
alloc_util_frame_buffers(cpi);
init_motion_estimation(cpi); // TODO(agrange) This can be removed.
@@ -4220,7 +4228,7 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
if (lf->filter_level[0] || lf->filter_level[1]) {
#if LOOP_FILTER_BITMASK
- av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0);
+ av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, 0, num_planes, 0);
#else
if (cpi->num_workers > 1)
av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0,
@@ -4587,8 +4595,8 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
rc->projected_frame_size < rc->max_frame_bandwidth)
loop = 0;
- if (recode_loop_test_global_motion(cpi)) {
- loop = 1;
+ if (!cpi->sf.gm_disable_recode) {
+ if (recode_loop_test_global_motion(cpi)) loop = 1;
}
if (loop) {
@@ -4716,47 +4724,6 @@ static void set_ext_overrides(AV1_COMP *cpi) {
cpi->ext_use_error_resilient && cpi->common.frame_type != KEY_FRAME;
}
-static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
- InterpFilter ifilter;
- int ref_total[REF_FRAMES] = { 0 };
- MV_REFERENCE_FRAME ref;
- int mask = 0;
- int arf_idx = ALTREF_FRAME;
-
- if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
- cpi->refresh_alt2_ref_frame)
- return mask;
-
- for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
- for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
- ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
-
- for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) {
- if ((ref_total[LAST_FRAME] &&
- cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
- (ref_total[LAST2_FRAME] == 0 ||
- cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50 <
- ref_total[LAST2_FRAME]) &&
- (ref_total[LAST3_FRAME] == 0 ||
- cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50 <
- ref_total[LAST3_FRAME]) &&
- (ref_total[GOLDEN_FRAME] == 0 ||
- cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 <
- ref_total[GOLDEN_FRAME]) &&
- (ref_total[BWDREF_FRAME] == 0 ||
- cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50 <
- ref_total[BWDREF_FRAME]) &&
- (ref_total[ALTREF2_FRAME] == 0 ||
- cpi->interp_filter_selected[ALTREF2_FRAME][ifilter] * 50 <
- ref_total[ALTREF2_FRAME]) &&
- (ref_total[ALTREF_FRAME] == 0 ||
- cpi->interp_filter_selected[arf_idx][ifilter] * 50 <
- ref_total[ALTREF_FRAME]))
- mask |= 1 << ifilter;
- }
- return mask;
-}
-
#define DUMP_RECON_FRAMES 0
#if DUMP_RECON_FRAMES == 1
@@ -4914,7 +4881,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
if (cm->current_video_frame > 0)
cpi->ref_frame_flags = get_ref_frame_flags(cpi);
- if (cm->show_existing_frame) {
+ if (encode_show_existing_frame(cm)) {
// NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
// BWDREF_FRAME in the reference frame buffer.
if (cm->frame_type == KEY_FRAME) {
@@ -4925,20 +4892,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
cm->show_frame = 1;
cpi->frame_flags = *frame_flags;
- // In the case of show_existing frame, we will not send fresh flag
- // to decoder. Any change in the reference frame buffer can be done by
- // switching the virtual indices.
-
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
-
- cpi->rc.is_bwd_ref_frame = 0;
- cpi->rc.is_last_bipred_frame = 0;
- cpi->rc.is_bipred_frame = 0;
-
restore_coding_context(cpi);
// Build the bitstream
@@ -4990,10 +4943,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
av1_rc_postencode_update(cpi, *size);
}
- // Decrement count down till next gf
- if (cpi->rc.frames_till_gf_update_due > 0)
- cpi->rc.frames_till_gf_update_due--;
-
++cm->current_video_frame;
return AOM_CODEC_OK;
@@ -5002,9 +4951,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
// Set default state for segment based loop filter update flags.
cm->lf.mode_ref_delta_update = 0;
- if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search)
- cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
-
// Set various flags etc to special state if it is a key frame.
if (frame_is_intra_only(cm) || frame_is_sframe(cm)) {
// Reset the loop filter deltas and segmentation map.
@@ -5246,15 +5192,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
// takes a space in the gf group. Therefore, even when
// it is not shown, we still need update the count down.
- // TODO(weitinglin): This is a work-around to handle the condition
- // when a frame is drop. We should fix the cm->show_frame flag
- // instead of checking the other condition to update the counter properly.
- if (cm->show_frame || is_frame_droppable(cpi)) {
- // Decrement count down till next gf
- if (cpi->rc.frames_till_gf_update_due > 0)
- cpi->rc.frames_till_gf_update_due--;
- }
-
if (cm->show_frame) {
// TODO(zoeliu): We may only swamp mi and prev_mi for those frames that
// are
@@ -5279,6 +5216,50 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
return AOM_CODEC_OK;
}
+static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+ // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
+ // differently here for rc->avg_frame_bandwidth.
+ if (cpi->common.show_frame || cpi->rc.is_bwd_ref_frame) {
+ if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+ cpi->common.frame_type == KEY_FRAME) {
+ // If this is a show_existing_frame with a source other than altref,
+ // or if it is not a displayed forward keyframe, the keyframe update
+ // counters were incremented when it was originally encoded.
+ cpi->rc.frames_since_key++;
+ cpi->rc.frames_to_key--;
+ }
+ }
+}
+
+static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+ // TODO(weitinglin): Updating this counter for is_frame_droppable
+ // is a work-around to handle the condition when a frame is drop.
+ // We should fix the cpi->common.show_frame flag
+ // instead of checking the other condition to update the counter properly.
+ if (cpi->common.show_frame || is_frame_droppable(cpi)) {
+ // Decrement count down till next gf
+ if (cpi->rc.frames_till_gf_update_due > 0)
+ cpi->rc.frames_till_gf_update_due--;
+ }
+}
+
+static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) {
+ // Increment the gf group index ready for the next frame. If this is
+ // a show_existing_frame with a source other than altref, or if it is not
+ // a displayed forward keyframe, the index was incremented when it was
+ // originally encoded.
+ if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+ cpi->common.frame_type == KEY_FRAME) {
+ ++cpi->twopass.gf_group.index;
+ }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+ update_keyframe_counters(cpi);
+ update_frames_till_gf_update(cpi);
+ if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi);
+}
+
static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
int skip_adapt, unsigned int *frame_flags) {
if (cpi->oxcf.rc_mode == AOM_CBR) {
@@ -5290,6 +5271,7 @@ static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
AOM_CODEC_OK) {
return AOM_CODEC_ERROR;
}
+ update_rc_counts(cpi);
check_show_existing_frame(cpi);
return AOM_CODEC_OK;
}
@@ -5319,14 +5301,8 @@ static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
cm->cum_txcoeff_cost_timer);
#endif
- // Do not do post-encoding update for those frames that do not have a spot
- // in
- // a gf group, but note that an OVERLAY frame always has a spot in a gf
- // group,
- // even when show_existing_frame is used.
- if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref) {
- av1_twopass_postencode_update(cpi);
- }
+ av1_twopass_postencode_update(cpi);
+ update_rc_counts(cpi);
check_show_existing_frame(cpi);
return AOM_CODEC_OK;
}
@@ -5734,7 +5710,7 @@ static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
av1_get_block_hash_value(
cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
block_size, &hash_value_1, &hash_value_2,
- (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH));
+ (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb);
// Hashing does not work for highbitdepth currently.
// TODO(Roger): Make it work for highbitdepth.
if (av1_use_hash_me(&cpi->common)) {
@@ -5822,13 +5798,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0);
- // Is multi-arf enabled.
- // Note that at the moment multi_arf is only configured for 2 pass VBR
- if ((oxcf->pass == 2) && (cpi->oxcf.enable_auto_arf > 1))
- cpi->multi_arf_allowed = 1;
- else
- cpi->multi_arf_allowed = 0;
-
// Normal defaults
cm->refresh_frame_context = oxcf->frame_parallel_decoding_mode
? REFRESH_FRAME_CONTEXT_DISABLED
@@ -5850,16 +5819,20 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
struct lookahead_entry *lookahead_src = NULL;
if (cm->current_video_frame > 0)
lookahead_src = av1_lookahead_peek(cpi->lookahead, 0);
- if (lookahead_src != NULL &&
- ((cpi->oxcf.error_resilient_mode |
- ((lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT) != 0)) ||
- (cpi->oxcf.s_frame_mode |
- ((lookahead_src->flags & AOM_EFLAG_SET_S_FRAME) != 0))) &&
- !(rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) {
- cm->show_existing_frame = 0;
+
+ int use_show_existing = 1;
+ if (lookahead_src != NULL) {
+ const int is_error_resilient =
+ cpi->oxcf.error_resilient_mode ||
+ (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
+ const int is_s_frame = cpi->oxcf.s_frame_mode ||
+ (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+ const int is_key_frame =
+ (rc->frames_to_key == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY);
+ use_show_existing = !(is_error_resilient || is_s_frame) || is_key_frame;
}
- if (oxcf->pass == 2 && cm->show_existing_frame) {
+ if (oxcf->pass == 2 && cm->show_existing_frame && use_show_existing) {
// Manage the source buffer and flush out the source frame that has been
// coded already; Also get prepared for PSNR calculation if needed.
if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) {
@@ -6415,3 +6388,50 @@ int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) {
const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
}
+
+aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
+ if (!cpi) return NULL;
+
+ uint8_t header_buf[512] = { 0 };
+ const uint32_t sequence_header_size =
+ write_sequence_header_obu(cpi, &header_buf[0]);
+ assert(sequence_header_size <= sizeof(header_buf));
+ if (sequence_header_size == 0) return NULL;
+
+ const size_t obu_header_size = 1;
+ const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size);
+ const size_t payload_offset = obu_header_size + size_field_size;
+
+ if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
+ memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
+
+ if (write_obu_header(OBU_SEQUENCE_HEADER, 0, &header_buf[0]) !=
+ obu_header_size) {
+ return NULL;
+ }
+
+ size_t coded_size_field_size = 0;
+ if (aom_uleb_encode(sequence_header_size, size_field_size,
+ &header_buf[obu_header_size],
+ &coded_size_field_size) != 0) {
+ return NULL;
+ }
+ assert(coded_size_field_size == size_field_size);
+
+ aom_fixed_buf_t *global_headers =
+ (aom_fixed_buf_t *)malloc(sizeof(*global_headers));
+ if (!global_headers) return NULL;
+
+ const size_t global_header_buf_size =
+ obu_header_size + size_field_size + sequence_header_size;
+
+ global_headers->buf = malloc(global_header_buf_size);
+ if (!global_headers->buf) {
+ free(global_headers);
+ return NULL;
+ }
+
+ memcpy(global_headers->buf, &header_buf[0], global_header_buf_size);
+ global_headers->sz = global_header_buf_size;
+ return global_headers;
+}
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
index 2b7ab711d..ee7fc4637 100644
--- a/third_party/aom/av1/encoder/encoder.h
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_ENCODER_H_
-#define AV1_ENCODER_ENCODER_H_
+#ifndef AOM_AV1_ENCODER_ENCODER_H_
+#define AOM_AV1_ENCODER_ENCODER_H_
#include <stdio.h>
@@ -142,7 +142,6 @@ typedef struct AV1EncoderConfig {
int noise_sensitivity; // pre processing blur: recommendation 0
int sharpness; // sharpening output: recommendation 0:
int speed;
- int dev_sf;
// maximum allowed bitrate for any intra frame in % of bitrate target.
unsigned int rc_max_intra_bitrate_pct;
// maximum allowed bitrate for any inter frame in % of bitrate target.
@@ -249,6 +248,7 @@ typedef struct AV1EncoderConfig {
int min_gf_interval;
int max_gf_interval;
+ int row_mt;
int tile_columns;
int tile_rows;
int tile_width_count;
@@ -309,6 +309,9 @@ typedef struct AV1EncoderConfig {
float noise_level;
int noise_block_size;
#endif
+
+ unsigned int chroma_subsampling_x;
+ unsigned int chroma_subsampling_y;
} AV1EncoderConfig;
static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
@@ -401,6 +404,43 @@ typedef struct FRAME_COUNTS {
[SWITCHABLE_FILTERS];
} FRAME_COUNTS;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
+
+typedef struct {
+ int ready;
+ double a;
+ double b;
+ double dist_mean;
+ double ld_mean;
+ double sse_mean;
+ double sse_sse_mean;
+ double sse_ld_mean;
+ int num;
+ double dist_sum;
+ double ld_sum;
+ double sse_sum;
+ double sse_sse_sum;
+ double sse_ld_sum;
+} InterModeRdModel;
+
+typedef struct {
+ int idx;
+ int64_t rd;
+} RdIdxPair;
+// TODO(angiebird): This is an estimated size. We still need to figure what is
+// the maximum number of modes.
+#define MAX_INTER_MODES 1024
+typedef struct inter_modes_info {
+ int num;
+ MB_MODE_INFO mbmi_arr[MAX_INTER_MODES];
+ int mode_rate_arr[MAX_INTER_MODES];
+ int64_t sse_arr[MAX_INTER_MODES];
+ int64_t est_rd_arr[MAX_INTER_MODES];
+ RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+} InterModesInfo;
+#endif
+
// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
typedef struct TileDataEnc {
TileInfo tile_info;
@@ -411,8 +451,18 @@ typedef struct TileDataEnc {
CFL_CTX cfl;
DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
uint8_t allow_update_cdf;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+ InterModesInfo inter_modes_info;
+#endif
} TileDataEnc;
+typedef struct {
+ TOKENEXTRA *start;
+ TOKENEXTRA *stop;
+ unsigned int count;
+} TOKENLIST;
+
typedef struct RD_COUNTS {
int64_t comp_pred_diff[REFERENCE_MODES];
// Stores number of 4x4 blocks using global motion per reference frame.
@@ -427,11 +477,14 @@ typedef struct ThreadData {
FRAME_COUNTS *counts;
PC_TREE *pc_tree;
PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+ uint32_t *hash_value_buffer[2][2];
int32_t *wsrc_buf;
int32_t *mask_buf;
uint8_t *above_pred_buf;
uint8_t *left_pred_buf;
PALETTE_BUFFER *palette_buffer;
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
int intrabc_used_this_tile;
} ThreadData;
@@ -502,6 +555,7 @@ typedef struct AV1_COMP {
int previous_index;
int cur_poc; // DebugInfo
+ unsigned int row_mt;
int scaled_ref_idx[REF_FRAMES];
int ref_fb_idx[REF_FRAMES];
int refresh_fb_idx; // ref frame buffer index to refresh
@@ -647,13 +701,12 @@ typedef struct AV1_COMP {
search_site_config ss_cfg;
- int multi_arf_allowed;
-
TileDataEnc *tile_data;
int allocated_tiles; // Keep track of memory allocated for tiles.
TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS];
+ TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
@@ -703,8 +756,13 @@ typedef struct AV1_COMP {
#if CONFIG_DENOISE
struct aom_denoise_and_model_t *denoise_and_model;
#endif
+ // Stores the default value of skip flag depending on chroma format
+ // Set as 1 for monochrome and 3 for other color formats
+ int default_interp_skip_flags;
+ int preserve_arf_as_gld;
} AV1_COMP;
+// Must not be called more than once.
void av1_initialize_enc(void);
struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
@@ -833,6 +891,22 @@ static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2,
return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
}
+static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
+ int mi_row, TOKENEXTRA **tok, int sb_size_log2,
+ int num_planes) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+
+ const int tile_mb_cols =
+ (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+ const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2;
+
+ *tok = cpi->tile_tok[tile_row][tile_col] +
+ get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes);
+}
+
void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
#define ALT_MIN_LAG 3
@@ -885,8 +959,27 @@ static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
return !av1_superres_scaled(cm) && av1_resize_scaled(cm);
}
+// Don't allow a show_existing_frame to coincide with an error resilient
+// frame. An exception can be made for a forward keyframe since it has no
+// previous dependencies.
+static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
+ return cm->show_existing_frame &&
+ (!cm->error_resilient_mode || cm->frame_type == KEY_FRAME);
+}
+
+// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
+// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
+// function, the memory must be freed by the caller. Both the buf member of the
+// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory
+// returned must be freed via call to free().
+//
+// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically,
+// the obu_has_size_field bit is set, and the buffer contains the obu_size
+// field.
+aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_ENCODER_H_
+#endif // AOM_AV1_ENCODER_ENCODER_H_
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
index 81f360733..5a31d93d7 100644
--- a/third_party/aom/av1/encoder/encodetxb.c
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -133,6 +133,38 @@ static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
return error;
}
+static const int8_t eob_to_pos_small[33] = {
+ 0, 1, 2, // 0-2
+ 3, 3, // 3-4
+ 4, 4, 4, 4, // 5-8
+ 5, 5, 5, 5, 5, 5, 5, 5, // 9-16
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32
+};
+
+static const int8_t eob_to_pos_large[17] = {
+ 6, // place holder
+ 7, // 33-64
+ 8, 8, // 65-128
+ 9, 9, 9, 9, // 129-256
+ 10, 10, 10, 10, 10, 10, 10, 10, // 257-512
+ 11 // 513-
+};
+
+static INLINE int get_eob_pos_token(const int eob, int *const extra) {
+ int t;
+
+ if (eob < 33) {
+ t = eob_to_pos_small[eob];
+ } else {
+ const int e = AOMMIN((eob - 1) >> 5, 16);
+ t = eob_to_pos_large[e];
+ }
+
+ *extra = eob - k_eob_group_start[t];
+
+ return t;
+}
+
#if CONFIG_ENTROPY_STATS
void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
TX_CLASS tx_class, PLANE_TYPE plane,
@@ -464,8 +496,12 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
aom_writer *w, int blk_row, int blk_col, int plane,
TX_SIZE tx_size, const tran_low_t *tcoeff,
uint16_t eob, TXB_CTX *txb_ctx) {
- const PLANE_TYPE plane_type = get_plane_type(plane);
const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, eob == 0,
+ ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
+ if (eob == 0) return;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
tx_size, cm->reduced_tx_set_used);
const TX_CLASS tx_class = tx_type_to_class[tx_type];
@@ -475,18 +511,10 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
const int bwl = get_txb_bwl(tx_size);
const int width = get_txb_wide(tx_size);
const int height = get_txb_high(tx_size);
- FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
uint8_t levels_buf[TX_PAD_2D];
uint8_t *const levels = set_levels(levels_buf, width);
DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
-
- aom_write_symbol(w, eob == 0,
- ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
- if (plane == 0 && eob == 0) {
- assert(tx_type == DCT_DCT);
- }
- if (eob == 0) return;
-
av1_txb_init_levels(tcoeff, width, height, levels);
av1_write_tx_type(cm, xd, blk_row, blk_col, plane, tx_size, w);
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
index 0442cc613..40ae343b0 100644
--- a/third_party/aom/av1/encoder/encodetxb.h
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef ENCODETXB_H_
-#define ENCODETXB_H_
+#ifndef AOM_AV1_ENCODER_ENCODETXB_H_
+#define AOM_AV1_ENCODER_ENCODETXB_H_
#include "config/aom_config.h"
@@ -84,4 +84,4 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
}
#endif
-#endif // COEFFS_CODING_H_
+#endif // AOM_AV1_ENCODER_ENCODETXB_H_
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
index 637d6824c..e8ac30bb5 100644
--- a/third_party/aom/av1/encoder/ethread.c
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -27,7 +27,8 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
}
-static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+static int enc_worker_hook(void *arg1, void *unused) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
AV1_COMP *const cpi = thread_data->cpi;
const AV1_COMMON *const cm = &cpi->common;
const int tile_cols = cm->tile_cols;
@@ -47,88 +48,141 @@ static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
return 1;
}
-void av1_encode_tiles_mt(AV1_COMP *cpi) {
+static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
AV1_COMMON *const cm = &cpi->common;
- const int tile_cols = cm->tile_cols;
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
- int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols);
- int i;
- av1_init_tile_data(cpi);
+ CHECK_MEM_ERROR(cm, cpi->workers,
+ aom_malloc(num_workers * sizeof(*cpi->workers)));
- // Only run once to create threads and allocate thread data.
- if (cpi->num_workers == 0) {
- CHECK_MEM_ERROR(cm, cpi->workers,
- aom_malloc(num_workers * sizeof(*cpi->workers)));
+ CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+ aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
- CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
- aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
- for (i = 0; i < num_workers; i++) {
- AVxWorker *const worker = &cpi->workers[i];
- EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+ ++cpi->num_workers;
+ winterface->init(worker);
+
+ thread_data->cpi = cpi;
+
+ if (i < num_workers - 1) {
+ // Allocate thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td,
+ aom_memalign(32, sizeof(*thread_data->td)));
+ av1_zero(*thread_data->td);
+
+ // Set up pc_tree.
+ thread_data->td->pc_tree = NULL;
+ av1_setup_pc_tree(cm, thread_data->td);
+
+ CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
+ (uint8_t *)aom_memalign(
+ 16, MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->td->above_pred_buf)));
+ CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
+ (uint8_t *)aom_memalign(
+ 16, MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->td->left_pred_buf)));
+
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->wsrc_buf,
+ (int32_t *)aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
+
+ for (int x = 0; x < 2; x++)
+ for (int y = 0; y < 2; y++)
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->hash_value_buffer[x][y],
+ (uint32_t *)aom_malloc(
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*thread_data->td->hash_value_buffer[0][0])));
+
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->mask_buf,
+ (int32_t *)aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
+ // Allocate frame counters in thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td->counts,
+ aom_calloc(1, sizeof(*thread_data->td->counts)));
+
+ // Allocate buffers used by palette coding mode.
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->palette_buffer,
+ aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
+
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+ sizeof(*thread_data->td->tmp_conv_dst)));
+ for (int j = 0; j < 2; ++j) {
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->tmp_obmc_bufs[j],
+ aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->td->tmp_obmc_bufs[j])));
+ }
- ++cpi->num_workers;
- winterface->init(worker);
+ // Create threads
+ if (!winterface->reset(worker))
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Tile encoder thread creation failed");
+ } else {
+ // Main thread acts as a worker and uses the thread data in cpi.
+ thread_data->td = &cpi->td;
+ }
+ winterface->sync(worker);
+ }
+}
- thread_data->cpi = cpi;
+static void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ // Encode a frame
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
- if (i < num_workers - 1) {
- // Allocate thread data.
- CHECK_MEM_ERROR(cm, thread_data->td,
- aom_memalign(32, sizeof(*thread_data->td)));
- av1_zero(*thread_data->td);
+ // Set the starting tile for each thread.
+ thread_data->start = i;
- // Set up pc_tree.
- thread_data->td->pc_tree = NULL;
- av1_setup_pc_tree(cm, thread_data->td);
+ if (i == cpi->num_workers - 1)
+ winterface->execute(worker);
+ else
+ winterface->launch(worker);
+ }
+}
- CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
- (uint8_t *)aom_memalign(
- 16, MAX_MB_PLANE * MAX_SB_SQUARE *
- sizeof(*thread_data->td->above_pred_buf)));
- CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
- (uint8_t *)aom_memalign(
- 16, MAX_MB_PLANE * MAX_SB_SQUARE *
- sizeof(*thread_data->td->left_pred_buf)));
+static void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
- CHECK_MEM_ERROR(
- cm, thread_data->td->wsrc_buf,
- (int32_t *)aom_memalign(
- 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
- CHECK_MEM_ERROR(
- cm, thread_data->td->mask_buf,
- (int32_t *)aom_memalign(
- 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
- // Allocate frame counters in thread data.
- CHECK_MEM_ERROR(cm, thread_data->td->counts,
- aom_calloc(1, sizeof(*thread_data->td->counts)));
-
- // Allocate buffers used by palette coding mode.
- CHECK_MEM_ERROR(
- cm, thread_data->td->palette_buffer,
- aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
-
- // Create threads
- if (!winterface->reset(worker))
- aom_internal_error(&cm->error, AOM_CODEC_ERROR,
- "Tile encoder thread creation failed");
- } else {
- // Main thread acts as a worker and uses the thread data in cpi.
- thread_data->td = &cpi->td;
- }
+ // Encoding ends.
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ winterface->sync(worker);
+ }
+}
- winterface->sync(worker);
+static void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) {
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+ cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile;
+ // Accumulate counters.
+ if (i < cpi->num_workers - 1) {
+ av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
+ accumulate_rd_opt(&cpi->td, thread_data->td);
+ cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
}
- } else {
- num_workers = AOMMIN(num_workers, cpi->num_workers);
}
+}
- for (i = 0; i < num_workers; i++) {
+static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ for (int i = 0; i < num_workers; i++) {
AVxWorker *const worker = &cpi->workers[i];
EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
- worker->hook = (AVxWorkerHook)enc_worker_hook;
+ worker->hook = hook;
worker->data1 = thread_data;
worker->data2 = NULL;
@@ -139,47 +193,59 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
+ for (int x = 0; x < 2; x++) {
+ for (int y = 0; y < 2; y++) {
+ memcpy(thread_data->td->hash_value_buffer[x][y],
+ cpi->td.mb.hash_value_buffer[x][y],
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*thread_data->td->hash_value_buffer[0][0]));
+ thread_data->td->mb.hash_value_buffer[x][y] =
+ thread_data->td->hash_value_buffer[x][y];
+ }
+ }
thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
}
if (thread_data->td->counts != &cpi->counts) {
memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
}
- if (i < num_workers - 1)
+ if (i < num_workers - 1) {
thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
- }
-
- // Encode a frame
- for (i = 0; i < num_workers; i++) {
- AVxWorker *const worker = &cpi->workers[i];
- EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
-
- // Set the starting tile for each thread.
- thread_data->start = i;
+ thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->mb.tmp_obmc_bufs[j] =
+ thread_data->td->tmp_obmc_bufs[j];
+ }
- if (i == cpi->num_workers - 1)
- winterface->execute(worker);
- else
- winterface->launch(worker);
+ thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
+ thread_data->td->mb.tmp_obmc_bufs[j];
+ }
+ }
}
+}
- // Encoding ends.
- for (i = 0; i < num_workers; i++) {
- AVxWorker *const worker = &cpi->workers[i];
- winterface->sync(worker);
- }
+void av1_encode_tiles_mt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows);
- for (i = 0; i < num_workers; i++) {
- AVxWorker *const worker = &cpi->workers[i];
- EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
- cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile;
- // Accumulate counters.
- if (i < cpi->num_workers - 1) {
- av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
- accumulate_rd_opt(&cpi->td, thread_data->td);
- cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
- }
+ if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
+ av1_alloc_tile_data(cpi);
+
+ av1_init_tile_data(cpi);
+ // Only run once to create threads and allocate thread data.
+ if (cpi->num_workers == 0) {
+ create_enc_workers(cpi, num_workers);
+ } else {
+ num_workers = AOMMIN(num_workers, cpi->num_workers);
}
+ prepare_enc_workers(cpi, enc_worker_hook, num_workers);
+ launch_enc_workers(cpi, num_workers);
+ sync_enc_workers(cpi, num_workers);
+ accumulate_counters_enc_workers(cpi, num_workers);
}
// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h
index b6b1fed4e..5de4b4803 100644
--- a/third_party/aom/av1/encoder/ethread.h
+++ b/third_party/aom/av1/encoder/ethread.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_ETHREAD_H_
-#define AV1_ENCODER_ETHREAD_H_
+#ifndef AOM_AV1_ENCODER_ETHREAD_H_
+#define AOM_AV1_ENCODER_ETHREAD_H_
#ifdef __cplusplus
extern "C" {
@@ -34,4 +34,4 @@ void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
} // extern "C"
#endif
-#endif // AV1_ENCODER_ETHREAD_H_
+#endif // AOM_AV1_ENCODER_ETHREAD_H_
diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h
index 48178b964..e0432cc97 100644
--- a/third_party/aom/av1/encoder/extend.h
+++ b/third_party/aom/av1/encoder/extend.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_EXTEND_H_
-#define AV1_ENCODER_EXTEND_H_
+#ifndef AOM_AV1_ENCODER_EXTEND_H_
+#define AOM_AV1_ENCODER_EXTEND_H_
#include "aom_scale/yv12config.h"
#include "aom/aom_integer.h"
@@ -29,4 +29,4 @@ void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
} // extern "C"
#endif
-#endif // AV1_ENCODER_EXTEND_H_
+#endif // AOM_AV1_ENCODER_EXTEND_H_
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
index ef0800c79..69dd20c52 100644
--- a/third_party/aom/av1/encoder/firstpass.c
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -31,6 +31,7 @@
#include "av1/encoder/aq_variance.h"
#include "av1/encoder/av1_quantize.h"
#include "av1/encoder/block.h"
+#include "av1/encoder/dwt.h"
#include "av1/encoder/encodeframe.h"
#include "av1/encoder/encodemb.h"
#include "av1/encoder/encodemv.h"
@@ -39,7 +40,7 @@
#include "av1/encoder/firstpass.h"
#include "av1/encoder/mcomp.h"
#include "av1/encoder/rd.h"
-#include "av1/encoder/dwt.h"
+#include "av1/encoder/reconinter_enc.h"
#define OUTPUT_FPF 0
#define ARF_STATS_OUTPUT 0
@@ -51,9 +52,10 @@
#define FACTOR_PT_LOW 0.70
#define FACTOR_PT_HIGH 0.90
#define FIRST_PASS_Q 10.0
-#define GF_MAX_BOOST 96.0
+#define GF_MAX_BOOST 90.0
#define INTRA_MODE_PENALTY 1024
-#define KF_MAX_BOOST 128.0
+#define KF_MIN_FRAME_BOOST 80.0
+#define KF_MAX_FRAME_BOOST 128.0
#define MIN_ARF_GF_BOOST 240
#define MIN_DECAY_FACTOR 0.01
#define MIN_KF_BOOST 300
@@ -62,6 +64,7 @@
#define DEFAULT_GRP_WEIGHT 1.0
#define RC_FACTOR_MIN 0.75
#define RC_FACTOR_MAX 1.75
+#define MIN_FWD_KF_INTERVAL 8
#define NCOUNT_INTRA_THRESH 8192
#define NCOUNT_INTRA_FACTOR 3
@@ -1562,576 +1565,9 @@ static int calculate_boost_bits(int frame_count, int boost,
0);
}
-#if USE_GF16_MULTI_LAYER
-// === GF Group of 16 ===
-#define GF_INTERVAL_16 16
-#define GF_FRAME_PARAMS (REF_FRAMES + 5)
-
-// GF Group of 16: multi-layer hierarchical coding structure
-// 1st Layer: Frame 0 and Frame 16 (ALTREF)
-// 2nd Layer: Frame 8 (ALTREF2)
-// 3rd Layer: Frame 4 and 12 (ALTREF2)
-// 4th Layer: Frame 2, 6, 10, and 14 (BWDREF)
-// 5th Layer: Frame 1, 3, 5, 7, 9, 11, 13, and 15
-static const unsigned char gf16_multi_layer_params[][GF_FRAME_PARAMS] = {
- // gf_group->index: coding order
- // (Frame #) : display order
- {
- // gf_group->index == 0 (Frame 0)
- OVERLAY_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // References (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF_FRAME, // Index (current) of reference to get updated
- GOLDEN_FRAME // cpi->refresh_golden_frame = 1
- },
- {
- // gf_group->index == 1 (Frame 16)
- ARF_UPDATE, // update_type
- GF_INTERVAL_16 - 1, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- ALTREF_FRAME, // cpi->alt_fb_idx ===> cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- GOLDEN_FRAME, // cpi->gld_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF_FRAME, // Index (current) of reference to get updated
- ALTREF_FRAME // cpi->refresh_alt_ref_frame = 1
- },
- {
- // gf_group->index == 2 (Frame 8)
- INTNL_ARF_UPDATE, // update_type
- (GF_INTERVAL_16 >> 1) - 1, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF2_FRAME, // Index (current) of reference to get updated
- ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1
- },
- {
- // gf_group->index == 3 (Frame 4)
- INTNL_ARF_UPDATE, // update_type
- (GF_INTERVAL_16 >> 2) - 1, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx
- // (BWDREF_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx
- // (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF2_FRAME, // Index (current) of reference to get updated
- ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1
- },
- {
- // gf_group->index == 4 (Frame 2)
- BRF_UPDATE, // update_type
- 0, // arf_src_offset
- 1, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx
- // (BWDREF_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx
- // (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- REF_FRAMES, // Index (current) of reference to get updated
- BWDREF_FRAME // cpi->refresh_bwd_ref_frame = 1
- },
- {
- // gf_group->index == 5 (Frame 1)
- LAST_BIPRED_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx ===> cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 6 (Frame 3)
- LF_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME)
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 7 (Frame 4 - OVERLAY)
- INTNL_OVERLAY_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- BWDREF_FRAME, // Index (current) of reference to get updated
- ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1
- },
- {
- // gf_group->index == 8 (Frame 6)
- BRF_UPDATE, // update_type
- 0, // arf_src_offset
- 1, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx -> cpi->bwd_fb_idx (BWDREF_FRAME)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF2_FRAME, // Index (current) of reference to get updated
- BWDREF_FRAME // cpi->refresh_bwd_frame = 1
- },
- {
- // gf_group->index == 9 (Frame 5)
- LAST_BIPRED_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 10 (Frame 7)
- LF_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 11 (Frame 8 - OVERLAY)
- INTNL_OVERLAY_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- BWDREF_FRAME, // Index (current) of reference to get updated
- ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1
- },
- {
- // gf_group->index == 12 (Frame 12)
- INTNL_ARF_UPDATE, // update_type
- (GF_INTERVAL_16 >> 2) - 1, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF2_FRAME, // Index (current) of reference to get updated
- ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1
- },
- {
- // gf_group->index == 13 (Frame 10)
- BRF_UPDATE, // update_type
- 0, // arf_src_offset
- 1, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF2_FRAME, // Index (current) of reference to get updated
- BWDREF_FRAME // cpi->refresh_bwd_frame = 1
- },
- {
- // gf_group->index == 14 (Frame 9)
- LAST_BIPRED_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 15 (Frame 11)
- LF_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 16 (Frame 12 - OVERLAY)
- INTNL_OVERLAY_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- BWDREF_FRAME, // Index (current) of reference to get updated
- ALTREF2_FRAME // cpi->refresh_alt2_ref_frame = 1
- },
- {
- // gf_group->index == 17 (Frame 14)
- BRF_UPDATE, // update_type
- 0, // arf_src_offset
- 1, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- BWDREF_FRAME, // Index (current) of reference to get updated
- BWDREF_FRAME // cpi->refresh_bwd_frame = 1
- },
- {
- // gf_group->index == 18 (Frame 13)
- LAST_BIPRED_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 19 (Frame 15)
- LF_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- BWDREF_FRAME, // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
- // LAST_FRAME]
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- LAST3_FRAME, // Index (current) of reference to get updated
- LAST_FRAME // cpi->refresh_last_frame = 1
- },
- {
- // gf_group->index == 20 (Frame 16 - OVERLAY: Belonging to the next GF
- // group)
- OVERLAY_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // brf_src_offset
- // Reference frame indexes (previous ===> current)
- LAST3_FRAME, // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
- LAST_FRAME, // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
- LAST2_FRAME, // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
- // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
- GOLDEN_FRAME, // cpi->gld_fb_idx (GOLDEN_FRAME)
- BWDREF_FRAME, // cpi->bwd_fb_idx (BWDREF_FRAME)
- ALTREF2_FRAME, // cpi->alt2_fb_idx (ALTREF2_FRAME)
- ALTREF_FRAME, // cpi->alt_fb_idx (ALTREF_FRAME)
- REF_FRAMES, // cpi->ext_fb_idx (extra ref frame)
- // Refreshment (index, flag)
- ALTREF_FRAME, // Index (current) of reference to get updated
- GOLDEN_FRAME // cpi->refresh_golden_frame = 1
- }
-};
-
-// === GF Group of 16 ===
-static void define_gf_group_structure_16(AV1_COMP *cpi) {
- RATE_CONTROL *const rc = &cpi->rc;
- TWO_PASS *const twopass = &cpi->twopass;
- GF_GROUP *const gf_group = &twopass->gf_group;
- const int key_frame = cpi->common.frame_type == KEY_FRAME;
-
- assert(rc->baseline_gf_interval == GF_INTERVAL_16);
-
- // Total number of frames to consider for GF group of 16:
- // = GF group interval + number of OVERLAY's
- // = rc->baseline_gf_interval + MAX_EXT_ARFS + 1 + 1
- // NOTE: The OVERLAY frame for the next GF group also needs to consider to
- // prepare for the reference frame index mapping.
-
- const int gf_update_frames = rc->baseline_gf_interval + MAX_EXT_ARFS + 2;
-
- for (int frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
- int param_idx = 0;
-
- // Treat KEY_FRAME differently
- if (frame_index == 0 && key_frame) {
- gf_group->update_type[frame_index] = KF_UPDATE;
-
- gf_group->rf_level[frame_index] = KF_STD;
- gf_group->arf_src_offset[frame_index] = 0;
- gf_group->brf_src_offset[frame_index] = 0;
- gf_group->bidir_pred_enabled[frame_index] = 0;
- for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx)
- gf_group->ref_fb_idx_map[frame_index][ref_idx] = ref_idx;
- gf_group->refresh_idx[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1];
- gf_group->refresh_flag[frame_index] = cpi->ref_fb_idx[LAST_FRAME - 1];
-
- continue;
- }
-
- // == update_type ==
- gf_group->update_type[frame_index] =
- gf16_multi_layer_params[frame_index][param_idx++];
-
- // == rf_level ==
- // Derive rf_level from update_type
- switch (gf_group->update_type[frame_index]) {
- case LF_UPDATE: gf_group->rf_level[frame_index] = INTER_NORMAL; break;
- case ARF_UPDATE: gf_group->rf_level[frame_index] = GF_ARF_LOW; break;
- case OVERLAY_UPDATE:
- gf_group->rf_level[frame_index] = INTER_NORMAL;
- break;
- case BRF_UPDATE: gf_group->rf_level[frame_index] = GF_ARF_LOW; break;
- case LAST_BIPRED_UPDATE:
- gf_group->rf_level[frame_index] = INTER_NORMAL;
- break;
- case BIPRED_UPDATE: gf_group->rf_level[frame_index] = INTER_NORMAL; break;
- case INTNL_ARF_UPDATE:
- gf_group->rf_level[frame_index] = GF_ARF_LOW;
- break;
- case INTNL_OVERLAY_UPDATE:
- gf_group->rf_level[frame_index] = INTER_NORMAL;
- break;
- default: gf_group->rf_level[frame_index] = INTER_NORMAL; break;
- }
-
- // == arf_src_offset ==
- gf_group->arf_src_offset[frame_index] =
- gf16_multi_layer_params[frame_index][param_idx++];
-
- // == brf_src_offset ==
- gf_group->brf_src_offset[frame_index] =
- gf16_multi_layer_params[frame_index][param_idx++];
-
- // == bidir_pred_enabled ==
- // Derive bidir_pred_enabled from bidir_src_offset
- gf_group->bidir_pred_enabled[frame_index] =
- gf_group->brf_src_offset[frame_index] ? 1 : 0;
-
- // == ref_fb_idx_map ==
- for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx)
- gf_group->ref_fb_idx_map[frame_index][ref_idx] =
- gf16_multi_layer_params[frame_index][param_idx++];
-
- // == refresh_idx ==
- gf_group->refresh_idx[frame_index] =
- gf16_multi_layer_params[frame_index][param_idx++];
-
- // == refresh_flag ==
- gf_group->refresh_flag[frame_index] =
- gf16_multi_layer_params[frame_index][param_idx];
- }
-
- // Mark the ARF_UPDATE / INTNL_ARF_UPDATE and OVERLAY_UPDATE /
- // INTNL_OVERLAY_UPDATE for rate allocation
- // NOTE: Indexes are designed in the display order backward:
- // ALT[3] .. ALT[2] .. ALT[1] .. ALT[0],
- // but their coding order is as follows:
- // ALT0-ALT2-ALT3 .. OVERLAY3 .. OVERLAY2-ALT1 .. OVERLAY1 .. OVERLAY0
-
- const int num_arfs_in_gf = cpi->num_extra_arfs + 1;
- const int sub_arf_interval = rc->baseline_gf_interval / num_arfs_in_gf;
-
- // == arf_pos_for_ovrly ==: Position for OVERLAY
- for (int arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) {
- const int prior_num_arfs =
- (arf_idx <= 1) ? num_arfs_in_gf : (num_arfs_in_gf - 1);
- cpi->arf_pos_for_ovrly[arf_idx] =
- sub_arf_interval * (num_arfs_in_gf - arf_idx) + prior_num_arfs;
- }
-
- // == arf_pos_in_gf ==: Position for ALTREF
- cpi->arf_pos_in_gf[0] = 1;
- cpi->arf_pos_in_gf[1] = cpi->arf_pos_for_ovrly[2] + 1;
- cpi->arf_pos_in_gf[2] = 2;
- cpi->arf_pos_in_gf[3] = 3;
-
- // == arf_update_idx ==
- // == arf_ref_idx ==
- // NOTE: Due to the hierarchical nature of GF16, these two parameters only
- // relect the index to the nearest future overlay.
- int start_frame_index = 0;
- for (int arf_idx = (num_arfs_in_gf - 1); arf_idx >= 0; --arf_idx) {
- const int end_frame_index = cpi->arf_pos_for_ovrly[arf_idx];
- for (int frame_index = start_frame_index; frame_index <= end_frame_index;
- ++frame_index) {
- gf_group->arf_update_idx[frame_index] = arf_idx;
- gf_group->arf_ref_idx[frame_index] = arf_idx;
- }
- start_frame_index = end_frame_index + 1;
- }
-}
-#endif // USE_GF16_MULTI_LAYER
-
#if USE_SYMM_MULTI_LAYER
+// #define CHCEK_GF_PARAMETER
+#ifdef CHCEK_GF_PARAMETER
void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
int frame_nums) {
static const char *update_type_strings[] = {
@@ -2149,9 +1585,15 @@ void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
}
+
+ fprintf(fid, "number of nodes in each level: \n");
+ for (int i = 0; i < MAX_PYRAMID_LVL; ++i) {
+ fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]);
+ }
+ fprintf(fid, "\n");
fclose(fid);
}
-
+#endif // CHCEK_GF_PARAMETER
static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) {
// Derive rf_level from update_type
switch (update_type) {
@@ -2169,14 +1611,17 @@ static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) {
static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r,
int *frame_ind, int arf_ind, int level) {
- if (r - l == 2) {
- // leaf node, not a look-ahead frame
- gf_group->update_type[*frame_ind] = LF_UPDATE;
- gf_group->arf_src_offset[*frame_ind] = 0;
- gf_group->arf_pos_in_gf[*frame_ind] = 0;
- gf_group->arf_update_idx[*frame_ind] = arf_ind;
- gf_group->pyramid_level[*frame_ind] = level;
- ++(*frame_ind);
+ if (r - l < 4) {
+ while (++l < r) {
+ // leaf nodes, not a look-ahead frame
+ gf_group->update_type[*frame_ind] = LF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->arf_pos_in_gf[*frame_ind] = 0;
+ gf_group->arf_update_idx[*frame_ind] = arf_ind;
+ gf_group->pyramid_level[*frame_ind] = 0;
+ ++gf_group->pyramid_lvl_nodes[0];
+ ++(*frame_ind);
+ }
} else {
int m = (l + r) / 2;
int arf_pos_in_gf = *frame_ind;
@@ -2186,6 +1631,7 @@ static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r,
gf_group->arf_pos_in_gf[*frame_ind] = 0;
gf_group->arf_update_idx[*frame_ind] = 1; // mark all internal ARF 1
gf_group->pyramid_level[*frame_ind] = level;
+ ++gf_group->pyramid_lvl_nodes[level];
++(*frame_ind);
// set parameters for frames displayed before this frame
@@ -2209,7 +1655,7 @@ static INLINE unsigned char get_pyramid_height(int pyramid_width) {
assert(pyramid_width <= 16 && pyramid_width >= 4 &&
"invalid gf interval for pyramid structure");
- return pyramid_width == 16 ? 4 : (pyramid_width >= 8 ? 3 : 2);
+ return pyramid_width > 12 ? 4 : (pyramid_width > 6 ? 3 : 2);
}
static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
@@ -2217,6 +1663,10 @@ static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
int frame_index = 0;
gf_group->pyramid_height = get_pyramid_height(gf_interval);
+ assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL);
+
+ av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL);
+
// At the beginning of each GF group it will be a key or overlay frame,
gf_group->update_type[frame_index] = OVERLAY_UPDATE;
gf_group->arf_src_offset[frame_index] = 0;
@@ -2236,9 +1686,6 @@ static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
// set parameters for the rest of the frames
set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0,
gf_group->pyramid_height - 1);
-
- // check_frame_params(gf_group, gf_interval, frame_index);
-
return frame_index;
}
@@ -2248,8 +1695,8 @@ void define_customized_gf_group_structure(AV1_COMP *cpi) {
GF_GROUP *const gf_group = &twopass->gf_group;
const int key_frame = cpi->common.frame_type == KEY_FRAME;
- assert(rc->baseline_gf_interval == 4 || rc->baseline_gf_interval == 8 ||
- rc->baseline_gf_interval == 16);
+ assert(rc->baseline_gf_interval >= 4 &&
+ rc->baseline_gf_interval <= MAX_PYRAMID_SIZE);
const int gf_update_frames =
construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval);
@@ -2305,8 +1752,9 @@ void define_customized_gf_group_structure(AV1_COMP *cpi) {
// This parameter is useless?
gf_group->arf_ref_idx[frame_index] = 0;
-
+#ifdef CHCEK_GF_PARAMETER
check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames);
+#endif
}
// It is an example of how to define a GF stucture manually. The function will
@@ -2447,16 +1895,10 @@ static int define_gf_group_structure_4(AV1_COMP *cpi) {
static void define_gf_group_structure(AV1_COMP *cpi) {
RATE_CONTROL *const rc = &cpi->rc;
-#if USE_GF16_MULTI_LAYER
- if (rc->baseline_gf_interval == 16) {
- define_gf_group_structure_16(cpi);
- return;
- }
-#endif // USE_GF16_MULTI_LAYER
#if USE_SYMM_MULTI_LAYER
- const int valid_customized_gf_length = rc->baseline_gf_interval == 4 ||
- rc->baseline_gf_interval == 8 ||
- rc->baseline_gf_interval == 16;
+ const int valid_customized_gf_length =
+ rc->baseline_gf_interval >= 4 &&
+ rc->baseline_gf_interval <= MAX_PYRAMID_SIZE;
// used the new structure only if extra_arf is allowed
if (valid_customized_gf_length && rc->source_alt_ref_pending &&
cpi->extra_arf_allowed > 0) {
@@ -2685,6 +2127,18 @@ static void define_gf_group_structure(AV1_COMP *cpi) {
gf_group->brf_src_offset[frame_index] = 0;
}
+#if USE_SYMM_MULTI_LAYER
+#define LEAF_REDUCTION_FACTOR 0.75f
+#define LVL_3_BOOST_FACTOR 0.8f
+#define LVL_2_BOOST_FACTOR 0.3f
+
+static float_t lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = {
+ { 1, 0, 0 },
+ { LVL_3_BOOST_FACTOR, 0, 0 }, // Leaking budget works better
+ { LVL_3_BOOST_FACTOR, (1 - LVL_3_BOOST_FACTOR) * LVL_2_BOOST_FACTOR,
+ (1 - LVL_3_BOOST_FACTOR) * (1 - LVL_2_BOOST_FACTOR) }
+};
+#endif // USE_SYMM_MULTI_LAYER
static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
double group_error, int gf_arf_bits) {
RATE_CONTROL *const rc = &cpi->rc;
@@ -2771,20 +2225,39 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
// BIPRED_UPDATE frames need to be further adjusted.
gf_group->bit_allocation[frame_index] = target_frame_size;
#if USE_SYMM_MULTI_LAYER
- } else if (cpi->new_bwdref_update_rule == 1 &&
+ } else if (cpi->new_bwdref_update_rule &&
gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
+ assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
+ gf_group->pyramid_height >= 0 &&
+ "non-valid height for a pyramid structure");
+
int arf_pos = gf_group->arf_pos_in_gf[frame_index];
gf_group->bit_allocation[frame_index] = 0;
- // Tried boosting up the allocated bits on backward reference frame
- // by (target_frame_size >> 2) as in the original setting. However it
- // does not bring gains for pyramid structure with GF length = 16.
gf_group->bit_allocation[arf_pos] = target_frame_size;
-#endif
+#if MULTI_LVL_BOOST_VBR_CQ
+ const int pyr_h = gf_group->pyramid_height - 2;
+ const int this_lvl = gf_group->pyramid_level[arf_pos];
+ const int dist2top = gf_group->pyramid_height - 1 - this_lvl;
+
+ const float_t budget =
+ LEAF_REDUCTION_FACTOR * gf_group->pyramid_lvl_nodes[0];
+ const float_t lvl_boost = budget * lvl_budget_factor[pyr_h][dist2top] /
+ gf_group->pyramid_lvl_nodes[this_lvl];
+
+ gf_group->bit_allocation[arf_pos] += (int)(target_frame_size * lvl_boost);
+#endif // MULTI_LVL_BOOST_VBR_CQ
+#endif // USE_SYMM_MULTI_LAYER
} else {
assert(gf_group->update_type[frame_index] == LF_UPDATE ||
gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
gf_group->bit_allocation[frame_index] = target_frame_size;
+#if MULTI_LVL_BOOST_VBR_CQ
+ if (cpi->new_bwdref_update_rule) {
+ gf_group->bit_allocation[frame_index] -=
+ (int)(target_frame_size * LEAF_REDUCTION_FACTOR);
+ }
+#endif // MULTI_LVL_BOOST_VBR_CQ
}
++frame_index;
@@ -2833,9 +2306,11 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
int i;
double boost_score = 0.0;
-#if !FIX_GF_INTERVAL_LENGTH
+#if !CONFIG_FIX_GF_LENGTH
double old_boost_score = 0.0;
double mv_ratio_accumulator_thresh;
+ int active_max_gf_interval;
+ int active_min_gf_interval;
#endif
double gf_group_err = 0.0;
#if GROUP_ADAPTIVE_MAXQ
@@ -2862,8 +2337,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
int f_boost = 0;
int b_boost = 0;
int flash_detected;
- int active_max_gf_interval;
- int active_min_gf_interval;
int64_t gf_group_bits;
double gf_group_error_left;
int gf_arf_bits;
@@ -2898,11 +2371,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
gf_group_skip_pct -= this_frame->intra_skip_pct;
gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
}
-#if !FIX_GF_INTERVAL_LENGTH
+#if !CONFIG_FIX_GF_LENGTH
// Motion breakout threshold for loop below depends on image size.
mv_ratio_accumulator_thresh =
(cpi->initial_height + cpi->initial_width) / 4.0;
-#endif
// Set a maximum and minimum interval for the GF group.
// If the image appears almost completely static we can extend beyond this.
{
@@ -2915,23 +2387,19 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if (active_min_gf_interval > rc->max_gf_interval)
active_min_gf_interval = rc->max_gf_interval;
- if (cpi->multi_arf_allowed) {
+ // The value chosen depends on the active Q range. At low Q we have
+ // bits to spare and are better with a smaller interval and smaller boost.
+ // At high Q when there are few bits to spare we are better with a longer
+ // interval to spread the cost of the GF.
+ active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
+
+ // We have: active_min_gf_interval <= rc->max_gf_interval
+ if (active_max_gf_interval < active_min_gf_interval)
+ active_max_gf_interval = active_min_gf_interval;
+ else if (active_max_gf_interval > rc->max_gf_interval)
active_max_gf_interval = rc->max_gf_interval;
- } else {
- // The value chosen depends on the active Q range. At low Q we have
- // bits to spare and are better with a smaller interval and smaller boost.
- // At high Q when there are few bits to spare we are better with a longer
- // interval to spread the cost of the GF.
- active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
-
- // We have: active_min_gf_interval <= rc->max_gf_interval
- if (active_max_gf_interval < active_min_gf_interval)
- active_max_gf_interval = active_min_gf_interval;
- else if (active_max_gf_interval > rc->max_gf_interval)
- active_max_gf_interval = rc->max_gf_interval;
- }
}
-
+#endif // !CONFIG_FIX_GF_LENGTH
double avg_sr_coded_error = 0;
double avg_raw_err_stdev = 0;
int non_zero_stdev_count = 0;
@@ -2990,10 +2458,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
boost_score +=
decay_accumulator *
calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
-#if FIX_GF_INTERVAL_LENGTH
+#if CONFIG_FIX_GF_LENGTH
if (i == (FIXED_GF_LENGTH + 1)) break;
#else
- // Skip breaking condition for FIX_GF_INTERVAL_LENGTH
+ // Skip breaking condition for CONFIG_FIX_GF_LENGTH
// Break out conditions.
if (
// Break at active_max_gf_interval unless almost totally static.
@@ -3017,7 +2485,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
}
}
old_boost_score = boost_score;
-#endif // FIX_GF_INTERVAL_LENGTH
+#endif // CONFIG_FIX_GF_LENGTH
*this_frame = next_frame;
}
twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
@@ -3030,44 +2498,116 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
assert(num_mbs > 0);
if (i) avg_sr_coded_error /= i;
+ if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
+
+ // Disable extra altrefs and backward refs for "still" gf group:
+ // zero_motion_accumulator: minimum percentage of (0,0) motion;
+ // avg_sr_coded_error: average of the SSE per pixel of each frame;
+ // avg_raw_err_stdev: average of the standard deviation of (0,0)
+ // motion error per block of each frame.
+ const int disable_bwd_extarf =
+ (zero_motion_accumulator > MIN_ZERO_MOTION &&
+ avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+ avg_raw_err_stdev < MAX_RAW_ERR_VAR);
+
+ if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
+
+#define REDUCE_GF_LENGTH_THRESH 4
+#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
+#define REDUCE_GF_LENGTH_BY 1
+ int alt_offset = 0;
+#if REDUCE_LAST_GF_LENGTH
+ // TODO(weitinglin): The length reduction stretagy is tweaking using AOM_Q
+ // mode, and hurting the performance of VBR mode. We need to investigate how
+ // to adjust GF length for other modes.
+
+ int allow_gf_length_reduction =
+ cpi->oxcf.rc_mode == AOM_Q || cpi->extra_arf_allowed == 0;
+
+ // We are going to have an alt ref, but we don't have do adjustment for
+ // lossless mode
+ if (allow_alt_ref && allow_gf_length_reduction &&
+ (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval) &&
+ !is_lossless_requested(&cpi->oxcf)) {
+ // adjust length of this gf group if one of the following condition met
+ // 1: only one overlay frame left and this gf is too long
+ // 2: next gf group is too short to have arf compared to the current gf
+
+ // maximum length of next gf group
+ const int next_gf_len = rc->frames_to_key - i;
+ const int single_overlay_left =
+ next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
+ // the next gf is probably going to have a ARF but it will be shorter than
+ // this gf
+ const int unbalanced_gf =
+ i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+ next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+ next_gf_len + 1 >= rc->min_gf_interval;
+
+ if (single_overlay_left || unbalanced_gf) {
+ // Note: Tried roll_back = DIVIDE_AND_ROUND(i, 8), but is does not work
+ // better in the current setting
+ const int roll_back = REDUCE_GF_LENGTH_BY;
+ alt_offset = -roll_back;
+ i -= roll_back;
+ }
+ }
+#endif
+
// Should we use the alternate reference frame.
if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
(i >= rc->min_gf_interval)) {
// Calculate the boost for alt ref.
rc->gfu_boost =
- calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+ calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost);
rc->source_alt_ref_pending = 1;
+
+ // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF
+ cpi->preserve_arf_as_gld = 1;
} else {
rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
rc->source_alt_ref_pending = 0;
+ cpi->preserve_arf_as_gld = 0;
}
// Set the interval until the next gf.
- if (cpi->oxcf.fwd_kf_enabled) {
- // Ensure the gf group before the next keyframe will contain an altref
- if ((rc->frames_to_key - i < rc->min_gf_interval) &&
- (rc->frames_to_key != i)) {
- rc->baseline_gf_interval = AOMMIN(rc->frames_to_key - rc->min_gf_interval,
- rc->static_scene_max_gf_interval);
- } else {
+ // If forward keyframes are enabled, ensure the final gf group obeys the
+ // MIN_FWD_KF_INTERVAL.
+ if (cpi->oxcf.fwd_kf_enabled &&
+ ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) {
+ if (i == rc->frames_to_key) {
rc->baseline_gf_interval = i;
+ // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
+ } else if ((rc->frames_to_key - i <
+ AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
+ (rc->frames_to_key != i)) {
+ // if possible, merge the last two gf groups
+ if (rc->frames_to_key <= MAX_PYRAMID_SIZE) {
+ rc->baseline_gf_interval = rc->frames_to_key;
+ // if merging the last two gf groups creates a group that is too long,
+ // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
+ } else {
+ rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
+ }
+ } else {
+ rc->baseline_gf_interval =
+ i - (is_key_frame || rc->source_alt_ref_pending);
}
} else {
rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
}
- if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
- // Disable extra altrefs and backward refs for "still" gf group:
- // zero_motion_accumulator: minimum percentage of (0,0) motion;
- // avg_sr_coded_error: average of the SSE per pixel of each frame;
- // avg_raw_err_stdev: average of the standard deviation of (0,0)
- // motion error per block of each frame.
- const int disable_bwd_extarf =
- (zero_motion_accumulator > MIN_ZERO_MOTION &&
- avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
- avg_raw_err_stdev < MAX_RAW_ERR_VAR);
-
- if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
+#if REDUCE_LAST_ALT_BOOST
+#define LAST_ALR_BOOST_FACTOR 0.2f
+ rc->arf_boost_factor = 1.0;
+ if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
+ // Reduce the boost of altref in the last gf group
+ if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY ||
+ rc->frames_to_key - i == 0) {
+ rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+ }
+ }
+#endif
if (!cpi->extra_arf_allowed) {
cpi->num_extra_arfs = 0;
@@ -3439,6 +2979,11 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// how many bits to spend on it.
decay_accumulator = 1.0;
boost_score = 0.0;
+ const double kf_max_boost =
+ cpi->oxcf.rc_mode == AOM_Q
+ ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+ KF_MAX_FRAME_BOOST)
+ : KF_MAX_FRAME_BOOST;
for (i = 0; i < (rc->frames_to_key - 1); ++i) {
if (EOF == input_stats(twopass, &next_frame)) break;
@@ -3450,7 +2995,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if ((i <= rc->max_gf_interval) ||
((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
const double frame_boost =
- calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST);
+ calc_frame_boost(cpi, this_frame, 0, kf_max_boost);
// How fast is prediction quality decaying.
if (!detect_flash(twopass, 0)) {
@@ -3513,147 +3058,6 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
twopass->modified_error_left -= kf_group_err;
}
-#if USE_GF16_MULTI_LAYER
-// === GF Group of 16 ===
-void av1_ref_frame_map_idx_updates(AV1_COMP *cpi, int gf_frame_index) {
- TWO_PASS *const twopass = &cpi->twopass;
- GF_GROUP *const gf_group = &twopass->gf_group;
-
- int ref_fb_idx_prev[REF_FRAMES];
- int ref_fb_idx_curr[REF_FRAMES];
-
- for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
- ref_fb_idx_prev[ref_frame] = cpi->ref_fb_idx[ref_frame];
- }
-
- // Update map index for each reference frame
- for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) {
- int ref_frame = gf_group->ref_fb_idx_map[gf_frame_index][ref_idx];
- ref_fb_idx_curr[ref_idx] = ref_fb_idx_prev[ref_frame - LAST_FRAME];
- }
-
- for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
- cpi->ref_fb_idx[ref_frame] = ref_fb_idx_curr[ref_frame];
- }
-}
-
-// Define the reference buffers that will be updated post encode.
-static void configure_buffer_updates_16(AV1_COMP *cpi) {
- TWO_PASS *const twopass = &cpi->twopass;
- GF_GROUP *const gf_group = &twopass->gf_group;
-
- if (gf_group->update_type[gf_group->index] == KF_UPDATE) {
- cpi->refresh_fb_idx = 0;
-
- cpi->refresh_last_frame = 1;
- cpi->refresh_golden_frame = 1;
- cpi->refresh_bwd_ref_frame = 1;
- cpi->refresh_alt2_ref_frame = 1;
- cpi->refresh_alt_ref_frame = 1;
-
- return;
- }
-
- // Update reference frame map indexes
- av1_ref_frame_map_idx_updates(cpi, gf_group->index);
-
- // Update refresh index
- switch (gf_group->refresh_idx[gf_group->index]) {
- case LAST_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST_FRAME - LAST_FRAME];
- break;
-
- case LAST2_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST2_FRAME - LAST_FRAME];
- break;
-
- case LAST3_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[LAST3_FRAME - LAST_FRAME];
- break;
-
- case GOLDEN_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
- break;
-
- case BWDREF_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[BWDREF_FRAME - 1];
- break;
-
- case ALTREF2_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF2_FRAME - 1];
- break;
-
- case ALTREF_FRAME:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
- break;
-
- case REF_FRAMES:
- cpi->refresh_fb_idx = cpi->ref_fb_idx[REF_FRAMES - 1];
- break;
-
- default: assert(0); break;
- }
-
- // Update refresh flags
- switch (gf_group->refresh_flag[gf_group->index]) {
- case LAST_FRAME:
- cpi->refresh_last_frame = 1;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
- break;
-
- case GOLDEN_FRAME:
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 1;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
- break;
-
- case BWDREF_FRAME:
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 1;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
- break;
-
- case ALTREF2_FRAME:
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 1;
- cpi->refresh_alt_ref_frame = 0;
- break;
-
- case ALTREF_FRAME:
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 1;
- break;
-
- default: assert(0); break;
- }
-
- switch (gf_group->update_type[gf_group->index]) {
- case BRF_UPDATE: cpi->rc.is_bwd_ref_frame = 1; break;
-
- case LAST_BIPRED_UPDATE: cpi->rc.is_last_bipred_frame = 1; break;
-
- case BIPRED_UPDATE: cpi->rc.is_bipred_frame = 1; break;
-
- case INTNL_OVERLAY_UPDATE: cpi->rc.is_src_frame_ext_arf = 1;
- case OVERLAY_UPDATE: cpi->rc.is_src_frame_alt_ref = 1; break;
-
- default: break;
- }
-}
-#endif // USE_GF16_MULTI_LAYER
-
// Define the reference buffers that will be updated post encode.
static void configure_buffer_updates(AV1_COMP *cpi) {
TWO_PASS *const twopass = &cpi->twopass;
@@ -3667,14 +3071,6 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
cpi->rc.is_bipred_frame = 0;
cpi->rc.is_src_frame_ext_arf = 0;
-#if USE_GF16_MULTI_LAYER
- RATE_CONTROL *const rc = &cpi->rc;
- if (rc->baseline_gf_interval == 16) {
- configure_buffer_updates_16(cpi);
- return;
- }
-#endif // USE_GF16_MULTI_LAYER
-
switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
case KF_UPDATE:
cpi->refresh_last_frame = 1;
@@ -3979,8 +3375,7 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
: cpi->common.MBs;
// The multiplication by 256 reverses a scaling factor of (>> 8)
// applied when combining MB error values for the frame.
- twopass->mb_av_energy =
- log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+ twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0);
twopass->frame_avg_haar_energy =
log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0);
}
@@ -4020,9 +3415,6 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) {
}
twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
- // Increment the gf group index ready for the next frame.
- ++twopass->gf_group.index;
-
// If the rate control is drifting consider adjustment to min or maxq.
if ((cpi->oxcf.rc_mode != AOM_Q) &&
(cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
index b0c1a21e4..4b7325ae2 100644
--- a/third_party/aom/av1/encoder/firstpass.h
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_FIRSTPASS_H_
-#define AV1_ENCODER_FIRSTPASS_H_
+#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_
+#define AOM_AV1_ENCODER_FIRSTPASS_H_
#include "av1/common/enums.h"
#include "av1/common/onyxc_int.h"
@@ -47,15 +47,7 @@ typedef struct {
// number of bi-predictive frames.
#define BFG_INTERVAL 2
// The maximum number of extra ALTREF's except ALTREF_FRAME
-// NOTE: REF_FRAMES indicates the maximum number of frames that may be buffered
-// to serve as references. Currently REF_FRAMES == 8.
-#define USE_GF16_MULTI_LAYER 0
-
-#if USE_GF16_MULTI_LAYER
-#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME)
-#else // !USE_GF16_MULTI_LAYER
#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
-#endif // USE_GF16_MULTI_LAYER
#define MIN_EXT_ARF_INTERVAL 4
@@ -126,6 +118,7 @@ typedef struct {
unsigned char arf_pos_in_gf[(MAX_LAG_BUFFERS * 2) + 1];
unsigned char pyramid_level[(MAX_LAG_BUFFERS * 2) + 1];
unsigned char pyramid_height;
+ unsigned char pyramid_lvl_nodes[MAX_PYRAMID_LVL];
#endif
unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
@@ -197,10 +190,6 @@ void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi,
// Post encode update of the rate control parameters for 2-pass
void av1_twopass_postencode_update(struct AV1_COMP *cpi);
-#if USE_GF16_MULTI_LAYER
-void av1_ref_frame_map_idx_updates(struct AV1_COMP *cpi, int gf_frame_index);
-#endif // USE_GF16_MULTI_LAYER
-
static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
if (arf_pending && MAX_EXT_ARFS > 0)
return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)
@@ -216,4 +205,4 @@ static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
} // extern "C"
#endif
-#endif // AV1_ENCODER_FIRSTPASS_H_
+#endif // AOM_AV1_ENCODER_FIRSTPASS_H_
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
index f07d1bc00..e9f8b0bb4 100644
--- a/third_party/aom/av1/encoder/global_motion.c
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -32,8 +32,8 @@
// Border over which to compute the global motion
#define ERRORADV_BORDER 0
-static const double erroradv_tr[] = { 0.75, 0.70, 0.65 };
-static const double erroradv_prod_tr[] = { 22000, 20000, 18000 };
+static const double erroradv_tr[] = { 0.65, 0.60, 0.55 };
+static const double erroradv_prod_tr[] = { 20000, 18000, 16000 };
int is_enough_erroradvantage(double best_erroradvantage, int params_cost,
int erroradv_type) {
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
index 2c15753fd..c7c016c43 100644
--- a/third_party/aom/av1/encoder/global_motion.h
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_GLOBAL_MOTION_H_
-#define AV1_ENCODER_GLOBAL_MOTION_H_
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_
#include "aom/aom_integer.h"
#include "aom_scale/yv12config.h"
@@ -61,4 +61,4 @@ int compute_global_motion_feature_based(TransformationType type,
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_GLOBAL_MOTION_H_
+#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_H_
diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h
index 45632da9b..945dc3733 100644
--- a/third_party/aom/av1/encoder/grain_test_vectors.h
+++ b/third_party/aom/av1/encoder/grain_test_vectors.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_GRAIN_TEST_VECTORS_H_
-#define AV1_GRAIN_TEST_VECTORS_H_
+#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
/* Test vectors for emulation of different film grain types.
* Note that bit depth would be derived from the bitstream and
@@ -778,4 +778,4 @@ static aom_film_grain_t film_grain_test_vectors[16] = {
45231 /* random_seed */
},
};
-#endif // AV1_GRAIN_TEST_VECTORS_H_
+#endif // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h
index 8b6227540..826c004d6 100644
--- a/third_party/aom/av1/encoder/hash.h
+++ b/third_party/aom/av1/encoder/hash.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_HASH_H_
-#define AV1_ENCODER_HASH_H_
+#ifndef AOM_AV1_ENCODER_HASH_H_
+#define AOM_AV1_ENCODER_HASH_H_
#include "config/aom_config.h"
@@ -43,8 +43,10 @@ typedef struct _CRC32C {
// init table for software version crc32c
void av1_crc32c_calculator_init(CRC32C *p_crc32c);
+#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_HASH_H_
+#endif // AOM_AV1_ENCODER_HASH_H_
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
index f2ff5b495..e85a516e8 100644
--- a/third_party/aom/av1/encoder/hash_motion.c
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -13,14 +13,12 @@
#include "config/av1_rtcd.h"
+#include "av1/encoder/block.h"
#include "av1/encoder/hash.h"
#include "av1/encoder/hash_motion.h"
static const int crc_bits = 16;
static const int block_size_bits = 3;
-static CRC_CALCULATOR crc_calculator1;
-static CRC_CALCULATOR crc_calculator2;
-static int g_crc_initialized = 0;
static void hash_table_clear_all(hash_table *p_hash_table) {
if (p_hash_table->p_lookup_table == NULL) {
@@ -106,11 +104,11 @@ static int hash_block_size_to_index(int block_size) {
}
}
-void av1_hash_table_init(hash_table *p_hash_table) {
- if (g_crc_initialized == 0) {
- av1_crc_calculator_init(&crc_calculator1, 24, 0x5D6DCB);
- av1_crc_calculator_init(&crc_calculator2, 24, 0x864CFB);
- g_crc_initialized = 1;
+void av1_hash_table_init(hash_table *p_hash_table, MACROBLOCK *x) {
+ if (x->g_crc_initialized == 0) {
+ av1_crc_calculator_init(&x->crc_calculator1, 24, 0x5D6DCB);
+ av1_crc_calculator_init(&x->crc_calculator2, 24, 0x864CFB);
+ x->g_crc_initialized = 1;
}
p_hash_table->p_lookup_table = NULL;
}
@@ -181,7 +179,8 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
uint32_t *pic_block_hash[2],
- int8_t *pic_block_same_info[3]) {
+ int8_t *pic_block_same_info[3],
+ MACROBLOCK *x) {
const int width = 2;
const int height = 2;
const int x_end = picture->y_crop_width - width + 1;
@@ -201,9 +200,9 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
pic_block_hash[0][pos] = av1_get_crc_value(
- &crc_calculator1, (uint8_t *)p, length * sizeof(p[0]));
+ &x->crc_calculator1, (uint8_t *)p, length * sizeof(p[0]));
pic_block_hash[1][pos] = av1_get_crc_value(
- &crc_calculator2, (uint8_t *)p, length * sizeof(p[0]));
+ &x->crc_calculator2, (uint8_t *)p, length * sizeof(p[0]));
pos++;
}
pos += width - 1;
@@ -220,9 +219,9 @@ void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
pic_block_hash[0][pos] =
- av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0]));
+ av1_get_crc_value(&x->crc_calculator1, p, length * sizeof(p[0]));
pic_block_hash[1][pos] =
- av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0]));
+ av1_get_crc_value(&x->crc_calculator2, p, length * sizeof(p[0]));
pos++;
}
pos += width - 1;
@@ -235,7 +234,8 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
uint32_t *src_pic_block_hash[2],
uint32_t *dst_pic_block_hash[2],
int8_t *src_pic_block_same_info[3],
- int8_t *dst_pic_block_same_info[3]) {
+ int8_t *dst_pic_block_same_info[3],
+ MACROBLOCK *x) {
const int pic_width = picture->y_crop_width;
const int x_end = picture->y_crop_width - block_size + 1;
const int y_end = picture->y_crop_height - block_size + 1;
@@ -254,14 +254,14 @@ void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
p[2] = src_pic_block_hash[0][pos + src_size * pic_width];
p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size];
dst_pic_block_hash[0][pos] =
- av1_get_crc_value(&crc_calculator1, (uint8_t *)p, length);
+ av1_get_crc_value(&x->crc_calculator1, (uint8_t *)p, length);
p[0] = src_pic_block_hash[1][pos];
p[1] = src_pic_block_hash[1][pos + src_size];
p[2] = src_pic_block_hash[1][pos + src_size * pic_width];
p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size];
dst_pic_block_hash[1][pos] =
- av1_get_crc_value(&crc_calculator2, (uint8_t *)p, length);
+ av1_get_crc_value(&x->crc_calculator2, (uint8_t *)p, length);
dst_pic_block_same_info[0][pos] =
src_pic_block_same_info[0][pos] &&
@@ -388,17 +388,9 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
return 1;
}
-// global buffer for hash value calculation of a block
-// used only in av1_get_block_hash_value()
-#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
-// [first hash/second hash]
-// [two buffers used ping-pong]
-// [num of 2x2 blocks in 128x128]
-static uint32_t hash_value_buffer[2][2][AOM_BUFFER_SIZE_FOR_BLOCK_HASH];
-
void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
uint32_t *hash_value1, uint32_t *hash_value2,
- int use_highbitdepth) {
+ int use_highbitdepth, MACROBLOCK *x) {
uint32_t to_hash[4];
const int add_value = hash_block_size_to_index(block_size) << crc_bits;
assert(add_value >= 0);
@@ -415,10 +407,12 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
get_pixels_in_1D_short_array_by_block_2x2(
y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
- hash_value_buffer[0][0][pos] = av1_get_crc_value(
- &crc_calculator1, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash));
- hash_value_buffer[1][0][pos] = av1_get_crc_value(
- &crc_calculator2, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash));
+ x->hash_value_buffer[0][0][pos] =
+ av1_get_crc_value(&x->crc_calculator1, (uint8_t *)pixel_to_hash,
+ sizeof(pixel_to_hash));
+ x->hash_value_buffer[1][0][pos] =
+ av1_get_crc_value(&x->crc_calculator2, (uint8_t *)pixel_to_hash,
+ sizeof(pixel_to_hash));
}
}
} else {
@@ -429,10 +423,10 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
stride, pixel_to_hash);
assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
- hash_value_buffer[0][0][pos] = av1_get_crc_value(
- &crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
- hash_value_buffer[1][0][pos] = av1_get_crc_value(
- &crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
+ x->hash_value_buffer[0][0][pos] = av1_get_crc_value(
+ &x->crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
+ x->hash_value_buffer[1][0][pos] = av1_get_crc_value(
+ &x->crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
}
}
}
@@ -457,24 +451,24 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
assert(srcPos + src_sub_block_in_width + 1 <
AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
- to_hash[0] = hash_value_buffer[0][src_idx][srcPos];
- to_hash[1] = hash_value_buffer[0][src_idx][srcPos + 1];
+ to_hash[0] = x->hash_value_buffer[0][src_idx][srcPos];
+ to_hash[1] = x->hash_value_buffer[0][src_idx][srcPos + 1];
to_hash[2] =
- hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width];
- to_hash[3] =
- hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width + 1];
+ x->hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width];
+ to_hash[3] = x->hash_value_buffer[0][src_idx]
+ [srcPos + src_sub_block_in_width + 1];
- hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value(
- &crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash));
+ x->hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value(
+ &x->crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash));
- to_hash[0] = hash_value_buffer[1][src_idx][srcPos];
- to_hash[1] = hash_value_buffer[1][src_idx][srcPos + 1];
+ to_hash[0] = x->hash_value_buffer[1][src_idx][srcPos];
+ to_hash[1] = x->hash_value_buffer[1][src_idx][srcPos + 1];
to_hash[2] =
- hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width];
- to_hash[3] =
- hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width + 1];
- hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value(
- &crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash));
+ x->hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width];
+ to_hash[3] = x->hash_value_buffer[1][src_idx]
+ [srcPos + src_sub_block_in_width + 1];
+ x->hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value(
+ &x->crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash));
dst_pos++;
}
}
@@ -483,8 +477,6 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
sub_block_in_width >>= 1;
}
- *hash_value1 = (hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value;
- *hash_value2 = hash_value_buffer[1][dst_idx][0];
+ *hash_value1 = (x->hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value;
+ *hash_value2 = x->hash_value_buffer[1][dst_idx][0];
}
-
-#undef AOM_BUFFER_SIZE_FOR_BLOCK_HASH
diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h
index 8deb92eb6..df3ec3215 100644
--- a/third_party/aom/av1/encoder/hash_motion.h
+++ b/third_party/aom/av1/encoder/hash_motion.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_HASH_MOTION_H_
-#define AV1_ENCODER_HASH_MOTION_H_
+#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_
+#define AOM_AV1_ENCODER_HASH_MOTION_H_
#include "config/aom_config.h"
@@ -34,7 +34,7 @@ typedef struct _hash_table {
Vector **p_lookup_table;
} hash_table;
-void av1_hash_table_init(hash_table *p_hash_table);
+void av1_hash_table_init(hash_table *p_hash_table, struct macroblock *x);
void av1_hash_table_destroy(hash_table *p_hash_table);
void av1_hash_table_create(hash_table *p_hash_table);
int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value);
@@ -44,13 +44,15 @@ int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
uint32_t hash_value2);
void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
uint32_t *pic_block_hash[2],
- int8_t *pic_block_same_info[3]);
+ int8_t *pic_block_same_info[3],
+ struct macroblock *x);
void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
int block_size,
uint32_t *src_pic_block_hash[2],
uint32_t *dst_pic_block_hash[2],
int8_t *src_pic_block_same_info[3],
- int8_t *dst_pic_block_same_info[3]);
+ int8_t *dst_pic_block_same_info[3],
+ struct macroblock *x);
void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
uint32_t *pic_hash[2],
int8_t *pic_is_same,
@@ -67,10 +69,10 @@ int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
int block_size, int x_start, int y_start);
void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
uint32_t *hash_value1, uint32_t *hash_value2,
- int use_highbitdepth);
+ int use_highbitdepth, struct macroblock *x);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_HASH_MOTION_H_
+#endif // AOM_AV1_ENCODER_HASH_MOTION_H_
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
index 0922557d0..67898fd18 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -121,15 +121,45 @@ static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
int32_t *dst_coeff = (int32_t *)coeff;
- av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
- txfm_param->bd);
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ switch (tx_type) {
+ // use the c version for anything including identity for now
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ default:
+ av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ }
}
static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TxfmParam *txfm_param) {
int32_t *dst_coeff = (int32_t *)coeff;
- av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
- txfm_param->bd);
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ switch (tx_type) {
+ // use the c version for anything including identity for now
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ default:
+ av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ }
}
static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
index 6155b255a..daabc7119 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_HYBRID_FWD_TXFM_H_
-#define AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
#include "config/aom_config.h"
@@ -28,4 +28,4 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
} // extern "C"
#endif
-#endif // AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#endif // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h
index 3897c2a6a..e55224cf7 100644
--- a/third_party/aom/av1/encoder/lookahead.h
+++ b/third_party/aom/av1/encoder/lookahead.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_LOOKAHEAD_H_
-#define AV1_ENCODER_LOOKAHEAD_H_
+#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_
+#define AOM_AV1_ENCODER_LOOKAHEAD_H_
#include "aom_scale/yv12config.h"
#include "aom/aom_integer.h"
@@ -103,4 +103,4 @@ unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx);
} // extern "C"
#endif
-#endif // AV1_ENCODER_LOOKAHEAD_H_
+#endif // AOM_AV1_ENCODER_LOOKAHEAD_H_
diff --git a/third_party/aom/av1/encoder/mathutils.h b/third_party/aom/av1/encoder/mathutils.h
index 23243dd9e..64f936176 100644
--- a/third_party/aom/av1/encoder/mathutils.h
+++ b/third_party/aom/av1/encoder/mathutils.h
@@ -9,6 +9,9 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#ifndef AOM_AV1_ENCODER_MATHUTILS_H_
+#define AOM_AV1_ENCODER_MATHUTILS_H_
+
#include <memory.h>
#include <math.h>
#include <stdio.h>
@@ -23,7 +26,7 @@ static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) {
double c;
// Forward elimination
for (k = 0; k < n - 1; k++) {
- // Bring the largest magitude to the diagonal position
+ // Bring the largest magnitude to the diagonal position
for (i = n - 1; i > k; i--) {
if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) {
for (j = 0; j < n; j++) {
@@ -352,3 +355,5 @@ static INLINE int SVD(double *U, double *W, double *V, double *matx, int M,
return 0;
}
+
+#endif // AOM_AV1_ENCODER_MATHUTILS_H_
diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c
index 472173634..1a35ff77c 100644
--- a/third_party/aom/av1/encoder/mbgraph.c
+++ b/third_party/aom/av1/encoder/mbgraph.c
@@ -17,11 +17,12 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_mem/aom_mem.h"
#include "aom_ports/system_state.h"
-#include "av1/encoder/segmentation.h"
-#include "av1/encoder/mcomp.h"
#include "av1/common/blockd.h"
#include "av1/common/reconinter.h"
#include "av1/common/reconintra.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
int mb_row, int mb_col) {
@@ -140,7 +141,7 @@ static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) {
// calculate SATD for each intra prediction mode;
// we're intentionally not doing 4x4, we just want a rough estimate
- for (mode = DC_PRED; mode <= PAETH_PRED; mode++) {
+ for (mode = INTRA_MODE_START; mode < INTRA_MODE_END; mode++) {
unsigned int err;
xd->mi[0]->mode = mode;
diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h
index 3e0a4fa9b..ba08476f7 100644
--- a/third_party/aom/av1/encoder/mbgraph.h
+++ b/third_party/aom/av1/encoder/mbgraph.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_MBGRAPH_H_
-#define AV1_ENCODER_MBGRAPH_H_
+#ifndef AOM_AV1_ENCODER_MBGRAPH_H_
+#define AOM_AV1_ENCODER_MBGRAPH_H_
#ifdef __cplusplus
extern "C" {
@@ -38,4 +38,4 @@ void av1_update_mbgraph_stats(struct AV1_COMP *cpi);
} // extern "C"
#endif
-#endif // AV1_ENCODER_MBGRAPH_H_
+#endif // AOM_AV1_ENCODER_MBGRAPH_H_
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
index c4572a341..8f6de9b53 100644
--- a/third_party/aom/av1/encoder/mcomp.c
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -29,6 +29,7 @@
#include "av1/encoder/encodemv.h"
#include "av1/encoder/mcomp.h"
#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
// #define NEW_DIAMOND_SEARCH
@@ -219,7 +220,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
thismse = upsampled_pref_error( \
xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride, \
pre(y, y_stride, r, c), y_stride, sp(c), sp(r), second_pred, mask, \
- mask_stride, invert_mask, w, h, &sse); \
+ mask_stride, invert_mask, w, h, &sse, use_accurate_subpel_search); \
v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
v += thismse; \
if (v < besterr) { \
@@ -342,19 +343,19 @@ static unsigned int setup_center_error(
if (second_pred != NULL) {
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+ uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
if (mask) {
- aom_highbd_comp_mask_pred(comp_pred16, second_pred, w, h, y + offset,
+ aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y + offset,
y_stride, mask, mask_stride, invert_mask);
} else {
if (xd->jcp_param.use_jnt_comp_avg)
- aom_highbd_jnt_comp_avg_pred(comp_pred16, second_pred, w, h,
- y + offset, y_stride, &xd->jcp_param);
+ aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
+ y_stride, &xd->jcp_param);
else
- aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+ aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
y_stride);
}
- besterr =
- vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
} else {
DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
if (mask) {
@@ -648,51 +649,54 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm,
int subpel_x_q3, int subpel_y_q3,
const uint8_t *second_pred, const uint8_t *mask,
int mask_stride, int invert_mask, int w, int h,
- unsigned int *sse) {
+ unsigned int *sse, int subpel_search) {
unsigned int besterr;
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+ uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
if (second_pred != NULL) {
if (mask) {
aom_highbd_comp_mask_upsampled_pred(
- xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd);
+ xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd,
+ subpel_search);
} else {
if (xd->jcp_param.use_jnt_comp_avg)
aom_highbd_jnt_comp_avg_upsampled_pred(
- xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h,
- subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param);
+ xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param, subpel_search);
else
- aom_highbd_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16,
- second_pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride, xd->bd);
+ aom_highbd_comp_avg_upsampled_pred(
+ xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, xd->bd, subpel_search);
}
} else {
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h,
- subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd);
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
+ subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+ subpel_search);
}
-
- besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse);
+ besterr = vfp->vf(pred8, w, src, src_stride, sse);
} else {
DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
if (second_pred != NULL) {
if (mask) {
- aom_comp_mask_upsampled_pred(
- xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask);
+ aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
+ second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, mask,
+ mask_stride, invert_mask, subpel_search);
} else {
if (xd->jcp_param.use_jnt_comp_avg)
aom_jnt_comp_avg_upsampled_pred(
xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride, &xd->jcp_param);
+ subpel_y_q3, y, y_stride, &xd->jcp_param, subpel_search);
else
aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
second_pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride);
+ subpel_y_q3, y, y_stride, subpel_search);
}
} else {
aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride);
+ subpel_y_q3, y, y_stride, subpel_search);
}
besterr = vfp->vf(pred, w, src, src_stride, sse);
@@ -707,10 +711,11 @@ static unsigned int upsampled_setup_center_error(
const int src_stride, const uint8_t *const y, int y_stride,
const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2],
- unsigned int *sse1, int *distortion) {
- unsigned int besterr = upsampled_pref_error(
- xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride, y + offset,
- y_stride, 0, 0, second_pred, mask, mask_stride, invert_mask, w, h, sse1);
+ unsigned int *sse1, int *distortion, int subpel_search) {
+ unsigned int besterr =
+ upsampled_pref_error(xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride,
+ y + offset, y_stride, 0, 0, second_pred, mask,
+ mask_stride, invert_mask, w, h, sse1, subpel_search);
*distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
return besterr;
@@ -781,7 +786,8 @@ int av1_find_best_sub_pixel_tree(
besterr = upsampled_setup_center_error(
xd, cm, mi_row, mi_col, bestmv, ref_mv, error_per_bit, vfp, src_address,
src_stride, y, y_stride, second_pred, mask, mask_stride, invert_mask, w,
- h, offset, mvjcost, mvcost, sse1, distortion);
+ h, offset, mvjcost, mvcost, sse1, distortion,
+ use_accurate_subpel_search);
else
besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
src_address, src_stride, y, y_stride,
@@ -802,7 +808,8 @@ int av1_find_best_sub_pixel_tree(
thismse = upsampled_pref_error(
xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
- mask, mask_stride, invert_mask, w, h, &sse);
+ mask, mask_stride, invert_mask, w, h, &sse,
+ use_accurate_subpel_search);
} else {
thismse = estimate_upsampled_pref_error(
xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
@@ -837,7 +844,8 @@ int av1_find_best_sub_pixel_tree(
thismse = upsampled_pref_error(
xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
- mask, mask_stride, invert_mask, w, h, &sse);
+ mask, mask_stride, invert_mask, w, h, &sse,
+ use_accurate_subpel_search);
} else {
thismse = estimate_upsampled_pref_error(
xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
@@ -929,8 +937,8 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
int16_t bc = mbmi->mv[0].as_mv.col;
int16_t *tr = &mbmi->mv[0].as_mv.row;
int16_t *tc = &mbmi->mv[0].as_mv.col;
- WarpedMotionParams best_wm_params = mbmi->wm_params[0];
- int best_num_proj_ref = mbmi->num_proj_ref[0];
+ WarpedMotionParams best_wm_params = mbmi->wm_params;
+ int best_num_proj_ref = mbmi->num_proj_ref;
unsigned int bestmse;
int minc, maxc, minr, maxr;
const int start = cm->allow_high_precision_mv ? 0 : 4;
@@ -962,18 +970,18 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
if (total_samples > 1)
- mbmi->num_proj_ref[0] =
+ mbmi->num_proj_ref =
selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
- if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, *tr,
- *tc, &mbmi->wm_params[0], mi_row, mi_col)) {
+ if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, *tr,
+ *tc, &mbmi->wm_params, mi_row, mi_col)) {
thismse =
av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col, &this_mv);
if (thismse < bestmse) {
best_idx = idx;
- best_wm_params = mbmi->wm_params[0];
- best_num_proj_ref = mbmi->num_proj_ref[0];
+ best_wm_params = mbmi->wm_params;
+ best_num_proj_ref = mbmi->num_proj_ref;
bestmse = thismse;
}
}
@@ -990,8 +998,8 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
*tr = br;
*tc = bc;
- mbmi->wm_params[0] = best_wm_params;
- mbmi->num_proj_ref[0] = best_num_proj_ref;
+ mbmi->wm_params = best_wm_params;
+ mbmi->num_proj_ref = best_num_proj_ref;
return bestmse;
}
@@ -2013,8 +2021,16 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
const uint8_t *mask, int mask_stride,
int invert_mask, const MV *center_mv,
const uint8_t *second_pred) {
- const MV neighbors[8] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
- { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } };
+ static const search_neighbors neighbors[8] = {
+ { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 },
+ { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 },
+ { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 },
+ { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 },
+ { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 }
+ };
const MACROBLOCKD *const xd = &x->e_mbd;
const struct buf_2d *const what = &x->plane[0].src;
const struct buf_2d *const in_what = &xd->plane[0].pre[0];
@@ -2022,6 +2038,10 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
MV *best_mv = &x->best_mv.as_mv;
unsigned int best_sad = INT_MAX;
int i, j;
+ uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P * SEARCH_GRID_STRIDE_8P] =
+ { 0 };
+ int grid_center = SEARCH_GRID_CENTER_8P;
+ int grid_coord = grid_center;
clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max,
x->mv_limits.row_min, x->mv_limits.row_max);
@@ -2043,13 +2063,20 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
}
+ do_refine_search_grid[grid_coord] = 1;
+
for (i = 0; i < search_range; ++i) {
int best_site = -1;
for (j = 0; j < 8; ++j) {
- const MV mv = { best_mv->row + neighbors[j].row,
- best_mv->col + neighbors[j].col };
+ grid_coord = grid_center + neighbors[j].coord_offset;
+ if (do_refine_search_grid[grid_coord] == 1) {
+ continue;
+ }
+ const MV mv = { best_mv->row + neighbors[j].coord.row,
+ best_mv->col + neighbors[j].coord.col };
+ do_refine_search_grid[grid_coord] = 1;
if (is_mv_in(&x->mv_limits, &mv)) {
unsigned int sad;
if (mask) {
@@ -2079,8 +2106,9 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
if (best_site == -1) {
break;
} else {
- best_mv->row += neighbors[best_site].row;
- best_mv->col += neighbors[best_site].col;
+ best_mv->row += neighbors[best_site].coord.row;
+ best_mv->col += neighbors[best_site].coord.col;
+ grid_center += neighbors[best_site].coord_offset;
}
}
return best_sad;
@@ -2099,11 +2127,11 @@ static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
}
int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
- MV *mvp_full, int step_param, int error_per_bit,
+ MV *mvp_full, int step_param, int method,
+ int run_mesh_search, int error_per_bit,
int *cost_list, const MV *ref_mv, int var_max, int rd,
int x_pos, int y_pos, int intra) {
const SPEED_FEATURES *const sf = &cpi->sf;
- const SEARCH_METHODS method = sf->mv.search_method;
const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
int var = 0;
@@ -2168,11 +2196,35 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
default: assert(0 && "Invalid search method.");
}
+ // Should we allow a follow on exhaustive search?
+ if (!run_mesh_search) {
+ if (method == NSTEP) {
+ if (is_exhaustive_allowed(cpi, x)) {
+ int exhuastive_thr = sf->exhaustive_searches_thresh;
+ exhuastive_thr >>=
+ 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+ // Threshold variance for an exhaustive full search.
+ if (var > exhuastive_thr) run_mesh_search = 1;
+ }
+ }
+ }
+
+ if (run_mesh_search) {
+ int var_ex;
+ MV tmp_mv_ex;
+ var_ex = full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
+ cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
+ if (var_ex < var) {
+ var = var_ex;
+ x->best_mv.as_mv = tmp_mv_ex;
+ }
+ }
+
if (method != NSTEP && rd && var < var_max)
var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
do {
- if (!av1_use_hash_me(&cpi->common)) break;
+ if (!intra || !av1_use_hash_me(&cpi->common)) break;
// already single ME
// get block size and original buffer of current block
@@ -2195,7 +2247,7 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
av1_get_block_hash_value(
what, what_stride, block_width, &hash_value1, &hash_value2,
- x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, x);
const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
// for intra, at lest one matching can be found, itself.
@@ -2279,7 +2331,8 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
MV this_mv = { r, c }; \
thismse = upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, &this_mv, \
mask, vfp, z, pre(y, y_stride, r, c), \
- y_stride, sp(c), sp(r), w, h, &sse); \
+ y_stride, sp(c), sp(r), w, h, &sse, \
+ use_accurate_subpel_search); \
if ((v = MVC(r, c) + thismse) < besterr) { \
besterr = v; \
br = r; \
@@ -2307,18 +2360,20 @@ static int upsampled_obmc_pref_error(
MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
const MV *const mv, const int32_t *mask, const aom_variance_fn_ptr_t *vfp,
const int32_t *const wsrc, const uint8_t *const y, int y_stride,
- int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse) {
+ int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse,
+ int subpel_search) {
unsigned int besterr;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h,
- subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd);
- besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse);
+ DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
+ subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+ subpel_search);
+ besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
} else {
- DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
- subpel_y_q3, y, y_stride);
+ subpel_y_q3, y, y_stride, subpel_search);
besterr = vfp->ovf(pred, w, wsrc, mask, sse);
}
@@ -2330,10 +2385,11 @@ static unsigned int upsampled_setup_obmc_center_error(
const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
const uint8_t *const y, int y_stride, int w, int h, int offset,
- int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) {
- unsigned int besterr =
- upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc,
- y + offset, y_stride, 0, 0, w, h, sse1);
+ int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion,
+ int subpel_search) {
+ unsigned int besterr = upsampled_obmc_pref_error(
+ xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc, y + offset, y_stride, 0,
+ 0, w, h, sse1, subpel_search);
*distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
return besterr;
@@ -2388,11 +2444,12 @@ int av1_find_best_obmc_sub_pixel_tree_up(
bestmv->row *= 8;
bestmv->col *= 8;
- // use_accurate_subpel_search can be 0 or 1
+ // use_accurate_subpel_search can be 0 or 1 or 2
if (use_accurate_subpel_search)
besterr = upsampled_setup_obmc_center_error(
xd, cm, mi_row, mi_col, mask, bestmv, ref_mv, error_per_bit, vfp, z, y,
- y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion);
+ y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion,
+ use_accurate_subpel_search);
else
besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
z, y, y_stride, offset, mvjcost, mvcost,
@@ -2408,7 +2465,8 @@ int av1_find_best_obmc_sub_pixel_tree_up(
if (use_accurate_subpel_search) {
thismse = upsampled_obmc_pref_error(
xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
- pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse);
+ pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+ use_accurate_subpel_search);
} else {
thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
sp(tr), src_address, mask, &sse);
@@ -2439,7 +2497,8 @@ int av1_find_best_obmc_sub_pixel_tree_up(
if (use_accurate_subpel_search) {
thismse = upsampled_obmc_pref_error(
xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
- pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse);
+ pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+ use_accurate_subpel_search);
} else {
thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr),
src_address, mask, &sse);
@@ -2643,11 +2702,12 @@ int obmc_diamond_search_sad(const MACROBLOCK *x, const search_site_config *cfg,
return best_sad;
}
-int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
- MV *mvp_full, int step_param, int sadpb,
- int further_steps, int do_refine,
- const aom_variance_fn_ptr_t *fn_ptr,
- const MV *ref_mv, MV *dst_mv, int is_second) {
+static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv,
+ int is_second) {
const int32_t *wsrc = x->wsrc_buf;
const int32_t *mask = x->mask_buf;
MV temp_mv;
@@ -2704,6 +2764,31 @@ int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
return bestsme;
}
+int av1_obmc_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
+ int step_param, int sadpb, int further_steps,
+ int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv, int is_second) {
+ if (cpi->sf.obmc_full_pixel_search_level == 0) {
+ return obmc_full_pixel_diamond(cpi, x, mvp_full, step_param, sadpb,
+ further_steps, do_refine, fn_ptr, ref_mv,
+ dst_mv, is_second);
+ } else {
+ const int32_t *wsrc = x->wsrc_buf;
+ const int32_t *mask = x->mask_buf;
+ const int search_range = 8;
+ *dst_mv = *mvp_full;
+ clamp_mv(dst_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ int thissme = obmc_refining_search_sad(
+ x, wsrc, mask, dst_mv, sadpb, search_range, fn_ptr, ref_mv, is_second);
+ if (thissme < INT_MAX)
+ thissme = get_obmc_mvpred_var(x, wsrc, mask, dst_mv, ref_mv, fn_ptr, 1,
+ is_second);
+ return thissme;
+ }
+}
+
// Note(yunqingwang): The following 2 functions are only used in the motion
// vector unit test, which return extreme motion vectors allowed by the MV
// limits.
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
index 539e8f4e4..a975218b0 100644
--- a/third_party/aom/av1/encoder/mcomp.h
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_MCOMP_H_
-#define AV1_ENCODER_MCOMP_H_
+#ifndef AOM_AV1_ENCODER_MCOMP_H_
+#define AOM_AV1_ENCODER_MCOMP_H_
#include "av1/encoder/block.h"
#include "aom_dsp/variance.h"
@@ -31,6 +31,11 @@ extern "C" {
// for Block_16x16
#define BORDER_MV_PIXELS_B16 (16 + AOM_INTERP_EXTEND)
+#define SEARCH_RANGE_8P 3
+#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1)
+#define SEARCH_GRID_CENTER_8P \
+ (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P)
+
// motion search site
typedef struct search_site {
MV mv;
@@ -43,6 +48,11 @@ typedef struct search_site_config {
int searches_per_step;
} search_site_config;
+typedef struct {
+ MV coord;
+ int coord_offset;
+} search_neighbors;
+
void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
void av1_init3smotion_compensation(search_site_config *cfg, int stride);
@@ -120,14 +130,15 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, MV *mvp_full, int step_param,
- int error_per_bit, int *cost_list, const MV *ref_mv,
- int var_max, int rd, int x_pos, int y_pos, int intra);
-
-int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
- MV *mvp_full, int step_param, int sadpb,
- int further_steps, int do_refine,
- const aom_variance_fn_ptr_t *fn_ptr,
- const MV *ref_mv, MV *dst_mv, int is_second);
+ int method, int run_mesh_search, int error_per_bit,
+ int *cost_list, const MV *ref_mv, int var_max, int rd,
+ int x_pos, int y_pos, int intra);
+
+int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv, int is_second);
int av1_find_best_obmc_sub_pixel_tree_up(
MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
@@ -147,4 +158,4 @@ unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
} // extern "C"
#endif
-#endif // AV1_ENCODER_MCOMP_H_
+#endif // AOM_AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c
index 3a27e5845..d21def43a 100644
--- a/third_party/aom/av1/encoder/ml.c
+++ b/third_party/aom/av1/encoder/ml.c
@@ -10,7 +10,9 @@
*/
#include <assert.h>
+#include <math.h>
+#include "aom_dsp/aom_dsp_common.h"
#include "av1/encoder/ml.h"
void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
@@ -55,3 +57,17 @@ void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
weights += num_input_nodes;
}
}
+
+void av1_nn_softmax(const float *input, float *output, int n) {
+ // Softmax function is invariant to adding the same constant
+ // to all input values, so we subtract the maximum input to avoid
+ // possible overflow.
+ float max_inp = input[0];
+ for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]);
+ float sum_out = 0.0f;
+ for (int i = 0; i < n; i++) {
+ output[i] = (float)exp(input[i] - max_inp);
+ sum_out += output[i];
+ }
+ for (int i = 0; i < n; i++) output[i] /= sum_out;
+}
diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h
index 614cb60bb..cb8ef2871 100644
--- a/third_party/aom/av1/encoder/ml.h
+++ b/third_party/aom/av1/encoder/ml.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_ML_H_
-#define AV1_ENCODER_ML_H_
+#ifndef AOM_AV1_ENCODER_ML_H_
+#define AOM_AV1_ENCODER_ML_H_
#ifdef __cplusplus
extern "C" {
@@ -37,8 +37,13 @@ typedef struct {
void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
float *output);
+// Applies the softmax normalization function to the input
+// to get a valid probability distribution in the output:
+// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
+void av1_nn_softmax(const float *input, float *output, int n);
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_RD_H_
+#endif // AOM_AV1_ENCODER_ML_H_
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
index bbdd50784..8b88c4755 100644
--- a/third_party/aom/av1/encoder/palette.h
+++ b/third_party/aom/av1/encoder/palette.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_PALETTE_H_
-#define AV1_ENCODER_PALETTE_H_
+#ifndef AOM_AV1_ENCODER_PALETTE_H_
+#define AOM_AV1_ENCODER_PALETTE_H_
#include "av1/common/blockd.h"
@@ -93,4 +93,4 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
} // extern "C"
#endif
-#endif /* AV1_ENCODER_PALETTE_H_ */
+#endif // AOM_AV1_ENCODER_PALETTE_H_
diff --git a/third_party/aom/av1/encoder/partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h
index 279d39495..437ea43f9 100644
--- a/third_party/aom/av1/encoder/partition_model_weights.h
+++ b/third_party/aom/av1/encoder/partition_model_weights.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
-#define AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
+#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
#ifdef __cplusplus
extern "C" {
@@ -1314,204 +1314,112 @@ static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
#define FEATURE_SIZE 18
#define LABEL_SIZE 4
-static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 48] = {
- 0.121894f, 0.058485f, 0.702226f, 0.015457f, -0.123380f, -0.573450f,
- 0.319576f, 0.118808f, 0.166057f, 0.526984f, 0.015211f, -0.025050f,
- 0.085717f, -0.028221f, -0.580062f, -0.270530f, -0.092371f, 0.037679f,
- 0.083573f, 0.007112f, -0.358623f, -0.264443f, -0.064819f, 0.022013f,
- -0.040077f, -0.291967f, -0.293100f, 0.072266f, -0.270572f, -0.292253f,
- -0.260105f, -0.294472f, -0.275752f, 0.054315f, 0.000085f, 0.105115f,
- -0.363572f, -0.016542f, 0.185943f, -0.359903f, 0.038765f, -0.377668f,
- 0.172692f, 0.127749f, -0.031275f, -0.242528f, -0.145880f, -0.055247f,
- -0.000265f, -0.355224f, 0.089917f, -0.377841f, -0.209766f, 0.030899f,
- 0.039546f, -0.375030f, -0.041605f, 0.137677f, 0.021282f, -0.150442f,
- -0.189445f, 0.009293f, -0.316033f, 0.038745f, -0.278761f, 0.005692f,
- -0.071763f, -0.302936f, -0.224572f, -0.211841f, 0.057503f, 0.005435f,
- -0.930979f, 0.115513f, 0.689958f, 0.221318f, 1.003891f, 0.359540f,
- -0.640534f, -0.162373f, -0.118105f, 0.205587f, 0.019710f, 0.025067f,
- -0.025344f, 0.002831f, 0.033078f, 0.040175f, -0.007502f, 0.026272f,
- 0.083443f, -0.880884f, 0.436948f, 0.293297f, 0.051678f, -0.133328f,
- -0.180323f, 0.667835f, 0.070733f, -0.003060f, -0.221804f, 0.146601f,
- 0.064024f, 0.056758f, -0.077361f, 0.105587f, -0.185500f, -0.133552f,
- 0.138269f, 0.165055f, 0.628284f, 0.846449f, 0.058825f, 0.223157f,
- 0.277896f, -0.381303f, 0.408241f, 0.643301f, 0.067494f, 0.120822f,
- -0.182491f, -0.111373f, -0.033374f, 0.131387f, -0.114654f, 0.114318f,
- 0.094718f, -0.052232f, 0.385903f, 1.212304f, 0.425305f, -0.052993f,
- 0.291474f, -0.319730f, 0.023090f, -0.317259f, 0.011181f, -0.034185f,
- -0.100671f, 0.186185f, -0.432511f, -0.115957f, -0.067746f, -0.177810f,
- -0.226700f, 0.004464f, 0.006809f, 0.171360f, -0.080723f, 0.099826f,
- -0.062301f, -0.358755f, -0.202549f, -0.084616f, -0.042313f, -0.325560f,
- 0.010452f, -0.341089f, -0.013566f, -0.340129f, 0.034675f, -0.036518f,
- -0.036473f, -0.192892f, 0.650235f, 0.609437f, -0.160982f, 0.125535f,
- -1.004575f, 0.521969f, 1.318091f, 0.614004f, -0.106622f, -0.077453f,
- -0.037328f, -0.081940f, 0.007640f, 0.026654f, -0.080332f, -0.077356f,
- -0.288170f, -0.319680f, -0.131712f, -0.150985f, 0.073218f, 0.089502f,
- -0.280502f, 0.003941f, -0.249937f, 0.244263f, 0.023269f, 0.080263f,
- 0.073172f, -0.200036f, 0.022381f, 0.008592f, -0.339517f, -0.135073f,
- 0.177199f, 0.208363f, 0.652360f, 0.272990f, 0.609535f, 0.145805f,
- 0.022527f, -0.088378f, 0.205008f, 0.101021f, -0.019673f, -0.252681f,
- 0.116034f, -0.062052f, 0.009991f, 0.138933f, -0.182428f, 0.052542f,
- -0.350825f, -0.122654f, -0.154687f, 0.066747f, 0.021541f, -0.212169f,
- -0.087093f, -0.087488f, 0.178129f, -0.146544f, 0.013919f, -0.273899f,
- 0.223753f, -0.187327f, -0.118795f, -0.191892f, -0.355979f, 0.023794f,
- -0.135236f, 0.058918f, 0.069080f, 0.279287f, 0.369689f, 1.134526f,
- 0.659511f, 0.250223f, 0.286040f, 0.515284f, 0.067791f, -0.156385f,
- 0.143283f, 0.050884f, 0.089956f, -0.040850f, -0.003650f, -0.081162f,
- 0.086004f, 0.116578f, 0.826254f, 0.504869f, -0.196022f, -0.207279f,
- 0.200503f, -0.196801f, 0.008211f, 0.411158f, -0.075855f, -0.036690f,
- 0.111519f, -0.057838f, -0.005846f, 0.111067f, 0.174712f, -0.078054f,
- 0.765897f, 0.018670f, -0.306960f, -0.020034f, -0.332875f, 0.662707f,
- -0.461233f, -1.007542f, -0.693995f, -1.243352f, -0.014745f, 0.004036f,
- -0.009141f, 0.003325f, -0.011233f, -0.000819f, 0.006369f, 0.002418f,
- -0.035906f, -0.005135f, 1.073830f, 1.020736f, -0.182611f, -1.038976f,
- -0.226695f, -0.375663f, 0.364568f, 0.620995f, -0.018615f, 0.011347f,
- 0.045786f, 0.041077f, 0.010886f, -0.148428f, 0.028007f, -0.022322f,
- -0.165985f, 0.233315f, -0.277531f, -0.329683f, -0.516967f, -0.390750f,
- 0.006948f, 0.133744f, -0.375681f, -0.116877f, -0.009441f, -0.008597f,
- -0.160679f, 0.102150f, -0.142647f, -0.117501f, 0.035035f, 0.228687f,
- -1.117397f, -0.005171f, -0.008708f, 0.413042f, -0.298532f, 0.614909f,
- -0.181084f, -0.711770f, 0.344033f, 0.287220f, -0.112848f, -0.052866f,
- -0.222466f, 0.025029f, -0.107558f, 0.137036f, -0.276661f, -0.038808f,
- -0.057448f, 0.037563f, 0.526020f, 0.447997f, 0.288366f, 0.264815f,
- 0.319974f, -0.193091f, 0.353830f, 0.412950f, -0.280454f, 0.092737f,
- 0.070919f, 0.043336f, 0.041214f, -0.052147f, 0.010860f, 0.191325f,
- 0.079783f, -0.425672f, -0.053469f, -0.005495f, 0.184526f, -0.166171f,
- 0.084459f, -0.042165f, -0.261759f, -0.248723f, -0.073483f, -0.377884f,
- -0.189614f, -0.054146f, -0.261279f, 0.196347f, -0.087568f, 0.070533f,
- -0.145492f, -0.041500f, -0.465861f, 0.077369f, 0.020645f, -0.440232f,
- -0.414585f, -0.168627f, -0.050011f, -0.336676f, -0.344943f, -0.288140f,
- 0.085513f, -0.200425f, 0.218516f, 0.049604f, -0.280952f, -0.242674f,
- -1.969931f, 0.013374f, -0.039643f, 1.113947f, 0.018568f, 0.916330f,
- -0.302934f, -0.225816f, 0.189529f, -0.361971f, 0.021073f, -0.050143f,
- -0.041415f, 0.015126f, 0.018091f, -0.082401f, 0.017152f, 0.064856f,
- 0.156170f, 0.145323f, -0.281409f, 0.213357f, -0.058966f, 0.158668f,
- 0.033742f, 0.378820f, -0.662875f, -0.455532f, -0.702928f, 0.234325f,
- 0.139627f, -1.360650f, 0.040921f, -0.044373f, -0.059999f, -0.048565f,
- 0.115339f, -0.105888f, -0.170567f, -0.206097f, -0.349537f, 0.107941f,
- -0.356286f, -0.374928f, 0.143257f, -0.317790f, 0.079875f, -0.359345f,
- 0.081321f, -0.219772f, -0.077213f, 0.110624f, -0.252329f, -0.266481f,
- 0.190135f, 0.121214f, 0.661064f, -0.037820f, -0.373068f, -0.065209f,
- -0.286154f, -0.120695f, -0.110670f, -0.193589f, -0.010867f, -0.048054f,
- -0.032010f, 0.110627f, 0.054094f, -0.884309f, -1.171623f, -0.386911f,
- -0.756058f, 0.030362f, 0.563628f, -0.334227f, -0.111213f, 1.143898f,
- -0.940454f, 0.084510f, 0.671010f, 0.312244f, -0.052592f, -0.014376f,
- 0.039965f, -0.010763f, -0.114936f, -0.146020f, 0.015874f, 0.027439f,
- -1.702315f, 0.148702f, 0.153021f, 0.363147f, -0.488933f, 0.220772f,
- 0.640310f, -0.173911f, -0.169523f, -0.082261f, -0.014854f, 0.024414f,
- 0.061041f, -0.013998f, 0.086539f, 0.000466f, 0.037472f, -0.010665f,
- -0.326646f, 0.106971f, 0.405589f, 0.555345f, -0.318315f, 0.526498f,
- 0.119246f, 0.022213f, 0.171237f, 0.214651f, 0.062904f, -0.023764f,
- 0.011831f, 0.079644f, -0.096530f, -0.054373f, -0.306309f, -0.203709f,
- -0.353217f, -0.350005f, -0.329549f, 0.062679f, -0.387625f, -0.237111f,
- -0.025050f, -0.193987f, 0.002235f, -0.380821f, -0.051036f, -0.136020f,
- 0.077989f, -0.361691f, 0.120485f, 0.157746f, 0.073394f, -0.284401f,
- 0.113221f, 0.109808f, 0.000197f, 0.122523f, 0.081411f, -0.048544f,
- -0.136577f, -0.007158f, -0.208952f, -0.276831f, 0.260479f, -1.392915f,
- -0.865248f, 0.114577f, -0.000749f, -0.060338f, -0.091176f, -0.108421f,
- 0.221256f, 0.100176f, -0.877560f, -1.248838f, 0.643005f, 0.064580f,
- -0.049878f, 0.267988f, -0.434340f, -0.299254f, -0.097572f, 0.009606f,
- 0.063810f, -0.090525f, 0.027760f, 0.043484f, 0.041697f, 0.108024f,
- -0.359586f, -0.197090f, 0.121397f, 0.152206f, -0.391126f, -0.283145f,
- 0.008754f, -0.059022f, -0.218745f, 0.043042f, -0.056716f, 0.153051f,
- -0.210372f, -0.029681f, -0.288354f, 0.065242f, -0.189376f, 0.115013f,
- -0.251488f, -0.533091f, 0.037768f, -0.319107f, -0.161364f, -0.103967f,
- 0.063271f, -0.313289f, -0.312093f, -0.045239f, 0.150607f, 0.001487f,
- 0.019602f, -0.338031f, -0.036214f, 0.112736f, -0.367762f, 0.122367f,
- 0.094670f, 0.175590f, 0.301041f, -0.135257f, 0.539620f, 0.328619f,
- -0.163971f, 0.137256f, 0.238805f, 0.483722f, 0.121353f, 0.083630f,
- -0.283568f, 0.291661f, -0.061122f, -0.195295f, 0.153459f, -0.153727f,
- -0.238839f, -0.071736f, 0.601437f, -0.664072f, 0.230827f, 0.198753f,
- -0.039196f, 0.206751f, 0.529020f, 0.904132f, -0.219471f, 0.186694f,
- -0.208608f, -0.093385f, -0.161617f, 0.003930f, -0.429869f, -0.123563f,
- 0.626098f, -0.002495f, -0.245511f, -1.069848f, 0.296115f, -0.940267f,
- -1.649122f, -0.512937f, -0.802874f, -1.000239f, -0.027629f, 0.020434f,
- -0.003030f, 0.035986f, -0.004812f, -0.009193f, -0.004644f, -0.024347f,
- 0.068439f, -0.314339f, 0.095057f, -0.212372f, 0.197523f, -0.040878f,
- -0.272164f, -0.243326f, -0.204955f, 0.157199f, -0.049964f, -0.091537f,
- -0.058012f, -0.306650f, 0.098621f, -0.146778f, -0.154447f, -0.177889f,
- -0.009698f, 0.025427f, 0.350576f, -0.448237f, -0.068823f, 1.224960f,
- -0.776883f, -0.692167f, -0.948497f, -0.492598f, 0.029440f, -0.056460f,
- 0.021654f, 0.004352f, 0.041508f, -0.027179f, 0.006789f, -0.023573f,
- 0.207775f, -0.280273f, -0.347984f, -0.129935f, 0.151512f, -0.087294f,
- -0.494352f, -0.341424f, 0.044084f, -0.064080f, 0.073091f, -0.145574f,
- 0.094715f, -0.258786f, -0.020419f, -0.401823f, 0.009397f, -0.138642f,
- -0.034953f, -0.077419f, 0.636610f, 0.314980f, 1.110610f, -0.343368f,
- 0.696647f, -0.649667f, 0.653491f, -0.096006f, -0.090469f, -0.066975f,
- -0.105864f, -0.015666f, 0.102056f, -0.105344f, -0.273495f, -0.014686f,
- 0.122031f, 0.139524f, -1.042029f, -0.562510f, 0.885644f, 1.088059f,
- 0.189223f, 0.049404f, -0.167371f, 0.018703f, -0.208390f, -0.159002f,
- -0.377130f, -0.151118f, 0.117861f, 0.026986f, -0.032433f, 0.081603f,
- -0.106729f, -0.040134f, 0.015161f, 0.290572f, 0.241446f, 1.390085f,
- 0.438915f, -0.358097f, -0.171799f, 0.879758f, -0.014110f, 0.029562f,
- -0.073583f, -0.125817f, -0.036512f, -0.040275f, 0.037997f, 0.120979f,
- 0.064538f, -0.038841f, 0.034797f, 0.110229f, -0.239779f, -0.004558f,
- 0.226534f, 0.111286f, -0.268198f, 0.237673f, -0.328237f, -0.090774f,
- -0.269690f, -0.202147f, -0.181808f, -0.305238f, 0.110058f, -0.169217f,
- -0.300125f, 0.069031f, -0.081358f, -0.376174f, -0.349980f, 0.071443f,
- -0.396278f, -0.389503f, -0.190410f, -0.014767f, -0.265229f, -0.099787f,
- 0.079847f, -0.214580f, -0.235661f, -0.184227f, 0.111099f, -0.083945f,
- -0.153809f, -0.284092f, -0.132497f, -0.154841f, -0.517157f, -0.640603f,
- -0.357036f, -0.486142f, -0.182819f, -0.475022f, 0.079282f, 0.081168f,
- -0.120831f, -0.016048f, -0.232495f, 0.214329f, -0.055058f, 0.032856f,
- 0.061753f, 0.003226f, 0.097028f, 0.084535f, -1.563199f, 0.434928f,
- -0.403710f, 0.520696f, -0.401696f, 0.450568f, -0.074121f, 0.076622f,
- -0.098421f, 0.167036f, -0.255250f, -0.526313f, -0.933693f, -0.558104f,
- 0.194341f, 0.173326f, 0.071112f, -0.651961f, -1.327587f, -0.705289f,
- -1.138889f, 0.197167f, -0.714654f, -0.113891f, 0.080158f, 0.000301f,
- 0.057905f, 0.060718f, -0.635995f, 0.100026f, -0.038239f, -0.025530f,
-};
-
-static const float av1_4_partition_nn_bias_16_layer0[48] = {
- -0.079252f, -0.083606f, -0.112759f, -0.071622f, 0.444562f, 0.215649f,
- -0.337661f, -0.242379f, -0.053829f, 0.165168f, -0.076613f, -0.190579f,
- -0.060175f, -0.571661f, -0.454075f, -1.462711f, -0.161563f, -0.088748f,
- -0.030279f, -0.456293f, -0.134473f, -0.194976f, 0.044373f, -0.503954f,
- -0.083563f, 0.123344f, 0.011821f, 0.085445f, -0.050294f, -0.135194f,
- 0.057815f, 0.543558f, -0.090602f, -0.104671f, -0.285075f, 0.354335f,
- 1.037007f, -0.023879f, -0.025025f, -0.094408f, -0.101200f, -0.142105f,
- -0.380607f, -0.059067f, -0.113017f, -0.137448f, -0.177840f, 0.468505f,
-};
-
-static const float av1_4_partition_nn_weights_16_layer1[48 * LABEL_SIZE] = {
- 0.174954f, -0.239117f, 0.073252f, 0.258881f, 0.579781f, 0.441827f,
- 0.372037f, -0.062362f, 0.068477f, 0.376811f, -0.130520f, 0.214951f,
- -0.200674f, 0.240347f, 0.152954f, 1.360264f, 0.334630f, -0.064789f,
- -0.270826f, 0.212699f, 0.045669f, -0.150852f, -0.412603f, 0.122481f,
- -0.230246f, 0.005004f, 0.321417f, -0.554083f, -0.186742f, -0.197687f,
- -0.028669f, -0.138559f, -0.117773f, 0.024953f, 0.326367f, -0.109951f,
- -1.098959f, -0.136134f, 0.563218f, 0.191799f, 0.126191f, -0.093113f,
- 0.185371f, 0.058468f, 0.245247f, -0.138064f, -0.471573f, -0.209372f,
- -0.111171f, 0.222275f, -0.350556f, -0.106336f, 0.268877f, 0.090639f,
- -0.083008f, -0.190791f, -0.243922f, -0.121182f, -0.133733f, -0.078450f,
- 0.099751f, 0.353020f, -0.199079f, -0.463492f, -0.647884f, 0.166611f,
- -0.464034f, 0.045096f, -0.312178f, -0.190972f, -0.468297f, 0.662376f,
- -0.197071f, -0.653123f, -0.354365f, -0.088501f, -0.302671f, 0.140713f,
- 0.885444f, 0.350273f, -0.003345f, 0.217260f, 0.219156f, 0.240653f,
- 0.347840f, 0.101849f, -0.244565f, -0.166971f, 0.091056f, 0.319912f,
- 0.268459f, 0.250726f, -0.155819f, -0.087588f, 0.010749f, -0.192344f,
- 0.344808f, 0.223482f, -0.189563f, -0.067317f, -0.348191f, -0.085265f,
- 0.259318f, 0.102408f, 0.096675f, -0.255564f, -0.168480f, -0.068189f,
- -0.457704f, 0.010565f, 0.228573f, -0.124421f, 0.202488f, 0.148519f,
- 0.002180f, 0.099099f, -0.179019f, 0.245414f, -0.038307f, 0.116897f,
- -0.031377f, 0.368533f, -0.793891f, 0.148614f, 0.075441f, 0.102465f,
- -0.310002f, -0.355369f, -0.206713f, -0.262276f, 0.068578f, -0.044980f,
- 0.092689f, -0.181058f, 0.016279f, 0.155965f, 0.545361f, -0.390699f,
- -0.042457f, 0.110238f, 0.114640f, 0.112525f, 0.522221f, 0.533164f,
- -0.331720f, -0.212966f, 0.140823f, 0.251311f, -0.006092f, -0.800438f,
- 0.007981f, -0.585140f, -0.006526f, 0.541683f, -0.298498f, 0.084322f,
- -0.056467f, -0.361806f, -0.256347f, -1.419173f, -0.159093f, 0.023017f,
- 0.667915f, -0.176995f, 0.022307f, -0.169493f, 0.581377f, 0.044929f,
- 0.044914f, -0.056290f, 0.324196f, 0.648043f, -0.089381f, -0.054971f,
- 0.064782f, 0.629356f, -0.003760f, -0.123822f, 0.144133f, -0.378821f,
- 1.116858f, 0.128552f, -0.668783f, 0.207194f, -0.437781f, -0.283321f,
- -0.549404f, 0.010538f, 0.208997f, 0.231396f, -0.174347f, 0.161910f,
+static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = {
+ -2.032866f, 0.056691f, 0.495960f, 0.778785f, 0.548153f, -0.806942f,
+ 0.481155f, 0.282298f, 0.584980f, 0.504688f, 0.209648f, 0.234616f,
+ 0.213484f, 0.221969f, 0.205862f, 0.235054f, 0.317863f, 0.257139f,
+ 0.529478f, 0.098122f, -0.657532f, 0.036296f, 0.327728f, 1.323180f,
+ -0.813082f, 0.160216f, -0.702030f, 0.722733f, -0.270576f, -0.347416f,
+ -0.264700f, -0.254248f, 0.159820f, 0.087995f, -0.184163f, 0.117357f,
+ 0.074194f, -0.667369f, 0.498246f, 0.420506f, 0.072409f, -0.121581f,
+ 0.315788f, 0.000525f, 0.414986f, 0.678166f, -0.011230f, 0.188131f,
+ -0.227749f, 0.009564f, 0.108672f, 0.106923f, -0.080695f, -0.279382f,
+ -0.061339f, -0.297835f, -0.134707f, 0.145865f, -0.009655f, -0.000842f,
+ -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f,
+ -0.143413f, 0.279668f, 0.000885f, -0.022380f, -0.140194f, -0.310473f,
+ 0.252699f, 0.066204f, 0.477568f, 0.994609f, -0.276000f, 1.213182f,
+ 0.277028f, -0.411570f, -0.211559f, 0.377815f, 0.121488f, -0.100559f,
+ -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f,
+ -0.143196f, -0.334035f, 0.162305f, 0.142279f, -0.001210f, -0.135252f,
+ -0.033562f, 0.204307f, -0.039757f, -0.394174f, 0.126617f, -0.128648f,
+ -0.410979f, 0.107641f, -0.117573f, -0.326512f, 0.235166f, 0.084959f,
+ 0.290063f, -0.005838f, 0.459894f, 1.023709f, -0.196145f, 1.100137f,
+ -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f,
+ -0.029743f, 0.125113f, -0.131024f, -0.321458f, -0.143996f, 0.008714f,
+ -0.101234f, 0.079706f, -1.128615f, -0.467381f, 0.220563f, -0.409900f,
+ -0.435353f, 0.759499f, -0.465799f, -0.394309f, 0.176282f, -0.086275f,
+ -0.161225f, -0.354814f, 0.562871f, 0.418253f, 0.414361f, 0.445480f,
+ -0.995903f, -0.086632f, -0.230645f, 0.354656f, -0.317576f, 0.079926f,
+ 0.424369f, 0.997232f, -0.304388f, 1.071667f, -0.023540f, 0.029677f,
+ 0.108564f, 0.183581f, -0.201395f, -0.054854f, -0.193039f, -0.049899f,
+ -0.271949f, -0.358483f, 0.304930f, 0.023823f, -0.009319f, -0.214247f,
+ 0.100712f, -0.050162f, 0.327103f, -0.212999f, -0.030496f, 0.316380f,
+ -0.439589f, -0.249959f, 0.229777f, -0.353664f, -0.384559f, 0.114236f,
+ 0.023119f, 0.007927f, 0.618368f, 0.957759f, -0.019780f, -1.002389f,
+ 0.564277f, -0.839531f, 1.040445f, 0.054340f, 0.031908f, -0.032893f,
+ -0.019170f, -0.042011f, 0.568928f, 0.362567f, -0.559999f, -0.605344f,
+ -0.586146f, -0.290778f, 0.195943f, -0.109580f, -0.088898f, -0.113054f,
+ 0.293282f, 0.429019f, 0.306136f, 0.863025f, 0.021234f, 0.125770f,
+ -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f, 0.064151f,
+ 0.029883f, 0.076287f, 0.757543f, 0.276713f, -2.529775f, -0.351727f,
+ -1.832316f, 0.544780f, -0.944529f, 0.509705f, -0.010236f, -0.016181f,
+ 0.021520f, 0.086417f, 0.041312f, 0.296853f, -0.372378f, 0.354446f,
+ -1.366762f, 0.048875f, 0.464918f, -0.007450f, 0.750013f, -0.360261f,
+ 0.518532f, 0.753776f, 0.641448f, 0.710746f, 0.250866f, 0.257063f,
+ 0.283421f, 0.253585f, 0.170303f, 0.210426f, 0.208842f, 0.158000f,
+ -0.033144f, 0.130748f, 0.907147f, 0.409248f, -0.854301f, -0.981307f,
+ 0.294427f, -0.507137f, 1.079967f, 0.203203f, 0.383890f, 0.368278f,
+ 0.305122f, 0.449288f, -0.044507f, -0.547263f, -0.298245f, -0.497834f,
+ 0.007016f, -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f,
+ 0.033502f, -0.018578f, -0.231531f, 0.177949f, 0.099564f, -0.010233f,
+ -0.333055f, -0.078586f, -0.417867f, 0.171271f, 0.013662f, -0.143599f,
+ -0.117296f, 0.135382f, 0.048321f, 0.000924f, -0.055024f, -0.405595f,
+ -0.068260f, -0.271011f, -0.436425f, 0.206751f, -0.899890f, 0.605510f,
+ 0.535649f, -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f,
+ 0.004660f, 0.176644f, -1.008475f, -0.038895f, 0.155429f, -0.095229f,
+ -0.680124f, -0.258063f, -0.261901f, 0.110380f, -0.337649f, -0.505870f,
+ -1.428536f, 0.610629f, 0.254905f, 0.045098f, 0.044109f, 0.172329f,
+ 0.060001f, -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f,
+ -0.312134f, 0.081261f, 0.160166f, 0.112690f, 0.266081f, 0.030175f,
+ -0.242746f, 0.000754f, -0.341811f, -0.149774f, -0.017484f, -0.301342f,
+ -0.121466f, 0.067300f, 0.342176f, 0.474538f, 0.085441f, -0.263935f,
+ 0.479235f, -0.003713f, -0.784840f, 0.119480f, 0.456632f, -0.640082f,
+ -0.080575f, -0.744403f, 0.259970f, 0.034667f, -0.274641f, -0.257594f,
+ -1.121124f, -0.003745f, -0.420693f, 0.300441f, -0.100976f, -1.049016f,
+ 0.201960f, 0.113054f, 0.187010f, 1.237427f, 0.054803f, -0.028673f,
+ 0.003596f, -0.034724f, 0.117246f, 0.190977f, 0.278915f, 0.224307f,
+ 0.017852f, -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f,
+ 0.045698f, -0.301095f, 0.184447f, 0.348240f, -0.017021f, -0.145064f,
+ -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f,
+ 0.647597f, -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f,
+ -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f,
+ -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f,
+ -0.971412f, 0.038542f, 0.705204f, 0.887113f, 0.150430f, -0.243676f,
+ 0.638410f, 0.320953f, 0.776676f, 0.527584f, 0.070389f, 0.051554f,
+ 0.177519f, 0.140451f, 0.128892f, 0.087771f, 0.197660f, 0.194764f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer0[24] = {
+ 0.614063f, -0.384872f, 0.084884f, -0.023980f, -0.378765f, -0.082312f,
+ -0.458271f, 0.189578f, -0.046169f, -0.073308f, -0.372322f, 0.162793f,
+ 0.148803f, 0.829214f, -0.221162f, -0.111157f, -0.017484f, -0.280596f,
+ -0.031905f, -0.143459f, 0.078823f, -0.021940f, 0.026834f, 0.257472f,
+};
+
+static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = {
+ -0.985391f, 0.587616f, 0.740683f, 0.192066f, 0.447080f, -0.016585f,
+ 0.680449f, 0.028983f, 0.643111f, 0.234338f, 0.107148f, 0.328456f,
+ -0.216394f, 1.106838f, -0.179062f, -0.129108f, -0.121655f, -0.151340f,
+ -0.306017f, -0.350989f, 0.859284f, -0.372831f, -0.954419f, 0.250495f,
+ 1.046732f, 0.287923f, -0.421088f, 0.326613f, -0.314396f, -0.084757f,
+ -0.474228f, 0.687999f, 0.052334f, 0.441708f, -0.630698f, -0.350348f,
+ -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f, 0.603119f,
+ 0.120245f, 0.182920f, -1.117797f, -0.239594f, -0.296296f, -0.718093f,
+ 0.489497f, -0.527019f, 0.102453f, 0.426731f, 0.034606f, 0.311461f,
+ -0.012723f, -0.229877f, -0.284290f, 0.383227f, 0.065696f, -0.222400f,
+ 1.279248f, -0.862190f, 0.629766f, -0.250011f, -0.325060f, -0.360115f,
+ -0.159540f, -0.291856f, -0.038348f, 0.224639f, 0.600934f, 0.030205f,
+ 1.337615f, -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f,
+ -0.481860f, 0.563625f, -0.154709f, -0.101198f, -0.758796f, -0.507616f,
+ -0.095253f, -0.711135f, 0.207759f, 0.076313f, -0.056087f, -0.162719f,
+ -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f, -1.504446f,
};
static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = {
- -0.197883f,
- -0.136696f,
- 0.094115f,
- 0.612799f,
+ -0.462133f,
+ 0.465060f,
+ 0.062211f,
+ 0.401786f,
};
static const NN_CONFIG av1_4_partition_nnconfig_16 = {
@@ -1519,7 +1427,7 @@ static const NN_CONFIG av1_4_partition_nnconfig_16 = {
LABEL_SIZE, // num_outputs
1, // num_hidden_layers
{
- 48, // num_hidden_nodes
+ 24, // num_hidden_nodes
},
{
av1_4_partition_nn_weights_16_layer0,
@@ -1532,143 +1440,143 @@ static const NN_CONFIG av1_4_partition_nnconfig_16 = {
};
static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = {
- 0.114554f, 0.043669f, 0.313291f, 0.167688f, -0.413357f, 0.088232f,
- 0.301915f, -0.358117f, 0.267711f, -0.252716f, -0.038531f, -0.032805f,
- -0.025382f, 0.023624f, -0.949694f, -0.065480f, -0.375721f, -0.697319f,
- -0.117387f, -0.204309f, -0.190797f, -0.223867f, -0.190248f, 0.026668f,
- 0.199717f, 0.216902f, -0.239241f, -0.096894f, -0.225046f, 0.246523f,
- 0.002333f, -0.254385f, -0.205815f, 0.123139f, -0.476923f, 0.137557f,
- 0.059686f, -0.124013f, 0.974675f, 0.889753f, 0.378940f, 0.526413f,
- -0.208747f, -0.001913f, 0.094081f, 0.848010f, 0.062042f, 0.159831f,
- 0.071016f, 0.024437f, 0.212611f, 0.039501f, -0.149922f, -0.055229f,
- -0.229270f, 0.129004f, -0.182803f, 0.291223f, -1.197804f, -0.916991f,
- -0.024095f, 0.738729f, -0.300326f, 0.402480f, 0.023944f, -0.022613f,
- -0.004554f, 0.001784f, 0.035143f, -0.202237f, 0.080252f, -0.003912f,
- -0.040345f, -0.121881f, 0.126672f, 0.093507f, -0.081305f, -0.081099f,
- -0.218824f, -0.459254f, -0.055250f, -0.095096f, 0.207278f, 0.245259f,
- -0.380849f, -0.334458f, -0.351449f, -0.513045f, -0.407823f, -0.222423f,
- 0.103205f, -0.299965f, -0.211472f, -0.348690f, -0.283688f, -0.152743f,
- -0.204005f, -0.173636f, 0.020302f, -0.109112f, 0.081203f, -0.137344f,
- -0.364582f, -0.343133f, -0.176167f, -0.446541f, 0.144844f, -0.268105f,
- -0.003889f, -0.309560f, -0.236092f, -0.299450f, 0.248269f, 0.207510f,
- -0.279023f, -0.272472f, -0.166427f, 0.205973f, -0.345692f, -0.238400f,
- -0.319178f, -0.327246f, -0.321756f, 0.043191f, -0.027520f, -0.029310f,
- 0.161379f, 0.031154f, -0.605365f, -0.230926f, 0.261142f, -0.262678f,
- -0.373351f, -0.326245f, 0.279222f, 0.684357f, -0.864302f, 0.036132f,
- 0.239307f, 0.136262f, 0.124002f, -0.410379f, -0.172722f, -0.376670f,
- -0.195889f, 0.037292f, -0.055295f, 1.022308f, 0.237600f, -0.618435f,
- 0.366154f, 0.168308f, -0.473467f, -0.756558f, -0.044830f, 0.019057f,
- -0.084214f, -0.007789f, -0.066028f, -0.074562f, 0.002082f, 0.001007f,
- -0.269676f, -0.164768f, -0.027271f, -0.098935f, 0.009431f, 0.254431f,
- 0.124238f, -0.198181f, 0.142723f, -0.112997f, -0.164224f, -0.355160f,
- 0.135330f, -0.379557f, 0.079392f, 0.210607f, -0.354927f, -0.277678f,
- -0.931111f, 0.056208f, -0.347710f, -0.355415f, 0.826145f, 0.390625f,
- 0.374414f, -0.205685f, 0.562485f, 0.152288f, 0.130635f, 0.056622f,
- 0.057972f, 0.095526f, -0.082436f, -0.085938f, -0.070570f, -0.087634f,
- 0.335934f, 0.084860f, 0.544424f, -0.278917f, 0.476740f, 0.050927f,
- -1.288817f, -0.078320f, -0.553041f, -0.160538f, -0.109365f, -0.127146f,
- -0.032524f, -0.105117f, -0.182965f, -0.024723f, 0.083317f, 0.060073f,
- -0.042945f, 0.015249f, 1.241504f, 0.662613f, 0.530496f, -0.180519f,
- -1.099086f, -0.825844f, 0.551856f, -0.025009f, -0.006619f, -0.001049f,
- 0.014828f, -0.035166f, -0.241091f, -0.136364f, -0.003219f, -0.014581f,
- -0.379945f, -0.226191f, -0.161241f, -0.496390f, -0.147175f, -0.118004f,
- -0.128206f, -0.389770f, -0.184288f, -0.119076f, -0.379211f, 0.236180f,
- -0.468730f, -0.175170f, 0.136433f, 0.167739f, -0.377602f, 0.135772f,
- 0.040972f, -0.193974f, -0.319475f, -0.016469f, -0.412027f, -0.322605f,
- 0.111125f, -0.078456f, -0.387234f, -0.401605f, -0.088717f, -0.340682f,
- 0.010556f, 0.058256f, -0.127352f, 0.017665f, 0.072632f, -0.171966f,
- -0.117342f, -0.166050f, -0.182689f, -0.073182f, 0.096279f, -0.260229f,
- 0.025216f, -0.332236f, -0.218706f, -0.200153f, -0.110303f, 0.073499f,
- -0.280123f, 0.132262f, -0.308330f, -0.119036f, -0.303874f, -0.065445f,
- -0.412137f, 0.057167f, 0.044582f, -0.330952f, -0.232572f, 0.039732f,
- -0.326877f, -0.300569f, -0.467164f, -0.371499f, 0.034430f, 0.058277f,
- -0.042485f, -0.409028f, -0.110889f, -0.500758f, -0.343141f, 0.042023f,
- -1.071050f, 0.086854f, -0.004932f, -0.259698f, 0.125301f, -0.742663f,
- -0.370517f, -0.772840f, 0.193628f, 0.554676f, 0.051283f, -0.196639f,
- 0.040344f, 0.027391f, -0.040501f, 0.038303f, 0.032972f, -0.014638f,
- 0.097720f, -0.206897f, -0.015480f, 0.008543f, 0.034469f, 0.127234f,
- -0.396463f, -0.390189f, 0.117538f, -0.435622f, 0.043420f, -0.241987f,
- -0.118254f, -0.190349f, 0.190273f, -0.085625f, -0.141253f, -0.377438f,
- -0.249211f, 0.214512f, -0.363191f, -0.754851f, 0.238045f, 1.127635f,
- 0.173947f, -0.357620f, 0.073671f, 0.220617f, 0.072067f, -0.076214f,
- -0.044583f, -0.018371f, 0.010952f, -0.135116f, 0.076597f, 0.034480f,
- -0.070212f, -0.454429f, -0.135215f, 0.163851f, -0.625990f, -0.283991f,
- 0.284051f, 0.182935f, -0.048717f, 0.002484f, -0.009086f, 0.321724f,
- 0.125162f, -0.069624f, -0.430299f, -0.007224f, -0.284725f, -0.475662f,
- 0.123807f, -0.313614f, -0.103142f, 0.072125f, 0.100320f, -0.185558f,
- -0.481522f, -0.247311f, -0.386762f, -0.258850f, 0.178844f, -0.381231f,
- -0.436001f, -0.374834f, 0.230104f, -0.500679f, 0.170880f, 0.029657f,
- -0.105857f, -0.366671f, -0.268833f, 0.036885f, -0.026776f, 0.037837f,
- -0.362095f, -0.254933f, 0.129650f, 0.007945f, -0.304715f, -0.100813f,
- -0.342849f, -0.269223f, 0.178490f, 0.186735f, -0.353995f, 0.050381f,
- -0.440186f, 0.025985f, 1.096969f, 1.132937f, 0.581545f, 0.271734f,
- -0.109169f, -0.014239f, 0.688644f, 0.602702f, 0.048616f, 0.022335f,
- 0.037545f, 0.081667f, -0.109038f, -0.088565f, -0.002506f, -0.041420f,
- -0.132515f, 0.187312f, 0.677273f, 1.111182f, 0.199096f, -0.211551f,
- -0.896508f, 0.257981f, 0.007803f, 0.160343f, -0.124864f, -0.097150f,
- 0.225090f, 0.242900f, -0.195665f, 0.011310f, 0.160765f, 0.169195f,
- -0.081994f, -0.017372f, -0.566190f, -0.902086f, 0.027768f, 0.511419f,
- 0.076009f, -0.165861f, 0.240487f, 0.006298f, -0.153334f, 0.041249f,
- 0.387092f, 0.313011f, -0.032269f, 0.019024f, 0.052568f, 0.124247f,
- 0.197640f, 0.002537f, 0.651044f, 0.829828f, -0.446444f, -0.402042f,
- -0.469399f, -0.019842f, 0.371960f, 0.140373f, -0.044808f, 0.008283f,
- 0.093791f, 0.052149f, 0.143123f, -0.449571f, -0.868816f, -0.265661f,
- -0.225232f, -0.014704f, 0.543836f, -0.374498f, 0.561647f, 1.309445f,
- 0.056789f, -0.048447f, 0.255758f, 0.644553f, -0.124802f, 0.097419f,
- -0.149336f, 0.021596f, -0.043699f, 0.057591f, -0.000077f, 0.034488f,
- -0.049353f, -0.007799f, 0.437914f, 0.509369f, 0.674428f, 1.858949f,
- -0.205964f, 0.060776f, 0.184213f, 0.037177f, -0.062535f, -0.115408f,
- 0.076498f, 0.010235f, -0.142253f, 0.009983f, 0.073436f, 0.038716f,
- -0.369983f, -0.185959f, -0.137867f, 0.032134f, 0.213814f, -0.125571f,
- 0.247874f, -0.166871f, -0.160890f, 0.147029f, 0.267143f, -0.298488f,
- -0.210203f, -0.188313f, -0.085024f, -0.244962f, -0.189833f, -0.261242f,
- 0.399519f, 0.143200f, -0.776419f, -0.374639f, -0.022066f, 0.582904f,
- 0.006430f, -0.139134f, -0.491894f, -0.430579f, -0.358221f, -0.231365f,
- -0.398255f, -0.173231f, 0.211789f, -0.036121f, -0.266856f, 0.042956f,
- -1.138513f, -0.070313f, 0.158803f, 0.406989f, -0.015974f, 0.651020f,
- -0.468982f, -0.310019f, 0.416922f, 0.895162f, 0.019921f, 0.004023f,
- 0.006962f, 0.000863f, -0.216395f, -0.074913f, -0.002613f, 0.026703f,
+ -0.219494f, -0.428273f, 0.471006f, 0.448210f, -0.152935f, 0.440435f,
+ 0.922857f, -0.074436f, 1.002195f, 0.414176f, -0.327202f, -0.380066f,
+ -0.212346f, 0.061868f, -0.056620f, 0.594134f, 0.617995f, 0.308358f,
+ 0.232484f, 0.129849f, 1.483593f, -0.071460f, 1.984515f, 1.116422f,
+ -1.141762f, -0.306220f, 0.089075f, -0.271845f, 0.187524f, 0.050396f,
+ -0.061025f, 0.030809f, 0.172799f, -0.458151f, -0.318357f, 0.122052f,
+ -0.414329f, 0.089366f, 0.118898f, -0.376213f, -0.206151f, -0.519946f,
+ -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f, -0.245280f,
+ -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f,
+ -0.500856f, 0.003388f, -0.527397f, -0.449330f, -0.174272f, 0.123769f,
+ 0.023005f, 0.157273f, 0.073400f, 0.019099f, -0.113848f, -0.098601f,
+ -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f,
+ -1.282604f, 0.048505f, -0.746382f, 0.093740f, -0.706583f, -0.085729f,
+ 0.947382f, -0.002961f, 1.175362f, 1.007309f, 0.141638f, -0.037608f,
+ -0.118807f, -0.021474f, -0.146763f, 0.069363f, -0.074372f, -0.215713f,
+ -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f, -0.534598f,
+ -0.357759f, -0.455950f, 0.139469f, 0.036582f, -0.384743f, -0.168828f,
+ -0.239250f, 0.003520f, -0.049003f, 0.075702f, -0.025809f, -0.225972f,
+ -0.228905f, -0.412489f, 0.060570f, -0.328819f, -0.206446f, -0.080231f,
+ -0.372008f, -0.218118f, -0.011954f, 0.024155f, 0.156014f, 0.020679f,
+ 0.194398f, -0.283491f, -0.024463f, -0.275099f, 0.028031f, 0.026340f,
+ -0.254668f, 0.103637f, 2.178693f, 0.552284f, 0.109366f, -0.474806f,
+ -0.379286f, -0.026315f, 2.487924f, -0.089466f, 0.206428f, 0.114578f,
+ 0.152248f, 0.184050f, -0.631948f, -0.014793f, -0.283782f, -0.830353f,
+ 0.009343f, -0.021029f, -0.060534f, -0.025164f, 1.841311f, 1.842748f,
+ -1.979708f, 0.450985f, -1.606357f, -0.785454f, -0.212679f, -0.344342f,
+ 0.198991f, -0.258070f, 0.055974f, 0.224069f, 0.453051f, 0.408053f,
+ 0.027873f, -0.180538f, 0.056609f, 0.207654f, 0.104086f, -0.194426f,
+ -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f,
+ -0.160439f, -0.044856f, -0.346647f, 0.044859f, 0.231398f, -0.023643f,
+ -0.140316f, -0.260177f, 0.206965f, -0.425386f, -0.420268f, -0.409748f,
+ 0.006971f, 0.066186f, -0.034950f, -0.345518f, 0.018633f, -0.122489f,
+ -0.038506f, -0.330942f, 0.161236f, -0.314119f, -0.050202f, -0.179597f,
+ 0.731897f, -0.184481f, 0.153598f, -0.539501f, -0.301493f, -0.184967f,
+ -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f,
+ -0.101083f, -0.064142f, 0.161190f, 0.430826f, 0.355647f, 0.138266f,
+ 0.051114f, -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f,
+ -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f,
+ 0.120193f, 0.011360f, -0.010475f, -0.092739f, -0.159650f, -0.033129f,
+ -0.259893f, -0.073217f, 0.200128f, 0.103407f, -0.229233f, 0.128831f,
+ -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f,
+ -0.442879f, -0.310456f, -0.112881f, 0.263696f, -0.205014f, -0.497936f,
+ -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f,
+ 0.117232f, -0.577809f, 0.154596f, -0.409522f, -0.413113f, -0.359199f,
+ 0.307294f, -0.008746f, -0.310522f, 0.347620f, -0.384845f, -0.451398f,
+ -0.226199f, 0.054154f, -0.167608f, 0.046836f, -0.013285f, -0.408119f,
+ -0.177973f, -0.248293f, -0.465830f, 0.035827f, -0.222208f, -0.221717f,
+ 0.066392f, -0.349769f, -0.428029f, -0.516692f, 0.022398f, -0.251682f,
+ 0.134746f, 0.011167f, -2.078787f, 0.173592f, -1.948348f, 0.330060f,
+ 1.993785f, -0.052859f, -0.004795f, -3.703177f, 0.013450f, -0.011687f,
+ 0.073079f, 0.034803f, 0.025515f, 0.005994f, 0.101731f, 0.074303f,
+ -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f,
+ 0.007667f, -0.358453f, 0.027412f, 0.033492f, 0.021197f, -0.049991f,
+ 0.104468f, -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f,
+ -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f,
+ -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f, -0.246475f,
+ -0.229144f, -0.087744f, -0.346909f, 0.172611f, 0.004377f, -0.009386f,
+ -0.023104f, 0.008000f, -0.029390f, -0.317842f, 0.549674f, -0.195337f,
+ -0.863979f, 0.160889f, -0.269014f, -0.442104f, -1.799191f, 1.396533f,
+ -0.112837f, 0.881303f, 0.000764f, -0.035415f, -0.141877f, 0.184831f,
+ -0.363566f, -0.178569f, 0.254134f, -0.326893f, 0.127325f, 0.310620f,
+ -0.384621f, 0.146058f, -0.287682f, -0.373447f, 0.026930f, 0.251650f,
+ 0.053817f, 0.227509f, 0.121396f, 0.396514f, -0.278381f, -0.038969f,
+ -1.538756f, -0.002856f, -0.892900f, 0.363426f, -1.257922f, 0.743795f,
+ 0.941177f, 0.219345f, 0.684189f, 1.396858f, 0.026299f, -0.093433f,
+ -0.066182f, 0.057868f, -0.089278f, -0.159680f, -0.262035f, -0.236656f,
+ 0.005349f, -0.031314f, 0.027917f, -0.182113f, -0.212086f, -0.160774f,
+ 0.051468f, 0.036787f, 0.183881f, -0.288205f, -0.349691f, 0.162511f,
+ 0.117878f, -0.294534f, -0.365037f, -0.246313f, 0.073977f, -0.072378f,
+ -0.173579f, -0.584560f, 0.547194f, 0.259853f, -0.405287f, -0.421146f,
+ 0.165788f, -0.146964f, 0.257415f, 0.772394f, -0.475302f, -0.310906f,
+ 0.058723f, 0.276833f, 0.586842f, 0.248998f, -0.061135f, 0.255779f,
+ 0.152158f, -0.024781f, 2.821834f, 1.365141f, 0.914744f, 0.165752f,
+ -1.048304f, -0.333891f, 1.804087f, -0.437028f, -0.120211f, -0.020443f,
+ 0.040077f, 0.258600f, -0.598893f, -0.494579f, -0.281054f, -0.517041f,
+ 0.005258f, 0.053986f, 0.322755f, 0.429495f, -1.992364f, -0.717192f,
+ -1.774802f, 2.047362f, -0.016194f, 0.312606f, 0.019331f, 0.060950f,
+ 0.116428f, 0.168458f, -0.307001f, -0.420734f, 0.475843f, 0.425346f,
+ -0.107119f, 0.049892f, -1.168619f, 0.010878f, 0.354872f, 0.902717f,
+ -0.391407f, 0.332772f, -1.335037f, -0.447100f, 0.481719f, -0.101069f,
+ -1.806565f, 0.925280f, 0.346999f, 0.093809f, 0.006275f, 0.270814f,
+ -0.691123f, 0.230748f, 0.137033f, 0.068228f, 1.555975f, -0.271637f,
+ -0.370403f, 0.236131f, 0.367464f, -0.136562f, 0.428838f, 0.181750f,
+ 0.338762f, 0.292449f, -0.748204f, -0.922731f, -0.959445f, -0.806418f,
+ -0.140501f, 0.070525f, 1.248748f, 0.637990f, -1.307246f, -0.514055f,
+ 0.393858f, -1.858727f, 0.713591f, -0.141044f, 0.080723f, 0.120220f,
+ -0.031175f, 0.224488f, 0.753818f, -0.833351f, -1.099132f, 0.651100f,
+ -0.135061f, -0.043820f, 0.026983f, -0.059259f, 0.001345f, -0.281775f,
+ 0.006958f, 0.046103f, -0.246539f, 0.057630f, -0.360778f, -0.160681f,
+ -0.414870f, -0.301979f, 0.000683f, 0.132957f, -0.477609f, 0.106110f,
+ -0.637769f, -0.078374f, -0.229494f, 0.583108f, -0.822973f, -0.107540f,
+ 1.063426f, -0.268346f, 1.105787f, 2.587550f, -0.020314f, -0.002161f,
+ -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f,
+ -0.019870f, -0.018920f, -0.219732f, 0.035608f, -1.789450f, 0.483032f,
+ -0.464729f, 1.563277f, -1.054195f, 0.359991f, 0.065204f, 0.135623f,
+ 0.158380f, -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f,
};
static const float av1_4_partition_nn_bias_32_layer0[32] = {
- 0.133615f, -0.113389f, -0.575989f, 0.589389f, -0.193574f, -0.132463f,
- 0.000000f, 0.060317f, 0.264577f, -0.060599f, 0.540147f, -0.127782f,
- -0.548802f, -0.172235f, -0.193032f, -0.026301f, -0.177527f, 0.267821f,
- -0.115455f, -0.137162f, -0.079595f, -0.041443f, -0.043856f, -0.657220f,
- -0.448931f, 0.446300f, 0.250002f, 0.223559f, -0.647723f, -0.014369f,
- 0.084333f, -0.056270f,
+ 0.421645f, -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f,
+ -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f,
+ -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f, -0.032831f,
+ -0.165621f, 0.145844f, -0.032104f, -0.453246f, -0.113444f, 0.321589f,
+ -0.862375f, -0.108826f, -0.486259f, 0.685325f, 0.072569f, -0.187961f,
+ 0.109579f, -0.082685f,
};
static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = {
- -0.069633f, -0.087239f, 0.365816f, -0.068579f, 0.231198f, -0.067856f,
- -0.139892f, -0.100235f, -0.488166f, -0.150112f, -0.005546f, 0.210832f,
- 0.778888f, 0.169624f, 0.089968f, -0.243569f, 0.353483f, 0.032296f,
- -0.157408f, 0.286885f, -0.063537f, -0.324055f, -0.161464f, 0.430600f,
- 0.277707f, -0.196463f, 0.154647f, 0.059804f, 0.176408f, 0.303179f,
- -0.040156f, 0.375810f, -0.363032f, -0.186808f, -0.264561f, -0.158937f,
- -0.007949f, -0.076394f, 0.056475f, 0.308528f, 0.695387f, 0.051336f,
- 0.433063f, -0.229948f, -1.210712f, 0.036286f, 0.183868f, -0.117660f,
- 0.230134f, -0.093469f, 0.237918f, 0.625986f, -0.236671f, -0.377172f,
- 0.331091f, -0.394004f, -0.214349f, 0.243940f, -0.600348f, 0.069843f,
- 0.088325f, 0.225775f, 0.276884f, -0.604493f, 0.769812f, 0.259574f,
- 0.086220f, 0.511515f, -0.282584f, -0.157719f, 0.278778f, -0.332732f,
- 0.068985f, -0.237236f, -0.006102f, -0.154883f, 0.710288f, -0.245896f,
- -0.255895f, -0.398038f, 0.304084f, -0.317065f, 0.192609f, -0.235613f,
- 0.461340f, 0.117194f, 0.116817f, 0.196150f, 0.421622f, -0.264495f,
- 0.617852f, -0.351756f, -0.310016f, 0.135932f, -0.242622f, -0.073094f,
- 0.042077f, 0.039230f, -0.482715f, 0.553187f, 0.360637f, 0.313484f,
- -0.131540f, -0.104731f, 0.374704f, 0.222173f, 0.437657f, 0.029827f,
- -0.545156f, -0.203176f, 0.267824f, 0.169237f, -0.057871f, 0.552197f,
- 0.272243f, 0.025681f, -0.262192f, 0.255934f, -0.202407f, -0.483317f,
- -0.204721f, 0.288807f, -0.030735f, -0.047161f, -0.780724f, 0.381939f,
- -0.295318f, 0.537378f,
+ 0.255012f, 0.658860f, 0.216907f, 0.165947f, 0.241182f, 0.340854f,
+ 0.409445f, 0.165220f, 0.553373f, -0.242385f, -0.209571f, 0.255515f,
+ 0.222500f, 0.037032f, 0.238590f, 0.061624f, -2.038693f, 0.264167f,
+ -0.230144f, 0.129952f, -0.027979f, 0.847761f, 0.438922f, 0.462323f,
+ 0.555345f, 0.030689f, 0.336357f, -0.357326f, -0.113137f, 0.272631f,
+ 0.421022f, 0.367776f, -0.197094f, 0.157117f, -0.015008f, -0.056123f,
+ -0.283913f, 0.186417f, 0.178561f, -0.763041f, 0.602038f, 0.341092f,
+ 0.320453f, -0.312776f, -0.371240f, -0.356279f, 0.220117f, -0.131871f,
+ 1.517429f, 0.162223f, -0.255069f, 0.451861f, 0.045071f, -0.223257f,
+ 0.003257f, 0.015734f, -0.630447f, -0.672588f, 0.670164f, 0.571031f,
+ -0.657948f, 0.034506f, -0.249076f, 0.790293f, 0.066491f, -0.131245f,
+ 0.355173f, 0.564622f, 0.374048f, 0.033974f, 0.253970f, 0.495498f,
+ -0.556321f, -0.104651f, 0.276947f, 0.057148f, -0.039126f, -0.170050f,
+ -0.141542f, 0.158541f, 0.582763f, -0.100992f, 0.096705f, -0.209029f,
+ 0.008449f, 0.255865f, 0.103565f, 0.317719f, 0.479499f, 0.599126f,
+ -0.065613f, -0.268614f, 0.508736f, 0.180813f, -0.815868f, 0.051238f,
+ 0.001223f, -0.305423f, -0.270079f, 0.036180f, 0.304342f, 0.202634f,
+ 0.218348f, -0.304304f, -0.438297f, 0.241123f, 0.200230f, 0.151804f,
+ 0.051944f, 0.160422f, -0.262981f, -0.417412f, 1.845729f, -0.086183f,
+ 0.403517f, 0.059667f, 0.564543f, -0.081752f, 0.114907f, -0.284489f,
+ -0.673943f, 0.056965f, 0.362221f, 0.403224f, -0.000233f, -0.209552f,
+ -0.800926f, -0.134132f,
};
static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = {
- -0.332518f,
- 0.114452f,
- 0.098949f,
- 0.465896f,
+ -0.019518f,
+ 0.198546f,
+ 0.339015f,
+ -0.261961f,
};
static const NN_CONFIG av1_4_partition_nnconfig_32 = {
@@ -1688,82 +1596,112 @@ static const NN_CONFIG av1_4_partition_nnconfig_32 = {
},
};
-static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
- 0.256343f, -0.021774f, -0.117102f, 0.416930f, 0.188160f, 0.148768f,
- -0.611181f, -0.121607f, -0.394825f, -0.875025f, -0.167071f, 0.016408f,
- 0.222769f, -0.199332f, 0.058667f, -0.679529f, 0.081744f, 0.044438f,
- -0.182941f, -0.110339f, -0.137822f, -0.096164f, -0.132319f, 0.140036f,
- -0.049503f, -0.309894f, -0.323991f, 0.166113f, 0.138104f, -0.263629f,
- -0.368460f, -0.273989f, 0.147239f, 0.044566f, -0.363357f, -0.030792f,
- 0.020734f, 0.068506f, -0.434214f, 0.581644f, -1.244146f, -0.569162f,
- 0.179499f, -0.188900f, 0.078431f, -0.392126f, -0.006431f, 0.112146f,
- -0.065892f, -0.051319f, 0.094607f, 0.251700f, -0.000650f, 0.011911f,
- 0.080449f, 0.022816f, 0.322382f, 0.577070f, 0.927738f, 0.178707f,
- -0.101237f, -0.212521f, 0.560261f, -0.206492f, -0.077591f, -0.069960f,
- 0.025727f, 0.041122f, -0.735228f, -0.506091f, -0.600776f, -0.117829f,
- 0.103556f, 0.141823f, 0.853448f, 0.339488f, 0.994022f, 0.121693f,
- -2.065366f, -0.352510f, -0.174323f, -0.323400f, -0.002193f, 0.004161f,
- 0.042469f, -0.005319f, -0.305784f, -0.371353f, 0.011194f, -0.018597f,
- 0.209260f, 0.071577f, 0.242470f, -0.856593f, 0.288842f, 1.062608f,
- -0.300472f, 0.221623f, -0.813563f, -0.250347f, -0.081455f, -0.092779f,
- -0.168132f, -0.180640f, -0.075130f, -0.052906f, -0.015645f, 0.127158f,
- -0.006546f, 0.051671f, 0.545608f, 1.101804f, 0.288086f, 1.107046f,
- -0.200012f, 0.220182f, -0.189220f, -0.554973f, 0.040711f, -0.058029f,
- 0.043737f, 0.016164f, -0.391790f, -0.287770f, -0.046545f, 0.045071f,
- 0.190005f, -0.076963f, 0.836839f, 1.633266f, 0.902928f, 0.991972f,
- -0.127932f, 0.293680f, -0.035984f, 0.476179f, -0.098024f, 0.068314f,
- -0.058365f, 0.096221f, -0.000321f, -0.128840f, 0.136441f, -0.061853f,
- 0.270367f, -0.184129f, -0.373670f, -0.177381f, 0.262109f, -0.378013f,
- -0.053249f, -0.456389f, 0.222972f, -0.228067f, -0.115210f, -0.277797f,
- 0.096913f, -0.014512f, -0.015533f, 0.026389f, -0.360536f, -0.078477f,
- -0.203186f, 0.199574f, 0.770476f, 0.595592f, 0.360828f, 0.547721f,
- -0.804787f, 0.389690f, -0.437645f, 0.576776f, 0.081903f, 0.082750f,
- 0.007166f, -0.143755f, 0.114462f, 0.472432f, -0.058974f, 0.077761f,
- -2.015181f, -0.054942f, -0.110894f, 0.529188f, -0.003300f, 0.913895f,
- -0.324643f, 0.316135f, -0.291729f, 1.072647f, -0.029236f, 0.045592f,
- -0.039399f, 0.043472f, -0.303244f, -0.108761f, -0.011154f, 0.009693f,
- -0.374985f, 0.027758f, 0.302075f, -0.295758f, -0.165563f, -0.297259f,
- -0.485624f, -0.469310f, -0.028247f, -0.124440f, -0.428082f, 0.096325f,
- 0.089003f, -0.301585f, 0.022474f, 0.077477f, -0.032233f, -0.231036f,
- 0.143206f, 0.169113f, -0.556486f, 0.346327f, -0.667790f, 0.126983f,
- 0.179727f, 0.397307f, -0.490612f, -1.708789f, -0.040336f, -0.028547f,
- -0.091319f, -0.119367f, -0.518796f, -0.543383f, 0.037162f, 0.031344f,
- -0.131692f, 0.119353f, 0.799313f, 0.443848f, -0.499919f, -1.002983f,
- 0.375477f, 0.221096f, -0.238033f, 0.284849f, 0.021897f, 0.023338f,
- -0.059067f, 0.117276f, 0.039540f, 0.049630f, 0.175150f, 0.014166f,
- -0.071486f, 0.091234f, -1.007432f, -1.417378f, 0.640528f, 1.442576f,
- -0.257183f, -0.597016f, 0.861785f, 0.276121f, -0.098017f, 0.120514f,
- -0.133184f, 0.106529f, 0.171644f, 0.059513f, 0.215952f, -0.009441f,
- -0.505313f, 0.063174f, 0.229148f, -0.344213f, 0.862721f, 1.549941f,
- -0.220129f, 0.493094f, 0.264095f, 0.143641f, 0.084968f, -0.078266f,
- 0.032335f, -0.019006f, -0.098205f, 0.119213f, -0.103465f, 0.072811f,
-};
-
-static const float av1_4_partition_nn_bias_64_layer0[16] = {
- 0.111611f, -0.067682f, 0.633594f, 0.143559f, -1.051284f, -0.266625f,
- -0.829789f, -0.956123f, -0.153484f, -0.787741f, 0.004832f, -0.080769f,
- 0.235166f, 0.449468f, 0.294689f, -0.395300f,
-};
-
-static const float av1_4_partition_nn_weights_64_layer1[16 * LABEL_SIZE] = {
- -0.069999f, -0.093710f, -0.423714f, -0.028138f, 0.684415f, 0.141445f,
- 0.507161f, 0.435533f, -0.263268f, 0.585105f, 0.235301f, 0.127536f,
- -0.688639f, -0.217993f, -0.540066f, 0.406718f, 0.018210f, -0.077349f,
- -0.124823f, -0.488220f, -0.957026f, 0.302632f, 0.285490f, -0.411356f,
- 0.091089f, 0.103862f, -0.549291f, 0.148628f, 0.640603f, -0.601018f,
- 0.178024f, 0.601370f, 0.313780f, 0.051938f, 0.524083f, 0.814631f,
- -0.415522f, -0.738849f, 0.477881f, -0.342864f, 0.105181f, 0.040010f,
- -0.177521f, 0.400646f, 0.167093f, 0.388279f, -0.898439f, -0.111936f,
- 0.469875f, -0.099528f, -0.217370f, 0.283742f, -0.033798f, -0.142797f,
- -0.174057f, -1.293311f, -0.038777f, -0.003846f, 0.093642f, -0.527150f,
- -0.021259f, 0.194651f, -0.276294f, -0.109514f,
+static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = {
+ -0.152649f, 0.074509f, 1.000136f, 0.601661f, -1.416694f, -1.932396f,
+ -1.163850f, 0.640931f, -0.888625f, -0.345711f, 0.161799f, 0.103165f,
+ 0.147513f, 0.089956f, 0.204329f, 0.196922f, 0.014927f, 0.283714f,
+ -0.110422f, 0.062005f, -0.531870f, -0.075287f, -0.448349f, -0.218881f,
+ -0.005592f, -0.130490f, -0.015779f, 0.093521f, -0.158487f, 0.072241f,
+ 0.066879f, -0.418566f, -0.206281f, 0.025634f, 0.048334f, -0.534750f,
+ 0.302081f, 0.028707f, -1.543248f, 0.103799f, -1.214052f, 0.395870f,
+ 0.394754f, -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f,
+ 0.014742f, 0.065263f, 0.000879f, -0.019768f, 0.101275f, 0.163059f,
+ -0.371392f, -0.283484f, 0.241915f, 0.012684f, -0.210101f, -0.166534f,
+ -0.024894f, 0.274696f, 0.098993f, 0.104086f, 0.055044f, -0.289378f,
+ 0.146571f, -0.147441f, 0.004056f, 0.112244f, -0.416162f, -0.033176f,
+ -0.214836f, -0.213787f, 0.023197f, -0.339043f, 0.301109f, -0.408551f,
+ 0.284922f, -0.344418f, -0.039255f, 0.158748f, -0.344169f, 0.078286f,
+ -0.043957f, -0.302162f, -0.310826f, 0.063425f, 0.198166f, -0.285324f,
+ -0.108252f, 0.038992f, -1.053110f, -1.663290f, -0.417185f, 1.504443f,
+ 0.643206f, -0.850240f, 0.889641f, -0.733214f, 0.147302f, 0.060291f,
+ -0.052954f, 0.167453f, 0.111870f, 0.085471f, 0.035107f, 0.064361f,
+ 0.176053f, 0.184373f, 0.676576f, 0.066164f, 1.455569f, 0.925111f,
+ -0.640845f, 0.803795f, -0.653782f, -0.201038f, 0.060033f, 0.016964f,
+ -0.047590f, 0.045908f, 0.354162f, 0.014812f, 0.156978f, 0.058792f,
+ -0.238119f, 0.002450f, -0.094388f, -0.155229f, 0.194858f, -0.355429f,
+ -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f, -0.425339f,
+ -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f, -0.185816f,
+ -0.317294f, 0.002453f, -0.498241f, -0.204302f, -0.079093f, 0.020646f,
+ -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f,
+ -0.146812f, 0.171111f, 0.090261f, -0.367033f, -0.299051f, -0.322132f,
+ 0.428192f, -0.252613f, 0.488498f, -0.559682f, 0.486720f, -0.511084f,
+ 0.992506f, 0.346765f, -0.118697f, -0.065127f, -0.376612f, -0.345137f,
+ -0.426517f, -0.516836f, 0.307083f, 0.609362f, 0.369555f, 0.093775f,
+ -0.375664f, -0.221595f, -0.025465f, 0.134374f, -0.387031f, 0.096236f,
+ 0.337465f, -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f,
+ -0.247705f, 0.146559f, -0.236206f, -0.036073f, 0.064206f, -0.330919f,
+ 0.516591f, -0.013492f, 1.269568f, 1.182530f, -0.455390f, -1.328091f,
+ -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f, 0.021176f,
+ 0.169119f, 0.103707f, -0.174504f, -0.462719f, -0.079445f, -0.247128f,
+ 0.459111f, 0.036129f, 0.769570f, -0.080405f, 1.667107f, 0.355567f,
+ -2.433896f, 0.627572f, -0.600090f, -0.651872f, -0.059769f, -0.041945f,
+ -0.009933f, 0.014864f, -0.049378f, -0.041561f, 0.075180f, 0.138307f,
+ 0.122366f, -0.160756f, 0.215327f, 0.013572f, 0.198194f, -0.762650f,
+ 0.054466f, 1.110332f, 1.692853f, 0.658654f, -0.409549f, 0.506085f,
+ 0.330962f, -0.223008f, 0.007448f, -0.289062f, -0.476231f, -0.228359f,
+ 0.013977f, -0.000609f, -0.673604f, 0.275996f, 0.405291f, 1.693561f,
+ -1.079768f, 1.122516f, -0.203227f, 0.099265f, -0.165207f, -0.323899f,
+ -0.269973f, -0.080122f, 0.127700f, 0.190201f, 0.219527f, 0.306194f,
+ 0.026049f, -0.003779f, 1.107357f, 1.720315f, 1.017908f, 0.078664f,
+ -1.599813f, -0.482636f, -0.117450f, 0.122249f, 0.030220f, 0.039794f,
+ 0.176350f, 0.129715f, -0.305755f, -0.274044f, -0.299640f, -0.187335f,
+ -0.073616f, -0.564507f, -0.127758f, 0.044855f, -0.191090f, 0.039095f,
+ 0.115378f, 0.969352f, -0.088360f, 0.301443f, 0.065726f, -0.019740f,
+ -0.102350f, -0.084913f, -0.194615f, 0.118582f, 0.920789f, -0.171615f,
+ -1.436553f, -0.026419f, -0.730864f, 0.615697f, -0.795079f, 0.119701f,
+ 0.601782f, 0.792902f, 0.184920f, 1.635090f, -0.085860f, -0.033187f,
+ -0.166883f, 0.008487f, -0.128300f, -0.089923f, -0.108781f, -0.133719f,
+ -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f,
+ -0.063101f, -0.121002f, 0.058072f, -0.031166f, 0.086413f, -0.016203f,
+ -0.305075f, -0.005420f, -0.168796f, 0.148745f, -0.116737f, -0.050222f,
+ -0.287952f, -0.290982f, -0.090449f, 0.076098f, -0.345632f, -0.061309f,
+ 0.142218f, 0.035692f, 0.304517f, -0.228031f, 0.119608f, -0.120350f,
+ 0.163404f, -0.105605f, -0.305462f, -0.176657f, 0.210070f, -0.227600f,
+ -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f, 0.017162f,
+ -0.069792f, 0.305768f, -0.421095f, 0.187740f, -0.032059f, 0.575115f,
+ -0.064283f, -0.091828f, 0.772648f, -0.393189f, -0.297098f, 0.141420f,
+ 0.826389f, -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f,
+ 1.546000f, -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f,
+ -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f,
+ -0.492625f, 0.025350f, -0.332075f, -0.475638f, -0.076667f, -0.065779f,
+ 0.108957f, 0.246298f, -0.289007f, -0.442552f, -0.206692f, -0.257453f,
+ 0.073806f, -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f,
+ 0.018810f, -0.098241f, 1.027369f, 0.479328f, 1.129707f, 0.484813f,
+ -0.085207f, 0.621873f, -0.520981f, 0.236175f, 0.273487f, 0.061426f,
+ 0.306085f, 0.161487f, 0.220991f, 0.223783f, -0.091826f, 0.391031f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer0[24] = {
+ 0.580225f, -0.191304f, 1.091767f, -0.134522f, -0.089361f, 0.398750f,
+ -0.882708f, -0.213102f, -0.119981f, 0.378296f, -0.075719f, 0.426598f,
+ -2.015505f, 0.202534f, -1.044792f, -0.841519f, 0.266421f, -0.047115f,
+ -0.131147f, -0.075066f, -0.009441f, 0.853007f, -0.175606f, -0.868306f,
+};
+
+static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = {
+ -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f, -0.340847f,
+ 0.498032f, 0.308652f, -0.051574f, 0.323146f, -0.097547f, -0.040269f,
+ 1.909655f, 0.098348f, 0.588136f, 0.568112f, 0.313297f, 0.920848f,
+ -0.014486f, 0.386014f, 0.029199f, -0.537330f, -0.021502f, 0.349073f,
+ -0.524715f, -0.351848f, 1.565454f, -0.297148f, 0.020177f, 0.648369f,
+ 0.027321f, -0.096052f, -0.363163f, -0.132642f, 0.024292f, -0.734176f,
+ -0.782700f, 0.408299f, 0.476945f, -0.489512f, -0.728318f, -0.632042f,
+ 0.405417f, 0.184086f, -0.400730f, 0.359032f, 0.019710f, -0.217409f,
+ 0.519159f, -0.136316f, 0.993592f, -0.147128f, 0.097495f, 0.426189f,
+ -0.295233f, 0.278799f, 0.080667f, -0.025052f, -0.307757f, 0.418716f,
+ -0.853388f, -0.374878f, -0.322725f, 0.696335f, -0.380649f, -0.160356f,
+ -0.140060f, 0.502455f, 0.656728f, -0.095023f, -0.184198f, -0.347069f,
+ 0.456372f, -0.029754f, 0.907923f, 0.265710f, -0.065505f, 0.226763f,
+ -0.277798f, 0.413292f, -0.593899f, -0.060740f, -0.313358f, -0.249944f,
+ -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f,
+ -0.057382f, 0.334741f, -0.283083f, 0.368280f, -0.407197f, -0.441849f,
};
static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = {
- -0.688947f,
- 0.121075f,
- 0.289597f,
- 0.948091f,
+ -0.478735f,
+ 0.292948f,
+ 0.293172f,
+ 0.040013f,
};
static const NN_CONFIG av1_4_partition_nnconfig_64 = {
@@ -1771,7 +1709,7 @@ static const NN_CONFIG av1_4_partition_nnconfig_64 = {
LABEL_SIZE, // num_outputs
1, // num_hidden_layers
{
- 16, // num_hidden_nodes
+ 24, // num_hidden_nodes
},
{
av1_4_partition_nn_weights_64_layer0,
@@ -1786,8 +1724,725 @@ static const NN_CONFIG av1_4_partition_nnconfig_64 = {
#undef FEATURE_SIZE
#undef LABEL_SIZE
+#define FEATURE_SIZE 4
+static const float
+ av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = {
+ -0.331785f, 0.068675f, -0.323814f, 0.033714f, -0.237835f, 0.166316f,
+ -0.498766f, -0.545634f, -0.266173f, -0.476957f, -0.120409f, -0.021042f,
+ 0.124056f, -0.278750f, -0.110120f, -0.372812f, 4.547939f, 0.097618f,
+ -0.002710f, -0.064169f, -1.841173f, -0.403833f, 0.005536f, 0.067188f,
+ -0.434935f, -0.227421f, -0.000011f, -0.139961f, -0.174056f, -0.652384f,
+ -0.000015f, -0.262847f, -3.319706f, -0.947693f, 0.002981f, 0.016717f,
+ -10.408850f, -0.014568f, -0.000018f, 0.019084f, 1.523383f, 0.074525f,
+ -0.002076f, -0.020734f, 4.881495f, 0.002799f, 0.000342f, -0.019623f,
+ 1.786154f, 0.037462f, -0.019037f, 0.052833f, 11.408153f, -0.044602f,
+ 0.026155f, -0.518627f, -0.474499f, -0.427430f, -0.442733f, -0.011116f,
+ -22.379410f, -0.000549f, -0.001418f, 0.008090f, -0.295090f, -0.230268f,
+ -0.337278f, -0.001127f, -0.644282f, -0.598783f, -0.539417f, -0.003303f,
+ 9.189824f, 0.038066f, -0.004097f, -0.460045f, -0.308858f, -0.242691f,
+ -0.230835f, -0.273057f, 0.152226f, 0.179239f, -0.146382f, -0.004655f,
+ -0.242940f, -0.718862f, -0.001685f, -0.214736f, 3.263186f, 0.079463f,
+ -0.003854f, -0.187461f, -0.599144f, -0.419808f, -0.000597f, -0.136980f,
+ 0.184813f, -0.319525f, -0.007246f, 0.079709f, -0.883229f, -0.343748f,
+ -0.000077f, -0.172214f, -0.548759f, -0.194674f, -0.144786f, 0.043896f,
+ -0.176364f, -0.248394f, -0.090215f, -0.294743f, -0.280980f, -0.181436f,
+ -0.115681f, -0.071915f, -13.035494f, -0.075623f, 0.017052f, -0.171152f,
+ 5.910803f, 0.128344f, 0.010256f, -1.073301f, 2.387826f, 0.166183f,
+ -0.007193f, -0.257836f,
+ };
+
+static const float av1_partition_breakout_nn_bias_128_layer0[32] = {
+ 0.115591f, -0.100178f, -0.165523f, -0.122997f, 11.045759f, 1.034761f,
+ -0.323672f, -0.189087f, 2.850950f, 7.010029f, -21.447067f, 1.877031f,
+ 0.437442f, 5.929414f, -0.117274f, 4.462253f, -0.135198f, -0.145927f,
+ 8.727211f, 0.000000f, -3.532987f, -0.405898f, 11.364439f, -0.141728f,
+ -5.994947f, -0.362574f, 1.857687f, -0.100400f, -0.130312f, 0.006080f,
+ 0.429660f, -8.439470f,
+};
+
+static const float av1_partition_breakout_nn_weights_128_layer1[32] = {
+ -0.013738f, 0.022052f, -0.074437f, -0.211377f, -0.080433f, 0.015543f,
+ 0.002091f, 0.014252f, 0.134834f, 0.190263f, 0.244175f, -0.031747f,
+ 0.020068f, -0.068326f, 0.185471f, 0.660268f, -0.134898f, -0.010376f,
+ -0.276023f, -0.282921f, -0.022769f, 0.007070f, -0.186235f, 0.024407f,
+ -0.024837f, 0.005764f, 0.016599f, -0.040077f, 0.020990f, 0.095054f,
+ -0.039662f, 0.131499f,
+};
+
+static const float av1_partition_breakout_nn_bias_128_layer1[1] = {
+ 0.86678213f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_128_layer0,
+ av1_partition_breakout_nn_weights_128_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_128_layer0,
+ av1_partition_breakout_nn_bias_128_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
+ 0.872892f, -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f,
+ -0.001373f, 0.112147f, 5.281734f, 0.060704f, 0.000838f, -0.961554f,
+ 0.244995f, 0.154515f, -0.292654f, -0.167177f, -3.759112f, -0.486347f,
+ 0.003208f, -0.418226f, 2.618152f, 0.026832f, 0.003988f, -0.404406f,
+ -0.405434f, 0.102791f, -0.033406f, -0.029820f, -4.492342f, -0.154291f,
+ 0.012947f, -0.195075f, 0.009311f, -0.411410f, -0.010986f, -0.554822f,
+ 0.160576f, 0.020796f, -0.457230f, -0.191111f, -7.759542f, -0.065039f,
+ -0.001322f, 0.055691f, 0.291924f, -0.053076f, -0.148379f, -0.298383f,
+ 1.022023f, -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f,
+ -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f,
+ -2.407131f, -0.062304f, 0.000874f, 0.108786f,
+ };
+
+static const float av1_partition_breakout_nn_bias_64_layer0[16] = {
+ 0.081425f, -14.404084f, 11.511393f, -0.930053f, 1.841889f, 15.020920f,
+ -1.872288f, 5.392535f, -0.329335f, -0.005358f, 12.600776f, 0.000000f,
+ -0.337413f, 4.492778f, 0.000000f, 17.043072f,
+};
+
+static const float av1_partition_breakout_nn_weights_64_layer1[16] = {
+ -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f,
+ 0.014185f, 0.067030f, -0.001939f, -0.175049f, 0.245992f, -0.181660f,
+ -0.038572f, 0.307899f, -0.294283f, 0.118323f,
+};
+
+static const float av1_partition_breakout_nn_bias_64_layer1[1] = {
+ -1.33438122f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_64_layer0,
+ av1_partition_breakout_nn_weights_64_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_64_layer0,
+ av1_partition_breakout_nn_bias_64_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = {
+ -4.825528f, -0.145737f, 0.001907f, 0.145415f, -1.858153f, -0.080744f,
+ 0.000601f, 0.211991f, 0.384265f, -0.043945f, -0.521332f, -0.170622f,
+ -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f,
+ -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f,
+ -1.202551f, -0.729727f, -0.370084f, 0.088215f, -1.926800f, -0.086519f,
+ 0.000359f, 0.215120f, 0.718749f, 0.022942f, 0.003840f, -0.176518f,
+ 1.213451f, 0.080786f, 0.001557f, -1.053430f, 0.202698f, -0.583919f,
+ -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f,
+ -0.178518f, -0.585784f, 0.000029f, -0.833014f, -0.331358f, -0.520297f,
+ -0.088676f, -0.178487f, -1.430755f, 0.022981f, -0.106931f, 0.015573f,
+ -0.520814f, -0.045386f, -0.443123f, -0.484209f,
+ };
+
+static const float av1_partition_breakout_nn_bias_32_layer0[16] = {
+ 11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f,
+ 6.669584f, 16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f,
+ -0.423808f, 0.000000f, 6.352258f, -0.155787f,
+};
+
+static const float av1_partition_breakout_nn_weights_32_layer1[16] = {
+ 0.168561f, -0.122519f, 0.524667f, 0.032474f, 0.059097f, 0.011900f,
+ 0.166445f, 0.127256f, -0.034838f, -0.212586f, -0.317973f, 0.348419f,
+ -0.004171f, 0.157694f, 0.117845f, 0.272115f,
+};
+
+static const float av1_partition_breakout_nn_bias_32_layer1[1] = {
+ 0.09049262f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_32_layer0,
+ av1_partition_breakout_nn_weights_32_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_32_layer0,
+ av1_partition_breakout_nn_bias_32_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = {
+ 0.209371f, 0.028758f, 0.005764f, -0.384401f, -0.625777f, -0.005647f,
+ -0.316867f, 0.042985f, 0.127344f, 0.025461f, 0.011465f, -0.071043f,
+ -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f,
+ -0.130997f, -0.012326f, 0.024124f, -0.323578f, -0.005790f, -0.085664f,
+ -1.575066f, -0.119221f, 0.015018f, 0.187204f, 0.238117f, 0.084924f,
+ -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f,
+ -0.278642f, -0.011114f, 0.021162f, 0.081290f, -0.467486f, -0.040771f,
+ -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f,
+ -0.814479f, -0.050450f, -0.003666f, 0.085668f, -0.272589f, 0.057330f,
+ -0.206540f, -0.303418f, 0.075335f, -0.180468f, -0.064872f, -0.755948f,
+ -0.509287f, -0.048877f, -0.001512f, 0.077086f,
+ };
+
+static const float av1_partition_breakout_nn_bias_16_layer0[16] = {
+ 16.421495f, 4.012273f, -1.828571f, 0.000000f, -0.263564f, -0.201972f,
+ 6.564987f, 14.651000f, -3.227779f, 2.241833f, -0.137116f, 0.762876f,
+ 5.625762f, 0.615822f, 0.040057f, 16.668884f,
+};
+
+static const float av1_partition_breakout_nn_weights_16_layer1[16] = {
+ -0.096440f, 0.184316f, -0.021148f, 0.424974f, 0.003743f, 0.006310f,
+ 0.046266f, -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f,
+ 0.269773f, -0.021105f, -0.146698f, 0.188764f,
+};
+
+static const float av1_partition_breakout_nn_bias_16_layer1[1] = {
+ 1.60751927f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_16_layer0,
+ av1_partition_breakout_nn_weights_16_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_16_layer0,
+ av1_partition_breakout_nn_bias_16_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = {
+ -0.255885f, 0.109548f, -0.111054f, -0.476119f, -1.083031f, -0.342003f,
+ 0.048241f, -0.356013f, -0.085054f, 0.124908f, 0.000084f, -0.149906f,
+ -0.729829f, 0.133535f, -0.002125f, 0.207516f, -0.210163f, -0.567365f,
+ -0.590103f, 0.045308f, -0.539406f, 0.130550f, -0.663879f, -0.170549f,
+ 0.017587f, -0.054187f, 0.000550f, 0.038297f, -0.112891f, -0.012751f,
+ -0.048067f, 0.095564f, 0.079892f, 0.077285f, -0.749708f, -0.286312f,
+ -0.054334f, 0.132242f, -0.004152f, -0.209758f, -0.073407f, 0.082306f,
+ -0.001034f, -0.090990f, 0.122823f, -0.109794f, -0.230066f, -0.391155f,
+ -0.262245f, -0.004744f, -0.232246f, 0.099290f, -0.637484f, 0.111937f,
+ -0.548556f, -0.598344f, 0.123265f, -0.281395f, -0.399711f, -0.525671f,
+ -0.596269f, 0.098494f, -0.005765f, 0.173652f,
+ };
+
+static const float av1_partition_breakout_nn_bias_8_layer0[16] = {
+ 0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f,
+ 2.056990f, 5.284306f, 0.639643f, -2.792049f, -2.232339f, -0.232209f,
+ 2.336705f, -0.278834f, 0.231905f, 7.954366f,
+};
+
+static const float av1_partition_breakout_nn_weights_8_layer1[16] = {
+ -0.014439f, 0.010171f, 0.048116f, -0.090659f, -0.081235f, -0.021840f,
+ -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f,
+ 0.055858f, 0.230970f, -0.056466f, 0.119780f,
+};
+
+static const float av1_partition_breakout_nn_bias_8_layer1[1] = {
+ 1.27784479f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_8 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_8_layer0,
+ av1_partition_breakout_nn_weights_8_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_8_layer0,
+ av1_partition_breakout_nn_bias_8_layer1,
+ },
+};
+#undef FEATURE_SIZE
+
+#define FEATURE_SIZE 9 // Input layer size
+#define NUM_NODES 32 // Hidden layer size
+#define LABEL_SIZE 3 // Output layer size
+
+static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ 0.22151f, 0.99424f, 0.23415f, -1.13841f, -0.11277f, 0.09530f, 0.14769f,
+ -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f, 0.15777f, -0.03176f,
+ 0.02729f, -0.37344f, -0.01727f, -0.05469f, 0.19402f, -3.45508f, 0.90106f,
+ -2.91557f, 0.19379f, 0.14356f, -0.13291f, 0.05734f, -0.03032f, -0.13060f,
+ 0.35744f, 1.31630f, -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f,
+ 1.08305f, -0.21596f, 0.76244f, 1.10616f, -1.71706f, 0.05768f, 0.10966f,
+ 0.00949f, -0.12680f, 0.00699f, -0.11522f, -0.38566f, 0.34283f, -0.35266f,
+ -0.40643f, -0.22462f, 0.32300f, -0.39737f, -0.20587f, -0.16096f, 1.07543f,
+ 0.30314f, -1.35659f, -0.38212f, 0.45857f, 0.76615f, 0.16819f, -1.24459f,
+ 0.39677f, 0.87436f, -2.33757f, 1.27471f, 0.27488f, 0.01019f, -0.01221f,
+ -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f,
+ -0.06777f, -1.13868f, 0.01354f, -0.75590f, -0.78222f, -0.07453f, 0.61788f,
+ 0.56899f, 1.17144f, 0.70899f, 0.48568f, 0.11266f, 0.81579f, -0.03929f,
+ 0.01088f, 0.33599f, -0.22401f, -0.49654f, -0.02598f, 0.04509f, -0.08217f,
+ -0.30687f, 0.19851f, -2.96860f, -2.30698f, 0.01848f, 0.11801f, 0.06614f,
+ 0.01673f, -0.11002f, -0.08168f, 0.09204f, -0.06379f, 0.27972f, -0.31716f,
+ -0.00566f, -0.13651f, -0.37276f, 0.01511f, -0.23697f, 0.21696f, -0.19480f,
+ 0.60758f, -0.43506f, -0.02247f, -1.45073f, 0.84442f, -0.94018f, 0.32550f,
+ 0.03985f, -0.06581f, 0.21665f, 0.79472f, -2.41080f, 0.04788f, -0.09492f,
+ -0.10677f, 0.07250f, 0.14329f, -0.37319f, 0.53043f, -0.49108f, 0.25792f,
+ -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f,
+ -0.26196f, 0.93379f, 0.72115f, 0.54464f, 0.27642f, 0.04757f, 2.01629f,
+ 1.55787f, -0.11665f, 1.00722f, -0.24352f, 0.53308f, 0.57719f, 0.39344f,
+ 0.19174f, 0.06339f, -0.02530f, 0.07724f, -0.32416f, -0.26992f, -0.35887f,
+ -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f, -1.52153f, -0.26503f,
+ 0.97552f, -2.96705f, -0.91220f, -0.11827f, 0.00406f, -0.14514f, 0.18417f,
+ -0.20874f, 0.27293f, -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f,
+ -0.19293f, -0.18275f, -0.05902f, 0.58625f, -0.05470f, -0.48814f, -0.45382f,
+ -0.05959f, 2.01250f, -0.30014f, 0.69546f, -1.24180f, 1.34923f, 0.20337f,
+ 0.16850f, 0.07187f, 0.72630f, -0.15380f, -2.40973f, -2.73561f, -1.71375f,
+ -1.61695f, 0.50052f, 0.09730f, 0.00579f, 0.06133f, -0.06512f, -0.61439f,
+ -1.16173f, -0.58716f, 1.60438f, 0.23242f, 0.91847f, 0.49041f, -0.16277f,
+ -0.02574f, -0.64593f, 1.17028f, 0.46852f, 0.14926f, 0.73853f, -0.78521f,
+ 0.05959f, -0.35590f, 0.02039f, 0.10812f, -0.28650f, 1.34038f, -0.72188f,
+ 0.62385f, -0.35271f, -0.39599f, 0.41543f, 0.53124f, -0.23510f, -0.15480f,
+ -0.05066f, -0.33529f, 0.05238f, -0.35311f, -0.26983f, -0.39764f, 0.01085f,
+ 0.26593f, -0.18411f, -0.29945f, 0.50090f, -0.03397f, 0.78562f, -0.33068f,
+ 1.21308f, -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f, 0.65567f,
+ 0.76496f, 0.44325f, 0.01368f, -0.33619f, -0.64256f, 0.64478f, 0.84553f,
+ 1.74183f, 0.22563f, -0.14550f, -0.16258f, 0.03010f, 0.49922f, 0.64575f,
+ -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f, 0.87411f,
+ -1.08228f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = {
+ 0.33919f, -0.03003f, 0.79073f, -0.18508f, 0.00668f, -0.12017f, 0.35362f,
+ -0.51642f, 0.06536f, 0.41668f, -0.06509f, 0.94606f, -0.15385f, 0.14936f,
+ 1.46274f, -0.06961f, 2.82537f, -1.95576f, -0.09457f, 0.02042f, -0.07480f,
+ -0.55083f, 0.26170f, 4.39883f, 0.33999f, -0.10502f, 0.70884f, -0.06992f,
+ -0.22638f, 1.40940f, -0.09309f, 0.05828f,
+};
+
+static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.09209f, 0.26236f, 0.62136f, 0.76324f, -1.14678f, 0.42289f, -0.08895f,
+ -0.97267f, 2.05958f, 0.00843f, 0.35335f, 1.12096f, -0.11679f, 0.07350f,
+ -1.23231f, -0.61990f, 1.51379f, -1.99450f, 0.22441f, 2.41974f, -0.30488f,
+ -0.37869f, 0.47168f, -3.70132f, 0.00061f, 0.19432f, 0.11512f, 0.26200f,
+ -0.35285f, 0.37985f, 0.90571f, 0.27344f, 0.74840f, -0.17965f, -2.51433f,
+ 0.59235f, 1.16670f, -0.53446f, 0.67897f, 0.04505f, -0.86874f, 0.45361f,
+ -0.35033f, 1.21283f, 0.31426f, -0.20841f, 0.56757f, 0.45909f, -1.23683f,
+ 0.09835f, -0.17214f, -0.96323f, 0.01138f, -0.50233f, 0.30104f, 2.01814f,
+ 1.15821f, -0.11947f, 0.74574f, -0.30714f, -0.39646f, -1.30086f, -0.88541f,
+ -0.12259f, -0.54977f, 0.30069f, 1.84299f, -0.95141f, -0.65887f, -0.25888f,
+ -0.63265f, 1.29531f, -0.56672f, 0.10837f, -0.21297f, -2.19131f, 0.01156f,
+ 0.51912f, 0.46704f, 0.42810f, -0.59271f, 0.98469f, -0.17914f, -1.91163f,
+ -0.32807f, 0.48199f, -0.99525f, 1.67108f, -0.87631f, -0.60258f, -0.78731f,
+ -0.32877f, 0.44237f, 0.01087f, 0.07489f, -0.28224f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = {
+ 1.70665f,
+ -0.77954f,
+ -0.92709f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_8 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_8_layer0,
+ av1_rect_partition_nn_weights_8_layer1 },
+ { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.18480f, -0.05410f, -0.18957f, 0.15451f, -0.38649f, -0.26162f, -0.22727f,
+ -0.38555f, -0.36738f, 0.74384f, -1.85999f, 0.98491f, -0.72119f, 1.77321f,
+ 0.39983f, 0.96314f, 0.23695f, 0.30200f, 0.30629f, -0.47617f, -1.43320f,
+ -1.81730f, 0.36554f, -0.07142f, -1.27242f, -1.27697f, 0.00110f, -0.32179f,
+ 0.27460f, 0.45428f, 0.15308f, -0.73906f, -0.28577f, -0.01238f, -0.16958f,
+ -0.85390f, 1.05484f, -1.62812f, 0.77632f, -0.27327f, -0.32527f, 0.32726f,
+ 1.73255f, 0.53763f, 0.59121f, -0.39068f, -0.32451f, -0.31869f, 0.17777f,
+ 0.07519f, -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f,
+ 0.39674f, -0.08596f, 0.18587f, -2.04097f, -1.73993f, 1.57212f, 1.42410f,
+ -1.36762f, -0.41485f, -1.12103f, 0.56959f, 0.11500f, 0.48945f, -0.13585f,
+ 1.22125f, 0.67071f, -1.11812f, -0.20660f, -0.52856f, 0.70663f, 0.74382f,
+ 0.61114f, -0.11454f, 1.14687f, 0.80322f, -0.45965f, -0.44466f, -0.05830f,
+ 0.13206f, -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f,
+ 1.34433f, 2.49427f, 2.91955f, 1.71730f, 0.03295f, 0.03587f, -0.14550f,
+ 0.08189f, -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f,
+ -1.01334f, -0.57302f, 0.22592f, 0.05916f, -0.05305f, -0.89824f, -0.52969f,
+ -0.24542f, 0.27029f, -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f,
+ 1.83695f, 2.19716f, 2.31001f, 0.03657f, 0.00063f, -0.04379f, 0.05835f,
+ -0.08623f, 0.20557f, -0.17791f, 0.07874f, -0.25456f, -0.19513f, -0.27753f,
+ -0.31982f, 0.00245f, -0.33183f, 0.26059f, -0.22165f, 0.37582f, -0.30411f,
+ -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f, 1.03673f,
+ 0.66139f, 0.44941f, -0.44461f, -0.50376f, -0.49664f, 0.18608f, -0.26175f,
+ 0.14844f, 0.78715f, -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f,
+ -0.43135f, -0.22571f, 3.46263f, 3.13580f, -1.33203f, -0.15247f, -0.15866f,
+ -0.11214f, 0.12211f, 0.03964f, -1.87597f, -4.81597f, -4.80195f, -4.98096f,
+ -5.62336f, -0.05337f, -0.00943f, 0.00792f, 0.02742f, 1.05679f, 2.41455f,
+ 0.85382f, 1.42504f, 0.58096f, 0.21443f, 1.02694f, 1.06746f, 1.20242f,
+ 0.60767f, 1.98667f, -0.80879f, -0.63495f, 1.95508f, 0.23952f, -0.15019f,
+ -0.16097f, 0.30155f, -3.42407f, -1.34998f, 9.07689f, -2.22559f, 2.22562f,
+ -0.03348f, -0.05229f, 0.05931f, 0.03042f, -0.18068f, -0.05732f, -0.33010f,
+ -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f, -0.16366f, -0.24935f,
+ -0.69124f, 0.58508f, 0.50654f, 0.04492f, 1.38340f, -1.51487f, 1.72889f,
+ -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f, 0.03784f,
+ 0.08052f, -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f,
+ 0.61751f, -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f,
+ 0.15671f, -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f, -0.11533f,
+ -0.05799f, -0.03142f, 0.20218f, -0.39499f, -0.33859f, -0.13201f, -0.19527f,
+ -0.28459f, -0.20346f, 0.89457f, -2.22103f, -2.37455f, -2.00221f, 2.44553f,
+ 0.33915f, 0.50047f, -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f,
+ -1.70297f, 1.00482f, -0.00103f, -1.40813f, 0.21311f, 0.39230f, -0.07302f,
+ -3.49100f, 1.60675f, -2.90692f, 0.11022f, 0.13507f, -0.13308f, 0.15201f,
+ -0.05573f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = {
+ -0.16783f, -0.16023f, 0.52215f, -0.04109f, 2.00122f, -0.11633f, 0.25535f,
+ 1.80638f, 1.69273f, -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f,
+ 0.00000f, -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f, -0.78548f,
+ -1.39335f, -5.42248f, -0.10388f, 0.07634f, 2.81012f, -0.57429f, -0.15629f,
+ -0.12044f, 1.65478f, -0.75153f, 1.18441f,
+};
+
+static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ -0.26407f, 0.06322f, 0.87932f, 0.17772f, 0.71686f, -0.12283f, 0.08454f,
+ 0.20098f, -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f, 3.80486f,
+ 0.16750f, 0.29218f, 0.57234f, -0.96550f, -0.10599f, -4.91130f, -0.14658f,
+ 0.95803f, -4.13925f, 0.24567f, 0.25708f, 1.60547f, -1.03251f, -0.31053f,
+ -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f, 0.13689f,
+ 0.24504f, 0.49623f, 0.19980f, 0.38349f, 0.37481f, 0.54540f, -0.02198f,
+ 3.43385f, 1.02543f, -0.40921f, -3.07235f, 0.02996f, 0.00323f, -0.35414f,
+ 0.71099f, 1.39334f, 2.43741f, -1.11007f, -0.22739f, -4.21757f, 0.11905f,
+ 0.00353f, -1.69637f, 0.45944f, -0.19884f, 0.03624f, 0.25729f, 0.23659f,
+ -2.08405f, 0.08573f, -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f,
+ -0.09884f, -0.69026f, -0.37284f, 0.04622f, 1.32973f, -0.15414f, 0.19138f,
+ -0.67927f, -0.17658f, 0.36008f, -0.51832f, 0.09887f, -1.94414f, 2.95227f,
+ 1.76937f, -0.26687f, 8.50976f, 0.26247f, 0.60262f, -0.27910f, 0.30061f,
+ -0.05117f, 0.16018f, 0.71195f, 0.57871f, 1.57794f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer1[3] = {
+ 2.68750f,
+ -1.31894f,
+ -1.36768f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_16_layer0,
+ av1_rect_partition_nn_weights_16_layer1 },
+ { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f,
+ -0.09858f, -0.09438f, 0.37306f, 0.23934f, -1.86375f, -1.18307f, -0.32995f,
+ -0.09745f, 0.05431f, -0.13799f, 0.14734f, -0.33219f, 0.18057f, -0.23792f,
+ -0.28126f, 0.02977f, -0.07431f, 0.07860f, 0.00067f, -0.01927f, 1.01841f,
+ -0.57739f, 0.08412f, -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f,
+ -0.16703f, 0.02808f, 0.11994f, -0.26267f, 0.19706f, -0.29707f, -0.25305f,
+ -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f, -0.37001f, -0.23319f,
+ -0.11139f, -0.30513f, 0.04213f, -0.12550f, 0.02504f, 0.33245f, 0.01102f,
+ -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f,
+ 0.65066f, 0.28443f, -1.24943f, -3.00246f, -1.01897f, 0.09304f, 0.70052f,
+ -0.12877f, 0.21120f, -0.37476f, 0.23261f, -0.28401f, 0.09837f, 0.00020f,
+ -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f, 0.16596f, -0.06532f,
+ 1.72938f, 1.57754f, 0.55963f, 0.33246f, -0.20023f, 0.30715f, 0.08629f,
+ 0.18945f, -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f,
+ -0.57698f, 0.04157f, -0.92428f, -1.31268f, 1.78210f, 0.10291f, 1.55042f,
+ -1.26793f, 1.39042f, -1.43729f, 0.25600f, 5.21263f, 5.31955f, 5.19316f,
+ 5.43430f, 0.00294f, -0.00970f, -0.02333f, 0.00250f, 1.17672f, 6.27544f,
+ 4.95973f, 3.54009f, 4.51269f, 0.30750f, 0.78780f, -0.44741f, -0.76442f,
+ 0.75050f, 0.58799f, 0.03400f, -2.09859f, 1.67313f, 0.12503f, 0.28609f,
+ 1.15809f, 2.46530f, -0.04898f, 0.23072f, -0.12635f, -0.82097f, -0.63827f,
+ 2.16779f, 1.77132f, 0.15434f, -1.06427f, 0.06206f, -0.87732f, -0.61897f,
+ -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f, -0.22992f,
+ 1.74638f, 1.29199f, -0.55464f, 0.98316f, 0.06665f, 0.50254f, -0.66292f,
+ 0.17113f, -0.32633f, -1.85803f, -0.92759f, 4.44965f, 1.33057f, 0.02135f,
+ -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f,
+ -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f, 0.14248f,
+ -0.48630f, 0.18840f, 0.11040f, 0.17287f, -0.51880f, 1.12466f, -0.38888f,
+ -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f, -0.26881f,
+ -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f,
+ -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f, 1.29826f, 0.23788f,
+ 0.04189f, 2.66416f, 0.48815f, -0.06803f, 0.96742f, 1.27165f, -0.70348f,
+ -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f,
+ -0.48073f, 2.43657f, -2.60191f, -1.82837f, 0.50440f, 0.71829f, 0.76491f,
+ 0.28293f, 0.20568f, 0.92642f, -0.02496f, 1.43637f, -0.24474f, -1.21030f,
+ 0.54084f, 1.05130f, 1.29572f, 0.03750f, -0.36894f, 0.74548f, -1.33857f,
+ -0.84858f, 1.35230f, 0.80175f, 0.66136f, 1.06473f, 0.18701f, 1.42413f,
+ 0.04661f, -0.07820f, 0.64990f, -0.43595f, 1.18304f, -0.11437f, -0.06365f,
+ 0.03558f, 0.78260f, -1.74890f, 1.56217f, -1.23424f, 4.59193f, -3.35072f,
+ 0.01180f, -0.18296f, -0.20870f, 0.04510f, 1.52595f, -1.37402f, -0.33123f,
+ -0.85957f, 0.80598f, 0.03743f, 0.02354f, 0.37707f, 1.62095f, -0.29627f,
+ -0.31778f, -0.45789f, -0.14906f, 0.25315f, -0.10817f, -0.32610f, -0.40890f,
+ 0.33984f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = {
+ -0.17482f, 0.39042f, 0.00000f, 1.69677f, 0.08792f, -0.09301f, 0.13809f,
+ 4.84061f, 0.00000f, 0.40515f, 0.46246f, 0.20644f, -5.77478f, -1.54510f,
+ 0.05660f, -0.32013f, 0.23649f, 0.03778f, -2.53710f, -0.27869f, 0.45623f,
+ -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f, 1.93272f, -1.07032f,
+ -0.27602f, -1.98063f, 0.20816f, -0.01315f,
+};
+
+static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.02827f, 1.02560f, -0.07137f, -0.31911f, 0.11365f, 0.13684f, -0.07816f,
+ -5.23036f, -0.34340f, 0.84526f, -1.51845f, 0.07017f, -8.12570f, 6.24061f,
+ 0.35739f, -0.09937f, -0.30978f, 0.22032f, 0.74968f, -0.34557f, 0.45547f,
+ -0.16512f, 0.07118f, 1.66415f, 0.41320f, -1.81533f, -1.96004f, 1.04666f,
+ 0.84049f, 4.31009f, 0.68850f, 0.26322f, -0.24634f, -1.25889f, 0.31952f,
+ 0.63632f, 0.05801f, -0.10664f, -0.21992f, 2.44386f, 0.19526f, -0.09838f,
+ 1.53049f, -0.26630f, 3.54126f, -3.40574f, 0.72730f, 0.04557f, 0.92652f,
+ 0.15522f, 2.35895f, -0.13347f, 0.56907f, 0.15352f, 0.01823f, -0.73939f,
+ 0.43104f, 1.90321f, 0.31267f, -0.51972f, 0.50094f, -3.98372f, -3.41518f,
+ -0.48183f, 0.26661f, 0.64146f, 0.14500f, -0.01695f, 0.16653f, -0.37846f,
+ 0.08412f, 2.69714f, -0.20258f, -0.75786f, 0.11201f, 0.61878f, 4.22231f,
+ -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f,
+ -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f, 2.11441f,
+ -1.08794f, -1.41694f, 0.02620f, 2.18792f, 0.04271f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer1[3] = {
+ 2.47332f,
+ -1.65756f,
+ -0.81573f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_32_layer0,
+ av1_rect_partition_nn_weights_32_layer1 },
+ { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ 0.08972f, 4.09095f, -0.31398f, -2.43631f, -0.74767f, 1.42471f, 1.60926f,
+ 1.44721f, 1.88259f, 2.35375f, 1.88299f, 2.01109f, 0.98679f, 2.24131f,
+ 0.06279f, -0.08315f, 0.32107f, 0.91334f, -0.36569f, 5.55049f, 5.44943f,
+ 5.20471f, 5.39099f, -0.01943f, -0.00284f, 0.02203f, -0.01309f, 1.41917f,
+ 6.68460f, -6.15986f, 6.41341f, -3.20630f, -0.00567f, -0.00038f, 0.05960f,
+ 0.04308f, 0.95366f, 3.48535f, 2.98266f, 4.11784f, 3.44255f, 0.61630f,
+ 0.71405f, 0.63945f, -0.00713f, 0.39193f, 1.91621f, 3.32755f, 0.71674f,
+ -0.11647f, 2.07090f, 2.64191f, 0.07949f, -0.05023f, 0.99935f, 0.83145f,
+ 0.75898f, -0.98764f, -0.58731f, 1.21734f, -0.08076f, -3.26780f, 1.66278f,
+ 0.04189f, -0.33177f, -1.58648f, 1.00883f, -0.56132f, -2.34877f, 0.67056f,
+ -2.32297f, -0.91641f, -1.02909f, 4.19781f, 3.87484f, 4.32778f, -1.97171f,
+ -0.24734f, 0.00822f, 0.05892f, 0.12697f, -3.62915f, -2.93127f, 7.94856f,
+ -3.29311f, 3.26001f, -0.02231f, 0.02741f, 0.05919f, 0.08190f, -1.49344f,
+ -0.64475f, -0.24627f, 4.03324f, -1.14799f, -0.18465f, -0.17829f, 0.10394f,
+ 0.08580f, -5.74721f, 4.42467f, 3.63964f, 3.00258f, -1.22744f, -0.29408f,
+ 0.00767f, 0.12305f, 0.05249f, -0.17166f, -0.20120f, -0.32941f, -0.31901f,
+ 0.04628f, -0.35249f, -0.18272f, 0.03956f, -0.19329f, -0.33564f, 0.09856f,
+ -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f,
+ -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f, -0.24918f, -0.53516f,
+ -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f, 6.54429f,
+ -0.00226f, 0.00430f, 0.00321f, 0.00442f, 0.87551f, -0.16224f, -0.22832f,
+ -0.60640f, -0.28738f, 0.18062f, 0.22008f, -0.47406f, 0.80302f, 0.12149f,
+ 1.49530f, 1.05069f, -2.02985f, -0.92833f, 0.25616f, 0.12852f, 3.51840f,
+ 0.25226f, -2.63283f, -4.04386f, 8.46300f, -2.93408f, 0.44069f, 0.08276f,
+ 0.34482f, -0.22615f, 0.28666f, 3.02962f, -1.20055f, -1.04832f, -0.97632f,
+ -0.99530f, 1.44196f, 1.68550f, 0.49360f, 1.08155f, -0.26059f, -0.02876f,
+ -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f,
+ 1.17443f, 2.41497f, 1.90537f, 2.37716f, 2.91495f, -0.44455f, -0.51176f,
+ 0.48195f, 0.53032f, 0.23696f, -1.06211f, 1.47459f, -0.89029f, 0.29521f,
+ 0.66291f, -0.42653f, 1.82308f, -1.30372f, -0.36192f, -3.40388f, -1.61476f,
+ -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f, 0.27297f,
+ -4.81472f, 4.60404f, -0.11053f, 0.14765f, 0.02826f, -0.14688f, -0.07066f,
+ -0.01224f, 1.20377f, 7.02725f, -6.02627f, 6.87255f, -3.14257f, 0.01074f,
+ 0.02397f, -0.02359f, 0.01901f, 0.14956f, -1.67671f, 2.26714f, 2.57043f,
+ -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f,
+ -2.14814f, -0.67266f, -1.56850f, 0.57137f, -1.14428f, -0.34265f, -0.12521f,
+ 0.01220f, -0.74906f, -0.19270f, 0.68110f, -0.24737f, -0.70568f, -1.64826f,
+ -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f, 6.17812f,
+ -0.03191f, -0.00104f, 0.01402f, -0.00046f, -0.94517f, 1.51266f, -0.56318f,
+ 0.72260f, -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f, 2.21148f,
+ -1.47954f, -1.01439f, 0.31536f, 0.77238f, -0.85083f, -0.15758f, -0.50886f,
+ 0.09101f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = {
+ 0.91706f, -1.31328f, -5.16196f, 1.13191f, -0.98044f, -1.61122f, 1.03039f,
+ -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f, -0.10752f, -0.13065f,
+ -0.35567f, -0.35693f, 1.74941f, 1.17379f, -3.45555f, 5.66321f, -0.24917f,
+ -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f, -2.97859f, -0.16774f,
+ 0.59835f, -0.31269f, -0.30585f, -1.66212f,
+};
+
+static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.58963f, 4.20320f, -8.62465f, -6.54014f, 5.41108f, 2.33581f, -0.10354f,
+ -1.17753f, -3.45909f, -2.24722f, 2.20881f, 3.21971f, -0.09087f, -0.21624f,
+ 0.16529f, -8.40985f, -1.60205f, -1.41538f, 4.41826f, -4.63069f, -0.27742f,
+ 4.08710f, 0.26439f, -1.46028f, 0.51234f, 6.25212f, -3.35650f, -1.21348f,
+ 1.37201f, 8.89151f, 0.28859f, -0.97328f, -0.36196f, -2.71701f, 4.54196f,
+ -0.62476f, -2.43814f, -1.34209f, 0.12850f, 1.73859f, 3.09809f, -4.42434f,
+ -1.82552f, -3.66420f, -0.31535f, 0.00968f, -0.02019f, 9.66824f, 0.58835f,
+ 1.50425f, 2.84487f, 2.55522f, 0.01409f, -2.27594f, -0.31800f, 0.91076f,
+ -0.66808f, 0.33120f, -0.12460f, 0.64457f, -0.36416f, -10.30843f, 1.51013f,
+ 2.06861f, -0.20989f, -0.87119f, 3.68642f, 7.33662f, -2.88037f, -0.52414f,
+ -0.35036f, -0.45947f, -0.07406f, 6.46346f, -0.16031f, 0.27071f, 0.38845f,
+ -0.21940f, 0.08583f, -1.39526f, 0.50554f, 0.45279f, -6.61856f, 1.84069f,
+ -0.19149f, -1.77235f, 0.75136f, 1.11797f, 0.32677f, -7.10427f, 3.82908f,
+ 1.04238f, -0.91435f, 1.93317f, -1.84946f, -0.48909f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer1[3] = {
+ 0.32215f,
+ -0.57522f,
+ 0.25314f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_64_layer0,
+ av1_rect_partition_nn_weights_64_layer1 },
+ { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.70901f, -3.03481f, 3.30604f, -1.28803f, -0.08610f, -0.33320f, -0.30716f,
+ 0.25100f, 0.14323f, -0.98422f, -0.89084f, -0.24508f, -1.10785f, -0.82524f,
+ 0.11766f, -0.42777f, 1.08965f, 4.35125f, -1.19388f, 4.22042f, 4.96306f,
+ 6.32406f, 3.29899f, -0.90768f, 0.05203f, 0.38467f, 1.74257f, -0.19918f,
+ -0.11335f, 0.00140f, -0.42303f, -0.04419f, 0.03583f, -0.05441f, -0.19586f,
+ 0.01484f, -1.19964f, 0.25497f, 3.04502f, 0.05446f, -0.23253f, 0.00266f,
+ 0.07117f, -2.78986f, -4.62953f, 1.45331f, 0.43923f, 0.92298f, -0.47736f,
+ 1.49165f, 0.45942f, -1.99787f, 3.33510f, 0.17234f, 0.04024f, -1.42780f,
+ 0.23566f, -0.90970f, 1.18041f, -1.45865f, 2.30878f, -1.28507f, 1.87290f,
+ 1.91186f, 4.74826f, -3.70735f, 4.49808f, -4.72275f, -0.02696f, -0.02642f,
+ -0.06093f, -0.01121f, -0.70683f, 2.69737f, -1.88563f, 2.48637f, 1.10922f,
+ 0.74624f, 0.40308f, 2.06396f, 1.39289f, 0.00909f, -2.05271f, -1.53539f,
+ -1.38323f, 0.83303f, -0.32250f, 0.51172f, 3.91249f, 1.66373f, 1.13184f,
+ -2.22874f, -1.13448f, -0.11185f, 0.19387f, 0.36770f, -0.58933f, 0.22789f,
+ 1.17307f, 0.77461f, 0.20817f, 0.33417f, 0.54037f, 0.32961f, -0.18456f,
+ -9.78171f, -0.17216f, -3.44703f, -2.42158f, 0.51946f, 4.35949f, -0.73335f,
+ -1.61515f, -0.29622f, -0.37617f, -0.42316f, 0.74922f, 1.44386f, 3.92704f,
+ -3.76274f, 4.19775f, -3.86958f, 0.00074f, -0.02418f, -0.12944f, 0.05857f,
+ -0.85507f, 5.42546f, 5.40338f, 5.54347f, 5.59791f, -0.01611f, 0.01618f,
+ -0.01654f, -0.00270f, -0.39608f, -0.40410f, -0.24551f, 0.09124f, -0.34413f,
+ -0.11504f, 0.12793f, -0.31523f, 0.09148f, -0.08567f, -0.05140f, -0.13310f,
+ -0.81200f, 0.06882f, -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f,
+ -1.02573f, 0.32788f, 1.06828f, -1.25503f, -0.42693f, 2.01413f, -2.29103f,
+ 0.62271f, 1.11764f, -1.83113f, -1.32325f, -1.65651f, -2.87826f, 1.46910f,
+ 0.60885f, 0.16079f, 0.00171f, -0.25658f, -0.25465f, -0.14149f, 0.19497f,
+ -0.07866f, -0.37080f, -0.05778f, -0.08870f, -0.20491f, 0.84521f, -0.18214f,
+ -1.38441f, -1.08932f, -1.76627f, 0.73172f, 0.05967f, 1.28057f, 3.42722f,
+ 1.69287f, 0.77169f, 0.44528f, 1.85513f, 0.07840f, 1.31252f, 2.89948f,
+ 1.49489f, 0.15281f, 0.54708f, -1.14185f, -2.51063f, 0.36618f, -0.55322f,
+ 0.96671f, 1.59470f, 1.38252f, 1.99697f, 0.03266f, -0.23200f, -0.01127f,
+ -0.18918f, -0.37598f, -0.03119f, -0.36039f, -0.21192f, -0.11565f, -4.22635f,
+ 1.41252f, 0.56608f, -0.08867f, 3.11924f, -0.54597f, -0.12504f, -0.05289f,
+ -0.28665f, -0.58297f, -1.18362f, -0.76201f, -1.22011f, -0.58756f, 0.14740f,
+ 1.43971f, 0.98381f, -0.02998f, -0.40678f, -0.23047f, -0.12979f, 0.04003f,
+ -0.22081f, -0.09294f, -0.15955f, -0.10379f, -0.10192f, -1.51316f, 2.39482f,
+ -1.69975f, 3.58976f, -0.91032f, -0.03498f, 0.48982f, -0.13418f, 0.76256f,
+ 1.61003f, -2.01676f, -1.24430f, -3.25763f, 1.12314f, 2.00740f, 0.04613f,
+ -0.14746f, -0.57374f, 3.44511f, -0.56767f, -4.08432f, -2.04894f, 2.35951f,
+ -0.00458f, 0.18512f, 0.09916f, -0.04084f, -1.56207f, 1.38034f, 4.17302f,
+ -1.47326f, -2.03530f, -0.00210f, 0.27469f, -0.17423f, 0.86860f, 2.76195f,
+ 2.43269f, -3.57331f, 2.08715f, -1.44171f, -0.17389f, 2.26157f, -0.07852f,
+ 2.02519f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = {
+ 2.53427f, 1.66678f, -0.84914f, -0.15070f, -1.74769f, 0.45218f, -0.26067f,
+ 2.05916f, 0.08978f, 5.30984f, 2.66243f, -1.62740f, 0.70018f, 1.96403f,
+ -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f, -0.08373f, 0.00225f,
+ -1.40692f, -0.27569f, -0.30253f, 0.77377f, -0.67636f, -0.26379f, 1.82348f,
+ 0.66120f, 0.61119f, -1.42293f, 0.32676f,
+};
+
+static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 1.53453f, -0.23707f, 7.88368f, 0.33340f, 0.97523f, 1.38538f, -0.16746f,
+ 4.42070f, 3.18678f, -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f,
+ -8.75673f, 0.27398f, -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f,
+ 3.32134f, 0.25375f, -0.00394f, 2.30213f, -0.14183f, 0.14544f, -1.42830f,
+ 1.31101f, 3.99389f, -0.00017f, -2.90184f, -2.11444f, 2.16734f, -3.05133f,
+ 0.39206f, 4.61489f, -2.88181f, -0.47745f, 2.86649f, -1.20621f, 3.70550f,
+ 1.58029f, -4.58731f, -2.29350f, -0.76930f, 5.19135f, -0.22521f, -5.08782f,
+ 2.17316f, 1.30563f, 0.16777f, -2.17767f, -2.09904f, 1.37001f, 0.25091f,
+ -1.76743f, 1.57940f, 0.30544f, -2.39895f, -0.08532f, -1.77122f, 1.84010f,
+ -0.88449f, 0.79299f, -1.35368f, -4.54110f, 0.02244f, -5.11580f, 1.60883f,
+ 0.29352f, -6.47042f, -1.81426f, 1.24013f, 0.90980f, 7.93977f, 2.12555f,
+ 5.24720f, 4.19508f, 0.21499f, 11.06045f, -0.74752f, 0.89396f, 0.26422f,
+ 1.72332f, -1.25113f, -1.71136f, 0.13676f, -0.07867f, -0.96929f, 0.19911f,
+ 3.58233f, -0.76470f, -2.24162f, -2.87465f, 3.18736f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer1[3] = {
+ 1.09014f,
+ -0.53317f,
+ -0.55668f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_128_layer0,
+ av1_rect_partition_nn_weights_128_layer1 },
+ { av1_rect_partition_nn_bias_128_layer0,
+ av1_rect_partition_nn_bias_128_layer1 }
+};
+#undef FEATURE_SIZE
+#undef NUM_NODES
+#undef LABEL_SIZE
+
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_AB_PARTITION_MODEL_WEIGHTS_H_
+#endif // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
index 461c3af83..c5508e25c 100644
--- a/third_party/aom/av1/encoder/picklpf.c
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -70,7 +70,7 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
// TODO(any): please enable multi-thread and remove the flag when loop
// filter mask is compatible with multi-thread.
#if LOOP_FILTER_BITMASK
- av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+ av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, 0, plane,
plane + 1, partial_frame);
#else
if (cpi->num_workers > 1)
@@ -193,6 +193,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
(void)sd;
lf->sharpness_level = 0;
+ cpi->td.mb.rdmult = cpi->rd.RDMULT;
if (method == LPF_PICK_MINIMAL_LPF) {
lf->filter_level[0] = 0;
diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h
index 2a168358e..357097ae1 100644
--- a/third_party/aom/av1/encoder/picklpf.h
+++ b/third_party/aom/av1/encoder/picklpf.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_PICKLPF_H_
-#define AV1_ENCODER_PICKLPF_H_
+#ifndef AOM_AV1_ENCODER_PICKLPF_H_
+#define AOM_AV1_ENCODER_PICKLPF_H_
#ifdef __cplusplus
extern "C" {
@@ -27,4 +27,4 @@ void av1_pick_filter_level(const struct yv12_buffer_config *sd,
} // extern "C"
#endif
-#endif // AV1_ENCODER_PICKLPF_H_
+#endif // AOM_AV1_ENCODER_PICKLPF_H_
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
index 28b693b08..e7804f6b4 100644
--- a/third_party/aom/av1/encoder/pickrst.c
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -15,6 +15,7 @@
#include <math.h>
#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/binary_codes_writer.h"
@@ -22,7 +23,6 @@
#include "aom_mem/aom_mem.h"
#include "aom_ports/mem.h"
#include "aom_ports/system_state.h"
-
#include "av1/common/onyxc_int.h"
#include "av1/common/quant_common.h"
#include "av1/common/restoration.h"
@@ -181,6 +181,77 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
}
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0,
+ int flt0_stride, int32_t *flt1,
+ int flt1_stride, int xq[2],
+ const sgr_params_type *params) {
+ int i, j;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+ assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ }
+ } else if (params->r[0] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[0] * (flt0[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ }
+ } else if (params->r[1] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[1] * (flt1[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt1 += flt1_stride;
+ }
+ } else {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t e = (int32_t)(dat[j]) - src[j];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ }
+
+ return err;
+}
+
static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
int src_stride, const uint8_t *dat8,
int dat_stride, int use_highbitdepth,
@@ -192,21 +263,9 @@ static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
int xq[2];
decode_xq(xqd, xq, params);
if (!use_highbitdepth) {
- const uint8_t *src = src8;
- const uint8_t *dat = dat8;
- for (i = 0; i < height; ++i) {
- for (j = 0; j < width; ++j) {
- const int32_t u =
- (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
- int32_t v = u << SGRPROJ_PRJ_BITS;
- if (params->r[0] > 0) v += xq[0] * (flt0[i * flt0_stride + j] - u);
- if (params->r[1] > 0) v += xq[1] * (flt1[i * flt1_stride + j] - u);
- const int32_t e =
- ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
- src[i * src_stride + j];
- err += e * e;
- }
- }
+ err = av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, xq, params);
} else {
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
@@ -463,9 +522,11 @@ static void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width,
// Iterate over the stripe in blocks of width pu_width
for (int j = 0; j < width; j += pu_width) {
const int w = AOMMIN(pu_width, width - j);
- av1_selfguided_restoration(dat8_row + j, w, h, dat_stride, flt0_row + j,
- flt1_row + j, flt_stride, sgr_params_idx,
- bit_depth, use_highbd);
+ const int ret = av1_selfguided_restoration(
+ dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
+ flt_stride, sgr_params_idx, bit_depth, use_highbd);
+ (void)ret;
+ assert(!ret);
}
}
}
@@ -588,22 +649,9 @@ static void search_sgrproj(const RestorationTileLimits *limits,
if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
}
-static double find_average(const uint8_t *src, int h_start, int h_end,
- int v_start, int v_end, int stride) {
- uint64_t sum = 0;
- double avg = 0;
- int i, j;
- aom_clear_system_state();
- for (i = v_start; i < v_end; i++)
- for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
- avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
- return avg;
-}
-
-static void compute_stats(int wiener_win, const uint8_t *dgd,
- const uint8_t *src, int h_start, int h_end,
- int v_start, int v_end, int dgd_stride,
- int src_stride, double *M, double *H) {
+void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+ int h_start, int h_end, int v_start, int v_end,
+ int dgd_stride, int src_stride, double *M, double *H) {
int i, j, k, l;
double Y[WIENER_WIN2];
const int wiener_win2 = wiener_win * wiener_win;
@@ -626,8 +674,7 @@ static void compute_stats(int wiener_win, const uint8_t *dgd,
assert(idx == wiener_win2);
for (k = 0; k < wiener_win2; ++k) {
M[k] += Y[k] * X;
- H[k * wiener_win2 + k] += Y[k] * Y[k];
- for (l = k + 1; l < wiener_win2; ++l) {
+ for (l = k; l < wiener_win2; ++l) {
// H is a symmetric matrix, so we only need to fill out the upper
// triangle here. We can copy it down to the lower triangle outside
// the (i, j) loops.
@@ -1073,9 +1120,9 @@ static void search_wiener(const RestorationTileLimits *limits,
limits->h_start, limits->h_end, limits->v_start,
limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
} else {
- compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer, limits->h_start,
- limits->h_end, limits->v_start, limits->v_end,
- rsc->dgd_stride, rsc->src_stride, M, H);
+ av1_compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+ limits->h_start, limits->h_end, limits->v_start,
+ limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
}
const MACROBLOCK *const x = rsc->x;
@@ -1266,6 +1313,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
// problem, as these elements are ignored later, but in order to quiet
// Valgrind's warnings we initialise the array below.
memset(rusi, 0, sizeof(*rusi) * ntiles[0]);
+ cpi->td.mb.rdmult = cpi->rd.RDMULT;
RestSearchCtxt rsc;
const int plane_start = AOM_PLANE_Y;
diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h
index 179b89ff9..3fec0c34b 100644
--- a/third_party/aom/av1/encoder/pickrst.h
+++ b/third_party/aom/av1/encoder/pickrst.h
@@ -8,22 +8,39 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_PICKRST_H_
-#define AV1_ENCODER_PICKRST_H_
+#ifndef AOM_AV1_ENCODER_PICKRST_H_
+#define AOM_AV1_ENCODER_PICKRST_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "av1/encoder/encoder.h"
+#include "aom_ports/system_state.h"
struct yv12_buffer_config;
struct AV1_COMP;
+static const uint8_t g_shuffle_stats_data[16] = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+};
+
+static INLINE double find_average(const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int stride) {
+ uint64_t sum = 0;
+ double avg = 0;
+ int i, j;
+ aom_clear_system_state();
+ for (i = v_start; i < v_end; i++)
+ for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+ avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+ return avg;
+}
+
void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_PICKRST_H_
+#endif // AOM_AV1_ENCODER_PICKRST_H_
diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h
index 42a4c590b..40dd46768 100644
--- a/third_party/aom/av1/encoder/pustats.h
+++ b/third_party/aom/av1/encoder/pustats.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_PUSTATS_H_
-#define AV1_ENCODER_PUSTATS_H_
+#ifndef AOM_AV1_ENCODER_PUSTATS_H_
+#define AOM_AV1_ENCODER_PUSTATS_H_
#ifdef __cplusplus
extern "C" {
@@ -18,83 +18,78 @@ extern "C" {
#include "av1/encoder/ml.h"
-#define NUM_FEATURES 11
+#define NUM_FEATURES_PUSTATS 8
#define NUM_HIDDEN_LAYERS 2
#define HIDDEN_LAYERS_0_NODES 12
#define HIDDEN_LAYERS_1_NODES 10
#define LOGITS_NODES 1
static const float
- av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES *
+ av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
HIDDEN_LAYERS_0_NODES] = {
- 21.5067f, 22.6709f, 0.0049f, 0.9288f, -0.0100f, 0.0060f, -0.0071f,
- -0.0085f, 0.0348f, -0.1273f, 10.1154f, 6.3405f, 7.8589f, -0.0652f,
- -4.6352f, 0.0445f, -3.2748f, 0.1025f, -0.0385f, -0.4505f, 1.1320f,
- 3.2634f, 23.2420f, -7.9056f, 0.0522f, -18.1555f, 0.0977f, 0.1155f,
- -0.0138f, 0.0267f, -0.3992f, 0.2735f, 22.8063f, 35.1043f, 3.8140f,
- -0.0295f, 0.0771f, -0.6938f, 0.0302f, -0.0266f, 0.0989f, -0.0794f,
- 0.2981f, 33.3333f, -24.1150f, 1.4986f, -0.0975f, -15.3938f, -0.0858f,
- -0.0845f, -0.0869f, -0.0858f, 0.3542f, 0.0155f, -18.2629f, 9.6688f,
- -11.9643f, -0.2904f, -5.3026f, -0.1011f, -0.1202f, 0.0127f, -0.0269f,
- 0.3434f, 0.0595f, 16.6800f, 41.4730f, 6.9269f, -0.0512f, -1.4540f,
- 0.0468f, 0.0077f, 0.0983f, 0.1265f, -0.5234f, 0.9477f, 36.6470f,
- -0.4838f, -0.2269f, -0.1143f, -0.3907f, -0.5005f, -0.0179f, -0.1057f,
- 0.1233f, -0.4412f, -0.0474f, 0.1140f, -21.6813f, -0.9077f, -0.0078f,
- -3.3306f, 0.0417f, 0.0412f, 0.0427f, 0.0418f, -0.1699f, 0.0072f,
- -22.3335f, 16.1203f, -10.1220f, -0.0019f, 0.0005f, -0.0054f, -0.0155f,
- -0.0302f, -0.0379f, 0.1276f, 0.1568f, 21.6175f, 12.2919f, 11.0327f,
- -0.2000f, -8.6691f, -0.5593f, -0.5952f, -0.4203f, -0.4857f, -1.1239f,
- 3.1404f, -13.1098f, -5.9165f, 22.2060f, -0.0312f, -3.9642f, -0.0344f,
- -0.0656f, -0.0273f, -0.0465f, 0.1412f, -6.1974f, 9.3661f,
+ -0.1758f, -0.0499f, -10.0069f, -2.2838f, -0.3359f, 0.3459f, -0.3285f,
+ -0.0515f, -0.5417f, 0.2357f, -0.0575f, -69.0782f, 0.5348f, 1.4068f,
+ 0.2213f, -1.0490f, -0.0636f, 0.1654f, 1.1002f, 33.4924f, 0.4358f,
+ 1.2499f, 0.1143f, 0.0592f, -1.6335f, -0.0092f, 1.2207f, -28.4543f,
+ -0.4973f, 0.4368f, 0.2341f, -0.1623f, -3.8986f, 0.1311f, -1.8789f,
+ -3.9079f, -0.8158f, -0.8420f, 1.4295f, -2.3629f, -1.4825f, 0.6498f,
+ -5.3669f, 6.4434f, 1.8393f, -35.0678f, 3.7459f, -2.8504f, 2.0502f,
+ -0.1812f, -3.9011f, -1.0155f, 1.8375f, -1.4517f, 1.3917f, 3.8664f,
+ 0.8345f, -0.3472f, 5.7740f, -1.1196f, -0.3264f, -1.2481f, -0.9284f,
+ -4.9657f, 2.2831f, 0.7337f, 2.3176f, 0.6416f, 0.8804f, 1.9988f,
+ -1.3426f, 1.2728f, 1.2249f, -0.1551f, 5.6045f, 0.2046f, -2.1464f,
+ -2.4922f, -0.5334f, 12.1055f, 7.2467f, -0.0070f, 0.0234f, 0.0021f,
+ 0.0215f, -0.0098f, -0.0682f, -6.1494f, -0.3176f, -1.6069f, -0.2119f,
+ -1.0533f, -0.3566f, 0.5294f, -0.4335f, 0.1626f,
};
static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
{
- -14.3065f, 2.059f, -62.9916f, -50.1209f, 57.643f, -59.3737f,
- -30.4737f, -0.1112f, 72.5427f, 55.402f, 24.9523f, 18.5834f,
+ 10.5266f, 5.3268f, -1.0678f, 7.7411f, 8.7164f, -0.3235f,
+ 7.3028f, 9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f,
};
static const float
av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
HIDDEN_LAYERS_1_NODES] = {
- 0.3883f, -0.2784f, -0.2850f, 0.4894f, -2.2450f, 0.4511f, -0.1969f,
- -0.0077f, -1.4924f, 0.1138f, -2.9848f, 1.0211f, -0.1712f, -0.1952f,
- -0.4774f, 0.0761f, -0.3186f, -0.1002f, 0.8663f, 0.5026f, 1.1920f,
- 0.9337f, 0.3911f, -0.3841f, -0.0037f, 0.7295f, -0.3183f, 0.1829f,
- -1.3670f, -0.1046f, 0.6629f, 0.0619f, -0.1551f, 0.8174f, 2.1521f,
- -1.3323f, -0.0527f, -0.5772f, 0.2001f, -0.6270f, -1.0625f, 0.3342f,
- 0.6676f, 0.4605f, -2.0049f, 0.7781f, 0.0713f, -0.0824f, -0.4529f,
- 0.1757f, -0.1338f, -0.2319f, -0.2864f, 0.1248f, 0.3887f, -0.1676f,
- 1.8422f, 0.6435f, 1.2123f, -0.5667f, -0.2423f, -0.0314f, 0.2411f,
- -0.5013f, 0.0422f, 0.2559f, 0.4435f, -0.1223f, 1.5167f, 0.3939f,
- 1.0898f, 0.0795f, -0.9251f, -0.0813f, -0.5929f, -0.0741f, 4.0687f,
- -0.4368f, -0.0984f, 0.0837f, 3.6169f, 0.0662f, -0.1679f, -0.8090f,
- -0.2610f, -0.5791f, 0.0642f, -0.2979f, -0.9036f, 0.2898f, 0.3265f,
- 0.4660f, -1.6358f, -0.0347f, 0.1087f, 0.0353f, 0.5687f, -0.5242f,
- -0.4895f, 0.7693f, -1.3829f, -0.2244f, -0.2880f, 0.0575f, 2.0563f,
- -0.2322f, -1.1597f, 1.6125f, -0.0925f, 1.3540f, 0.1432f, 0.3993f,
- -0.0303f, -1.1438f, -1.7323f, -0.4329f, 2.9443f, -0.5724f, 0.0122f,
- -1.0829f,
+ 10.5932f, 2.5192f, -0.0015f, 5.9479f, 5.2426f, -0.4091f, 5.3220f,
+ 6.0469f, 0.7200f, 3.3241f, 5.5006f, 12.8290f, -1.6396f, 0.5743f,
+ -0.8370f, 1.9956f, -4.9270f, -1.5295f, 2.1350f, -9.4415f, -0.7094f,
+ 5.1822f, 19.7287f, -3.0444f, -0.3320f, 0.0031f, -0.2709f, -0.5249f,
+ 0.3281f, -0.2240f, 0.2225f, -0.2386f, -0.4370f, -0.2438f, -0.4928f,
+ -0.2842f, -2.1772f, 9.2570f, -17.6655f, 3.5448f, -2.8394f, -1.0167f,
+ -0.5115f, -1.9260f, -0.2111f, -0.7528f, -1.2387f, -0.0401f, 5.0716f,
+ -3.3763f, -0.2898f, -0.4956f, -7.9993f, 0.1526f, -0.0242f, 0.7354f,
+ 6.0432f, 4.8043f, 7.4790f, -0.6295f, 1.7565f, 3.7197f, -2.3963f,
+ 6.8945f, 2.9717f, -3.1623f, 3.4241f, 4.4676f, -1.8154f, -2.9401f,
+ -8.5657f, -3.0240f, -1.4661f, 8.1145f, -12.7858f, 3.3624f, -1.0819f,
+ -4.2856f, 1.1801f, -0.5587f, -1.6062f, -1.1813f, -3.5882f, -0.2490f,
+ -24.9566f, -0.4140f, -0.1113f, 3.5537f, 4.4112f, 0.1367f, -1.5876f,
+ 1.6605f, 1.3903f, -0.0253f, -2.1419f, -2.2197f, -0.7659f, -0.4249f,
+ -0.0424f, 0.1486f, 0.4643f, -0.9068f, -0.3619f, -0.7624f, -0.9132f,
+ -0.4947f, -0.3527f, -0.5445f, -0.4768f, -1.7761f, -1.0686f, 0.5462f,
+ 1.3371f, 4.3116f, 0.0777f, -2.7216f, -1.8908f, 3.4989f, 7.7269f,
+ -2.7566f,
};
static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
{
- -10.3717f, 37.304f, -36.7221f, -52.7572f, 44.0877f,
- 41.1631f, 36.3299f, -48.6087f, -4.5189f, 13.0611f,
+ 13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f,
+ 6.1715f, 0.5094f, 7.6433f, -0.3992f, -1.3555f,
};
static const float
av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
- 0.8362f, 1.0615f, -1.5178f, -1.2959f, 1.3233f,
- 1.4909f, 1.3554f, -0.8626f, -0.618f, -0.9458f,
+ 4.3078f, -17.3497f, 0.0195f, 34.6032f, -5.0127f,
+ 5.3079f, 10.0077f, -13.129f, 0.0087f, -8.4009f,
};
static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
- 30.6878f,
+ 4.5103f,
};
static const NN_CONFIG av1_pustats_rate_nnconfig = {
- NUM_FEATURES, // num_inputs
+ NUM_FEATURES_PUSTATS, // num_inputs
LOGITS_NODES, // num_outputs
NUM_HIDDEN_LAYERS, // num_hidden_layers
{ HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes
@@ -111,76 +106,71 @@ static const NN_CONFIG av1_pustats_rate_nnconfig = {
};
static const float
- av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES *
+ av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
HIDDEN_LAYERS_0_NODES] = {
- 0.7770f, 1.0881f, 0.0177f, 0.4939f, -0.2541f, -0.2672f, -0.1705f,
- -0.1940f, -0.6395f, 1.2928f, 3.6240f, 2.4445f, 1.6790f, 0.0265f,
- 0.1897f, 0.1776f, 0.0422f, 0.0197f, -0.0466f, 0.0462f, -1.0827f,
- 2.0231f, 1.8044f, 2.7022f, 0.0064f, 0.2255f, -0.0552f, -0.1010f,
- -0.0581f, -0.0781f, 0.2614f, -3.4085f, 1.7478f, 0.1155f, -0.1458f,
- -0.0031f, -0.1797f, -0.4378f, -0.0539f, 0.0607f, -0.1347f, -0.3142f,
- -0.2014f, -0.4484f, -0.2808f, 1.5913f, 0.0046f, -0.0610f, -0.6479f,
- -0.7278f, -0.5592f, -0.6695f, -0.8120f, 2.9056f, -1.1501f, 9.3618f,
- 4.2486f, 0.0011f, -0.1499f, -0.0834f, 0.1282f, 0.0409f, 0.1670f,
- -0.1398f, -0.4661f, 13.7700f, 8.2061f, -0.0685f, 0.0061f, -0.2951f,
- 0.0169f, 0.0520f, 0.0040f, 0.0374f, 0.0467f, -0.0107f, 14.2664f,
- -2.2489f, -0.2516f, -0.0061f, -0.9921f, 0.1223f, 0.1212f, 0.1199f,
- 0.1185f, -0.4867f, 0.0325f, -5.0757f, -8.7853f, 1.0450f, 0.0169f,
- 0.5462f, 0.0051f, 0.1330f, 0.0143f, 0.1429f, -0.0258f, 0.2769f,
- -12.8839f, 22.3093f, 1.2761f, 0.0037f, -1.2459f, -0.0466f, 0.0003f,
- -0.0464f, -0.0067f, 0.2361f, 0.0355f, 23.3833f, 10.9218f, 2.6811f,
- 0.0222f, -1.1055f, 0.1825f, 0.0575f, 0.0114f, -0.1259f, 0.3148f,
- -2.0047f, 11.9559f, 5.7375f, 0.8802f, 0.0042f, -0.2469f, -0.1040f,
- -1.5679f, 0.1969f, -0.0184f, 0.0157f, 0.6688f, 3.4492f,
+ -0.2560f, 0.1105f, -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f,
+ 0.4885f, 1.7518f, 0.4985f, 0.5582f, -0.3739f, 0.9403f, 0.3874f,
+ 0.3265f, 1.7383f, 3.1747f, 0.0285f, 3.3942f, -0.0123f, 0.5057f,
+ 0.1584f, 0.2697f, 4.6151f, 3.6251f, -0.0121f, -1.0047f, -0.0037f,
+ 0.0127f, 0.1935f, -0.5277f, -2.7144f, 0.0729f, -0.1457f, -0.0816f,
+ -0.5462f, 0.4738f, 0.3599f, -0.0564f, 0.0910f, 0.0126f, -0.0310f,
+ -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f, -0.2662f, -0.0999f,
+ -0.2983f, -0.4899f, -0.2314f, 0.2873f, -0.3614f, 0.1783f, -0.1210f,
+ 0.3569f, 0.5436f, -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f,
+ 1.9045f, 0.5463f, 0.1102f, -0.9293f, -0.0185f, -0.8302f, -0.4378f,
+ -0.3531f, -1.3095f, 0.6099f, 0.7977f, 4.1950f, -0.0067f, -0.2762f,
+ -0.1574f, -0.2149f, 0.6104f, -1.7053f, 0.1904f, 4.2402f, -0.2671f,
+ 0.8940f, 0.6820f, 0.2241f, -0.9459f, 1.4571f, 0.5255f, 2.3352f,
+ -0.0806f, 0.5231f, 0.3928f, 0.4146f, 2.0956f,
};
static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
{
- 4.5051f, -4.5858f, 1.4693f, 0.f, 3.7968f, -3.6292f,
- -7.3112f, 10.9743f, 8.027f, -2.2692f, -8.748f, -1.3689f,
+ 1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f,
+ 0.f, 1.1485f, 2.7085f, -4.7897f, 1.4093f, -1.657f,
};
static const float
av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
HIDDEN_LAYERS_1_NODES] = {
- -0.0182f, -0.0925f, -0.0311f, -0.2962f, 0.1177f, -0.0027f, -0.2136f,
- -1.2094f, 0.0935f, -0.1403f, -0.1477f, -0.0752f, 0.1519f, -0.4726f,
- -0.3521f, 0.4199f, -0.0168f, -0.2927f, -0.2510f, 0.0706f, -0.2920f,
- 0.2046f, -0.0400f, -0.2114f, 0.4240f, -0.7070f, 0.4964f, 0.4471f,
- 0.3841f, -0.0918f, -0.6140f, 0.6056f, -0.1123f, 0.3944f, -0.0178f,
- -1.7702f, -0.4434f, 0.0560f, 0.1565f, -0.0793f, -0.0041f, 0.0052f,
- -0.1843f, 0.2400f, -0.0605f, 0.3196f, -0.0286f, -0.0002f, -0.0595f,
- -0.0493f, -0.2636f, -0.3994f, -0.1871f, -0.3298f, -0.0788f, -1.0685f,
- 0.1900f, -0.5549f, -0.1350f, -0.0153f, -0.1195f, -0.5874f, 1.0468f,
- 0.0212f, -0.2306f, -0.2677f, -0.3000f, -1.0702f, -0.1725f, -0.0656f,
- -0.0226f, 0.0616f, -0.3453f, 0.0810f, 0.4838f, -0.3780f, -1.4486f,
- 0.7777f, -0.0459f, -0.6568f, 0.0589f, -1.0286f, -0.6001f, 0.0826f,
- 0.4794f, -0.0586f, -0.1759f, 0.3811f, -0.1313f, 0.3829f, -0.0968f,
- -2.0445f, -0.3566f, -0.1491f, -0.0745f, -0.0202f, 0.0839f, 0.0470f,
- -0.2432f, 0.3013f, -0.0743f, -0.3479f, 0.0749f, -5.2490f, 0.0209f,
- -0.1653f, -0.0826f, -0.0535f, 0.3225f, -0.3786f, -0.0104f, 0.3091f,
- 0.3652f, 0.1757f, -0.3252f, -1.1022f, -0.0574f, -0.4473f, 0.3469f,
- -0.5539f,
+ -0.5203f, -1.3468f, 0.3865f, -0.6859f, 0.0058f, 4.0682f, 0.4807f,
+ -0.1380f, 0.6050f, 0.8958f, 0.7748f, -0.1311f, 1.7317f, 1.1265f,
+ 0.0827f, 0.1407f, -0.3605f, 0.5429f, 0.1880f, -0.1439f, 0.2837f,
+ 1.6477f, 0.0832f, 0.0593f, -1.8464f, -0.7241f, -1.0672f, -0.3546f,
+ -0.3842f, -2.3637f, 0.2514f, 0.8263f, -0.1872f, 0.5774f, -0.3610f,
+ -0.0205f, 1.3977f, -0.1083f, 0.6923f, 1.3039f, -0.2870f, 1.0622f,
+ -0.0566f, 0.2697f, -0.5429f, -0.6193f, 1.7559f, 0.3246f, 1.9159f,
+ 0.3744f, 0.0686f, 1.0191f, -0.4212f, 1.9591f, -0.0691f, -0.1085f,
+ -1.2034f, 0.0606f, 1.0116f, 0.5565f, -0.1874f, -0.7898f, 0.4796f,
+ 0.2290f, 0.4334f, -0.5817f, -0.2949f, 0.1367f, -0.2932f, -1.1265f,
+ 0.0133f, -0.5309f, -3.3191f, 0.0939f, 0.3895f, -2.5812f, -0.0066f,
+ -3.0063f, -0.2982f, 0.7309f, -0.2422f, -0.2770f, -0.7152f, 0.1700f,
+ 1.9630f, 0.1988f, 0.4194f, 0.8762f, 0.3402f, 0.1051f, -0.1598f,
+ 0.2405f, 0.0392f, 1.1256f, 1.5245f, 0.0950f, 0.2160f, -0.5023f,
+ 0.2584f, 0.2074f, 0.2218f, 0.3966f, -0.0921f, -0.2435f, -0.4560f,
+ -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f, -0.3342f, -0.7888f,
+ -0.4488f, -1.7168f, 0.3341f, 0.1146f, 0.5226f, 0.2610f, -0.4574f,
+ -0.4164f,
};
static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
{
- 11.9337f, -0.3681f, -6.1324f, 12.674f, 9.0956f,
- 4.6069f, -4.4158f, -12.4848f, 10.8473f, 5.7633f,
+ -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f,
+ 2.7149f, -2.5649f, 2.7765f, 2.9617f, 2.7684f,
};
static const float
av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
- 0.3245f, 0.2979f, -0.157f, -0.1441f, 0.1413f,
- -0.7496f, -0.1737f, -0.5322f, 0.0748f, 0.2518f,
+ -0.6868f, -0.6715f, 0.449f, -1.293f, 0.6214f,
+ 0.9894f, -0.4342f, 0.7002f, 1.4363f, 0.6951f,
};
static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
- 4.6065f,
+ 2.3371f,
};
static const NN_CONFIG av1_pustats_dist_nnconfig = {
- NUM_FEATURES, // num_inputs
+ NUM_FEATURES_PUSTATS, // num_inputs
LOGITS_NODES, // num_outputs
NUM_HIDDEN_LAYERS, // num_hidden_layers
{ HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes
@@ -196,7 +186,6 @@ static const NN_CONFIG av1_pustats_dist_nnconfig = {
},
};
-#undef NUM_FEATURES
#undef NUM_HIDDEN_LAYERS
#undef HIDDEN_LAYERS_0_NODES
#undef HIDDEN_LAYERS_1_NODES
@@ -206,4 +195,4 @@ static const NN_CONFIG av1_pustats_dist_nnconfig = {
} // extern "C"
#endif
-#endif // AV1_ENCODER_PUSTATS_H_
+#endif // AOM_AV1_ENCODER_PUSTATS_H_
diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h
index 9b2dac965..0bca39102 100644
--- a/third_party/aom/av1/encoder/random.h
+++ b/third_party/aom/av1/encoder/random.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RANDOM_H_
-#define AV1_ENCODER_RANDOM_H_
+#ifndef AOM_AV1_ENCODER_RANDOM_H_
+#define AOM_AV1_ENCODER_RANDOM_H_
#ifdef __cplusplus
extern "C" {
@@ -26,4 +26,4 @@ static INLINE unsigned int lcg_rand16(unsigned int *state) {
} // extern "C"
#endif
-#endif // AV1_ENCODER_RANDOM_H_
+#endif // AOM_AV1_ENCODER_RANDOM_H_
diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h
index 1019055ed..c429f2ce5 100644
--- a/third_party/aom/av1/encoder/ransac.h
+++ b/third_party/aom/av1/encoder/ransac.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RANSAC_H_
-#define AV1_ENCODER_RANSAC_H_
+#ifndef AOM_AV1_ENCODER_RANSAC_H_
+#define AOM_AV1_ENCODER_RANSAC_H_
#include <stdio.h>
#include <stdlib.h>
@@ -32,4 +32,4 @@ int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
int ransac_translation(int *matched_points, int npoints,
int *num_inliers_by_motion, double *params_by_motion,
int num_motions);
-#endif // AV1_ENCODER_RANSAC_H_
+#endif // AOM_AV1_ENCODER_RANSAC_H_
diff --git a/third_party/aom/av1/encoder/rate_distortion_model_params.h b/third_party/aom/av1/encoder/rate_distortion_model_params.h
index 14d23f10f..7cd0962c5 100644
--- a/third_party/aom/av1/encoder/rate_distortion_model_params.h
+++ b/third_party/aom/av1/encoder/rate_distortion_model_params.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
-#define AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#ifndef AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#define AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
#ifdef __cplusplus
extern "C" {
@@ -588,4 +588,4 @@ static const NN_CONFIG av1_rdcost_model_nnconfig = {
} // extern "C"
#endif
-#endif // AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#endif // AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
index 3aae0144e..2597fb990 100644
--- a/third_party/aom/av1/encoder/ratectrl.c
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -117,7 +117,7 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
for (i = 0; i < QINDEX_RANGE; i++) {
const double maxq = av1_convert_qindex_to_q(i, bit_depth);
kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
- kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+ kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth);
arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
@@ -253,6 +253,9 @@ int av1_rc_get_default_min_gf_interval(int width, int height,
int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
interval += (interval & 0x01); // Round to even value
+#if CONFIG_FIX_GF_LENGTH
+ interval = AOMMAX(FIXED_GF_LENGTH, interval);
+#endif
return AOMMAX(interval, min_gf_interval);
}
@@ -299,9 +302,9 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
- rc->rate_correction_factors[i] = 1.0;
+ rc->rate_correction_factors[i] = 0.7;
}
-
+ rc->rate_correction_factors[KF_STD] = 1.0;
rc->min_gf_interval = oxcf->min_gf_interval;
rc->max_gf_interval = oxcf->max_gf_interval;
if (rc->min_gf_interval == 0)
@@ -556,6 +559,14 @@ static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
arfgf_low_motion_minq, arfgf_high_motion_minq);
}
+#if REDUCE_LAST_ALT_BOOST
+static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
+ int *arfgf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+ return arfgf_high_motion_minq[q];
+}
+#endif
+
static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
const RATE_CONTROL *const rc = &cpi->rc;
const unsigned int curr_frame = cpi->common.current_video_frame;
@@ -918,7 +929,7 @@ int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
#define STATIC_MOTION_THRESH 95
static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
int height, int *bottom_index,
- int *top_index) {
+ int *top_index, int *arf_q) {
const AV1_COMMON *const cm = &cpi->common;
const RATE_CONTROL *const rc = &cpi->rc;
const AV1EncoderConfig *const oxcf = &cpi->oxcf;
@@ -959,7 +970,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
qindex = rc->last_boosted_qindex;
last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
- last_boosted_q * 0.75, bit_depth);
+ last_boosted_q * 0.5, bit_depth);
active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
}
} else {
@@ -1000,17 +1011,49 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
// For constrained quality dont allow Q less than the cq level
if (oxcf->rc_mode == AOM_CQ) {
if (q < cq_level) q = cq_level;
+#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+ (is_intrl_arf_boost && !cpi->new_bwdref_update_rule)) {
+#endif // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
- active_best_quality = get_gf_active_quality(rc, q, bit_depth);
-
- // Constrained quality use slightly lower active best.
- active_best_quality = active_best_quality * 15 / 16;
+ // Constrained quality use slightly lower active best.
+ active_best_quality = active_best_quality * 15 / 16;
+#if REDUCE_LAST_ALT_BOOST
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+ const int boost = min_boost - active_best_quality;
+ active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+ }
+#endif
+ *arf_q = active_best_quality;
+#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+ } else {
+ active_best_quality = rc->arf_q;
+ int this_height = gf_group->pyramid_level[gf_group->index];
+ while (this_height < gf_group->pyramid_height) {
+ active_best_quality = (active_best_quality + cq_level + 1) / 2;
+ ++this_height;
+ }
+ }
+#endif // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
} else if (oxcf->rc_mode == AOM_Q) {
if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
active_best_quality = cq_level;
} else {
- active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+ *arf_q = active_best_quality;
+#if REDUCE_LAST_ALT_BOOST
+ const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+ const int boost = min_boost - active_best_quality;
+
+ active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+#endif
+ } else {
+ active_best_quality = rc->arf_q;
+ }
#if USE_SYMM_MULTI_LAYER
if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
int this_height = gf_group->pyramid_level[gf_group->index];
@@ -1030,6 +1073,12 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
}
} else {
active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+#if REDUCE_LAST_ALT_BOOST
+ const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+ const int boost = min_boost - active_best_quality;
+
+ active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+#endif
#if USE_SYMM_MULTI_LAYER
if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
int this_height = gf_group->pyramid_level[gf_group->index];
@@ -1104,7 +1153,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
} else {
- q = rc->last_boosted_qindex;
+ q = AOMMIN(rc->last_boosted_qindex,
+ (active_best_quality + active_worst_quality) / 2);
}
} else {
q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
@@ -1129,7 +1179,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
return q;
}
-int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height,
int *bottom_index, int *top_index) {
int q;
if (cpi->oxcf.pass == 0) {
@@ -1140,8 +1190,17 @@ int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index,
top_index);
} else {
+ assert(cpi->oxcf.pass == 2 && "invalid encode pass");
+
+ GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ int arf_q = 0;
+
q = rc_pick_q_and_bounds_two_pass(cpi, width, height, bottom_index,
- top_index);
+ top_index, &arf_q);
+
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ cpi->rc.arf_q = arf_q;
+ }
}
return q;
@@ -1327,13 +1386,6 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
update_golden_frame_stats(cpi);
if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
-
- // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
- // differently here for rc->avg_frame_bandwidth.
- if (cm->show_frame || rc->is_bwd_ref_frame) {
- rc->frames_since_key++;
- rc->frames_to_key--;
- }
// if (cm->current_video_frame == 1 && cm->show_frame)
/*
rc->this_frame_target =
@@ -1635,10 +1687,6 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
rc->max_gf_interval = rc->static_scene_max_gf_interval;
-#if FIX_GF_INTERVAL_LENGTH
- rc->max_gf_interval = FIXED_GF_LENGTH + 1;
-#endif
-
// Clamp min to max
rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
}
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
index f0508da9e..198ecab97 100644
--- a/third_party/aom/av1/encoder/ratectrl.h
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RATECTRL_H_
-#define AV1_ENCODER_RATECTRL_H_
+#ifndef AOM_AV1_ENCODER_RATECTRL_H_
+#define AOM_AV1_ENCODER_RATECTRL_H_
#include "aom/aom_codec.h"
#include "aom/aom_integer.h"
@@ -25,13 +25,27 @@ extern "C" {
#define BPER_MB_NORMBITS 9
#define CUSTOMIZED_GF 1
-#define FIX_GF_INTERVAL_LENGTH 0
-#if FIX_GF_INTERVAL_LENGTH
+#if CONFIG_FIX_GF_LENGTH
#define FIXED_GF_LENGTH 16
+#define MAX_PYRAMID_LVL 4
+// We allow a frame to have at most two left/right descendants before changing
+// them into to a subtree, i.e., we allow the following structure:
+/* OUT_OF_ORDER_FRAME
+ / / \ \
+(two left children) F F F F (two right children) */
+// Therefore the max gf size supported by 4 layer structure is
+// 1 (KEY/OVERLAY) + 1 + 2 + 4 + 16 (two children on both side of their parent)
+#define MAX_PYRAMID_SIZE 24
#define USE_SYMM_MULTI_LAYER 1
+#define REDUCE_LAST_ALT_BOOST 1
+#define REDUCE_LAST_GF_LENGTH 1
+#define MULTI_LVL_BOOST_VBR_CQ 1
#else
#define USE_SYMM_MULTI_LAYER 0
+#define REDUCE_LAST_ALT_BOOST 0
+#define REDUCE_LAST_GF_LENGTH 0
+#define MULTI_LVL_BOOST_VBR_CQ 0
#endif
#if USE_SYMM_MULTI_LAYER
@@ -159,6 +173,9 @@ typedef struct {
// Auto frame-scaling variables.
int rf_level_maxq[RATE_FACTOR_LEVELS];
+ float_t arf_boost_factor;
+ // Q index used for ALT frame
+ int arf_q;
} RATE_CONTROL;
struct AV1_COMP;
@@ -228,7 +245,7 @@ void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
int *frame_over_shoot_limit);
// Picks q and q bounds given the target for bits
-int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int width, int height,
+int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height,
int *bottom_index, int *top_index);
// Estimates q to achieve a target bits per frame
@@ -275,4 +292,4 @@ int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
} // extern "C"
#endif
-#endif // AV1_ENCODER_RATECTRL_H_
+#endif // AOM_AV1_ENCODER_RATECTRL_H_
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.c b/third_party/aom/av1/encoder/ratectrl_xiph.c
deleted file mode 100644
index e69de29bb..000000000
--- a/third_party/aom/av1/encoder/ratectrl_xiph.c
+++ /dev/null
diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.h b/third_party/aom/av1/encoder/ratectrl_xiph.h
deleted file mode 100644
index e69de29bb..000000000
--- a/third_party/aom/av1/encoder/ratectrl_xiph.h
+++ /dev/null
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
index c4d4777bf..b87d89e50 100644
--- a/third_party/aom/av1/encoder/rd.c
+++ b/third_party/aom/av1/encoder/rd.c
@@ -648,6 +648,473 @@ void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
}
}
+static double interp_cubic(const double *p, double x) {
+ return p[1] + 0.5 * x *
+ (p[2] - p[0] +
+ x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
+ x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+}
+
+static double interp_bicubic(const double *p, int p_stride, double x,
+ double y) {
+ double q[4];
+ q[0] = interp_cubic(p, x);
+ q[1] = interp_cubic(p + p_stride, x);
+ q[2] = interp_cubic(p + 2 * p_stride, x);
+ q[3] = interp_cubic(p + 3 * p_stride, x);
+ return interp_cubic(q, y);
+}
+
+static const double interp_rgrid_surf[65 * 18] = {
+ 0.104019, 0.245714, 0.293686, 0.358635, 0.382167, 0.412446,
+ 0.419955, 0.421388, 0.426672, 0.427990, 0.428531, 0.456868,
+ 0.569880, 0.638822, 1.016319, 2.143453, 3.565229, 4.720880,
+ 0.124618, 0.294211, 0.352023, 0.429991, 0.458206, 0.494510,
+ 0.503513, 0.505232, 0.511566, 0.513234, 0.519365, 0.570225,
+ 0.697373, 0.840624, 1.462198, 3.289054, 6.256517, 6.852788,
+ 0.118630, 0.269669, 0.346620, 0.430999, 0.459385, 0.495783,
+ 0.504808, 0.506532, 0.512884, 0.514988, 0.543437, 0.662772,
+ 0.795876, 1.313596, 2.403841, 4.163098, 7.440589, 8.616275,
+ 0.093329, 0.168205, 0.321320, 0.430607, 0.459385, 0.495783,
+ 0.504813, 0.506548, 0.512975, 0.520662, 0.571659, 0.701841,
+ 1.010727, 2.138851, 3.460626, 6.317955, 10.098127, 14.418553,
+ 0.087021, 0.142905, 0.315011, 0.430509, 0.459385, 0.495787,
+ 0.505075, 0.507599, 0.513584, 0.543182, 0.669941, 0.825620,
+ 1.362800, 2.572187, 4.205047, 7.498399, 12.303118, 16.641735,
+ 0.086923, 0.142513, 0.314913, 0.430508, 0.459385, 0.495803,
+ 0.506126, 0.511816, 0.514810, 0.549705, 0.725350, 1.127334,
+ 2.168597, 3.463686, 6.318605, 10.162284, 18.556041, 19.847042,
+ 0.086923, 0.142513, 0.314913, 0.430506, 0.459376, 0.495805,
+ 0.506388, 0.512954, 0.520772, 0.580215, 0.810474, 1.391548,
+ 2.579442, 4.205160, 7.498399, 12.381597, 21.703618, 24.015457,
+ 0.086923, 0.142513, 0.314911, 0.430353, 0.458765, 0.495652,
+ 0.506391, 0.513406, 0.544098, 0.702950, 1.121860, 2.168961,
+ 3.463798, 6.318607, 10.162284, 18.685361, 28.188192, 37.638872,
+ 0.086923, 0.142513, 0.314901, 0.429742, 0.456313, 0.495045,
+ 0.506484, 0.519195, 0.580104, 0.810126, 1.391462, 2.579441,
+ 4.205160, 7.498399, 12.381597, 21.848607, 33.367199, 42.623190,
+ 0.086923, 0.142513, 0.314899, 0.429589, 0.455706, 0.495155,
+ 0.507882, 0.542426, 0.702360, 1.119921, 2.168478, 3.463791,
+ 6.318607, 10.162284, 18.685361, 28.345760, 47.802028, 49.163533,
+ 0.086924, 0.142548, 0.315086, 0.429842, 0.455870, 0.496336,
+ 0.512412, 0.556953, 0.773373, 1.266396, 2.548277, 4.204676,
+ 7.498399, 12.381597, 21.848607, 33.548250, 54.301011, 56.262859,
+ 0.087067, 0.144957, 0.327436, 0.446616, 0.466362, 0.505706,
+ 0.522077, 0.610747, 0.972543, 1.666916, 3.338812, 6.316669,
+ 10.162284, 18.685361, 28.345760, 48.065311, 66.145302, 78.396020,
+ 0.094295, 0.164235, 0.393722, 0.534219, 0.530922, 0.579308,
+ 0.603889, 0.760870, 1.229961, 2.423214, 4.173513, 7.497916,
+ 12.381597, 21.848607, 33.548250, 54.589585, 74.875848, 86.468182,
+ 0.124096, 0.213005, 0.497188, 0.665176, 0.685973, 0.800200,
+ 0.911394, 1.077971, 1.677290, 3.332129, 6.314960, 10.162257,
+ 18.685361, 28.345760, 48.065311, 66.453506, 98.275189, 96.862588,
+ 0.140999, 0.270140, 0.658212, 0.867661, 0.970183, 1.149516,
+ 1.480599, 1.664833, 2.421893, 3.857981, 7.418830, 12.380371,
+ 21.848607, 33.548250, 54.589585, 75.188867, 106.657971, 99.762997,
+ 0.178353, 0.398001, 0.988462, 1.241473, 1.340967, 1.713568,
+ 2.335030, 2.701432, 3.348532, 5.077158, 9.829903, 18.676528,
+ 28.345700, 48.065311, 66.453506, 98.588283, 117.057193, 101.130722,
+ 0.281079, 0.548300, 1.395825, 1.780770, 2.000508, 2.702964,
+ 3.638454, 4.573843, 5.051641, 7.079129, 11.293332, 21.594861,
+ 33.544335, 54.589585, 75.188867, 106.971065, 119.957601, 101.466632,
+ 0.476762, 0.842189, 2.019678, 2.723895, 3.188467, 4.011610,
+ 5.545111, 7.508984, 8.176339, 9.774504, 14.720782, 27.334416,
+ 48.049609, 66.453506, 98.588283, 117.370357, 121.329855, 101.509242,
+ 0.993999, 1.520111, 3.013605, 4.203530, 4.982992, 6.074944,
+ 8.583581, 11.818375, 14.192544, 14.937517, 21.258160, 33.305953,
+ 54.585735, 75.188867, 106.971135, 120.279824, 121.976055, 102.690130,
+ 1.776487, 2.613655, 4.356487, 6.161726, 7.622196, 9.464193,
+ 13.077233, 18.051656, 23.221051, 24.080068, 30.085038, 48.345269,
+ 66.457698, 98.588353, 117.379415, 121.976128, 124.356210, 107.713202,
+ 3.191085, 4.495201, 5.686033, 8.365566, 11.275339, 14.706437,
+ 20.300969, 28.152237, 35.688355, 39.341382, 41.030743, 55.752262,
+ 75.211764, 106.980285, 120.608403, 124.680746, 130.222528, 112.260098,
+ 6.136611, 7.305215, 7.272532, 10.646713, 15.630815, 22.383168,
+ 31.349131, 42.419822, 52.301680, 58.983454, 58.915405, 69.161305,
+ 98.992460, 117.713855, 124.344836, 130.623638, 138.442401, 127.846670,
+ 11.707980, 13.490761, 11.640845, 14.176132, 22.131124, 33.776462,
+ 47.365711, 61.603834, 75.281056, 83.463985, 85.510533, 86.026513,
+ 108.787480, 123.031136, 130.607284, 138.954406, 160.867784, 158.958882,
+ 27.062874, 32.195139, 24.147297, 22.114632, 35.580506, 52.551674,
+ 71.652956, 88.606776, 102.107193, 110.703186, 114.398733, 111.118539,
+ 121.503578, 132.455924, 139.490806, 161.412674, 193.563210, 172.203945,
+ 35.625692, 47.953028, 42.639820, 42.276254, 58.815664, 84.977282,
+ 110.656412, 126.168446, 134.658126, 140.604482, 144.006012, 141.702382,
+ 140.125323, 153.122630, 164.748041, 194.156197, 206.854650, 174.013079,
+ 49.516447, 65.335381, 71.738306, 81.872819, 98.400740, 136.840488,
+ 163.775802, 169.440078, 172.747876, 171.222919, 171.679604, 172.173550,
+ 168.200129, 187.617133, 199.683394, 207.768200, 210.062520, 175.478356,
+ 60.341673, 92.487135, 119.907299, 136.068010, 144.778950, 189.443534,
+ 220.120077, 219.641635, 214.616503, 205.894657, 198.453924, 200.013069,
+ 195.938103, 206.118661, 210.447375, 212.061379, 216.078218, 181.162805,
+ 78.422159, 112.242899, 158.416312, 181.404320, 193.188690, 229.296967,
+ 270.461799, 275.168977, 256.511701, 244.706786, 231.344608, 226.065087,
+ 222.248618, 218.662324, 217.966722, 218.248574, 218.818588, 182.740573,
+ 88.713664, 123.594164, 172.928179, 213.781414, 245.800351, 252.063414,
+ 313.283141, 331.703831, 305.866639, 285.177142, 269.759635, 251.988739,
+ 245.998388, 232.688076, 230.588702, 230.882657, 230.319053, 192.120741,
+ 102.540561, 152.905927, 189.137131, 241.806756, 273.868497, 284.258017,
+ 339.689853, 373.561104, 362.657463, 326.291984, 311.922687, 290.460189,
+ 276.774381, 273.012072, 277.751792, 279.123748, 278.820447, 233.813798,
+ 132.983118, 176.307242, 197.415684, 243.307787, 280.893995, 332.922370,
+ 340.329043, 404.530166, 419.475405, 375.775209, 351.300889, 340.042759,
+ 315.683832, 306.123530, 306.359319, 306.733063, 307.609556, 261.647847,
+ 149.579109, 185.925581, 207.937033, 245.159084, 301.890957, 350.040480,
+ 352.250771, 418.742329, 458.112686, 430.125208, 386.460441, 380.346839,
+ 354.679150, 337.305620, 334.504124, 335.889932, 341.060725, 286.898578,
+ 153.576812, 202.105624, 219.366967, 248.524506, 314.255692, 350.607526,
+ 390.567688, 408.629209, 488.000213, 480.563823, 432.461799, 410.412624,
+ 398.607371, 400.188740, 402.780916, 408.853470, 430.449735, 363.777088,
+ 161.353129, 214.848904, 231.549852, 258.536466, 313.163177, 368.140577,
+ 412.136393, 413.409032, 499.838438, 519.571063, 485.833867, 444.562715,
+ 435.738129, 442.358549, 450.166531, 453.208524, 458.424358, 385.823139,
+ 175.109034, 227.608058, 250.069563, 286.101747, 312.256740, 378.421485,
+ 413.344147, 435.058646, 476.960941, 542.448886, 530.189154, 495.408402,
+ 475.326752, 465.017144, 464.694045, 465.144689, 466.905382, 398.669138,
+ 184.750180, 240.766694, 283.240772, 305.480150, 322.409001, 374.526162,
+ 427.141326, 452.840323, 472.604139, 545.366105, 567.676694, 541.666203,
+ 509.591873, 492.044219, 492.778569, 493.765684, 493.235693, 413.684325,
+ 194.728357, 254.928927, 289.991157, 300.193195, 324.194589, 371.563147,
+ 439.226438, 468.295088, 495.654854, 533.506353, 587.476353, 578.298989,
+ 548.041942, 527.393885, 538.965146, 545.070442, 544.295454, 454.012211,
+ 205.195287, 283.135677, 297.921431, 319.295927, 355.621830, 392.466463,
+ 446.696167, 485.053519, 516.426615, 532.264584, 588.481600, 615.906737,
+ 589.319634, 555.754316, 558.389367, 569.094521, 569.779764, 475.384946,
+ 218.552054, 298.511016, 319.188338, 351.781666, 372.789510, 412.827434,
+ 464.569387, 506.270203, 533.049810, 553.347364, 580.644599, 632.759854,
+ 622.235843, 569.960552, 580.799340, 586.553714, 579.488366, 491.826482,
+ 244.803348, 299.790203, 324.187975, 363.280782, 403.710443, 441.724083,
+ 492.732682, 534.722691, 552.193622, 575.112647, 586.097705, 635.224970,
+ 644.642944, 606.017786, 640.321218, 642.316989, 616.397020, 548.300111,
+ 256.957358, 318.638991, 355.063346, 389.889307, 433.607315, 468.209001,
+ 515.178157, 573.556591, 578.113115, 587.246475, 601.762801, 638.454644,
+ 656.574853, 641.184609, 676.908189, 684.198162, 678.387412, 574.805864,
+ 251.211502, 323.448532, 364.227424, 411.792704, 462.226488, 503.572288,
+ 549.299249, 599.124071, 601.227977, 597.118176, 613.247552, 633.278532,
+ 658.074755, 664.930719, 685.731531, 693.632845, 693.076350, 578.326477,
+ 267.695377, 354.273736, 389.976833, 438.518178, 493.332686, 544.343027,
+ 588.895829, 620.206193, 628.327410, 606.067827, 620.998532, 657.985256,
+ 683.936059, 691.345257, 693.894723, 695.175306, 693.618786, 578.517148,
+ 274.290725, 363.465288, 411.808596, 463.369805, 515.310226, 581.009306,
+ 613.070738, 636.638714, 647.333929, 629.867603, 644.646319, 687.796202,
+ 702.859596, 713.495479, 704.068069, 704.991807, 704.188594, 587.283658,
+ 302.538449, 389.174737, 438.518422, 493.398902, 547.662399, 601.981814,
+ 624.773046, 641.629484, 644.699451, 645.848784, 668.033340, 703.643523,
+ 707.422408, 717.329600, 726.298973, 744.127507, 745.365167, 617.954068,
+ 310.328188, 410.984766, 463.369805, 515.315010, 581.309832, 613.787792,
+ 634.988538, 654.145284, 662.632978, 668.413496, 706.494057, 750.545471,
+ 730.724808, 730.002100, 743.625262, 750.801609, 745.308457, 606.505800,
+ 329.948756, 437.600191, 493.398902, 547.661910, 601.917884, 622.557745,
+ 633.244395, 644.055898, 648.224221, 665.062911, 763.555733, 812.391078,
+ 769.063582, 744.865168, 727.579796, 724.950408, 722.179707, 598.564510,
+ 350.848328, 462.437458, 515.315010, 581.309823, 613.779123, 634.465309,
+ 652.056257, 662.179143, 671.466297, 726.881256, 819.824030, 880.232789,
+ 810.371672, 754.246481, 725.053473, 724.253390, 723.503395, 603.394909,
+ 373.704088, 492.408266, 547.661910, 601.917884, 622.557620, 633.236320,
+ 644.023513, 648.232514, 666.381639, 785.498283, 929.441612, 999.772800,
+ 890.339033, 775.852504, 731.840181, 726.905100, 725.251844, 604.899901,
+ 394.473422, 514.261306, 581.309823, 613.779123, 634.465309, 652.056257,
+ 662.179143, 671.466557, 727.134512, 835.764144, 981.747089, 1018.462934,
+ 939.686967, 811.276731, 739.398459, 727.365647, 725.285425, 604.923525,
+ 419.976505, 546.538939, 601.917884, 622.557620, 633.236320, 644.023513,
+ 648.232514, 666.381639, 785.545191, 932.841398, 1036.609617, 1026.945092,
+ 963.822765, 840.827315, 755.532423, 730.241865, 725.366847, 604.924155,
+ 437.281359, 580.116337, 613.779123, 634.465309, 652.056257, 662.179143,
+ 671.466557, 727.134512, 835.764859, 981.996194, 1031.896881, 1002.544732,
+ 881.157178, 828.151494, 799.340975, 751.314325, 728.316587, 605.005504,
+ 464.713920, 600.649281, 622.557620, 633.236320, 644.023513, 648.232514,
+ 666.381639, 785.545191, 932.841398, 1036.735329, 1035.037004, 995.478339,
+ 858.093733, 823.471976, 819.881754, 798.749289, 749.440463, 607.955244,
+ 495.880237, 612.473139, 634.465309, 652.056257, 662.179143, 671.466557,
+ 727.134512, 835.764859, 981.996194, 1032.339788, 1031.105117, 995.303259,
+ 857.733663, 823.435877, 822.822791, 819.873050, 796.882480, 629.038445,
+ 510.391280, 621.158273, 633.236320, 644.023513, 648.232514, 666.381639,
+ 785.545191, 932.841398, 1036.735329, 1035.566013, 1029.599350, 994.926093,
+ 857.645648, 823.435143, 822.904139, 822.822791, 817.965681, 673.856962,
+ 514.588176, 632.947715, 652.056257, 662.179143, 671.466557, 727.134512,
+ 835.764859, 981.996194, 1032.339788, 1031.547475, 1023.835377, 972.158629,
+ 851.968626, 823.347128, 822.904770, 822.904139, 820.752301, 684.418900,
+ 520.013294, 631.668183, 644.023513, 648.232514, 666.381639, 785.545191,
+ 932.841398, 1036.735329, 1035.567378, 1029.776746, 1001.044108, 880.853721,
+ 829.201546, 822.994150, 822.904770, 822.904770, 820.792975, 684.582020,
+ 531.253628, 650.479606, 662.179143, 671.466557, 727.134512, 835.764859,
+ 981.996194, 1032.339788, 1031.636855, 1029.601779, 995.366703, 858.086641,
+ 823.524524, 822.906135, 822.904770, 822.904770, 820.792975, 684.582020,
+ 528.531744, 642.424501, 648.232514, 666.381639, 785.545191, 932.841398,
+ 1036.735329, 1035.567378, 1030.219103, 1029.576226, 995.278687, 857.733663,
+ 823.436508, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 545.401164, 660.550678, 671.508859, 727.304161, 835.807162, 981.996850,
+ 1032.339788, 1031.636855, 1030.130788, 1029.487827, 994.925709, 857.645648,
+ 823.435143, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 537.684760, 646.650947, 669.110131, 796.487512, 935.569890, 1036.777631,
+ 1035.567378, 1030.219103, 1030.018584, 1023.810805, 972.158629, 851.968626,
+ 823.347128, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 552.408370, 670.001885, 738.246482, 879.690154, 992.939171, 1032.509436,
+ 1031.636855, 1030.132153, 1029.665223, 1001.043724, 880.853721, 829.201546,
+ 822.994150, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 539.835902, 667.496388, 799.216004, 946.512211, 1039.506123, 1035.609680,
+ 1030.219103, 1030.107964, 1029.577207, 995.366703, 858.086641, 823.524524,
+ 822.906135, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 558.362529, 734.277451, 877.197218, 990.478243, 1029.908393, 1028.993978,
+ 1027.488620, 1027.464048, 1026.933674, 992.724534, 855.532488, 821.323349,
+ 820.792975, 820.792975, 820.792975, 820.792975, 818.686600, 682.825198,
+ 453.127195, 649.075095, 780.278390, 867.165890, 862.469711, 857.067460,
+ 856.956321, 856.955937, 856.513579, 827.981461, 713.556496, 685.024378,
+ 684.582020, 684.582020, 684.582020, 684.582020, 682.825198, 569.510056,
+};
+
+static const double interp_dgrid_surf[65 * 18] = {
+ 10.650434, 12.204694, 12.040917, 11.843008, 11.845578, 12.051535, 12.103583,
+ 12.136780, 12.266709, 12.299107, 12.299673, 12.303120, 12.316337, 12.293431,
+ 12.092165, 11.602421, 11.141559, 8.864495, 12.770003, 14.634889, 14.437149,
+ 14.199413, 14.202487, 14.449423, 14.511827, 14.551629, 14.707410, 14.746265,
+ 14.747610, 14.753705, 14.762194, 14.699395, 14.390525, 13.690970, 12.874168,
+ 10.367121, 12.832328, 14.790730, 14.503765, 14.236403, 14.239028, 14.486600,
+ 14.549164, 14.589069, 14.745250, 14.784258, 14.788320, 14.801930, 14.762798,
+ 14.499088, 14.021544, 13.469684, 12.661560, 10.108384, 12.950520, 15.264726,
+ 14.621957, 14.238236, 14.239028, 14.486601, 14.549264, 14.589469, 14.745361,
+ 14.784949, 14.791572, 14.798652, 14.660251, 14.119394, 13.651131, 12.935657,
+ 12.176082, 9.228999, 12.979992, 15.382918, 14.651428, 14.238693, 14.239028,
+ 14.486701, 14.555710, 14.615321, 14.751849, 14.787700, 14.797104, 14.743189,
+ 14.475057, 13.944406, 13.450468, 12.687876, 11.824993, 8.906683, 12.980449,
+ 15.384750, 14.651885, 14.238700, 14.239028, 14.487102, 14.581562, 14.718998,
+ 14.777721, 14.788445, 14.778661, 14.582790, 14.099785, 13.649637, 12.935359,
+ 12.201859, 10.891931, 8.482221, 12.980449, 15.384750, 14.651886, 14.238801,
+ 14.239434, 14.487303, 14.588010, 14.744860, 14.784773, 14.786094, 14.735647,
+ 14.455704, 13.939591, 13.450393, 12.687876, 11.849334, 10.476658, 8.043672,
+ 12.980449, 15.384750, 14.651987, 14.245320, 14.265579, 14.493824, 14.588211,
+ 14.745312, 14.787263, 14.775934, 14.582036, 14.099475, 13.649563, 12.935358,
+ 12.201859, 10.911285, 9.730570, 6.696921, 12.980449, 15.384750, 14.652393,
+ 14.271466, 14.370434, 14.520069, 14.589027, 14.746028, 14.785482, 14.735605,
+ 14.455693, 13.939590, 13.450393, 12.687876, 11.849334, 10.494514, 9.195398,
+ 6.215460, 12.980449, 15.384750, 14.652494, 14.277985, 14.396679, 14.533035,
+ 14.615021, 14.754825, 14.775610, 14.582796, 14.099664, 13.649565, 12.935358,
+ 12.201859, 10.911285, 9.747361, 7.779960, 5.617541, 12.980448, 15.384731,
+ 14.652415, 14.278078, 14.397578, 14.559053, 14.718657, 14.776398, 14.747044,
+ 14.504690, 13.951810, 13.450583, 12.687876, 11.849334, 10.494514, 9.210817,
+ 7.210003, 5.164575, 12.980446, 15.383448, 14.647073, 14.277541, 14.403813,
+ 14.569546, 14.744956, 14.765103, 14.629073, 14.296161, 13.698573, 12.936118,
+ 12.201859, 10.911285, 9.747361, 7.790897, 6.322998, 3.931551, 12.981550,
+ 15.376916, 14.615597, 14.274820, 14.437479, 14.575942, 14.707492, 14.734111,
+ 14.515975, 14.000806, 13.462803, 12.688066, 11.849334, 10.494514, 9.210817,
+ 7.219566, 5.781392, 3.486081, 12.991899, 15.376201, 14.579444, 14.296898,
+ 14.473361, 14.522910, 14.491600, 14.543267, 14.288580, 13.700311, 12.936579,
+ 12.201867, 10.911285, 9.747361, 7.790897, 6.331506, 4.480348, 2.923138,
+ 13.019848, 15.383477, 14.582260, 14.385262, 14.452673, 14.436019, 14.238174,
+ 14.255993, 13.977481, 13.532342, 12.705591, 11.849605, 10.494514, 9.210817,
+ 7.219566, 5.789642, 4.018194, 2.766222, 13.028558, 15.315782, 14.439141,
+ 14.326286, 14.452429, 14.311731, 14.033235, 13.922587, 13.665868, 13.207897,
+ 12.274375, 10.912967, 9.747371, 7.790897, 6.331506, 4.488594, 3.454993,
+ 2.692682, 12.992752, 15.321471, 14.409573, 14.236340, 14.322969, 14.049072,
+ 13.764823, 13.479242, 13.250105, 12.759133, 12.019174, 10.532951, 9.211409,
+ 7.219566, 5.789642, 4.026440, 3.298077, 2.674624, 12.945493, 15.276596,
+ 14.315745, 14.026198, 14.085774, 13.844563, 13.447576, 12.964935, 12.735525,
+ 12.288592, 11.511693, 9.900227, 7.793270, 6.331506, 4.488594, 3.463236,
+ 3.224318, 2.672433, 12.757570, 15.056661, 14.095011, 13.722362, 13.812624,
+ 13.608480, 13.021206, 12.367627, 11.937931, 11.581049, 10.599552, 9.247860,
+ 7.220151, 5.789642, 4.026437, 3.305882, 3.191260, 2.615317, 12.581293,
+ 14.824658, 13.909074, 13.496158, 13.491402, 13.221550, 12.514140, 11.677229,
+ 10.936895, 10.619912, 9.634779, 7.763570, 6.331082, 4.488590, 3.462798,
+ 3.216460, 3.076315, 2.373499, 12.283499, 14.455760, 13.890593, 13.427587,
+ 13.183783, 12.763833, 11.861006, 10.740618, 9.820756, 9.354945, 8.669862,
+ 7.123268, 5.787860, 4.025994, 3.290000, 3.084410, 2.810905, 2.222916,
+ 12.010893, 14.300919, 13.986624, 13.484026, 13.025385, 12.224281, 11.064265,
+ 9.631040, 8.594396, 8.003736, 7.561587, 6.274418, 4.466637, 3.446574,
+ 3.102467, 2.816989, 2.598688, 1.951541, 11.581477, 13.831132, 13.632027,
+ 13.380414, 12.807880, 11.665651, 10.218236, 8.562237, 7.222614, 6.611808,
+ 6.261676, 5.402793, 3.938544, 3.174375, 2.818166, 2.602758, 2.213911,
+ 1.434763, 11.050735, 12.893449, 12.363152, 12.712829, 12.012961, 10.887854,
+ 9.109699, 7.421701, 5.965603, 5.272129, 4.991435, 4.423000, 3.369988,
+ 2.800371, 2.593901, 2.217431, 1.670917, 1.215265, 10.641194, 11.766277,
+ 10.777082, 10.972917, 10.689298, 9.701545, 7.719947, 6.145654, 4.872442,
+ 4.099600, 3.880934, 3.514159, 2.786474, 2.368963, 2.162376, 1.673670,
+ 1.450770, 1.185424, 10.071964, 11.107701, 9.172361, 8.551313, 8.412080,
+ 7.641397, 6.174246, 4.853916, 3.904549, 3.246810, 2.959903, 2.785066,
+ 2.240001, 1.793166, 1.585520, 1.449824, 1.405368, 1.168856, 9.213182,
+ 9.173278, 7.219231, 6.242951, 5.626013, 5.768007, 4.908666, 3.809589,
+ 3.115109, 2.617899, 2.274793, 2.172960, 1.838597, 1.505915, 1.414333,
+ 1.392666, 1.338173, 1.105611, 7.365015, 7.471370, 5.622346, 4.520127,
+ 3.936272, 4.208822, 3.623024, 2.977794, 2.450003, 2.097261, 1.824090,
+ 1.643270, 1.473525, 1.351388, 1.327504, 1.323865, 1.307894, 1.088234,
+ 6.198210, 6.580712, 4.682511, 3.416952, 2.941929, 2.766637, 2.650686,
+ 2.315439, 1.925838, 1.659784, 1.464419, 1.252806, 1.162722, 1.197518,
+ 1.199875, 1.197365, 1.194040, 0.995797, 5.402507, 5.055466, 3.728724,
+ 2.624359, 2.165810, 1.943189, 1.918190, 1.738078, 1.516328, 1.290520,
+ 1.155793, 1.015962, 0.881900, 0.807203, 0.754242, 0.743378, 0.740288,
+ 0.614158, 3.937867, 3.862507, 2.884664, 2.088147, 1.648496, 1.473584,
+ 1.340123, 1.291769, 1.165381, 1.000224, 0.893316, 0.821333, 0.691363,
+ 0.610501, 0.586766, 0.583762, 0.577840, 0.468733, 3.104660, 3.181078,
+ 2.420208, 1.747442, 1.297956, 1.109835, 0.970385, 0.943229, 0.876923,
+ 0.777584, 0.678183, 0.628623, 0.553745, 0.523430, 0.519490, 0.514394,
+ 0.492259, 0.403172, 2.593833, 2.533720, 2.010452, 1.480944, 1.060302,
+ 0.846383, 0.738703, 0.673144, 0.658010, 0.592449, 0.518236, 0.470335,
+ 0.425088, 0.393168, 0.378116, 0.355846, 0.275469, 0.213128, 2.176988,
+ 2.089575, 1.671284, 1.225008, 0.895382, 0.672008, 0.566241, 0.496746,
+ 0.488005, 0.449874, 0.400899, 0.354002, 0.318150, 0.281533, 0.238545,
+ 0.224159, 0.202399, 0.160681, 1.874679, 1.769165, 1.430124, 1.068727,
+ 0.780272, 0.557801, 0.441643, 0.377256, 0.352957, 0.338452, 0.304965,
+ 0.273172, 0.240052, 0.208724, 0.193431, 0.190845, 0.185025, 0.138166,
+ 1.590226, 1.502830, 1.193127, 0.917885, 0.670432, 0.474546, 0.355420,
+ 0.292305, 0.259035, 0.249937, 0.232079, 0.208943, 0.181936, 0.160038,
+ 0.152257, 0.151235, 0.149583, 0.120747, 1.331730, 1.255907, 1.012871,
+ 0.778422, 0.578977, 0.412432, 0.293155, 0.231824, 0.197187, 0.183921,
+ 0.174876, 0.157252, 0.140263, 0.127050, 0.110244, 0.105041, 0.104323,
+ 0.086944, 1.153994, 1.118771, 0.822355, 0.612321, 0.478249, 0.348222,
+ 0.247408, 0.186141, 0.152714, 0.135445, 0.129810, 0.119994, 0.115619,
+ 0.131626, 0.095612, 0.079343, 0.077502, 0.064550, 0.946317, 0.925894,
+ 0.677969, 0.499906, 0.397101, 0.297931, 0.214467, 0.152333, 0.120731,
+ 0.102686, 0.095062, 0.090361, 0.122319, 0.240194, 0.112687, 0.070690,
+ 0.070461, 0.054194, 0.824155, 0.787241, 0.581856, 0.419228, 0.313167,
+ 0.245582, 0.183500, 0.128101, 0.096577, 0.080267, 0.071022, 0.066851,
+ 0.085754, 0.154163, 0.075884, 0.052401, 0.054270, 0.026656, 0.716310,
+ 0.671378, 0.489580, 0.349569, 0.256155, 0.206343, 0.157853, 0.111950,
+ 0.079271, 0.062518, 0.053441, 0.049660, 0.051400, 0.063778, 0.039993,
+ 0.029133, 0.023382, 0.013725, 0.614125, 0.579096, 0.417126, 0.299465,
+ 0.217849, 0.165515, 0.129040, 0.093127, 0.065612, 0.049543, 0.041429,
+ 0.036850, 0.034416, 0.033989, 0.024216, 0.017377, 0.014833, 0.011987,
+ 0.520407, 0.487239, 0.349473, 0.251741, 0.184897, 0.135813, 0.107098,
+ 0.073607, 0.053938, 0.040531, 0.032931, 0.028876, 0.025759, 0.022168,
+ 0.016739, 0.014638, 0.014333, 0.011947, 0.449954, 0.415124, 0.299452,
+ 0.216942, 0.158874, 0.115334, 0.088821, 0.060105, 0.042610, 0.032566,
+ 0.026903, 0.023123, 0.019913, 0.016835, 0.014306, 0.013625, 0.013535,
+ 0.011284, 0.377618, 0.347773, 0.251741, 0.184839, 0.132857, 0.095439,
+ 0.070462, 0.052244, 0.036078, 0.026025, 0.021518, 0.018487, 0.015361,
+ 0.012905, 0.011470, 0.010569, 0.010283, 0.008297, 0.319953, 0.297976,
+ 0.216942, 0.158842, 0.113280, 0.080426, 0.057367, 0.041987, 0.030135,
+ 0.022295, 0.017901, 0.015121, 0.012224, 0.010035, 0.009353, 0.009108,
+ 0.008695, 0.006139, 0.267864, 0.250502, 0.184839, 0.132851, 0.095039,
+ 0.068220, 0.049135, 0.035315, 0.025144, 0.018237, 0.013857, 0.012094,
+ 0.009715, 0.007743, 0.006937, 0.006446, 0.006243, 0.004929, 0.230449,
+ 0.215895, 0.158842, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959,
+ 0.021866, 0.015673, 0.012133, 0.010083, 0.007801, 0.006053, 0.005401,
+ 0.003834, 0.003429, 0.002851, 0.193984, 0.183963, 0.132851, 0.095039,
+ 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013175, 0.010422,
+ 0.008491, 0.006397, 0.004567, 0.003494, 0.002933, 0.002825, 0.002355,
+ 0.167298, 0.158088, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009257, 0.007051, 0.005543, 0.003905,
+ 0.002984, 0.002825, 0.002814, 0.002347, 0.143228, 0.132220, 0.095039,
+ 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008403, 0.006661, 0.005378, 0.003545, 0.002876, 0.002818, 0.002814,
+ 0.002347, 0.122934, 0.112735, 0.080417, 0.057174, 0.041304, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009258, 0.007182, 0.006012, 0.003762,
+ 0.002866, 0.002739, 0.002788, 0.002810, 0.002347, 0.101934, 0.094569,
+ 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008405, 0.006797, 0.005845, 0.003333, 0.002703, 0.002695, 0.002723,
+ 0.002781, 0.002343, 0.086702, 0.080014, 0.057174, 0.041304, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006533, 0.005839,
+ 0.003326, 0.002700, 0.002690, 0.002694, 0.002716, 0.002314, 0.073040,
+ 0.067886, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008405, 0.006807, 0.006468, 0.005831, 0.003325, 0.002700, 0.002690,
+ 0.002690, 0.002687, 0.002253, 0.061685, 0.056890, 0.041304, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006542, 0.006360,
+ 0.005416, 0.003221, 0.002698, 0.002690, 0.002690, 0.002683, 0.002238,
+ 0.052465, 0.048894, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008405, 0.006807, 0.006472, 0.005943, 0.003748, 0.002805, 0.002692,
+ 0.002690, 0.002690, 0.002683, 0.002238, 0.043838, 0.041101, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006543, 0.006465,
+ 0.005839, 0.003333, 0.002702, 0.002690, 0.002690, 0.002690, 0.002683,
+ 0.002238, 0.037824, 0.035133, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008405, 0.006807, 0.006480, 0.006464, 0.005838, 0.003326, 0.002700,
+ 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.031865, 0.029815,
+ 0.021866, 0.015668, 0.011955, 0.009258, 0.007190, 0.006543, 0.006475,
+ 0.006462, 0.005831, 0.003325, 0.002700, 0.002690, 0.002690, 0.002690,
+ 0.002683, 0.002238, 0.027150, 0.025016, 0.018128, 0.013083, 0.010371,
+ 0.008405, 0.006807, 0.006480, 0.006472, 0.006359, 0.005416, 0.003221,
+ 0.002698, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.023094,
+ 0.021760, 0.015577, 0.011590, 0.009167, 0.007188, 0.006543, 0.006475,
+ 0.006466, 0.005943, 0.003748, 0.002805, 0.002692, 0.002690, 0.002690,
+ 0.002690, 0.002683, 0.002238, 0.019269, 0.018038, 0.013060, 0.010280,
+ 0.008382, 0.006806, 0.006480, 0.006474, 0.006464, 0.005839, 0.003333,
+ 0.002702, 0.002690, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238,
+ 0.016874, 0.015472, 0.011566, 0.009148, 0.007171, 0.006527, 0.006458,
+ 0.006457, 0.006447, 0.005823, 0.003318, 0.002693, 0.002683, 0.002683,
+ 0.002683, 0.002683, 0.002676, 0.002232, 0.011968, 0.011056, 0.008762,
+ 0.007219, 0.005717, 0.005391, 0.005386, 0.005386, 0.005377, 0.004856,
+ 0.002767, 0.002246, 0.002238, 0.002238, 0.002238, 0.002238, 0.002232,
+ 0.001862,
+};
+
+void av1_model_rd_surffit(double xm, double yl, double *rate_f,
+ double *dist_f) {
+ const double x_start = -0.5;
+ const double x_end = 16.5;
+ const double x_step = 1;
+ const double y_start = -15.5;
+ const double y_end = 16.5;
+ const double y_step = 0.5;
+ const double epsilon = 1e-6;
+ const int stride = (int)rint((x_end - x_start) / x_step) + 1;
+ (void)y_end;
+
+ xm = AOMMAX(xm, x_start + x_step + epsilon);
+ xm = AOMMIN(xm, x_end - x_step - epsilon);
+ yl = AOMMAX(yl, y_start + y_step + epsilon);
+ yl = AOMMIN(yl, y_end - y_step - epsilon);
+
+ const double y = (yl - y_start) / y_step;
+ const double x = (xm - x_start) / x_step;
+
+ const int yi = (int)floor(y);
+ const int xi = (int)floor(x);
+ assert(xi > 0);
+ assert(yi > 0);
+
+ const double yo = y - yi;
+ const double xo = x - xi;
+ const double *prate = &interp_rgrid_surf[(yi - 1) * stride + (xi - 1)];
+ const double *pdist = &interp_dgrid_surf[(yi - 1) * stride + (xi - 1)];
+ *rate_f = interp_bicubic(prate, stride, xo, yo);
+ *dist_f = interp_bicubic(pdist, stride, xo, yo);
+}
+
+static const double interp_rgrid_curv[65] = {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 4.759876,
+ 8.132086, 13.651828, 21.908271, 33.522054, 48.782376, 71.530983,
+ 106.728649, 151.942795, 199.893011, 242.850965, 283.933923, 322.154203,
+ 360.684608, 394.801656, 426.879017, 460.234313, 484.103987, 508.261495,
+ 536.486763, 558.196737, 586.285894, 614.764511, 634.166333, 647.706472,
+ 658.211478, 681.360407, 701.052141, 727.007310, 768.663973, 804.407660,
+ 884.627751, 1065.658131, 1238.875214, 1440.185176, 1678.377931, 1962.243390,
+ 2300.571467, 2702.152072, 3175.775119, 3730.230519, 4374.308184, 5116.798028,
+ 5966.489961, 6932.173897, 8022.639747, 9246.677424, 10613.076839,
+};
+
+static const double interp_dgrid_curv[65] = {
+ 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855,
+ 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.555776, 14.533692,
+ 14.439920, 14.257791, 13.977230, 13.623229, 13.064884, 12.355411, 11.560773,
+ 10.728960, 9.861975, 8.643612, 6.916021, 5.154769, 3.734940, 2.680051,
+ 1.925506, 1.408410, 1.042223, 0.767641, 0.565392, 0.420116, 0.310427,
+ 0.231711, 0.172999, 0.128293, 0.094992, 0.072171, 0.052972, 0.039354,
+ 0.029555, 0.022857, 0.016832, 0.013297, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000,
+};
+
+void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) {
+ const double x_start = -15.5;
+ const double x_end = 16.5;
+ const double x_step = 0.5;
+ const double epsilon = 1e-6;
+ (void)x_end;
+
+ xqr = AOMMAX(xqr, x_start + x_step + epsilon);
+ xqr = AOMMIN(xqr, x_end - x_step - epsilon);
+ const double x = (xqr - x_start) / x_step;
+ const int xi = (int)floor(x);
+ const double xo = x - xi;
+
+ assert(xi > 0);
+
+ const double *prate = &interp_rgrid_curv[(xi - 1)];
+ const double *pdist = &interp_dgrid_curv[(xi - 1)];
+ *rate_f = interp_cubic(prate, xo);
+ *distbysse_f = interp_cubic(pdist, xo);
+}
+
static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
const struct macroblockd_plane *pd,
ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
@@ -814,8 +1281,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_NEARESTG] = 0;
}
- rd->thresh_mult[THR_DC] += 1000;
-
rd->thresh_mult[THR_NEWMV] += 1000;
rd->thresh_mult[THR_NEWL2] += 1000;
rd->thresh_mult[THR_NEWL3] += 1000;
@@ -840,8 +1305,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_GLOBALG] += 2000;
rd->thresh_mult[THR_GLOBALA] += 2000;
- rd->thresh_mult[THR_PAETH] += 1000;
-
rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
@@ -956,15 +1419,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000;
rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500;
- rd->thresh_mult[THR_H_PRED] += 2000;
- rd->thresh_mult[THR_V_PRED] += 2000;
- rd->thresh_mult[THR_D135_PRED] += 2500;
- rd->thresh_mult[THR_D203_PRED] += 2500;
- rd->thresh_mult[THR_D157_PRED] += 2500;
- rd->thresh_mult[THR_D67_PRED] += 2500;
- rd->thresh_mult[THR_D113_PRED] += 2500;
- rd->thresh_mult[THR_D45_PRED] += 2500;
-
rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600;
rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000;
rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000;
@@ -996,6 +1450,20 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200;
rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400;
rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200;
+
+ rd->thresh_mult[THR_DC] += 1000;
+ rd->thresh_mult[THR_PAETH] += 1000;
+ rd->thresh_mult[THR_SMOOTH] += 2000;
+ rd->thresh_mult[THR_SMOOTH_V] += 2000;
+ rd->thresh_mult[THR_SMOOTH_H] += 2000;
+ rd->thresh_mult[THR_H_PRED] += 2000;
+ rd->thresh_mult[THR_V_PRED] += 2000;
+ rd->thresh_mult[THR_D135_PRED] += 2500;
+ rd->thresh_mult[THR_D203_PRED] += 2500;
+ rd->thresh_mult[THR_D157_PRED] += 2500;
+ rd->thresh_mult[THR_D67_PRED] += 2500;
+ rd->thresh_mult[THR_D113_PRED] += 2500;
+ rd->thresh_mult[THR_D45_PRED] += 2500;
}
void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
index 692367d7a..755b61df5 100644
--- a/third_party/aom/av1/encoder/rd.h
+++ b/third_party/aom/av1/encoder/rd.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RD_H_
-#define AV1_ENCODER_RD_H_
+#ifndef AOM_AV1_ENCODER_RD_H_
+#define AOM_AV1_ENCODER_RD_H_
#include <limits.h>
@@ -57,8 +57,6 @@ typedef enum {
THR_NEARESTA,
THR_NEARESTG,
- THR_DC,
-
THR_NEWMV,
THR_NEWL2,
THR_NEWL3,
@@ -100,12 +98,6 @@ typedef enum {
THR_COMP_NEAREST_NEARESTLG,
THR_COMP_NEAREST_NEARESTBA,
- THR_PAETH,
-
- THR_SMOOTH,
- THR_SMOOTH_V,
- THR_SMOOTH_H,
-
THR_COMP_NEAR_NEARLA,
THR_COMP_NEW_NEARESTLA,
THR_COMP_NEAREST_NEWLA,
@@ -202,15 +194,6 @@ typedef enum {
THR_COMP_NEW_NEWGA2,
THR_COMP_GLOBAL_GLOBALGA2,
- THR_H_PRED,
- THR_V_PRED,
- THR_D135_PRED,
- THR_D203_PRED,
- THR_D157_PRED,
- THR_D67_PRED,
- THR_D113_PRED,
- THR_D45_PRED,
-
THR_COMP_NEAR_NEARLL2,
THR_COMP_NEW_NEARESTLL2,
THR_COMP_NEAREST_NEWLL2,
@@ -243,7 +226,26 @@ typedef enum {
THR_COMP_NEW_NEWBA,
THR_COMP_GLOBAL_GLOBALBA,
- MAX_MODES
+ THR_DC,
+ THR_PAETH,
+ THR_SMOOTH,
+ THR_SMOOTH_V,
+ THR_SMOOTH_H,
+ THR_H_PRED,
+ THR_V_PRED,
+ THR_D135_PRED,
+ THR_D203_PRED,
+ THR_D157_PRED,
+ THR_D67_PRED,
+ THR_D113_PRED,
+ THR_D45_PRED,
+
+ MAX_MODES,
+
+ LAST_SINGLE_REF_MODES = THR_GLOBALG,
+ MAX_SINGLE_REF_MODES = LAST_SINGLE_REF_MODES + 1,
+ LAST_COMP_REF_MODES = THR_COMP_GLOBAL_GLOBALBA,
+ MAX_COMP_REF_MODES = LAST_COMP_REF_MODES + 1
} THR_MODES;
typedef enum {
@@ -392,6 +394,10 @@ void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
unsigned int qstep, int *rate, int64_t *dist);
+void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f);
+void av1_model_rd_surffit(double xm, double yl, double *rate_f,
+ double *distbysse_f);
+
int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
const MACROBLOCKD *xd);
@@ -455,4 +461,4 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
} // extern "C"
#endif
-#endif // AV1_ENCODER_RD_H_
+#endif // AOM_AV1_ENCODER_RD_H_
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
index fef6d2875..c2d15534f 100644
--- a/third_party/aom/av1/encoder/rdopt.c
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -55,17 +55,97 @@
#include "av1/encoder/ratectrl.h"
#include "av1/encoder/rd.h"
#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
#include "av1/encoder/tokenize.h"
#include "av1/encoder/tx_prune_model_weights.h"
-#define DNN_BASED_RD_INTERP_FILTER 0
+typedef void (*model_rd_for_sb_type)(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist);
-// Set this macro as 1 to collect data about tx size selection.
-#define COLLECT_TX_SIZE_DATA 0
+static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+ int plane_to, int mi_row, int mi_col,
+ int *out_rate_sum, int64_t *out_dist_sum,
+ int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse,
+ int64_t *plane_dist);
+static void model_rd_for_sb_with_curvfit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_surffit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_dnn(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_fullrdy(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_from_sse(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+ int plane, int64_t sse, int num_samples,
+ int *rate, int64_t *dist);
+static void model_rd_with_dnn(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+ int plane, int64_t sse, int num_samples,
+ int *rate, int64_t *dist);
+static void model_rd_with_curvfit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist);
+static void model_rd_with_surffit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist);
-#if COLLECT_TX_SIZE_DATA
-static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
-#endif
+typedef enum {
+ MODELRD_LEGACY,
+ MODELRD_CURVFIT,
+ MODELRD_SUFFIT,
+ MODELRD_DNN,
+ MODELRD_FULLRDY,
+ MODELRD_TYPES
+} ModelRdType;
+
+static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
+ model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit,
+ model_rd_for_sb_with_dnn, model_rd_for_sb_with_fullrdy
+};
+
+static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
+ model_rd_from_sse, model_rd_with_curvfit, model_rd_with_surffit,
+ model_rd_with_dnn, NULL
+};
+
+// 0: Legacy model
+// 1: Curve fit model
+// 2: Surface fit model
+// 3: DNN regression model
+// 4: Full rd model
+#define MODELRD_TYPE_INTERP_FILTER 1
+#define MODELRD_TYPE_TX_SEARCH_PRUNE 2
+#define MODELRD_TYPE_MASKED_COMPOUND 1
+#define MODELRD_TYPE_INTERINTRA 1
+#define MODELRD_TYPE_INTRA 1
+#define MODELRD_TYPE_JNT_COMPOUND 1
#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = {
@@ -103,6 +183,16 @@ typedef enum {
FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
} FAST_TX_SEARCH_MODE;
+static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int64_t ref_best_rd);
+
+static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t non_skip_ref_best_rd,
+ int64_t skip_ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode);
+
struct rdcost_block_args {
const AV1_COMP *cpi;
MACROBLOCK *x;
@@ -112,6 +202,7 @@ struct rdcost_block_args {
int64_t this_rd;
int64_t best_rd;
int exit_early;
+ int incomplete_exit;
int use_fast_coef_costing;
FAST_TX_SEARCH_MODE ftxs_mode;
};
@@ -126,8 +217,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
{ NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
{ NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
- { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
-
{ NEWMV, { LAST_FRAME, NONE_FRAME } },
{ NEWMV, { LAST2_FRAME, NONE_FRAME } },
{ NEWMV, { LAST3_FRAME, NONE_FRAME } },
@@ -172,12 +261,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
{ NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
{ NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
- { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
-
- { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
- { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
- { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
-
{ NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
{ NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
{ NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -274,15 +357,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
{ NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
{ GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
- { H_PRED, { INTRA_FRAME, NONE_FRAME } },
- { V_PRED, { INTRA_FRAME, NONE_FRAME } },
- { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
- { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
- { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
- { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
- { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
- { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
-
{ NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
{ NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
{ NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
@@ -314,6 +388,21 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
{ NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
{ NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
{ GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+ // intra modes
+ { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
};
static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = {
@@ -451,7 +540,6 @@ static int get_prediction_mode_idx(PREDICTION_MODE this_mode,
if (this_mode >= SINGLE_INTER_MODE_START &&
this_mode < SINGLE_INTER_MODE_END) {
assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
- assert(second_ref_frame == NONE_FRAME);
return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
[ref_frame];
}
@@ -479,6 +567,12 @@ static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
UV_D113_PRED, UV_D45_PRED,
};
+typedef struct SingleInterModeState {
+ int64_t rd;
+ MV_REFERENCE_FRAME ref_frame;
+ int valid;
+} SingleInterModeState;
+
typedef struct InterModeSearchState {
int64_t best_rd;
MB_MODE_INFO best_mbmode;
@@ -510,32 +604,21 @@ typedef struct InterModeSearchState {
int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES];
int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES];
int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES];
- int64_t modelled_rd[MB_MODE_COUNT][REF_FRAMES];
+ int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES];
+ // The rd of simple translation in single inter modes
+ int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES];
+
+ // Single search results by [directions][modes][reference frames]
+ SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+ int single_state_cnt[2][SINGLE_INTER_MODE_NUM];
+ SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM]
+ [FWD_REFS];
+ int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM];
+
+ MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
} InterModeSearchState;
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-
-typedef struct InterModeRdModel {
- int ready;
- double a;
- double b;
- double dist_mean;
- int skip_count;
- int non_skip_count;
- int fp_skip_count;
- int bracket_idx;
-} InterModeRdModel;
-
-InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
-
-#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
-static int inter_mode_data_idx[4];
-static int64_t inter_mode_data_sse[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-static int64_t inter_mode_data_dist[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-static int inter_mode_data_residue_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-static int inter_mode_data_all_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-static int64_t inter_mode_data_ref_best_rd[4][INTER_MODE_RD_DATA_OVERALL_SIZE];
-
int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
if (bsize == BLOCK_8X8) return 1;
if (bsize == BLOCK_16X16) return 2;
@@ -543,137 +626,152 @@ int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
return -1;
}
-void av1_inter_mode_data_init() {
+void av1_inter_mode_data_init(TileDataEnc *tile_data) {
for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
- const int block_idx = inter_mode_data_block_idx(i);
- if (block_idx != -1) inter_mode_data_idx[block_idx] = 0;
- InterModeRdModel *md = &inter_mode_rd_models[i];
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[i];
md->ready = 0;
- md->skip_count = 0;
- md->non_skip_count = 0;
- md->fp_skip_count = 0;
- md->bracket_idx = 0;
+ md->num = 0;
+ md->dist_sum = 0;
+ md->ld_sum = 0;
+ md->sse_sum = 0;
+ md->sse_sse_sum = 0;
+ md->sse_ld_sum = 0;
}
}
-void av1_inter_mode_data_show(const AV1_COMMON *cm) {
- printf("frame_offset %d\n", cm->frame_offset);
- for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
- const int block_idx = inter_mode_data_block_idx(i);
- if (block_idx != -1) inter_mode_data_idx[block_idx] = 0;
- InterModeRdModel *md = &inter_mode_rd_models[i];
- if (md->ready) {
- printf("bsize %d non_skip_count %d skip_count %d fp_skip_count %d\n", i,
- md->non_skip_count, md->skip_count, md->fp_skip_count);
+static int get_est_rate_dist(TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ int64_t sse, int *est_residue_cost,
+ int64_t *est_dist) {
+ aom_clear_system_state();
+ const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ if (md->ready) {
+ const double est_ld = md->a * sse + md->b;
+ if (sse < md->dist_mean) {
+ *est_residue_cost = 0;
+ *est_dist = sse;
+ } else {
+ *est_residue_cost = (int)round((sse - md->dist_mean) / est_ld);
+ *est_dist = (int64_t)round(md->dist_mean);
}
+ return 1;
}
+ return 0;
}
-static int64_t get_est_rd(BLOCK_SIZE bsize, int rdmult, int64_t sse,
- int curr_cost) {
- aom_clear_system_state();
- InterModeRdModel *md = &inter_mode_rd_models[bsize];
- if (md->ready) {
- const double est_ld = md->a * sse + md->b;
- const double est_residue_cost = (sse - md->dist_mean) / est_ld;
- const int64_t est_cost = (int64_t)round(est_residue_cost) + curr_cost;
- const int64_t int64_dist_mean = (int64_t)round(md->dist_mean);
- const int64_t est_rd = RDCOST(rdmult, est_cost, int64_dist_mean);
+static int64_t get_est_rd(TileDataEnc *tile_data, BLOCK_SIZE bsize, int rdmult,
+ int64_t sse, int curr_cost) {
+ int est_residue_cost;
+ int64_t est_dist;
+ if (get_est_rate_dist(tile_data, bsize, sse, &est_residue_cost, &est_dist)) {
+ int rate = est_residue_cost + curr_cost;
+ int64_t est_rd = RDCOST(rdmult, rate, est_dist);
return est_rd;
}
return 0;
}
-#define DATA_BRACKETS 7
-static const int data_num_threshold[DATA_BRACKETS] = {
- 200, 400, 800, 1600, 3200, 6400, INT32_MAX
-};
-
-void av1_inter_mode_data_fit(int rdmult) {
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
aom_clear_system_state();
for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
const int block_idx = inter_mode_data_block_idx(bsize);
- InterModeRdModel *md = &inter_mode_rd_models[bsize];
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
if (block_idx == -1) continue;
- int data_num = inter_mode_data_idx[block_idx];
- if (data_num < data_num_threshold[md->bracket_idx]) {
+ if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) {
continue;
+ } else {
+ if (md->ready == 0) {
+ md->dist_mean = md->dist_sum / md->num;
+ md->ld_mean = md->ld_sum / md->num;
+ md->sse_mean = md->sse_sum / md->num;
+ md->sse_sse_mean = md->sse_sse_sum / md->num;
+ md->sse_ld_mean = md->sse_ld_sum / md->num;
+ } else {
+ const double factor = 3;
+ md->dist_mean =
+ (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1);
+ md->ld_mean =
+ (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1);
+ md->sse_mean =
+ (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1);
+ md->sse_sse_mean =
+ (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) /
+ (factor + 1);
+ md->sse_ld_mean =
+ (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) /
+ (factor + 1);
+ }
+
+ const double my = md->ld_mean;
+ const double mx = md->sse_mean;
+ const double dx = sqrt(md->sse_sse_mean);
+ const double dxy = md->sse_ld_mean;
+
+ md->a = (dxy - mx * my) / (dx * dx - mx * mx);
+ md->b = my - md->a * mx;
+ md->ready = 1;
+
+ md->num = 0;
+ md->dist_sum = 0;
+ md->ld_sum = 0;
+ md->sse_sum = 0;
+ md->sse_sse_sum = 0;
+ md->sse_ld_sum = 0;
}
- double my = 0;
- double mx = 0;
- double dx = 0;
- double dxy = 0;
- double dist_mean = 0;
- const int train_num = data_num;
- for (int i = 0; i < train_num; ++i) {
- const double sse = (double)inter_mode_data_sse[block_idx][i];
- const double dist = (double)inter_mode_data_dist[block_idx][i];
- const double residue_cost = inter_mode_data_residue_cost[block_idx][i];
- const double ld = (sse - dist) / residue_cost;
- dist_mean += dist;
- my += ld;
- mx += sse;
- dx += sse * sse;
- dxy += sse * ld;
- }
- dist_mean = dist_mean / data_num;
- my = my / train_num;
- mx = mx / train_num;
- dx = sqrt(dx / train_num);
- dxy = dxy / train_num;
-
- md->dist_mean = dist_mean;
- md->a = (dxy - mx * my) / (dx * dx - mx * mx);
- md->b = my - md->a * mx;
- ++md->bracket_idx;
- md->ready = 1;
- assert(md->bracket_idx < DATA_BRACKETS);
-
(void)rdmult;
-#if 0
- int skip_count = 0;
- int fp_skip_count = 0;
- double avg_error = 0;
- const int test_num = data_num;
- for (int i = 0; i < data_num; ++i) {
- const int64_t sse = inter_mode_data_sse[block_idx][i];
- const int64_t dist = inter_mode_data_dist[block_idx][i];
- const int64_t residue_cost = inter_mode_data_residue_cost[block_idx][i];
- const int64_t all_cost = inter_mode_data_all_cost[block_idx][i];
- const int64_t est_rd =
- get_est_rd(bsize, rdmult, sse, all_cost - residue_cost);
- const int64_t real_rd = RDCOST(rdmult, all_cost, dist);
- const int64_t ref_best_rd = inter_mode_data_ref_best_rd[block_idx][i];
- if (est_rd > ref_best_rd) {
- ++skip_count;
- if (real_rd < ref_best_rd) {
- ++fp_skip_count;
- }
- }
- avg_error += abs(est_rd - real_rd) * 100. / real_rd;
- }
- avg_error /= test_num;
- printf("test_num %d bsize %d avg_error %f skip_count %d fp_skip_count %d\n",
- test_num, bsize, avg_error, skip_count, fp_skip_count);
-#endif
}
}
-static void inter_mode_data_push(BLOCK_SIZE bsize, int64_t sse, int64_t dist,
- int residue_cost, int all_cost,
- int64_t ref_best_rd) {
+static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ int64_t sse, int64_t dist, int residue_cost) {
if (residue_cost == 0 || sse == dist) return;
const int block_idx = inter_mode_data_block_idx(bsize);
if (block_idx == -1) return;
- if (inter_mode_data_idx[block_idx] < INTER_MODE_RD_DATA_OVERALL_SIZE) {
- const int data_idx = inter_mode_data_idx[block_idx];
- inter_mode_data_sse[block_idx][data_idx] = sse;
- inter_mode_data_dist[block_idx][data_idx] = dist;
- inter_mode_data_residue_cost[block_idx][data_idx] = residue_cost;
- inter_mode_data_all_cost[block_idx][data_idx] = all_cost;
- inter_mode_data_ref_best_rd[block_idx][data_idx] = ref_best_rd;
- ++inter_mode_data_idx[block_idx];
+ InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize];
+ if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) {
+ aom_clear_system_state();
+ const double ld = (sse - dist) * 1. / residue_cost;
+ ++rd_model->num;
+ rd_model->dist_sum += dist;
+ rd_model->ld_sum += ld;
+ rd_model->sse_sum += sse;
+ rd_model->sse_sse_sum += sse * sse;
+ rd_model->sse_ld_sum += sse * ld;
+ }
+}
+
+static void inter_modes_info_push(InterModesInfo *inter_modes_info,
+ int mode_rate, int64_t sse, int64_t est_rd,
+ const MB_MODE_INFO *mbmi) {
+ const int num = inter_modes_info->num;
+ assert(num < MAX_INTER_MODES);
+ inter_modes_info->mbmi_arr[num] = *mbmi;
+ inter_modes_info->mode_rate_arr[num] = mode_rate;
+ inter_modes_info->sse_arr[num] = sse;
+ inter_modes_info->est_rd_arr[num] = est_rd;
+ ++inter_modes_info->num;
+}
+
+static int compare_rd_idx_pair(const void *a, const void *b) {
+ if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) {
+ return 0;
+ } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) {
+ return 1;
+ } else {
+ return -1;
+ }
+}
+
+static void inter_modes_info_sort(const InterModesInfo *inter_modes_info,
+ RdIdxPair *rd_idx_pair_arr) {
+ if (inter_modes_info->num == 0) {
+ return;
+ }
+ for (int i = 0; i < inter_modes_info->num; ++i) {
+ rd_idx_pair_arr[i].idx = i;
+ rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i];
}
+ qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]),
+ compare_rd_idx_pair);
}
#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
@@ -1528,13 +1626,13 @@ static void score_2D_transform_pow8(float *scores_2D, float shift) {
// will lead to pruning i+1 TX types on average
static const float *prune_2D_adaptive_thresholds[] = {
// TX_4X4
- (float[]){ 0.02014f, 0.02722f, 0.03430f, 0.04114f, 0.04724f, 0.05212f,
- 0.05627f, 0.06018f, 0.06409f, 0.06824f, 0.07312f, 0.07849f,
- 0.08606f, 0.09827f },
+ (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f,
+ 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f,
+ 0.09778f, 0.11780f },
// TX_8X8
- (float[]){ 0.00745f, 0.01355f, 0.02039f, 0.02795f, 0.03625f, 0.04407f,
- 0.05042f, 0.05579f, 0.06067f, 0.06604f, 0.07239f, 0.08093f,
- 0.09363f, 0.11682f },
+ (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f,
+ 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f,
+ 0.10803f, 0.14124f },
// TX_16X16
(float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
0.06897f, 0.07629f, 0.08875f, 0.11169f },
@@ -1543,35 +1641,37 @@ static const float *prune_2D_adaptive_thresholds[] = {
// TX_64X64
NULL,
// TX_4X8
- (float[]){ 0.01282f, 0.02087f, 0.02844f, 0.03601f, 0.04285f, 0.04871f,
- 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f,
- 0.09119f, 0.10828f },
+ (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f,
+ 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f,
+ 0.10168f, 0.12585f },
// TX_8X4
- (float[]){ 0.01184f, 0.01941f, 0.02722f, 0.03503f, 0.04187f, 0.04822f,
- 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f,
- 0.09167f, 0.10974f },
+ (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f,
+ 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f,
+ 0.10583f, 0.13123f },
// TX_8X16
- (float[]){ 0.00525f, 0.01135f, 0.01819f, 0.02576f, 0.03357f, 0.04114f,
- 0.04773f, 0.05383f, 0.05920f, 0.06506f, 0.07190f, 0.08118f,
- 0.09509f, 0.12097f },
+ (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f,
+ 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f,
+ 0.10730f, 0.14221f },
// TX_16X8
- (float[]){ 0.00525f, 0.01160f, 0.01819f, 0.02527f, 0.03308f, 0.04065f,
- 0.04773f, 0.05383f, 0.05969f, 0.06531f, 0.07214f, 0.08118f,
- 0.09485f, 0.12048f },
+ (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f,
+ 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f,
+ 0.10339f, 0.13464f },
// TX_16X32
- (float[]){ 0.01257f, 0.02576f, 0.03723f, 0.04578f, 0.05212f, 0.05798f,
- 0.06506f, 0.07385f, 0.08606f, 0.10925f },
+ NULL,
// TX_32X16
- (float[]){ 0.01233f, 0.02527f, 0.03699f, 0.04602f, 0.05286f, 0.05896f,
- 0.06531f, 0.07336f, 0.08582f, 0.11072f },
+ NULL,
// TX_32X64
NULL,
// TX_64X32
NULL,
// TX_4X16
- NULL,
+ (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f,
+ 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f,
+ 0.10242f, 0.12878f },
// TX_16X4
- NULL,
+ (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f,
+ 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f,
+ 0.10217f, 0.12610f },
// TX_8X32
NULL,
// TX_32X8
@@ -1631,7 +1731,18 @@ static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
cur_scores_2D[3];
}
score_2D_average /= 16;
- score_2D_transform_pow8(scores_2D, (20 - score_2D_average));
+
+ const int prune_aggr_table[2][2] = { { 6, 4 }, { 10, 7 } };
+ int pruning_aggressiveness = 1;
+ if (tx_set_type == EXT_TX_SET_ALL16) {
+ score_2D_transform_pow8(scores_2D, (10 - score_2D_average));
+ pruning_aggressiveness =
+ prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0];
+ } else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) {
+ score_2D_transform_pow8(scores_2D, (20 - score_2D_average));
+ pruning_aggressiveness =
+ prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1];
+ }
// Always keep the TX type with the highest score, prune all others with
// score below score_thresh.
@@ -1645,18 +1756,6 @@ static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
}
}
- int pruning_aggressiveness = 0;
- if (prune_mode == PRUNE_2D_ACCURATE) {
- if (tx_set_type == EXT_TX_SET_ALL16)
- pruning_aggressiveness = 6;
- else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
- pruning_aggressiveness = 4;
- } else if (prune_mode == PRUNE_2D_FAST) {
- if (tx_set_type == EXT_TX_SET_ALL16)
- pruning_aggressiveness = 10;
- else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
- pruning_aggressiveness = 7;
- }
const float score_thresh =
prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1];
@@ -1724,9 +1823,11 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
}
static void model_rd_from_sse(const AV1_COMP *const cpi,
- const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
- int plane, int64_t sse, int *rate,
- int64_t *dist) {
+ const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+ int plane, int64_t sse, int num_samples,
+ int *rate, int64_t *dist) {
+ (void)num_samples;
+ const MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
const int dequant_shift =
(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
@@ -1734,15 +1835,17 @@ static void model_rd_from_sse(const AV1_COMP *const cpi,
// Fast approximate the modelling function.
if (cpi->sf.simple_model_rd_from_var) {
const int64_t square_error = sse;
- int quantizer = (pd->dequant_Q3[1] >> dequant_shift);
+ int quantizer = pd->dequant_Q3[1] >> dequant_shift;
if (quantizer < 120)
- *rate = (int)((square_error * (280 - quantizer)) >>
- (16 - AV1_PROB_COST_SHIFT));
+ *rate = (int)AOMMIN(
+ (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT),
+ INT_MAX);
else
*rate = 0;
+ assert(*rate >= 0);
*dist = (square_error * quantizer) >> 8;
} else {
- av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize],
+ av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize],
pd->dequant_Q3[1] >> dequant_shift, rate,
dist);
}
@@ -1776,22 +1879,23 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
- int plane_to, int *out_rate_sum,
- int64_t *out_dist_sum, int *skip_txfm_sb,
- int64_t *skip_sse_sb, int *plane_rate,
- int64_t *plane_sse, int64_t *plane_dist) {
+ int plane_to, int mi_row, int mi_col,
+ int *out_rate_sum, int64_t *out_dist_sum,
+ int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse,
+ int64_t *plane_dist) {
// Note our transform coeffs are 8 times an orthogonal transform.
// Hence quantizer step is also 8 times. To get effective quantizer
// we need to divide by 8 before sending to modeling function.
int plane;
+ (void)mi_row;
+ (void)mi_col;
const int ref = xd->mi[0]->ref_frame[0];
int64_t rate_sum = 0;
int64_t dist_sum = 0;
int64_t total_sse = 0;
- x->pred_sse[ref] = 0;
-
for (plane = plane_from; plane <= plane_to; ++plane) {
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -1805,26 +1909,31 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
if (x->skip_chroma_rd && plane) continue;
- // TODO(geza): Write direct sse functions that do not compute
- // variance as well.
- sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+ model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
total_sse += sse;
-
- model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &rate, &dist);
-
rate_sum += rate;
dist_sum += dist;
if (plane_rate) plane_rate[plane] = rate;
if (plane_sse) plane_sse[plane] = sse;
if (plane_dist) plane_dist[plane] = dist;
+ assert(rate_sum >= 0);
}
if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ rate_sum = AOMMIN(rate_sum, INT_MAX);
*out_rate_sum = (int)rate_sum;
*out_dist_sum = dist_sum;
}
@@ -1949,7 +2058,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
assert(visible_cols > 0);
#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && plane == 0 && txb_cols >= 8 && txb_rows >= 8)
+ if (x->using_dist_8x8 && plane == 0)
return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
tx_bsize, txb_cols, txb_rows, visible_cols,
visible_rows, x->qindex);
@@ -1967,8 +2076,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
int blk_row, int blk_col,
const BLOCK_SIZE plane_bsize,
- const BLOCK_SIZE tx_bsize,
- int force_sse) {
+ const BLOCK_SIZE tx_bsize) {
int visible_rows, visible_cols;
const MACROBLOCKD *xd = &x->e_mbd;
get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
@@ -1978,8 +2086,7 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
#if CONFIG_DIST_8X8
int txb_height = block_size_high[tx_bsize];
int txb_width = block_size_wide[tx_bsize];
- if (!force_sse && x->using_dist_8x8 && plane == 0 && txb_width >= 8 &&
- txb_height >= 8) {
+ if (x->using_dist_8x8 && plane == 0) {
const int src_stride = x->plane[plane].src.stride;
const int src_idx = (blk_row * src_stride + blk_col)
<< tx_size_wide_log2[0];
@@ -2145,29 +2252,7 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
MAX_TX_SIZE, eob,
cpi->common.reduced_tx_set_used);
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
- // Save decoded pixels for inter block in pd->pred to avoid
- // block_8x8_rd_txfm_daala_dist() need to produce them
- // by calling av1_inverse_transform_block() again.
- const int pred_stride = block_size_wide[plane_bsize];
- const int pred_idx = (blk_row * pred_stride + blk_col)
- << tx_size_wide_log2[0];
- int16_t *pred = &x->pred_luma[pred_idx];
- int i, j;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- for (j = 0; j < bsh; j++)
- for (i = 0; i < bsw; i++)
- pred[j * pred_stride + i] =
- CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
- } else {
- for (j = 0; j < bsh; j++)
- for (i = 0; i < bsw; i++)
- pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
- }
- }
-#endif // CONFIG_DIST_8X8
return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
blk_row, blk_col, plane_bsize, tx_bsize);
}
@@ -2258,11 +2343,11 @@ static void get_2x2_normalized_sses_and_sads(
}
}
+// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+// 0: Do not collect any RD stats
+// 1: Collect RD stats for transform units
+// 2: Collect RD stats for partition units
#if CONFIG_COLLECT_RD_STATS
- // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
- // 0: Do not collect any RD stats
- // 1: Collect RD stats for transform units
- // 2: Collect RD stats for partition units
#if CONFIG_COLLECT_RD_STATS == 1
static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -2274,7 +2359,7 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
// Generate small sample to restrict output size.
static unsigned int seed = 21743;
- if (lcg_rand16(&seed) % 100 > 0) return;
+ if (lcg_rand16(&seed) % 256 > 0) return;
const char output_file[] = "tu_stats.txt";
FILE *fout = fopen(output_file, "a");
@@ -2336,7 +2421,8 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
int model_rate;
int64_t model_dist;
- model_rd_from_sse(cpi, xd, tx_bsize, plane, sse, &model_rate, &model_dist);
+ model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
const double model_rate_norm = (double)model_rate / num_samples;
const double model_dist_norm = (double)model_dist / num_samples;
fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
@@ -2360,7 +2446,7 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
}
#endif // CONFIG_COLLECT_RD_STATS == 1
-#if CONFIG_COLLECT_RD_STATS == 2
+#if CONFIG_COLLECT_RD_STATS >= 2
static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
const RD_STATS *const rd_stats,
BLOCK_SIZE plane_bsize) {
@@ -2369,7 +2455,7 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
// Generate small sample to restrict output size.
static unsigned int seed = 95014;
- if (lcg_rand16(&seed) % 100 > 0) return;
+ if (lcg_rand16(&seed) % 256 > 0) return;
const char output_file[] = "pu_stats.txt";
FILE *fout = fopen(output_file, "a");
@@ -2390,8 +2476,10 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
const double rate_norm = (double)rd_stats->rate / num_samples;
const double dist_norm = (double)rd_stats->dist / num_samples;
+ const double rdcost_norm =
+ (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples;
- fprintf(fout, "%g %g", rate_norm, dist_norm);
+ fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm);
const int src_stride = p->src.stride;
const uint8_t *const src = p->src.buf;
@@ -2426,14 +2514,18 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
fprintf(fout, " %g", sad_norm_arr[i]);
}
- fprintf(fout, " %d %d %d", q_step, bw, bh);
+ fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh);
int model_rate;
int64_t model_dist;
- model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, &model_dist);
+ model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
+ const double model_rdcost_norm =
+ (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples;
const double model_rate_norm = (double)model_rate / num_samples;
const double model_dist_norm = (double)model_dist / num_samples;
- fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
+ fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
+ model_rdcost_norm);
double mean = get_mean(src_diff, diff_stride, bw, bh);
mean /= (1 << shift);
@@ -2450,53 +2542,51 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
fprintf(fout, "\n");
fclose(fout);
}
-#endif // CONFIG_COLLECT_RD_STATS == 2
+#endif // CONFIG_COLLECT_RD_STATS >= 2
#endif // CONFIG_COLLECT_RD_STATS
-static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x,
- BLOCK_SIZE plane_bsize, int plane, int64_t *rsse,
+static void model_rd_with_dnn(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+ int plane, int64_t sse, int num_samples,
int *rate, int64_t *dist) {
const MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
const int log_numpels = num_pels_log2_lookup[plane_bsize];
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int q_step = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+
const struct macroblock_plane *const p = &x->plane[plane];
int bw, bh;
- const int diff_stride = block_size_wide[plane_bsize];
get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
&bh);
- const int num_samples = bw * bh;
- const int dequant_shift =
- (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
- const int q_step = pd->dequant_Q3[1] >> dequant_shift;
-
const int src_stride = p->src.stride;
const uint8_t *const src = p->src.buf;
const int dst_stride = pd->dst.stride;
const uint8_t *const dst = pd->dst.buf;
const int16_t *const src_diff = p->src_diff;
+ const int diff_stride = block_size_wide[plane_bsize];
const int shift = (xd->bd - 8);
- int64_t sse = aom_sum_squares_2d_i16(p->src_diff, diff_stride, bw, bh);
- sse = ROUND_POWER_OF_TWO(sse, shift * 2);
- const double sse_norm = (double)sse / num_samples;
if (sse == 0) {
if (rate) *rate = 0;
if (dist) *dist = 0;
- if (rsse) *rsse = sse;
return;
}
if (plane) {
int model_rate;
int64_t model_dist;
- model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate,
- &model_dist);
+ model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
if (rate) *rate = model_rate;
if (dist) *dist = model_dist;
- if (rsse) *rsse = sse;
return;
}
+ aom_clear_system_state();
+ const double sse_norm = (double)sse / num_samples;
+
double sse_norm_arr[4];
get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
dst_stride, src_diff, diff_stride,
@@ -2506,25 +2596,26 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x,
for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
mean /= (1 << shift);
}
- const double variance = sse_norm - mean * mean;
- assert(variance >= 0.0);
+ double sse_norm_sum = 0.0, sse_frac_arr[3];
+ for (int k = 0; k < 4; ++k) sse_norm_sum += sse_norm_arr[k];
+ for (int k = 0; k < 3; ++k)
+ sse_frac_arr[k] =
+ sse_norm_sum > 0.0 ? sse_norm_arr[k] / sse_norm_sum : 0.25;
const double q_sqr = (double)(q_step * q_step);
const double q_sqr_by_sse_norm = q_sqr / (sse_norm + 1.0);
+ const double mean_sqr_by_sse_norm = mean * mean / (sse_norm + 1.0);
double hor_corr, vert_corr;
get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
- float features[11];
+ float features[NUM_FEATURES_PUSTATS];
features[0] = (float)hor_corr;
features[1] = (float)log_numpels;
- features[2] = (float)q_sqr;
+ features[2] = (float)mean_sqr_by_sse_norm;
features[3] = (float)q_sqr_by_sse_norm;
- features[4] = (float)sse_norm_arr[0];
- features[5] = (float)sse_norm_arr[1];
- features[6] = (float)sse_norm_arr[2];
- features[7] = (float)sse_norm_arr[3];
- features[8] = (float)sse_norm;
- features[9] = (float)variance;
- features[10] = (float)vert_corr;
+ features[4] = (float)sse_frac_arr[0];
+ features[5] = (float)sse_frac_arr[1];
+ features[6] = (float)sse_frac_arr[2];
+ features[7] = (float)vert_corr;
float rate_f, dist_by_sse_norm_f;
av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f);
@@ -2532,27 +2623,29 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x,
const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm));
int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+ aom_clear_system_state();
// Check if skip is better
- if (RDCOST(x->rdmult, rate_i, dist_i) >= RDCOST(x->rdmult, 0, (sse << 4))) {
+ if (rate_i == 0) {
dist_i = sse << 4;
+ } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+ RDCOST(x->rdmult, 0, sse << 4)) {
rate_i = 0;
- } else if (rate_i == 0) {
dist_i = sse << 4;
}
if (rate) *rate = rate_i;
if (dist) *dist = dist_i;
- if (rsse) *rsse = sse;
return;
}
-void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
- MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
- int plane_to, int *out_rate_sum,
- int64_t *out_dist_sum, int *skip_txfm_sb,
- int64_t *skip_sse_sb, int *plane_rate,
- int64_t *plane_sse, int64_t *plane_dist) {
+static void model_rd_for_sb_with_dnn(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ (void)mi_row;
+ (void)mi_col;
// Note our transform coeffs are 8 times an orthogonal transform.
// Hence quantizer step is also 8 times. To get effective quantizer
// we need to divide by 8 before sending to modeling function.
@@ -2562,19 +2655,306 @@ void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
int64_t dist_sum = 0;
int64_t total_sse = 0;
- x->pred_sse[ref] = 0;
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int64_t dist, sse;
+ int rate;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int shift = (xd->bd - 8);
+ int bw, bh;
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+ &bw, &bh);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+
+ model_rd_with_dnn(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+// Fits a surface for rate and distortion using as features:
+// log2(sse_norm + 1) and log2(sse_norm/qstep^2)
+static void model_rd_with_surffit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist) {
+ (void)cpi;
+ (void)plane_bsize;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+ if (sse == 0) {
+ if (rate) *rate = 0;
+ if (dist) *dist = 0;
+ return;
+ }
+ aom_clear_system_state();
+ const double sse_norm = (double)sse / num_samples;
+ const double qstepsqr = (double)qstep * qstep;
+ const double xm = log(sse_norm + 1.0) / log(2.0);
+ const double yl = log(sse_norm / qstepsqr) / log(2.0);
+ double rate_f, dist_by_sse_norm_f;
+
+ av1_model_rd_surffit(xm, yl, &rate_f, &dist_by_sse_norm_f);
+
+ const double dist_f = dist_by_sse_norm_f * sse_norm;
+ int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+ int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+ aom_clear_system_state();
+
+ // Check if skip is better
+ if (rate_i == 0) {
+ dist_i = sse << 4;
+ } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+ RDCOST(x->rdmult, 0, sse << 4)) {
+ rate_i = 0;
+ dist_i = sse << 4;
+ }
+
+ if (rate) *rate = rate_i;
+ if (dist) *dist = dist_i;
+}
+
+static void model_rd_for_sb_with_surffit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ (void)mi_row;
+ (void)mi_col;
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int64_t dist, sse;
+ int rate;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ int bw, bh;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int shift = (xd->bd - 8);
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+ &bw, &bh);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+
+ model_rd_with_surffit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+ &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+// Fits a curve for rate and distortion using as feature:
+// log2(sse_norm/qstep^2)
+static void model_rd_with_curvfit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist) {
+ (void)cpi;
+ (void)plane_bsize;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+
+ if (sse == 0) {
+ if (rate) *rate = 0;
+ if (dist) *dist = 0;
+ return;
+ }
+ aom_clear_system_state();
+ const double sse_norm = (double)sse / num_samples;
+ const double qstepsqr = (double)qstep * qstep;
+ const double xqr = log(sse_norm / qstepsqr) / log(2.0);
+
+ double rate_f, dist_by_sse_norm_f;
+ av1_model_rd_curvfit(xqr, &rate_f, &dist_by_sse_norm_f);
+
+ const double dist_f = dist_by_sse_norm_f * sse_norm;
+ int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+ int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+ aom_clear_system_state();
+
+ // Check if skip is better
+ if (rate_i == 0) {
+ dist_i = sse << 4;
+ } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+ RDCOST(x->rdmult, 0, sse << 4)) {
+ rate_i = 0;
+ dist_i = sse << 4;
+ }
+
+ if (rate) *rate = rate_i;
+ if (dist) *dist = dist_i;
+}
+
+static void model_rd_for_sb_with_curvfit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ (void)mi_row;
+ (void)mi_col;
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int64_t dist, sse;
+ int rate;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ int bw, bh;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int shift = (xd->bd - 8);
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+ &bw, &bh);
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+ model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+ &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+static void model_rd_for_sb_with_fullrdy(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
const BLOCK_SIZE plane_bsize =
get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
int64_t sse;
int rate;
int64_t dist;
if (x->skip_chroma_rd && plane) continue;
- model_rd_with_dnn(cpi, x, plane_bsize, plane, &sse, &rate, &dist);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+ sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+
+ RD_STATS rd_stats;
+ if (plane == 0) {
+ select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
+ if (rd_stats.invalid_rate) {
+ rate = 0;
+ dist = sse << 4;
+ } else {
+ rate = rd_stats.rate;
+ dist = rd_stats.dist;
+ }
+ } else {
+ model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+ &dist);
+ }
if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
@@ -2638,8 +3018,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
- if (intra_hash_idx > 0 &&
- intra_txb_rd_info->entropy_context == cur_joint_ctx &&
+ if (intra_txb_rd_info->entropy_context == cur_joint_ctx &&
x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type;
const TX_TYPE ref_tx_type =
@@ -2727,7 +3106,6 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
#if CONFIG_DIST_8X8
if (x->using_dist_8x8) use_transform_domain_distortion = 0;
#endif
-
int calc_pixel_domain_distortion_final =
cpi->sf.use_transform_domain_distortion == 1 &&
use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD &&
@@ -2740,7 +3118,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
int64_t block_sse =
- pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize, 1);
+ pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
block_sse *= 16;
@@ -2834,7 +3212,6 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
assert(best_rd != INT64_MAX);
best_rd_stats->skip = best_eob == 0;
- if (best_eob == 0) best_tx_type = DCT_DCT;
if (plane == 0) {
update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
best_tx_type);
@@ -2914,24 +3291,12 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
int64_t rd1, rd2, rd;
RD_STATS this_rd_stats;
-#if CONFIG_DIST_8X8
- // If sub8x8 tx, 8x8 or larger partition, and luma channel,
- // dist-8x8 disables early skip, because the distortion metrics for
- // sub8x8 tx (MSE) and reference distortion from 8x8 or larger partition
- // (new distortion metric) are different.
- // Exception is: dist-8x8 is enabled but still MSE is used,
- // i.e. "--tune=" encoder option is not used.
- int bw = block_size_wide[plane_bsize];
- int bh = block_size_high[plane_bsize];
- int disable_early_skip =
- x->using_dist_8x8 && plane == AOM_PLANE_Y && bw >= 8 && bh >= 8 &&
- (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
- x->tune_metric != AOM_TUNE_PSNR;
-#endif // CONFIG_DIST_8X8
-
av1_init_rd_stats(&this_rd_stats);
- if (args->exit_early) return;
+ if (args->exit_early) {
+ args->incomplete_exit = 1;
+ return;
+ }
if (!is_inter_block(mbmi)) {
av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
@@ -2954,11 +3319,14 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
#endif // CONFIG_RD_DEBUG
av1_set_txb_context(x, plane, block, tx_size, a, l);
- if (plane == 0) {
- x->blk_skip[blk_row *
- (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) +
- blk_col] = (x->plane[plane].eobs[block] == 0);
- }
+ const int blk_idx =
+ blk_row * (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) +
+ blk_col;
+
+ if (plane == 0)
+ set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0);
+ else
+ set_blk_skip(x, plane, blk_idx, 0);
rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
@@ -2972,100 +3340,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
args->this_rd += rd;
-#if CONFIG_DIST_8X8
- if (!disable_early_skip)
-#endif
- if (args->this_rd > args->best_rd) {
- args->exit_early = 1;
- return;
- }
-}
-
-#if CONFIG_DIST_8X8
-static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize,
- struct rdcost_block_args *args) {
- MACROBLOCKD *const xd = &x->e_mbd;
- const struct macroblockd_plane *const pd = &xd->plane[0];
- const struct macroblock_plane *const p = &x->plane[0];
- MB_MODE_INFO *const mbmi = xd->mi[0];
- const int src_stride = p->src.stride;
- const int dst_stride = pd->dst.stride;
- const uint8_t *src = &p->src.buf[0];
- const uint8_t *dst = &pd->dst.buf[0];
- const int16_t *pred = &x->pred_luma[0];
- int bw = block_size_wide[bsize];
- int bh = block_size_high[bsize];
- int visible_w = bw;
- int visible_h = bh;
-
- int i, j;
- int64_t rd, rd1, rd2;
- int64_t sse = INT64_MAX, dist = INT64_MAX;
- int qindex = x->qindex;
-
- assert((bw & 0x07) == 0);
- assert((bh & 0x07) == 0);
-
- get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w,
- &visible_h);
-
- const int diff_stride = block_size_wide[bsize];
- const int16_t *diff = p->src_diff;
- sse = dist_8x8_diff(x, src, src_stride, diff, diff_stride, bw, bh, visible_w,
- visible_h, qindex);
- sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
- sse *= 16;
-
- if (!is_inter_block(mbmi)) {
- dist = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize, bw, bh,
- visible_w, visible_h, qindex);
- dist *= 16;
- } else {
- // For inter mode, the decoded pixels are provided in x->pred_luma,
- // while the predicted pixels are in dst.
- uint8_t *pred8;
- DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- pred8 = CONVERT_TO_BYTEPTR(pred16);
- else
- pred8 = (uint8_t *)pred16;
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- for (j = 0; j < bh; j++)
- for (i = 0; i < bw; i++)
- CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i];
- } else {
- for (j = 0; j < bh; j++)
- for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i];
- }
-
- dist = av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw, bh,
- visible_w, visible_h, qindex);
- dist *= 16;
- }
-
-#ifdef DEBUG_DIST_8X8
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) {
- assert(args->rd_stats.sse == sse);
- assert(args->rd_stats.dist == dist);
+ if (args->this_rd > args->best_rd) {
+ args->exit_early = 1;
+ return;
}
-#endif // DEBUG_DIST_8X8
-
- args->rd_stats.sse = sse;
- args->rd_stats.dist = dist;
-
- rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist);
- rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse);
- rd = AOMMIN(rd1, rd2);
-
- args->rd_stats.rdcost = rd;
- args->this_rd = rd;
-
- if (args->this_rd > args->best_rd) args->exit_early = 1;
}
-#endif // CONFIG_DIST_8X8
static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
@@ -3089,16 +3368,12 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
&args);
-#if CONFIG_DIST_8X8
- int bw = block_size_wide[bsize];
- int bh = block_size_high[bsize];
- if (x->using_dist_8x8 && !args.exit_early && plane == 0 && bw >= 8 &&
- bh >= 8 && (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
- dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
-#endif
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early;
- if (args.exit_early) {
+ if (invalid_rd) {
av1_invalid_rd_stats(rd_stats);
} else {
*rd_stats = args.rd_stats;
@@ -3269,6 +3544,11 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16);
for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) {
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue;
+ }
+#endif
RD_STATS this_rd_stats;
if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD;
rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE);
@@ -3284,10 +3564,13 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
}
if (n == TX_4X4) break;
}
- mbmi->tx_size = best_tx_size;
- memcpy(mbmi->txk_type, best_txk_type,
- sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
- memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+
+ if (rd_stats->rate != INT_MAX) {
+ mbmi->tx_size = best_tx_size;
+ memcpy(mbmi->txk_type, best_txk_type,
+ sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+ memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+ }
// Reset the pruning flags.
av1_zero(x->tx_search_prune);
@@ -3429,7 +3712,8 @@ static int conditional_skipintra(PREDICTION_MODE mode,
// Model based RD estimation for luma intra blocks.
static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
- BLOCK_SIZE bsize, int mode_cost) {
+ BLOCK_SIZE bsize, int mode_cost, int mi_row,
+ int mi_col) {
const AV1_COMMON *cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -3450,10 +3734,9 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
}
}
// RD estimation.
- av1_subtract_plane(x, bsize, 0);
- model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
- &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL,
- NULL, NULL);
+ model_rd_sb_fn[MODELRD_TYPE_INTRA](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &this_rd_stats.rate,
+ &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL);
if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
mode_cost +=
x->angle_delta_cost[mbmi->mode - V_PRED]
@@ -3519,13 +3802,16 @@ static void optimize_palette_colors(uint16_t *color_cache, int n_cache,
// Given the base colors as specified in centroids[], calculate the RD cost
// of palette mode.
-static void palette_rd_y(
- const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
- BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
- uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
- uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
- int *rate, int *rate_tokenonly, int *rate_overhead, int64_t *distortion,
- int *skippable, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip) {
+static void palette_rd_y(const AV1_COMP *const cpi, MACROBLOCK *x,
+ MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int dc_mode_cost, const int *data,
+ int *centroids, int n, uint16_t *color_cache,
+ int n_cache, MB_MODE_INFO *best_mbmi,
+ uint8_t *best_palette_color_map, int64_t *best_rd,
+ int64_t *best_model_rd, int *rate, int *rate_tokenonly,
+ int *rate_overhead, int64_t *distortion,
+ int *skippable, PICK_MODE_CONTEXT *ctx,
+ uint8_t *blk_skip) {
optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
int k = av1_remove_duplicates(centroids, n);
if (k < PALETTE_MIN_SIZE) {
@@ -3551,7 +3837,8 @@ static void palette_rd_y(
extend_palette_color_map(color_map, cols, rows, block_width, block_height);
const int palette_mode_cost =
intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
- int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
+ int64_t this_model_rd =
+ intra_model_yrd(cpi, x, bsize, palette_mode_cost, mi_row, mi_col);
if (*best_model_rd != INT64_MAX &&
this_model_rd > *best_model_rd + (*best_model_rd >> 1))
return;
@@ -3580,11 +3867,11 @@ static void palette_rd_y(
}
static int rd_pick_palette_intra_sby(
- const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
- int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
- int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
- int64_t *distortion, int *skippable, PICK_MODE_CONTEXT *ctx,
- uint8_t *best_blk_skip) {
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int dc_mode_cost, MB_MODE_INFO *best_mbmi,
+ uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+ int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+ PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip) {
int rate_overhead = 0;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -3668,10 +3955,11 @@ static int rd_pick_palette_intra_sby(
// where the dominant colors and the k-means results are similar.
for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
for (i = 0; i < n; ++i) centroids[i] = top_colors[i];
- palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
- color_cache, n_cache, best_mbmi, best_palette_color_map,
- best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead,
- distortion, skippable, ctx, best_blk_skip);
+ palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data,
+ centroids, n, color_cache, n_cache, best_mbmi,
+ best_palette_color_map, best_rd, best_model_rd, rate,
+ rate_tokenonly, &rate_overhead, distortion, skippable, ctx,
+ best_blk_skip);
}
// K-means clustering.
@@ -3688,10 +3976,11 @@ static int rd_pick_palette_intra_sby(
}
av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
}
- palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
- color_cache, n_cache, best_mbmi, best_palette_color_map,
- best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead,
- distortion, skippable, ctx, best_blk_skip);
+ palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data,
+ centroids, n, color_cache, n_cache, best_mbmi,
+ best_palette_color_map, best_rd, best_model_rd, rate,
+ rate_tokenonly, &rate_overhead, distortion, skippable, ctx,
+ best_blk_skip);
}
}
@@ -3705,10 +3994,11 @@ static int rd_pick_palette_intra_sby(
// Return 1 if an filter intra mode is selected; return 0 otherwise.
static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
- int *rate, int *rate_tokenonly,
- int64_t *distortion, int *skippable,
- BLOCK_SIZE bsize, int mode_cost,
- int64_t *best_rd, int64_t *best_model_rd,
+ int mi_row, int mi_col, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ int *skippable, BLOCK_SIZE bsize,
+ int mode_cost, int64_t *best_rd,
+ int64_t *best_model_rd,
PICK_MODE_CONTEXT *ctx) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi = xd->mi[0];
@@ -3727,7 +4017,7 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
int64_t this_rd, this_model_rd;
RD_STATS tokenonly_rd_stats;
mbmi->filter_intra_mode_info.filter_intra_mode = mode;
- this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
+ this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col);
if (*best_model_rd != INT64_MAX &&
this_model_rd > *best_model_rd + (*best_model_rd >> 1))
continue;
@@ -3770,20 +4060,18 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
// Run RD calculation with given luma intra prediction angle., and return
// the RD cost. Update the best mode info. if the RD cost is the best so far.
static int64_t calc_rd_given_intra_angle(
- const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
- int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
- RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
- int64_t *best_rd, int64_t *best_model_rd, TX_TYPE *best_txk_type,
- uint8_t *best_blk_skip) {
- int this_rate;
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int mode_cost, int64_t best_rd_in, int8_t angle_delta,
+ int max_angle_delta, int *rate, RD_STATS *rd_stats, int *best_angle_delta,
+ TX_SIZE *best_tx_size, int64_t *best_rd, int64_t *best_model_rd,
+ TX_TYPE *best_txk_type, uint8_t *best_blk_skip) {
RD_STATS tokenonly_rd_stats;
int64_t this_rd, this_model_rd;
MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
const int n4 = bsize_to_num_blk(bsize);
assert(!is_inter_block(mbmi));
-
mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
- this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
+ this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col);
if (*best_model_rd != INT64_MAX &&
this_model_rd > *best_model_rd + (*best_model_rd >> 1))
return INT64_MAX;
@@ -3791,10 +4079,9 @@ static int64_t calc_rd_given_intra_angle(
super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in);
if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
- this_rate =
- tokenonly_rd_stats.rate + mode_cost +
- x->angle_delta_cost[mbmi->mode - V_PRED]
- [max_angle_delta + mbmi->angle_delta[PLANE_TYPE_Y]];
+ int this_rate =
+ mode_cost + tokenonly_rd_stats.rate +
+ x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
if (this_rd < *best_rd) {
@@ -3815,32 +4102,32 @@ static int64_t calc_rd_given_intra_angle(
// With given luma directional intra prediction mode, pick the best angle delta
// Return the RD cost corresponding to the best angle delta.
static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
- int *rate, RD_STATS *rd_stats,
- BLOCK_SIZE bsize, int mode_cost,
- int64_t best_rd,
+ int mi_row, int mi_col, int *rate,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int mode_cost, int64_t best_rd,
int64_t *best_model_rd) {
- MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = xd->mi[0];
+ MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
assert(!is_inter_block(mbmi));
- int i, angle_delta, best_angle_delta = 0;
- int first_try = 1;
- int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+
+ int best_angle_delta = 0;
+ int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
TX_SIZE best_tx_size = mbmi->tx_size;
- const int n4 = bsize_to_num_blk(bsize);
TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
- for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+ for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
- for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
- for (i = 0; i < 2; ++i) {
- best_rd_in = (best_rd == INT64_MAX)
- ? INT64_MAX
- : (best_rd + (best_rd >> (first_try ? 3 : 5)));
- this_rd = calc_rd_given_intra_angle(
- cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta,
- MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
- &best_rd, best_model_rd, best_txk_type, best_blk_skip);
+ int first_try = 1;
+ for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ for (int i = 0; i < 2; ++i) {
+ const int64_t best_rd_in =
+ (best_rd == INT64_MAX) ? INT64_MAX
+ : (best_rd + (best_rd >> (first_try ? 3 : 5)));
+ const int64_t this_rd = calc_rd_given_intra_angle(
+ cpi, x, bsize, mi_row, mi_col, mode_cost, best_rd_in,
+ (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
+ &best_angle_delta, &best_tx_size, &best_rd, best_model_rd,
+ best_txk_type, best_blk_skip);
rd_cost[2 * angle_delta + i] = this_rd;
if (first_try && this_rd == INT64_MAX) return best_rd;
first_try = 0;
@@ -3852,28 +4139,31 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
}
assert(best_rd != INT64_MAX);
- for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
- int64_t rd_thresh;
- for (i = 0; i < 2; ++i) {
+ for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ for (int i = 0; i < 2; ++i) {
int skip_search = 0;
- rd_thresh = best_rd + (best_rd >> 5);
+ const int64_t rd_thresh = best_rd + (best_rd >> 5);
if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
skip_search = 1;
if (!skip_search) {
- calc_rd_given_intra_angle(
- cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta,
- MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
- &best_rd, best_model_rd, best_txk_type, best_blk_skip);
+ calc_rd_given_intra_angle(cpi, x, bsize, mi_row, mi_col, mode_cost,
+ best_rd, (1 - 2 * i) * angle_delta,
+ MAX_ANGLE_DELTA, rate, rd_stats,
+ &best_angle_delta, &best_tx_size, &best_rd,
+ best_model_rd, best_txk_type, best_blk_skip);
}
}
}
- mbmi->tx_size = best_tx_size;
- mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
- memcpy(mbmi->txk_type, best_txk_type,
- sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
- memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+ if (rd_stats->rate != INT_MAX) {
+ mbmi->tx_size = best_tx_size;
+ mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
+ memcpy(mbmi->txk_type, best_txk_type,
+ sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+ memcpy(x->blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+ }
return best_rd;
}
@@ -4052,10 +4342,10 @@ static void intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
// This function is used only for intra_only frames
static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
- int *rate, int *rate_tokenonly,
- int64_t *distortion, int *skippable,
- BLOCK_SIZE bsize, int64_t best_rd,
- PICK_MODE_CONTEXT *ctx) {
+ int mi_row, int mi_col, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ int *skippable, BLOCK_SIZE bsize,
+ int64_t best_rd, PICK_MODE_CONTEXT *ctx) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
assert(!is_inter_block(mbmi));
@@ -4098,13 +4388,14 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
MB_MODE_INFO best_mbmi = *mbmi;
/* Y Search for intra prediction mode */
- for (int mode_idx = DC_PRED; mode_idx < INTRA_MODES; ++mode_idx) {
+ for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
RD_STATS this_rd_stats;
int this_rate, this_rate_tokenonly, s;
int64_t this_distortion, this_rd, this_model_rd;
mbmi->mode = intra_rd_search_mode_order[mode_idx];
mbmi->angle_delta[PLANE_TYPE_Y] = 0;
- this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]);
+ this_model_rd =
+ intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode], mi_row, mi_col);
if (best_model_rd != INT64_MAX &&
this_model_rd > best_model_rd + (best_model_rd >> 1))
continue;
@@ -4113,8 +4404,9 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
if (is_directional_mode && av1_use_angle_delta(bsize)) {
this_rd_stats.rate = INT_MAX;
- rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
- bmode_costs[mbmi->mode], best_rd, &best_model_rd);
+ rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &this_rate,
+ &this_rd_stats, bsize, bmode_costs[mbmi->mode],
+ best_rd, &best_model_rd);
} else {
super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
}
@@ -4151,16 +4443,16 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
}
if (try_palette) {
- rd_pick_palette_intra_sby(cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi,
- best_palette_color_map, &best_rd, &best_model_rd,
- rate, rate_tokenonly, distortion, skippable, ctx,
- ctx->blk_skip);
+ rd_pick_palette_intra_sby(
+ cpi, x, bsize, mi_row, mi_col, bmode_costs[DC_PRED], &best_mbmi,
+ best_palette_color_map, &best_rd, &best_model_rd, rate, rate_tokenonly,
+ distortion, skippable, ctx, ctx->blk_skip);
}
if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
- if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
- skippable, bsize, bmode_costs[DC_PRED],
- &best_rd, &best_model_rd, ctx)) {
+ if (rd_pick_filter_intra_sby(
+ cpi, x, mi_row, mi_col, rate, rate_tokenonly, distortion, skippable,
+ bsize, bmode_costs[DC_PRED], &best_rd, &best_model_rd, ctx)) {
best_mbmi = *mbmi;
}
}
@@ -4230,16 +4522,12 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
int blk_row, int blk_col, int plane, int block,
- int plane_bsize, const ENTROPY_CONTEXT *a,
- const ENTROPY_CONTEXT *l, RD_STATS *rd_stats,
+ int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats,
FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost,
TXB_RD_INFO *rd_info_array) {
const struct macroblock_plane *const p = &x->plane[plane];
- TXB_CTX txb_ctx;
- get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
const uint16_t cur_joint_ctx =
- (txb_ctx.dc_sign_ctx << 8) + txb_ctx.txb_skip_ctx;
-
+ (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
const int txk_type_idx =
av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
// Look up RD and terminate early in case when we've already processed exactly
@@ -4264,7 +4552,7 @@ static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
RD_STATS this_rd_stats;
search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
- &txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
+ txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
av1_merge_rd_stats(rd_stats, &this_rd_stats);
@@ -4428,8 +4716,8 @@ static void try_tx_block_no_split(
rd_stats->zero_rate = zero_blk_rate;
const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
mbmi->inter_tx_size[index] = tx_size;
- tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta,
- ptl, rd_stats, ftxs_mode, ref_best_rd,
+ tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
+ &txb_ctx, rd_stats, ftxs_mode, ref_best_rd,
rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
assert(rd_stats->rate < INT_MAX);
@@ -4444,12 +4732,12 @@ static void try_tx_block_no_split(
rd_stats->rate = zero_blk_rate;
rd_stats->dist = rd_stats->sse;
rd_stats->skip = 1;
- x->blk_skip[blk_row * bw + blk_col] = 1;
+ set_blk_skip(x, 0, blk_row * bw + blk_col, 1);
p->eobs[block] = 0;
update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
DCT_DCT);
} else {
- x->blk_skip[blk_row * bw + blk_col] = 0;
+ set_blk_skip(x, 0, blk_row * bw + blk_col, 0);
rd_stats->skip = 0;
}
@@ -4482,7 +4770,6 @@ static void try_tx_block_split(
MACROBLOCKD *const xd = &x->e_mbd;
const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
- struct macroblock_plane *const p = &x->plane[0];
const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
const int bsw = tx_size_wide_unit[sub_txs];
const int bsh = tx_size_high_unit[sub_txs];
@@ -4490,10 +4777,7 @@ static void try_tx_block_split(
RD_STATS this_rd_stats;
int this_cost_valid = 1;
int64_t tmp_rd = 0;
-#if CONFIG_DIST_8X8
- int sub8x8_eob[4] = { 0, 0, 0, 0 };
- struct macroblockd_plane *const pd = &xd->plane[0];
-#endif
+
split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
assert(tx_size < TX_SIZES_ALL);
@@ -4511,123 +4795,22 @@ static void try_tx_block_split(
&this_cost_valid, ftxs_mode,
(rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
-#if CONFIG_DIST_8X8
- if (!x->using_dist_8x8)
-#endif
- if (!this_cost_valid) goto LOOP_EXIT;
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && tx_size == TX_8X8) {
- sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block];
- }
-#endif // CONFIG_DIST_8X8
+ if (!this_cost_valid) goto LOOP_EXIT;
+
av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
-#if CONFIG_DIST_8X8
- if (!x->using_dist_8x8)
-#endif
- if (no_split_rd < tmp_rd) {
- this_cost_valid = 0;
- goto LOOP_EXIT;
- }
+
+ if (no_split_rd < tmp_rd) {
+ this_cost_valid = 0;
+ goto LOOP_EXIT;
+ }
block += sub_step;
}
}
LOOP_EXIT : {}
-#if CONFIG_DIST_8X8
- if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) {
- const int src_stride = p->src.stride;
- const int dst_stride = pd->dst.stride;
-
- const uint8_t *src =
- &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
- const uint8_t *dst =
- &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-
- int64_t dist_8x8;
- const int qindex = x->qindex;
- const int pred_stride = block_size_wide[plane_bsize];
- const int pred_idx = (blk_row * pred_stride + blk_col)
- << tx_size_wide_log2[0];
- const int16_t *pred = &x->pred_luma[pred_idx];
- int i, j;
- int row, col;
-
- uint8_t *pred8;
- DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
-
- dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, BLOCK_8X8,
- 8, 8, 8, 8, qindex) *
- 16;
-
-#ifdef DEBUG_DIST_8X8
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
- assert(sum_rd_stats.sse == dist_8x8);
-#endif // DEBUG_DIST_8X8
-
- split_rd_stats->sse = dist_8x8;
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- pred8 = CONVERT_TO_BYTEPTR(pred8_16);
- else
- pred8 = (uint8_t *)pred8_16;
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- for (row = 0; row < 2; ++row) {
- for (col = 0; col < 2; ++col) {
- int idx = row * 2 + col;
- int eob = sub8x8_eob[idx];
-
- if (eob > 0) {
- for (j = 0; j < 4; j++)
- for (i = 0; i < 4; i++)
- CONVERT_TO_SHORTPTR(pred8)
- [(row * 4 + j) * 8 + 4 * col + i] =
- pred[(row * 4 + j) * pred_stride + 4 * col + i];
- } else {
- for (j = 0; j < 4; j++)
- for (i = 0; i < 4; i++)
- CONVERT_TO_SHORTPTR(pred8)
- [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR(
- dst)[(row * 4 + j) * dst_stride + 4 * col + i];
- }
- }
- }
- } else {
- for (row = 0; row < 2; ++row) {
- for (col = 0; col < 2; ++col) {
- int idx = row * 2 + col;
- int eob = sub8x8_eob[idx];
-
- if (eob > 0) {
- for (j = 0; j < 4; j++)
- for (i = 0; i < 4; i++)
- pred8[(row * 4 + j) * 8 + 4 * col + i] =
- (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i];
- } else {
- for (j = 0; j < 4; j++)
- for (i = 0; i < 4; i++)
- pred8[(row * 4 + j) * 8 + 4 * col + i] =
- dst[(row * 4 + j) * dst_stride + 4 * col + i];
- }
- }
- }
- }
- dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8, 8,
- 8, 8, qindex) *
- 16;
-
-#ifdef DEBUG_DIST_8X8
- if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
- assert(sum_rd_stats.dist == dist_8x8);
-#endif // DEBUG_DIST_8X8
-
- split_rd_stats->dist = dist_8x8;
- tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
- }
-#endif // CONFIG_DIST_8X8
if (this_cost_valid) *split_rd = tmp_rd;
}
@@ -4660,7 +4843,10 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
const int try_no_split = 1;
int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
-
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8)
+ try_split &= tx_size_wide[tx_size] >= 16 && tx_size_high[tx_size] >= 16;
+#endif
TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
// TX no split
@@ -4691,11 +4877,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
}
}
-#if COLLECT_TX_SIZE_DATA
- // Do not skip tx_split when collecting tx size data.
- try_split = 1;
-#endif
-
// TX split
int64_t split_rd = INT64_MAX;
RD_STATS split_rd_stats;
@@ -4707,54 +4888,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
rd_info_node, &split_rd_stats, &split_rd);
}
-#if COLLECT_TX_SIZE_DATA
- do {
- if (tx_size <= TX_4X4 || depth >= MAX_VARTX_DEPTH) break;
-
-#if 0
- // Randomly select blocks to collect data to reduce output file size.
- const int rnd_val = rand() % 2;
- if (rnd_val) break;
-#endif
-
- const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
- const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
- const int within_border =
- mi_row >= xd->tile.mi_row_start &&
- (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
- mi_col >= xd->tile.mi_col_start &&
- (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
- if (!within_border) break;
-
- FILE *fp = fopen(av1_tx_size_data_output_file, "a");
- if (!fp) break;
-
- // Split decision, RD cost, block type(inter/intra), q-index, rdmult,
- // and block size.
- const int split_selected = sum_rd < this_rd;
- const int is_inter = 1;
- const int txb_w = tx_size_wide[tx_size];
- const int txb_h = tx_size_high[tx_size];
- fprintf(fp, "%d,%lld,%lld,%d,%d,%d,%d,%d,", split_selected,
- (long long)this_rd, (long long)sum_rd, cpi->common.base_qindex,
- x->rdmult, is_inter, txb_w, txb_h);
-
- // Residue signal.
- const int diff_stride = block_size_wide[plane_bsize];
- const int16_t *src_diff =
- &p->src_diff[(blk_row * diff_stride + blk_col) * 4];
- for (int r = 0; r < txb_h; ++r) {
- for (int c = 0; c < txb_w; ++c) {
- fprintf(fp, "%d,", src_diff[c]);
- }
- src_diff += diff_stride;
- }
- fprintf(fp, "\n");
-
- fclose(fp);
- } while (0);
-#endif // COLLECT_TX_SIZE_DATA
-
if (no_split.rd < split_rd) {
ENTROPY_CONTEXT *pta = ta + blk_col;
ENTROPY_CONTEXT *ptl = tl + blk_row;
@@ -4773,7 +4906,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
mbmi->tx_size = tx_size_selected;
update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
no_split.tx_type);
- x->blk_skip[blk_row * bw + blk_col] = rd_stats->skip;
+ set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip);
} else {
*rd_stats = split_rd_stats;
if (split_rd == INT64_MAX) *is_cost_valid = 0;
@@ -4787,7 +4920,7 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
TXB_RD_INFO_NODE *rd_info_tree) {
MACROBLOCKD *const xd = &x->e_mbd;
int is_cost_valid = 1;
- int64_t this_rd = 0;
+ int64_t this_rd = 0, skip_rd = 0;
if (ref_best_rd < 0) is_cost_valid = 0;
@@ -4818,42 +4951,39 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+ const int skip_ctx = av1_get_skip_context(xd);
+ const int s0 = x->skip_cost[skip_ctx][0];
+ const int s1 = x->skip_cost[skip_ctx][1];
+ skip_rd = RDCOST(x->rdmult, s1, 0);
+ this_rd = RDCOST(x->rdmult, s0, 0);
for (idy = 0; idy < mi_height; idy += bh) {
for (idx = 0; idx < mi_width; idx += bw) {
+ int64_t best_rd_sofar = (ref_best_rd - (AOMMIN(skip_rd, this_rd)));
select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth,
plane_bsize, ctxa, ctxl, tx_above, tx_left,
- &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid,
- ftxs_mode, rd_info_tree);
+ &pn_rd_stats, best_rd_sofar, &is_cost_valid, ftxs_mode,
+ rd_info_tree);
if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
av1_invalid_rd_stats(rd_stats);
return;
}
av1_merge_rd_stats(rd_stats, &pn_rd_stats);
- this_rd +=
- AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
- RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
+ skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+ this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
block += step;
if (rd_info_tree != NULL) rd_info_tree += 1;
}
}
+ if (skip_rd <= this_rd) {
+ rd_stats->rate = 0;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip = 1;
+ } else {
+ rd_stats->skip = 0;
+ }
}
- const int skip_ctx = av1_get_skip_context(xd);
- const int s0 = x->skip_cost[skip_ctx][0];
- const int s1 = x->skip_cost[skip_ctx][1];
- int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
- this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
- if (skip_rd <= this_rd) {
- this_rd = skip_rd;
- rd_stats->rate = 0;
- rd_stats->dist = rd_stats->sse;
- rd_stats->skip = 1;
- } else {
- rd_stats->skip = 0;
- }
- if (this_rd > ref_best_rd) is_cost_valid = 0;
-
if (!is_cost_valid) {
// reset cost value
av1_invalid_rd_stats(rd_stats);
@@ -4945,8 +5075,8 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
.txb_skip_cost[txb_ctx.txb_skip_ctx][1];
rd_stats->zero_rate = zero_blk_rate;
rd_stats->ref_rdcost = ref_best_rd;
- tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, ta,
- tl, rd_stats, ftxs_mode, ref_best_rd, NULL);
+ tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
+ &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL);
const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
@@ -4954,14 +5084,14 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
rd_stats->rate = zero_blk_rate;
rd_stats->dist = rd_stats->sse;
rd_stats->skip = 1;
- x->blk_skip[blk_row * mi_width + blk_col] = 1;
+ set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1);
x->plane[0].eobs[block] = 0;
x->plane[0].txb_entropy_ctx[block] = 0;
update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
DCT_DCT);
} else {
rd_stats->skip = 0;
- x->blk_skip[blk_row * mi_width + blk_col] = 0;
+ set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0);
}
if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
rd_stats->rate += x->txfm_partition_cost[ctx][0];
@@ -5128,12 +5258,13 @@ static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info,
static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
const uint32_t hash) {
// Linear search through the circular buffer to find matching hash.
- int index;
- for (int i = cur_record->num - 1; i >= 0; i--) {
- index = (cur_record->index_start + i) % TX_SIZE_RD_RECORD_BUFFER_LEN;
- if (cur_record->hash_vals[index] == hash) return index;
+ for (int i = cur_record->index_start - 1; i >= 0; i--) {
+ if (cur_record->hash_vals[i] == hash) return i;
}
-
+ for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) {
+ if (cur_record->hash_vals[i] == hash) return i;
+ }
+ int index;
// If not found - add new RD info into the buffer and return its index
if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
index = (cur_record->index_start + cur_record->num) %
@@ -5150,6 +5281,155 @@ static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
return index;
}
+typedef struct {
+ int leaf;
+ int8_t children[4];
+} RD_RECORD_IDX_NODE;
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = {
+ { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 1, { 0, 0, 0, 0 } },
+ { 1, { 0, 0, 0, 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 1, { 0 } },
+ { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = {
+ { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 0, { 3, 4, 5, 6 } },
+ { 0, { 7, 8, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 0, { 3, 4, 7, 8 } },
+ { 0, { 5, 6, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = {
+ { 0, { 1, 2, 3, 4 } }, { 0, { 5, 6, 9, 10 } }, { 0, { 7, 8, 11, 12 } },
+ { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = {
+ { 0, { 2, 3, 4, 5 } }, { 0, { 6, 7, 8, 9 } },
+ { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } },
+ { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } },
+ { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } },
+ { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = {
+ { 0, { 2, 3, 6, 7 } }, { 0, { 4, 5, 8, 9 } },
+ { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } },
+ { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } },
+ { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } },
+ { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = {
+ { 0, { 4, 5, 8, 9 } }, { 0, { 6, 7, 10, 11 } },
+ { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } },
+ { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } },
+ { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } },
+ { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } },
+ { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } },
+ { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } },
+ { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } },
+ { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } },
+ { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = {
+ { 0, { 1, -1, 2, -1 } },
+ { 0, { 3, 4, -1, -1 } },
+ { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 0, { 3, 4, -1, -1 } },
+ { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = {
+ NULL, // BLOCK_4X4
+ NULL, // BLOCK_4X8
+ NULL, // BLOCK_8X4
+ rd_record_tree_8x8, // BLOCK_8X8
+ rd_record_tree_8x16, // BLOCK_8X16
+ rd_record_tree_16x8, // BLOCK_16X8
+ rd_record_tree_16x16, // BLOCK_16X16
+ rd_record_tree_1_2, // BLOCK_16X32
+ rd_record_tree_2_1, // BLOCK_32X16
+ rd_record_tree_sqr, // BLOCK_32X32
+ rd_record_tree_1_2, // BLOCK_32X64
+ rd_record_tree_2_1, // BLOCK_64X32
+ rd_record_tree_sqr, // BLOCK_64X64
+ rd_record_tree_64x128, // BLOCK_64X128
+ rd_record_tree_128x64, // BLOCK_128X64
+ rd_record_tree_128x128, // BLOCK_128X128
+ NULL, // BLOCK_4X16
+ NULL, // BLOCK_16X4
+ rd_record_tree_1_4, // BLOCK_8X32
+ rd_record_tree_4_1, // BLOCK_32X8
+ rd_record_tree_1_4, // BLOCK_16X64
+ rd_record_tree_4_1, // BLOCK_64X16
+};
+
+static const int rd_record_tree_size[BLOCK_SIZES_ALL] = {
+ 0, // BLOCK_4X4
+ 0, // BLOCK_4X8
+ 0, // BLOCK_8X4
+ sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X8
+ sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X16
+ sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X8
+ sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X16
+ sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X32
+ sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X16
+ sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X32
+ sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X64
+ sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X32
+ sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X64
+ sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X128
+ sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X64
+ sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X128
+ 0, // BLOCK_4X16
+ 0, // BLOCK_16X4
+ sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X32
+ sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X8
+ sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X64
+ sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X16
+};
+
+static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree,
+ BLOCK_SIZE bsize) {
+ const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize];
+ const int size = rd_record_tree_size[bsize];
+ for (int i = 0; i < size; ++i) {
+ if (rd_record[i].leaf) {
+ av1_zero(tree[i].children);
+ } else {
+ for (int j = 0; j < 4; ++j) {
+ const int8_t idx = rd_record[i].children[j];
+ tree[i].children[j] = idx > 0 ? &tree[idx] : NULL;
+ }
+ }
+ }
+}
+
// Go through all TX blocks that could be used in TX size search, compute
// residual hash values for them and find matching RD info that stores previous
// RD search results for these TX blocks. The idea is to prevent repeated
@@ -5168,26 +5448,23 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
// Hashing is performed only for square TX sizes larger than TX_4X4
if (max_square_tx_size < TX_8X8) return 0;
-
- const int bw_mi = mi_size_wide[bsize];
const int diff_stride = bw;
const struct macroblock_plane *const p = &x->plane[0];
const int16_t *diff = &p->src_diff[0];
-
+ init_rd_record_tree(dst_rd_info, bsize);
// Coordinates of the top-left corner of current block within the superblock
// measured in pixels:
const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
int cur_rd_info_idx = 0;
int cur_tx_depth = 0;
- uint8_t parent_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
- uint8_t child_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
while (cur_tx_depth <= MAX_VARTX_DEPTH) {
const int cur_tx_bw = tx_size_wide[cur_tx_size];
const int cur_tx_bh = tx_size_high[cur_tx_size];
if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
+ const int tx_size_idx = cur_tx_size - TX_8X8;
for (int row = 0; row < bh; row += cur_tx_bh) {
for (int col = 0; col < bw; col += cur_tx_bw) {
if (cur_tx_bw != cur_tx_bh) {
@@ -5211,48 +5488,13 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
(uint8_t *)hash_data,
2 * cur_tx_bw * cur_tx_bh);
-
// Find corresponding RD info based on the hash value.
- const int rd_record_idx =
- row_in_sb * (MAX_MIB_SIZE >> (cur_tx_size + 1 - TX_8X8)) +
- col_in_sb;
-
- int idx = find_tx_size_rd_info(
- &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx], hash);
+ const int record_idx =
+ row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
+ TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx];
+ int idx = find_tx_size_rd_info(records, hash);
dst_rd_info[cur_rd_info_idx].rd_info_array =
- &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx]
- .tx_rd_info[idx];
- }
-
- // Update the output quadtree RD info structure.
- av1_zero(dst_rd_info[cur_rd_info_idx].children);
- const int this_mi_row = row / MI_SIZE;
- const int this_mi_col = col / MI_SIZE;
- if (cur_tx_depth > 0) { // Set up child pointers.
- const int mi_index = this_mi_row * bw_mi + this_mi_col;
- const int child_idx = child_idx_buf[mi_index];
- assert(child_idx < 4);
- dst_rd_info[parent_idx_buf[mi_index]].children[child_idx] =
- &dst_rd_info[cur_rd_info_idx];
- }
- if (cur_tx_depth < MAX_VARTX_DEPTH) { // Set up parent and child idx.
- const int tx_bh_mi = cur_tx_bh / MI_SIZE;
- const int tx_bw_mi = cur_tx_bw / MI_SIZE;
- for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; ++i) {
- memset(parent_idx_buf + i * bw_mi + this_mi_col, cur_rd_info_idx,
- tx_bw_mi);
- }
- int child_idx = 0;
- const int next_tx_bh_mi = tx_size_wide_unit[next_tx_size];
- const int next_tx_bw_mi = tx_size_wide_unit[next_tx_size];
- for (int i = this_mi_row; i < this_mi_row + tx_bh_mi;
- i += next_tx_bh_mi) {
- for (int j = this_mi_col; j < this_mi_col + tx_bw_mi;
- j += next_tx_bw_mi) {
- assert(child_idx < 4);
- child_idx_buf[i * bw_mi + j] = child_idx++;
- }
- }
+ &records->tx_rd_info[idx];
}
++cur_rd_info_idx;
}
@@ -5300,7 +5542,7 @@ static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
const MACROBLOCKD *xd = &x->e_mbd;
const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
- *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, 1);
+ *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
const int64_t mse = *dist / bw / bh;
// Normalized quantizer takes the transform upscaling factor (8 for tx size
// smaller than 32) into account.
@@ -5354,7 +5596,7 @@ static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
mbmi->tx_size = tx_size;
- memset(x->blk_skip, 1, sizeof(x->blk_skip[0]) * n4);
+ for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
rd_stats->skip = 1;
rd_stats->rate = 0;
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
@@ -5388,17 +5630,21 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
int model_rate;
int64_t model_dist;
int model_skip;
- model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist,
- &model_skip, NULL, NULL, NULL, NULL);
+ model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &model_rate, &model_dist,
+ &model_skip, NULL, NULL, NULL, NULL);
const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
// If the modeled rd is a lot worse than the best so far, breakout.
// TODO(debargha, urvang): Improve the model and make the check below
// tighter.
assert(cpi->sf.model_based_prune_tx_search_level >= 0 &&
cpi->sf.model_based_prune_tx_search_level <= 2);
+ static const int prune_factor_by8[] = { 2 + MODELRD_TYPE_TX_SEARCH_PRUNE,
+ 4 + MODELRD_TYPE_TX_SEARCH_PRUNE };
if (!model_skip &&
- model_rd / (5 - cpi->sf.model_based_prune_tx_search_level) >
- ref_best_rd)
+ ((model_rd *
+ prune_factor_by8[cpi->sf.model_based_prune_tx_search_level - 1]) >>
+ 3) > ref_best_rd)
return;
}
@@ -5431,7 +5677,7 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
// Precompute residual hashes and find existing or add new RD records to
// store and reuse rate and distortion values to speed up TX size search.
- TXB_RD_INFO_NODE matched_rd_info[16 + 64 + 256];
+ TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64];
int found_rd_info = 0;
if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_inter_txb_hash) {
found_rd_info =
@@ -5479,34 +5725,61 @@ static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
assert(plane > 0);
assert(tx_size < TX_SIZES_ALL);
MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
ENTROPY_CONTEXT *ta = above_ctx + blk_col;
ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx);
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_UV]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize,
- ta, tl, rd_stats, ftxs_mode, INT64_MAX, NULL);
+ &txb_ctx, rd_stats, ftxs_mode, INT64_MAX, NULL);
+
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int blk_idx = blk_row * mi_width + blk_col;
+
av1_set_txb_context(x, plane, block, tx_size, ta, tl);
+ if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+ RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+ rd_stats->skip == 1) &&
+ !xd->lossless[mbmi->segment_id]) {
+ rd_stats->rate = zero_blk_rate;
+ rd_stats->dist = rd_stats->sse;
+ }
+
+ // Set chroma blk_skip to 0
+ set_blk_skip(x, plane, blk_idx, 0);
}
// Return value 0: early termination triggered, no valid rd cost available;
// 1: rd cost values are valid.
static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
RD_STATS *rd_stats, BLOCK_SIZE bsize,
- int64_t ref_best_rd,
+ int64_t non_skip_ref_best_rd,
+ int64_t skip_ref_best_rd,
FAST_TX_SEARCH_MODE ftxs_mode) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
int plane;
int is_cost_valid = 1;
int64_t this_rd = 0;
+ int64_t skip_rd = 0;
- if (ref_best_rd < 0) is_cost_valid = 0;
+ if ((non_skip_ref_best_rd < 0) && (skip_ref_best_rd < 0)) is_cost_valid = 0;
av1_init_rd_stats(rd_stats);
- if (x->skip_chroma_rd) return is_cost_valid;
+ if (x->skip_chroma_rd) {
+ if (!is_cost_valid) av1_invalid_rd_stats(rd_stats);
+
+ return is_cost_valid;
+ }
+
const BLOCK_SIZE bsizec = scale_chroma_bsize(
bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
@@ -5531,36 +5804,31 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
const int step = bh * bw;
ENTROPY_CONTEXT ta[MAX_MIB_SIZE];
ENTROPY_CONTEXT tl[MAX_MIB_SIZE];
- RD_STATS pn_rd_stats;
- av1_init_rd_stats(&pn_rd_stats);
av1_get_entropy_contexts(bsizec, pd, ta, tl);
for (idy = 0; idy < mi_height; idy += bh) {
for (idx = 0; idx < mi_width; idx += bw) {
+ RD_STATS pn_rd_stats;
+ av1_init_rd_stats(&pn_rd_stats);
tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size,
plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode);
+ if (pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+ if ((this_rd > non_skip_ref_best_rd) &&
+ (skip_rd > skip_ref_best_rd)) {
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
block += step;
}
}
-
- if (pn_rd_stats.rate == INT_MAX) {
- is_cost_valid = 0;
- break;
- }
-
- av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-
- this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
- RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse));
-
- if (this_rd > ref_best_rd) {
- is_cost_valid = 0;
- break;
- }
}
- }
-
- if (!is_cost_valid) {
+ } else {
// reset cost value
av1_invalid_rd_stats(rd_stats);
}
@@ -6137,9 +6405,9 @@ static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
(mv->col >> 3) > mv_limits->col_max;
}
-static INLINE int get_single_mode(int this_mode, int ref_idx,
- int is_comp_pred) {
- int single_mode;
+static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
+ int ref_idx, int is_comp_pred) {
+ PREDICTION_MODE single_mode;
if (is_comp_pred) {
single_mode =
ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode);
@@ -6149,63 +6417,6 @@ static INLINE int get_single_mode(int this_mode, int ref_idx,
return single_mode;
}
-/* If the current mode shares the same mv with other modes with higher prority,
- * skip this mode. This priority order is nearest > global > near. */
-static int skip_repeated_mv(const AV1_COMMON *const cm,
- const MACROBLOCK *const x, int this_mode,
- const MV_REFERENCE_FRAME ref_frames[2]) {
- const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
- const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
- const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
- if (!is_comp_pred) {
- if (this_mode == NEARMV) {
- if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) {
- // NEARMV has the same motion vector as NEARESTMV
- return 1;
- }
- if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 &&
- cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
- // NEARMV has the same motion vector as GLOBALMV
- return 1;
- }
- }
- if (this_mode == GLOBALMV) {
- if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 &&
- cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
- // GLOBALMV has the same motion vector as NEARESTMV
- return 1;
- }
- }
- } else {
- for (int i = 0; i < 2; ++i) {
- const int single_mode = get_single_mode(this_mode, i, is_comp_pred);
- if (single_mode == NEARMV) {
- if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) {
- // NEARMV has the same motion vector as NEARESTMV in compound mode
- return 1;
- }
- }
- }
- if (this_mode == NEAR_NEARMV) {
- if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 &&
- cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION &&
- cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) {
- // NEAR_NEARMV has the same motion vector as GLOBAL_GLOBALMV
- return 1;
- }
- }
- if (this_mode == GLOBAL_GLOBALMV) {
- if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 &&
- cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION &&
- cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) {
- // GLOBAL_GLOBALMV has the same motion vector as NEARST_NEARSTMV
- return 1;
- }
- }
- }
- return 0;
-}
-
static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row,
int mi_col, int_mv *ref_mv_sub8x8[2],
@@ -6215,10 +6426,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
const int num_planes = av1_num_planes(cm);
const int pw = block_size_wide[bsize];
const int ph = block_size_high[bsize];
+ const int plane = 0;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = xd->mi[0];
// This function should only ever be called for compound modes
assert(has_second_ref(mbmi));
+ const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
int_mv ref_mv[2];
int ite, ref;
@@ -6228,11 +6441,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
struct macroblockd_plane *const pd = &xd->plane[0];
const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
- int is_global[2];
+
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+ conv_params.use_jnt_comp_avg = 0;
+ WarpTypesAllowed warp_types[2];
for (ref = 0; ref < 2; ++ref) {
const WarpedMotionParams *const wm =
&xd->global_motion[xd->mi[0]->ref_frame[ref]];
- is_global[ref] = is_global_mv_block(xd->mi[0], wm->wmtype);
+ const int is_global = is_global_mv_block(xd->mi[0], wm->wmtype);
+ warp_types[ref].global_warp_allowed = is_global;
+ warp_types[ref].local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
}
// Do joint motion search in compound mode to get more accurate mv.
@@ -6244,30 +6462,38 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
};
// Prediction buffer from second frame.
- DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
- uint8_t *second_pred;
+ DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
+ uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
(void)ref_mv_sub8x8;
+ const int have_newmv = have_nearmv_in_inter_mode(mbmi->mode);
+ const int ref_mv_idx = mbmi->ref_mv_idx + (have_newmv ? 1 : 0);
+ MV *const best_mv = &x->best_mv.as_mv;
+ const int search_range = SEARCH_RANGE_8P;
+ const int sadpb = x->sadperbit16;
// Allow joint search multiple times iteratively for each reference frame
// and break out of the search loop if it couldn't find a better mv.
for (ite = 0; ite < 4; ite++) {
struct buf_2d ref_yv12[2];
int bestsme = INT_MAX;
- int sadpb = x->sadperbit16;
- MV *const best_mv = &x->best_mv.as_mv;
- int search_range = 3;
-
MvLimits tmp_mv_limits = x->mv_limits;
int id = ite % 2; // Even iterations search in the first reference frame,
// odd iterations search in the second. The predictor
// found for the 'other' reference frame is factored in.
- const int plane = 0;
- ConvolveParams conv_params = get_conv_params(!id, 0, plane, xd->bd);
- conv_params.use_jnt_comp_avg = 0;
- WarpTypesAllowed warp_types;
- warp_types.global_warp_allowed = is_global[!id];
- warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
-
+ if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) {
+ if (cur_mv[id].as_int == init_mv[id].as_int) {
+ break;
+ } else {
+ int_mv cur_int_mv, init_int_mv;
+ cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
+ cur_int_mv.as_mv.row = cur_mv[id].as_mv.col >> 3;
+ init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
+ init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
+ if (cur_int_mv.as_int == init_int_mv.as_int) {
+ break;
+ }
+ }
+ }
for (ref = 0; ref < 2; ++ref) {
ref_mv[ref] = av1_get_ref_mv(x, ref);
// Swap out the reference frame for a version that's been scaled to
@@ -6294,26 +6520,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
ref_yv12[1] = xd->plane[plane].pre[1];
// Get the prediction block from the 'other' reference frame.
- InterpFilters interp_filters = EIGHTTAP_REGULAR;
+ const InterpFilters interp_filters = EIGHTTAP_REGULAR;
// Since we have scaled the reference frames to match the size of the
// current frame we must use a unit scaling factor during mode selection.
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
- av1_highbd_build_inter_predictor(
- ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
- &cur_mv[!id].as_mv, &cm->sf_identity, pw, ph, 0, interp_filters,
- &warp_types, p_col, p_row, plane, MV_PRECISION_Q3, mi_col * MI_SIZE,
- mi_row * MI_SIZE, xd, cm->allow_warped_motion);
- } else {
- second_pred = (uint8_t *)second_pred_alloc_16;
- av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
- second_pred, pw, &cur_mv[!id].as_mv,
- &cm->sf_identity, pw, ph, &conv_params,
- interp_filters, &warp_types, p_col, p_row,
- plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE,
- mi_row * MI_SIZE, xd, cm->allow_warped_motion);
- }
+ av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
+ second_pred, pw, &cur_mv[!id].as_mv,
+ &cm->sf_identity, pw, ph, &conv_params,
+ interp_filters, &warp_types[!id], p_col, p_row,
+ plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE,
+ mi_row * MI_SIZE, xd, cm->allow_warped_motion);
const int order_idx = id != 0;
av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
@@ -6325,15 +6541,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
// Use the mv result from the single mode as mv predictor.
- // Use the mv result from the single mode as mv predictor.
*best_mv = cur_mv[id].as_mv;
best_mv->col >>= 3;
best_mv->row >>= 3;
- av1_set_mvcost(
- x, id,
- mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
+ av1_set_mvcost(x, id, ref_mv_idx);
// Small-range full-pixel motion search.
bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
@@ -6385,7 +6598,6 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
// Restore the pointer to the first prediction buffer.
if (id) xd->plane[plane].pre[0] = ref_yv12[0];
-
if (bestsme < last_besterr[id]) {
cur_mv[id].as_mv = *best_mv;
last_besterr[id] = bestsme;
@@ -6397,10 +6609,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
*rate_mv = 0;
for (ref = 0; ref < 2; ++ref) {
- av1_set_mvcost(
- x, ref,
- mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
-
+ av1_set_mvcost(x, ref, ref_mv_idx);
const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
*rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -6710,16 +6919,16 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
switch (mbmi->motion_mode) {
case SIMPLE_TRANSLATION:
- bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
- sadpb, cond_cost_list(cpi, cost_list),
- &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col),
- (MI_SIZE * mi_row), 0);
+ bestsme = av1_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
+ sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
+ (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0);
break;
case OBMC_CAUSAL:
- bestsme = av1_obmc_full_pixel_diamond(
- cpi, x, &mvp_full, step_param, sadpb,
- MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
- &(x->best_mv.as_mv), 0);
+ bestsme = av1_obmc_full_pixel_search(cpi, x, &mvp_full, step_param, sadpb,
+ MAX_MVSEARCH_STEPS - 1 - step_param,
+ 1, &cpi->fn_ptr[bsize], &ref_mv,
+ &(x->best_mv.as_mv), 0);
break;
default: assert(0 && "Invalid motion mode!\n");
}
@@ -6850,25 +7059,17 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
cm->width, cm->height);
- ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane, xd->bd);
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
WarpTypesAllowed warp_types;
warp_types.global_warp_allowed = is_global;
warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
// Get the prediction block from the 'other' reference frame.
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- av1_highbd_build_inter_predictor(
- ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
- 0, mbmi->interp_filters, &warp_types, p_col, p_row, plane,
- MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd,
- cm->allow_warped_motion);
- } else {
- av1_build_inter_predictor(
- ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
- &conv_params, mbmi->interp_filters, &warp_types, p_col, p_row, plane,
- !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd,
- cm->allow_warped_motion);
- }
+ av1_build_inter_predictor(ref_yv12.buf, ref_yv12.stride, second_pred, pw,
+ other_mv, &sf, pw, ph, &conv_params,
+ mbmi->interp_filters, &warp_types, p_col, p_row,
+ plane, !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE,
+ mi_row * MI_SIZE, xd, cm->allow_warped_motion);
av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
&xd->jcp_param.bck_offset,
@@ -6921,7 +7122,7 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
int bestsme = INT_MAX;
int sadpb = x->sadperbit16;
MV *const best_mv = &x->best_mv.as_mv;
- int search_range = 3;
+ int search_range = SEARCH_RANGE_8P;
MvLimits tmp_mv_limits = x->mv_limits;
@@ -7056,12 +7257,12 @@ static void do_masked_motion_search_indexed(
// near mv modes to reduce distortion in subsequent blocks and also improve
// visual quality.
#define NEW_MV_DISCOUNT_FACTOR 8
-static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
- int ref_mv_idx,
+static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+ int ref_idx, int ref_mv_idx,
const MV_REFERENCE_FRAME *ref_frame,
const MB_MODE_INFO_EXT *mbmi_ext);
static int discount_newmv_test(const AV1_COMP *const cpi, const MACROBLOCK *x,
- int this_mode, int_mv this_mv) {
+ PREDICTION_MODE this_mode, int_mv this_mv) {
if (this_mode == NEWMV && this_mv.as_int != 0 &&
!cpi->rc.is_src_frame_alt_ref) {
// Only discount new_mv when nearst_mv and all near_mv are zero, and the
@@ -7176,6 +7377,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
const int N = bw * bh;
+ assert(N >= 64);
int rate;
int64_t dist;
int64_t rd, best_rd = INT64_MAX;
@@ -7199,28 +7401,27 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
(int64_t)aom_sum_squares_i16(residual1, N)) *
(1 << WEDGE_WEIGHT_BITS) / 2;
int16_t *ds = residual0;
- if (N < 64)
- av1_wedge_compute_delta_squares_c(ds, residual0, residual1, N);
- else
- av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
+
+ av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
- // TODO(jingning): Make sse2 functions support N = 16 case
- if (N < 64)
- wedge_sign = av1_wedge_sign_from_residuals_c(ds, mask, N, sign_limit);
- else
- wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+ wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
- if (N < 64)
- sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N);
- else
- sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
sse = ROUND_POWER_OF_TWO(sse, bd_round);
- model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ // int rate2;
+ // int64_t dist2;
+ // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2);
+ // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n",
+ // sse, rate, dist, rate2, dist2); dist = dist2;
+ // rate = rate2;
+
rate += x->wedge_idx_cost[bsize][wedge_index];
rd = RDCOST(x->rdmult, rate, dist);
@@ -7248,6 +7449,7 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
const int N = bw * bh;
+ assert(N >= 64);
int rate;
int64_t dist;
int64_t rd, best_rd = INT64_MAX;
@@ -7259,13 +7461,11 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
- if (N < 64)
- sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N);
- else
- sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
sse = ROUND_POWER_OF_TWO(sse, bd_round);
- model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
rate += x->wedge_idx_cost[bsize][wedge_index];
rd = RDCOST(x->rdmult, rate, dist);
@@ -7317,50 +7517,45 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
MB_MODE_INFO *const mbmi = xd->mi[0];
const int bw = block_size_wide[bsize];
const int bh = block_size_high[bsize];
- const int N = bw * bh;
+ const int N = 1 << num_pels_log2_lookup[bsize];
int rate;
- uint64_t sse;
int64_t dist;
- int64_t rd0;
DIFFWTD_MASK_TYPE cur_mask_type;
int64_t best_rd = INT64_MAX;
DIFFWTD_MASK_TYPE best_mask_type = 0;
const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+ DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+ uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
// try each mask type and its inverse
for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
// build mask and inverse
if (hbd)
av1_build_compound_diffwtd_mask_highbd(
- xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+ tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
else
- av1_build_compound_diffwtd_mask(xd->seg_mask, cur_mask_type, p0, bw, p1,
- bw, bh, bw);
+ av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
+ p0, bw, p1, bw, bh, bw);
// compute rd for mask
- sse = av1_wedge_sse_from_residuals(residual1, diff10, xd->seg_mask, N);
+ uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
+ tmp_mask[cur_mask_type], N);
sse = ROUND_POWER_OF_TWO(sse, bd_round);
- model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
- rd0 = RDCOST(x->rdmult, rate, dist);
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ const int64_t rd0 = RDCOST(x->rdmult, rate, dist);
if (rd0 < best_rd) {
best_mask_type = cur_mask_type;
best_rd = rd0;
}
}
-
- // make final mask
mbmi->interinter_comp.mask_type = best_mask_type;
- if (hbd)
- av1_build_compound_diffwtd_mask_highbd(
- xd->seg_mask, mbmi->interinter_comp.mask_type, CONVERT_TO_BYTEPTR(p0),
- bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
- else
- av1_build_compound_diffwtd_mask(
- xd->seg_mask, mbmi->interinter_comp.mask_type, p0, bw, p1, bw, bh, bw);
-
+ if (best_mask_type == DIFFWTD_38_INV) {
+ memcpy(xd->seg_mask, seg_mask, N * 2);
+ }
return best_rd;
}
@@ -7413,9 +7608,12 @@ static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
}
}
-static int interinter_compound_motion_search(
- const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
- const BLOCK_SIZE bsize, const int this_mode, int mi_row, int mi_col) {
+static int interinter_compound_motion_search(const AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ const int_mv *const cur_mv,
+ const BLOCK_SIZE bsize,
+ const PREDICTION_MODE this_mode,
+ int mi_row, int mi_col) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
int_mv tmp_mv[2];
@@ -7440,11 +7638,40 @@ static int interinter_compound_motion_search(
return tmp_rate_mv;
}
+static void get_inter_predictors_masked_compound(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+ int mi_row, int mi_col, uint8_t **preds0, uint8_t **preds1,
+ int16_t *residual1, int16_t *diff10, int *strides) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ int can_use_previous = cm->allow_warped_motion;
+ // get inter predictors to use for masked compound modes
+ av1_build_inter_predictors_for_planes_single_buf(
+ xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous);
+ av1_build_inter_predictors_for_planes_single_buf(
+ xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
+ const struct buf_2d *const src = &x->plane[0].src;
+ if (get_bitdepth_data_path_index(xd)) {
+ aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd);
+ aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
+ bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd);
+ } else {
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1,
+ bw);
+ aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+ }
+}
+
static int64_t build_and_cost_compound_type(
const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
- const BLOCK_SIZE bsize, const int this_mode, int *rs2, int rate_mv,
- BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1,
- int16_t *residual1, int16_t *diff10, int *strides, int mi_row, int mi_col) {
+ const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
+ int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
+ uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
+ int mi_row, int mi_col, int mode_rate, int64_t ref_best_rd,
+ int *calc_pred_masked_compound) {
const AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -7456,19 +7683,30 @@ static int64_t build_and_cost_compound_type(
int64_t tmp_skip_sse_sb;
const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+ if (*calc_pred_masked_compound) {
+ get_inter_predictors_masked_compound(cpi, x, bsize, mi_row, mi_col, preds0,
+ preds1, residual1, diff10, strides);
+ *calc_pred_masked_compound = 0;
+ }
+
best_rd_cur =
pick_interinter_mask(cpi, x, bsize, *preds0, *preds1, residual1, diff10);
*rs2 += get_interinter_compound_mask_rate(x, mbmi);
best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
- if (have_newmv_in_inter_mode(this_mode) &&
- use_masked_motion_search(compound_type)) {
+ // Although the true rate_mv might be different after motion search, but it
+ // is unlikely to be the best mode considering the transform rd cost and other
+ // mode overhead cost
+ int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
+ if (mode_rd > ref_best_rd) return INT64_MAX;
+
+ if (have_newmv_in_inter_mode(this_mode) && compound_type == COMPOUND_WEDGE) {
*out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize,
this_mode, mi_row, mi_col);
av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
- av1_subtract_plane(x, bsize, 0);
- model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
if (rd >= best_rd_cur) {
mbmi->mv[0].as_int = cur_mv[0].as_int;
@@ -7508,12 +7746,72 @@ typedef struct {
int (*single_newmv_valid)[REF_FRAMES];
// Pointer to array of predicted rate-distortion
// Should point to first of 2 arrays in 2D array
- int64_t (*modelled_rd)[REF_FRAMES];
+ int64_t (*modelled_rd)[MAX_REF_MV_SERCH][REF_FRAMES];
InterpFilter single_filter[MB_MODE_COUNT][REF_FRAMES];
int ref_frame_cost;
int single_comp_cost;
+ int64_t (*simple_rd)[MAX_REF_MV_SERCH][REF_FRAMES];
+ int skip_motion_mode;
+ INTERINTRA_MODE *inter_intra_mode;
} HandleInterModeArgs;
+/* If the current mode shares the same mv with other modes with higher cost,
+ * skip this mode. */
+static int skip_repeated_mv(const AV1_COMMON *const cm,
+ const MACROBLOCK *const x,
+ PREDICTION_MODE this_mode,
+ const MV_REFERENCE_FRAME ref_frames[2],
+ InterModeSearchState *search_state) {
+ const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
+ const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ PREDICTION_MODE compare_mode = MB_MODE_COUNT;
+ if (!is_comp_pred) {
+ if (this_mode == NEARMV) {
+ if (ref_mv_count == 0) {
+ // NEARMV has the same motion vector as NEARESTMV
+ compare_mode = NEARESTMV;
+ }
+ if (ref_mv_count == 1 &&
+ cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+ // NEARMV has the same motion vector as GLOBALMV
+ compare_mode = GLOBALMV;
+ }
+ }
+ if (this_mode == GLOBALMV) {
+ if (ref_mv_count == 0 &&
+ cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+ // GLOBALMV has the same motion vector as NEARESTMV
+ compare_mode = NEARESTMV;
+ }
+ if (ref_mv_count == 1) {
+ // GLOBALMV has the same motion vector as NEARMV
+ compare_mode = NEARMV;
+ }
+ }
+
+ if (compare_mode != MB_MODE_COUNT) {
+ // Use modelled_rd to check whether compare mode was searched
+ if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] !=
+ INT64_MAX) {
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames);
+ const int compare_cost = cost_mv_ref(x, compare_mode, mode_ctx);
+ const int this_cost = cost_mv_ref(x, this_mode, mode_ctx);
+
+ // Only skip if the mode cost is larger than compare mode cost
+ if (this_cost > compare_cost) {
+ search_state->modelled_rd[this_mode][0][ref_frames[0]] =
+ search_state->modelled_rd[compare_mode][0][ref_frames[0]];
+ return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
const AV1_COMMON *cm,
const MACROBLOCK *x) {
@@ -7640,62 +7938,97 @@ static INLINE int64_t interpolation_filter_rd(
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
- int tmp_rate, tmp_skip_sb = 0;
- int64_t tmp_dist, tmp_skip_sse = INT64_MAX;
+ int tmp_rate[2], tmp_skip_sb[2] = { 1, 1 };
+ int64_t tmp_dist[2], tmp_skip_sse[2] = { 0, 0 };
const InterpFilters last_best = mbmi->interp_filters;
mbmi->interp_filters = filter_sets[filter_idx];
const int tmp_rs =
get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
- if (!skip_pred) {
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
- av1_subtract_plane(x, bsize, 0);
-#if DNN_BASED_RD_INTERP_FILTER
- model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist,
- &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL);
-#else
- model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, &tmp_skip_sb,
- &tmp_skip_sse, NULL, NULL, NULL);
-#endif
+ assert(skip_pred != 2);
+ assert((skip_pred >= 0) && (skip_pred <= cpi->default_interp_skip_flags));
+ assert(rate[0] >= 0);
+ assert(dist[0] >= 0);
+ assert((skip_txfm_sb[0] == 0) || (skip_txfm_sb[0] == 1));
+ assert(skip_sse_sb[0] >= 0);
+ assert(rate[1] >= 0);
+ assert(dist[1] >= 0);
+ assert((skip_txfm_sb[1] == 0) || (skip_txfm_sb[1] == 1));
+ assert(skip_sse_sb[1] >= 0);
+
+ if (skip_pred != cpi->default_interp_skip_flags) {
+ if (skip_pred != DEFAULT_LUMA_INTERP_SKIP_FLAG) {
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+#if CONFIG_COLLECT_RD_STATS == 3
+ RD_STATS rd_stats_y;
+ select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
+ PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 3
+ model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
+ &tmp_skip_sb[0], &tmp_skip_sse[0], NULL, NULL, NULL);
+ tmp_rate[1] = tmp_rate[0];
+ tmp_dist[1] = tmp_dist[0];
+ } else {
+ // only luma MC is skipped
+ tmp_rate[1] = rate[0];
+ tmp_dist[1] = dist[0];
+ }
if (num_planes > 1) {
- int64_t tmp_y_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
- if (tmp_y_rd > *rd) {
- mbmi->interp_filters = last_best;
- return 0;
+ for (int plane = 1; plane < num_planes; ++plane) {
+ int tmp_rate_uv, tmp_skip_sb_uv;
+ int64_t tmp_dist_uv, tmp_skip_sse_uv;
+ int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]);
+ if (tmp_rd >= *rd) {
+ mbmi->interp_filters = last_best;
+ return 0;
+ }
+ av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ plane);
+ model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, plane, plane, mi_row, mi_col, &tmp_rate_uv,
+ &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, NULL, NULL);
+ tmp_rate[1] =
+ (int)AOMMIN(((int64_t)tmp_rate[1] + (int64_t)tmp_rate_uv), INT_MAX);
+ tmp_dist[1] += tmp_dist_uv;
+ tmp_skip_sb[1] &= tmp_skip_sb_uv;
+ tmp_skip_sse[1] += tmp_skip_sse_uv;
}
- int tmp_rate_uv, tmp_skip_sb_uv;
- int64_t tmp_dist_uv, tmp_skip_sse_uv;
- av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize);
- for (int plane = 1; plane < num_planes; ++plane)
- av1_subtract_plane(x, bsize, plane);
-#if DNN_BASED_RD_INTERP_FILTER
- model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 1, num_planes - 1,
- &tmp_rate_uv, &tmp_dist_uv, &tmp_skip_sb_uv,
- &tmp_skip_sse_uv, NULL, NULL, NULL);
-#else
- model_rd_for_sb(cpi, bsize, x, xd, 1, num_planes - 1, &tmp_rate_uv,
- &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL,
- NULL, NULL);
-#endif
- tmp_rate += tmp_rate_uv;
- tmp_skip_sb &= tmp_skip_sb_uv;
- tmp_dist += tmp_dist_uv;
- tmp_skip_sse += tmp_skip_sse_uv;
}
} else {
- tmp_rate = *rate;
- tmp_dist = *dist;
+ // both luma and chroma MC is skipped
+ tmp_rate[1] = rate[1];
+ tmp_dist[1] = dist[1];
}
- int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
+ int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]);
+
if (tmp_rd < *rd) {
*rd = tmp_rd;
*switchable_rate = tmp_rs;
- *skip_txfm_sb = tmp_skip_sb;
- *skip_sse_sb = tmp_skip_sse;
- *rate = tmp_rate;
- *dist = tmp_dist;
- if (!skip_pred) {
+ if (skip_pred != cpi->default_interp_skip_flags) {
+ if (skip_pred == 0) {
+ // Overwrite the data as current filter is the best one
+ tmp_skip_sb[1] = tmp_skip_sb[0] & tmp_skip_sb[1];
+ tmp_skip_sse[1] = tmp_skip_sse[0] + tmp_skip_sse[1];
+ memcpy(rate, tmp_rate, sizeof(*rate) * 2);
+ memcpy(dist, tmp_dist, sizeof(*dist) * 2);
+ memcpy(skip_txfm_sb, tmp_skip_sb, sizeof(*skip_txfm_sb) * 2);
+ memcpy(skip_sse_sb, tmp_skip_sse, sizeof(*skip_sse_sb) * 2);
+ // As luma MC data is computed, no need to recompute after the search
+ x->recalc_luma_mc_data = 0;
+ } else if (skip_pred == DEFAULT_LUMA_INTERP_SKIP_FLAG) {
+ // As luma MC data is not computed, update of luma data can be skipped
+ rate[1] = tmp_rate[1];
+ dist[1] = tmp_dist[1];
+ skip_txfm_sb[1] = skip_txfm_sb[0] & tmp_skip_sb[1];
+ skip_sse_sb[1] = skip_sse_sb[0] + tmp_skip_sse[1];
+ // As luma MC data is not recomputed and current filter is the best,
+ // indicate the possibility of recomputing MC data
+ // If current buffer contains valid MC data, toggle to indicate that
+ // luma MC data needs to be recomputed
+ x->recalc_luma_mc_data ^= 1;
+ }
swap_dst_buf(xd, dst_bufs, num_planes);
}
return 1;
@@ -7715,8 +8048,8 @@ static INLINE int find_best_horiz_interp_filter_rd(
int i;
const int bw = block_size_wide[bsize];
assert(best_dual_mode == 0);
- if ((bw <= 4) && (!skip_hor)) {
- int skip_pred = 1;
+ if ((bw <= 4) && (skip_hor != cpi->default_interp_skip_flags)) {
+ int skip_pred = cpi->default_interp_skip_flags;
// Process the filters in reverse order to enable reusing rate and
// distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
@@ -7726,7 +8059,7 @@ static INLINE int find_best_horiz_interp_filter_rd(
dist)) {
best_dual_mode = i;
}
- skip_pred = 0;
+ skip_pred = skip_hor;
}
} else {
for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
@@ -7751,8 +8084,8 @@ static INLINE void find_best_vert_interp_filter_rd(
int best_dual_mode, int filter_set_size) {
int i;
const int bh = block_size_high[bsize];
- if ((bh <= 4) && (!skip_ver)) {
- int skip_pred = 1;
+ if ((bh <= 4) && (skip_ver != cpi->default_interp_skip_flags)) {
+ int skip_pred = cpi->default_interp_skip_flags;
// Process the filters in reverse order to enable reusing rate and
// distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
assert(filter_set_size == DUAL_FILTER_SET_SIZE);
@@ -7762,7 +8095,7 @@ static INLINE void find_best_vert_interp_filter_rd(
switchable_rate, skip_txfm_sb, skip_sse_sb,
dst_bufs, i, switchable_ctx, skip_pred, rate,
dist);
- skip_pred = 0;
+ skip_pred = skip_ver;
}
} else {
for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
@@ -7784,6 +8117,7 @@ static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
return 0;
}
}
+ if (has_second_ref(mi) && st->comp_type != mi->interinter_comp.type) return 0;
return 1;
}
@@ -7806,11 +8140,11 @@ static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
const int comp_idx = mbmi->compound_idx;
const int offset = x->interp_filter_stats_idx[comp_idx];
if (offset < MAX_INTERP_FILTER_STATS) {
- INTERPOLATION_FILTER_STATS stat = {
- mbmi->interp_filters,
- { mbmi->mv[0], mbmi->mv[1] },
- { mbmi->ref_frame[0], mbmi->ref_frame[1] },
- };
+ INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters,
+ { mbmi->mv[0], mbmi->mv[1] },
+ { mbmi->ref_frame[0],
+ mbmi->ref_frame[1] },
+ mbmi->interinter_comp.type };
x->interp_filter_stats[comp_idx][offset] = stat;
x->interp_filter_stats_idx[comp_idx]++;
}
@@ -7821,15 +8155,22 @@ static int64_t interpolation_filter_search(
int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES],
int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
- int64_t *const skip_sse_sb) {
+ int64_t *const skip_sse_sb, const int skip_build_pred,
+ HandleInterModeArgs *args, int64_t ref_best_rd) {
const AV1_COMMON *cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
const int need_search =
av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
- int i, tmp_rate;
- int64_t tmp_dist;
+ int i;
+ // Index 0 corresponds to luma rd data and index 1 corresponds to cummulative
+ // data of all planes
+ int tmp_rate[2] = { 0, 0 };
+ int64_t tmp_dist[2] = { 0, 0 };
+ int best_skip_txfm_sb[2] = { 1, 1 };
+ int64_t best_skip_sse_sb[2] = { 0, 0 };
+ const int ref_frame = xd->mi[0]->ref_frame[0];
(void)single_filter;
int match_found = -1;
@@ -7845,18 +8186,32 @@ static int64_t interpolation_filter_search(
switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
*switchable_rate =
get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
- for (int plane = 0; plane < num_planes; ++plane)
- av1_subtract_plane(x, bsize, plane);
-#if DNN_BASED_RD_INTERP_FILTER
- model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
- &tmp_dist, skip_txfm_sb, skip_sse_sb, NULL, NULL,
- NULL);
-#else
- model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist,
- skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL);
-#endif // DNN_BASED_RD_INTERP_FILTER
- *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist);
+ if (!skip_build_pred)
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+
+#if CONFIG_COLLECT_RD_STATS == 3
+ RD_STATS rd_stats_y;
+ select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
+ PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 3
+ model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
+ &best_skip_txfm_sb[0], &best_skip_sse_sb[0], NULL, NULL, NULL);
+ if (num_planes > 1)
+ model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, 1, num_planes - 1, mi_row, mi_col, &tmp_rate[1],
+ &tmp_dist[1], &best_skip_txfm_sb[1], &best_skip_sse_sb[1], NULL, NULL,
+ NULL);
+ tmp_rate[1] =
+ (int)AOMMIN((int64_t)tmp_rate[0] + (int64_t)tmp_rate[1], INT_MAX);
+ assert(tmp_rate[1] >= 0);
+ tmp_dist[1] = tmp_dist[0] + tmp_dist[1];
+ best_skip_txfm_sb[1] = best_skip_txfm_sb[0] & best_skip_txfm_sb[1];
+ best_skip_sse_sb[1] = best_skip_sse_sb[0] + best_skip_sse_sb[1];
+ *rd = RDCOST(x->rdmult, (*switchable_rate + tmp_rate[1]), tmp_dist[1]);
+ *skip_txfm_sb = best_skip_txfm_sb[1];
+ *skip_sse_sb = best_skip_sse_sb[1];
+ x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
if (assign_filter != SWITCHABLE || match_found != -1) {
return 0;
@@ -7866,22 +8221,71 @@ static int64_t interpolation_filter_search(
av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
return 0;
}
- int skip_hor = 1;
- int skip_ver = 1;
+ if (args->modelled_rd != NULL) {
+ if (has_second_ref(mbmi)) {
+ const int ref_mv_idx = mbmi->ref_mv_idx;
+ int refs[2] = { mbmi->ref_frame[0],
+ (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+ const int mode0 = compound_ref0_mode(mbmi->mode);
+ const int mode1 = compound_ref1_mode(mbmi->mode);
+ const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+ args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+ if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) {
+ return INT64_MAX;
+ }
+ }
+ }
+
+ x->recalc_luma_mc_data = 0;
+ // skip_flag=xx (in binary form)
+ // Setting 0th flag corresonds to skipping luma MC and setting 1st bt
+ // corresponds to skipping chroma MC skip_flag=0 corresponds to "Don't skip
+ // luma and chroma MC" Skip flag=1 corresponds to "Skip Luma MC only"
+ // Skip_flag=2 is not a valid case
+ // skip_flag=3 corresponds to "Skip both luma and chroma MC"
+ int skip_hor = cpi->default_interp_skip_flags;
+ int skip_ver = cpi->default_interp_skip_flags;
const int is_compound = has_second_ref(mbmi);
- for (int k = 0; k < num_planes - 1; ++k) {
- struct macroblockd_plane *const pd = &xd->plane[k];
- const int bw = pd->width;
- const int bh = pd->height;
- for (int j = 0; j < 1 + is_compound; ++j) {
- const MV mv = mbmi->mv[j].as_mv;
+ assert(is_intrabc_block(mbmi) == 0);
+ for (int j = 0; j < 1 + is_compound; ++j) {
+ const RefBuffer *ref_buf = &cm->frame_refs[mbmi->ref_frame[j] - LAST_FRAME];
+ const struct scale_factors *const sf = &ref_buf->sf;
+ // TODO(any): Refine skip flag calculation considering scaling
+ if (av1_is_scaled(sf)) {
+ skip_hor = 0;
+ skip_ver = 0;
+ break;
+ }
+ const MV mv = mbmi->mv[j].as_mv;
+ int skip_hor_plane = 0;
+ int skip_ver_plane = 0;
+ for (int k = 0; k < AOMMAX(1, (num_planes - 1)); ++k) {
+ struct macroblockd_plane *const pd = &xd->plane[k];
+ const int bw = pd->width;
+ const int bh = pd->height;
const MV mv_q4 = clamp_mv_to_umv_border_sb(
xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
- skip_hor &= (sub_x == 0);
- skip_ver &= (sub_y == 0);
- }
+ skip_hor_plane |= ((sub_x == 0) << k);
+ skip_ver_plane |= ((sub_y == 0) << k);
+ }
+ skip_hor = skip_hor & skip_hor_plane;
+ skip_ver = skip_ver & skip_ver_plane;
+ // It is not valid that "luma MV is sub-pel, whereas chroma MV is not"
+ assert(skip_hor != 2);
+ assert(skip_ver != 2);
+ }
+ // When compond prediction type is compound segment wedge, luma MC and chroma
+ // MC need to go hand in hand as mask generated during luma MC is reuired for
+ // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during
+ // vertical filter decision may be incorrect as temporary MC evaluation
+ // overwrites the mask. Make skip_ver as 0 for this case so that mask is
+ // populated during luma MC
+ if (is_compound && mbmi->compound_idx == 1 &&
+ mbmi->interinter_comp.type == COMPOUND_DIFFWTD) {
+ assert(mbmi->comp_group_idx == 1);
+ if (skip_hor == 0 && skip_ver == 1) skip_ver = 0;
}
// do interp_filter search
const int filter_set_size = DUAL_FILTER_SET_SIZE;
@@ -7895,14 +8299,14 @@ static int64_t interpolation_filter_search(
// EIGHTTAP_REGULAR mode is calculated beforehand
best_dual_mode = find_best_horiz_interp_filter_rd(
x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
- skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_hor,
- &tmp_rate, &tmp_dist, best_dual_mode);
+ best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_hor,
+ tmp_rate, tmp_dist, best_dual_mode);
// From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
find_best_vert_interp_filter_rd(
x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
- skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
- &tmp_rate, &tmp_dist, best_dual_mode, filter_set_size);
+ best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
+ tmp_rate, tmp_dist, best_dual_mode, filter_set_size);
} else {
// EIGHTTAP_REGULAR mode is calculated beforehand
for (i = 1; i < filter_set_size; ++i) {
@@ -7912,12 +8316,25 @@ static int64_t interpolation_filter_search(
if (filter_x != filter_y) continue;
}
interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
- switchable_rate, skip_txfm_sb, skip_sse_sb,
- dst_bufs, i, switchable_ctx, 0, &tmp_rate,
- &tmp_dist);
+ switchable_rate, best_skip_txfm_sb,
+ best_skip_sse_sb, dst_bufs, i, switchable_ctx, 0,
+ tmp_rate, tmp_dist);
+ assert(x->recalc_luma_mc_data == 0);
}
}
swap_dst_buf(xd, dst_bufs, num_planes);
+ // Recompute final MC data if required
+ if (x->recalc_luma_mc_data == 1) {
+ // Recomputing final luma MC data is required only if the same was skipped
+ // in either of the directions Condition below is necessary, but not
+ // sufficient
+ assert((skip_hor == 1) || (skip_ver == 1));
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ }
+ *skip_txfm_sb = best_skip_txfm_sb[1];
+ *skip_sse_sb = best_skip_sse_sb[1];
+ x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
+
// save search results
if (cpi->sf.skip_repeat_interpolation_filter_search) {
assert(match_found == -1);
@@ -7926,6 +8343,301 @@ static int64_t interpolation_filter_search(
return 0;
}
+static int txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, RD_STATS *rd_stats,
+ RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+ int mode_rate, int64_t ref_best_rd) {
+ /*
+ * This function combines y and uv planes' transform search processes
+ * together, when the prediction is generated. It first does subtration to
+ * obtain the prediction error. Then it calls
+ * select_tx_type_yrd/super_block_yrd and inter_block_uvrd sequentially and
+ * handles the early terminations happen in those functions. At the end, it
+ * computes the rd_stats/_y/_uv accordingly.
+ */
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int skip_txfm_sb = 0;
+ const int num_planes = av1_num_planes(cm);
+ const int ref_frame_1 = mbmi->ref_frame[1];
+ const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
+ const int64_t rd_thresh =
+ ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
+ const int skip_ctx = av1_get_skip_context(xd);
+ const int64_t min_header_rate =
+ mode_rate + AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+ // Account for minimum skip and non_skip rd.
+ // Eventually either one of them will be added to mode_rate
+ const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
+
+ if (min_header_rd_possible > ref_best_rd) {
+ av1_invalid_rd_stats(rd_stats_y);
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
+
+ av1_init_rd_stats(rd_stats);
+ av1_init_rd_stats(rd_stats_y);
+ av1_init_rd_stats(rd_stats_uv);
+ rd_stats->rate = mode_rate;
+
+ if (!cpi->common.all_lossless)
+ check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
+ if (!skip_txfm_sb) {
+ int64_t non_skip_rdcosty = INT64_MAX;
+ int64_t skip_rdcosty = INT64_MAX;
+ int64_t min_rdcosty = INT64_MAX;
+ int is_cost_valid_uv = 0;
+
+ // cost and distortion
+ av1_subtract_plane(x, bsize, 0);
+ if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+ // Motion mode
+ select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh);
+#if CONFIG_COLLECT_RD_STATS == 2
+ PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 2
+ } else {
+ super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+ memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+ for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+ set_blk_skip(x, 0, i, rd_stats_y->skip);
+ }
+
+ if (rd_stats_y->rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ // TODO(angiebird): check if we need this
+ // restore_dst_buf(xd, *orig_dst, num_planes);
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+
+ av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+ non_skip_rdcosty = RDCOST(
+ x->rdmult, rd_stats->rate + x->skip_cost[skip_ctx][0], rd_stats->dist);
+ skip_rdcosty =
+ RDCOST(x->rdmult, mode_rate + x->skip_cost[skip_ctx][1], rd_stats->sse);
+ min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
+
+ if (min_rdcosty > ref_best_rd) {
+ int64_t tokenonly_rdy =
+ AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
+ RDCOST(x->rdmult, 0, rd_stats_y->sse));
+ // Invalidate rd_stats_y to skip the rest of the motion modes search
+ if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.adaptive_txb_search_level) >
+ rd_thresh)
+ av1_invalid_rd_stats(rd_stats_y);
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+
+ if (num_planes > 1) {
+ /* clang-format off */
+ is_cost_valid_uv =
+ inter_block_uvrd(cpi, x, rd_stats_uv, bsize,
+ ref_best_rd - non_skip_rdcosty,
+ ref_best_rd - skip_rdcosty, FTXS_NONE);
+ if (!is_cost_valid_uv) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+ /* clang-format on */
+ av1_merge_rd_stats(rd_stats, rd_stats_uv);
+ } else {
+ av1_init_rd_stats(rd_stats_uv);
+ }
+ if (rd_stats->skip) {
+ rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ rd_stats->rate += x->skip_cost[skip_ctx][1];
+ mbmi->skip = 0;
+ // here mbmi->skip temporarily plays a role as what this_skip2 does
+
+ int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (tmprd > ref_best_rd) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+ } else if (!xd->lossless[mbmi->segment_id] &&
+ (RDCOST(x->rdmult,
+ rd_stats_y->rate + rd_stats_uv->rate +
+ x->skip_cost[skip_ctx][0],
+ rd_stats->dist) >=
+ RDCOST(x->rdmult, x->skip_cost[skip_ctx][1], rd_stats->sse))) {
+ rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+ rd_stats->rate += x->skip_cost[skip_ctx][1];
+ rd_stats->dist = rd_stats->sse;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ mbmi->skip = 1;
+ } else {
+ rd_stats->rate += x->skip_cost[skip_ctx][0];
+ mbmi->skip = 0;
+ }
+ } else {
+ x->skip = 1;
+ mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+ // The cost of skip bit needs to be added.
+ mbmi->skip = 0;
+ rd_stats->rate += x->skip_cost[skip_ctx][1];
+
+ rd_stats->dist = 0;
+ rd_stats->sse = 0;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ rd_stats->skip = 1;
+ int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (tmprd > ref_best_rd) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int handle_inter_intra_mode(const AV1_COMP *const cpi,
+ MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, MB_MODE_INFO *mbmi,
+ HandleInterModeArgs *args,
+ int64_t ref_best_rd, int *rate_mv,
+ int *tmp_rate2, BUFFER_SET *orig_dst) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
+ int64_t rd, best_interintra_rd = INT64_MAX;
+ int rmode, rate_sum;
+ int64_t dist_sum;
+ int tmp_rate_mv = 0;
+ int tmp_skip_txfm_sb;
+ int bw = block_size_wide[bsize];
+ int64_t tmp_skip_sse_sb;
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
+ uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
+ uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+ const int *const interintra_mode_cost =
+ x->interintra_mode_cost[size_group_lookup[bsize]];
+ const int_mv mv0 = mbmi->mv[0];
+ const int is_wedge_used = is_interintra_wedge_used(bsize);
+ int rwedge = is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ xd->plane[0].dst.buf = tmp_buf;
+ xd->plane[0].dst.stride = bw;
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ mbmi->ref_frame[1] = INTRA_FRAME;
+ mbmi->use_wedge_interintra = 0;
+ best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]];
+ int j = 0;
+ if (cpi->sf.reuse_inter_intra_mode == 0 ||
+ best_interintra_mode == INTERINTRA_MODES) {
+ for (j = 0; j < INTERINTRA_MODES; ++j) {
+ mbmi->interintra_mode = (INTERINTRA_MODE)j;
+ rmode = interintra_mode_cost[mbmi->interintra_mode];
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
+ if (rd < best_interintra_rd) {
+ best_interintra_rd = rd;
+ best_interintra_mode = mbmi->interintra_mode;
+ }
+ }
+ args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+ }
+ if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) {
+ mbmi->interintra_mode = best_interintra_mode;
+ rmode = interintra_mode_cost[mbmi->interintra_mode];
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ }
+ rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, *rate_mv + rmode + rate_sum + rwedge, dist_sum);
+ best_interintra_rd = rd;
+ if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) {
+ return -1;
+ }
+ if (is_wedge_used) {
+ int64_t best_interintra_rd_nowedge = rd;
+ int64_t best_interintra_rd_wedge = INT64_MAX;
+ int_mv tmp_mv;
+ // Disable wedge search if source variance is small
+ if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
+ mbmi->use_wedge_interintra = 1;
+
+ rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
+ x->wedge_interintra_cost[bsize][1];
+
+ best_interintra_rd_wedge =
+ pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+
+ best_interintra_rd_wedge +=
+ RDCOST(x->rdmult, rmode + *rate_mv + rwedge, 0);
+ rd = INT64_MAX;
+ // Refine motion vector.
+ if (have_newmv_in_inter_mode(mbmi->mode)) {
+ // get negative of mask
+ const uint8_t *mask = av1_get_contiguous_soft_mask(
+ mbmi->interintra_wedge_index, 1, bsize);
+ tmp_mv = mbmi->mv[0];
+ compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
+ mi_col, intrapred, mask, bw, &tmp_rate_mv,
+ 0);
+ if (mbmi->mv[0].as_int != tmp_mv.as_int) {
+ mbmi->mv[0].as_int = tmp_mv.as_int;
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
+ bsize);
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge,
+ dist_sum);
+ }
+ }
+ if (rd >= best_interintra_rd_wedge) {
+ tmp_mv.as_int = mv0.as_int;
+ tmp_rate_mv = *rate_mv;
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ }
+ // Evaluate closer to true rd
+ rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
+ dist_sum);
+ best_interintra_rd_wedge = rd;
+ if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+ mbmi->use_wedge_interintra = 1;
+ mbmi->mv[0].as_int = tmp_mv.as_int;
+ *tmp_rate2 += tmp_rate_mv - *rate_mv;
+ *rate_mv = tmp_rate_mv;
+ } else {
+ mbmi->use_wedge_interintra = 0;
+ mbmi->mv[0].as_int = mv0.as_int;
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ }
+ } else {
+ mbmi->use_wedge_interintra = 0;
+ }
+ } // if (is_interintra_wedge_used(bsize))
+ if (num_planes > 1) {
+ av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ }
+ return 0;
+}
+
// TODO(afergs): Refactor the MBMI references in here - there's four
// TODO(afergs): Refactor optional args - add them to a struct or remove
static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
@@ -7933,11 +8645,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
int *disable_skip, int mi_row, int mi_col,
HandleInterModeArgs *const args,
- int64_t ref_best_rd, const int *refs, int rate_mv,
- BUFFER_SET *orig_dst
+ int64_t ref_best_rd, const int *refs,
+ int *rate_mv, BUFFER_SET *orig_dst
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
,
- int64_t *best_est_rd
+ TileDataEnc *tile_data, int64_t *best_est_rd,
+ int do_tx_search, InterModesInfo *inter_modes_info
#endif
) {
const AV1_COMMON *const cm = &cpi->common;
@@ -7946,41 +8659,49 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
MB_MODE_INFO *mbmi = xd->mi[0];
const int is_comp_pred = has_second_ref(mbmi);
const PREDICTION_MODE this_mode = mbmi->mode;
- int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0;
+ const int rate2_nocoeff = rd_stats->rate;
+ int best_xskip, best_disable_skip = 0;
RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
MB_MODE_INFO base_mbmi, best_mbmi;
uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const int rate_mv0 = *rate_mv;
+
int interintra_allowed = cm->seq_params.enable_interintra_compound &&
is_interintra_allowed(mbmi) && mbmi->compound_idx;
int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
- int total_samples;
-
- (void)rate_mv;
+ assert(mbmi->ref_frame[1] != INTRA_FRAME);
+ const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
av1_invalid_rd_stats(&best_rd_stats);
-
aom_clear_system_state();
- mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
- total_samples = mbmi->num_proj_ref[0];
- rate2_nocoeff = rd_stats->rate;
+ mbmi->num_proj_ref = 1; // assume num_proj_ref >=1
+ MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+ if (cm->switchable_motion_mode) {
+ last_motion_mode_allowed = motion_mode_allowed(xd->global_motion, xd, mbmi,
+ cm->allow_warped_motion);
+ }
+ if (last_motion_mode_allowed == WARPED_CAUSAL) {
+ mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
+ }
+ int total_samples = mbmi->num_proj_ref;
+ if (total_samples == 0) {
+ last_motion_mode_allowed = OBMC_CAUSAL;
+ }
base_mbmi = *mbmi;
- MOTION_MODE last_motion_mode_allowed =
- cm->switchable_motion_mode
- ? motion_mode_allowed(xd->global_motion, xd, mbmi,
- cm->allow_warped_motion)
- : SIMPLE_TRANSLATION;
- assert(mbmi->ref_frame[1] != INTRA_FRAME);
- const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
+ const int switchable_rate =
+ av1_is_interp_needed(xd) ? av1_get_switchable_rate(cm, x, xd) : 0;
int64_t best_rd = INT64_MAX;
-
+ int best_rate_mv = rate_mv0;
for (int mode_index = (int)SIMPLE_TRANSLATION;
mode_index <= (int)last_motion_mode_allowed + interintra_allowed;
mode_index++) {
+ if (args->skip_motion_mode && mode_index) continue;
int64_t tmp_rd = INT64_MAX;
int tmp_rate2 = rate2_nocoeff;
int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
int skip_txfm_sb = 0;
+ int tmp_rate_mv = rate_mv0;
*mbmi = base_mbmi;
if (is_interintra_mode) {
@@ -7995,10 +8716,9 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
// The prediction is calculated before motion_mode_rd() is called in
// handle_inter_mode()
} else if (mbmi->motion_mode == OBMC_CAUSAL) {
- mbmi->motion_mode = OBMC_CAUSAL;
- if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
- int tmp_rate_mv = 0;
-
+ uint32_t cur_mv = mbmi->mv[0].as_int;
+ assert(!is_comp_pred);
+ if (have_newmv_in_inter_mode(this_mode)) {
single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
mbmi->mv[0].as_int = x->best_mv.as_int;
#if USE_DISCOUNT_NEWMV_TEST
@@ -8006,36 +8726,38 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
}
#endif
- tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+ tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+ }
+ if (mbmi->mv[0].as_int != cur_mv) {
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
}
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
av1_build_obmc_inter_prediction(
cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
args->left_pred_buf, args->left_pred_stride);
} else if (mbmi->motion_mode == WARPED_CAUSAL) {
int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
mbmi->motion_mode = WARPED_CAUSAL;
- mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
+ mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
mbmi->interp_filters = av1_broadcast_interp_filter(
av1_unswitchable_filter(cm->interp_filter));
memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
// Select the samples according to motion vector difference
- if (mbmi->num_proj_ref[0] > 1) {
- mbmi->num_proj_ref[0] = selectSamples(
- &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref[0], bsize);
+ if (mbmi->num_proj_ref > 1) {
+ mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+ mbmi->num_proj_ref, bsize);
}
- if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
+ if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
- &mbmi->wm_params[0], mi_row, mi_col)) {
+ &mbmi->wm_params, mi_row, mi_col)) {
// Refine MV for NEWMV mode
- if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
- int tmp_rate_mv = 0;
+ assert(!is_comp_pred);
+ if (have_newmv_in_inter_mode(this_mode)) {
const int_mv mv0 = mbmi->mv[0];
- const WarpedMotionParams wm_params0 = mbmi->wm_params[0];
- int num_proj_ref0 = mbmi->num_proj_ref[0];
+ const WarpedMotionParams wm_params0 = mbmi->wm_params;
+ int num_proj_ref0 = mbmi->num_proj_ref;
// Refine MV in a small range.
av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
@@ -8057,12 +8779,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
}
#endif
- tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+ tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
} else {
// Restore the old MV and WM parameters.
mbmi->mv[0] = mv0;
- mbmi->wm_params[0] = wm_params0;
- mbmi->num_proj_ref[0] = num_proj_ref0;
+ mbmi->wm_params = wm_params0;
+ mbmi->num_proj_ref = num_proj_ref0;
}
}
@@ -8071,144 +8793,10 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
continue;
}
} else if (is_interintra_mode) {
- INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
- int64_t rd, best_interintra_rd = INT64_MAX;
- int rmode, rate_sum;
- int64_t dist_sum;
- int j;
- int tmp_rate_mv = 0;
- int tmp_skip_txfm_sb;
- int bw = block_size_wide[bsize];
- int64_t tmp_skip_sse_sb;
- DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
- uint8_t *tmp_buf, *intrapred;
- const int *const interintra_mode_cost =
- x->interintra_mode_cost[size_group_lookup[bsize]];
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
- intrapred = CONVERT_TO_BYTEPTR(intrapred_);
- } else {
- tmp_buf = tmp_buf_;
- intrapred = intrapred_;
- }
- const int_mv mv0 = mbmi->mv[0];
-
- mbmi->ref_frame[1] = NONE_FRAME;
- xd->plane[0].dst.buf = tmp_buf;
- xd->plane[0].dst.stride = bw;
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
-
- restore_dst_buf(xd, *orig_dst, num_planes);
- mbmi->ref_frame[1] = INTRA_FRAME;
- mbmi->use_wedge_interintra = 0;
- for (j = 0; j < INTERINTRA_MODES; ++j) {
- mbmi->interintra_mode = (INTERINTRA_MODE)j;
- rmode = interintra_mode_cost[mbmi->interintra_mode];
- av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
- intrapred, bw);
- av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
- av1_subtract_plane(x, bsize, 0);
- model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
- rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
- if (rd < best_interintra_rd) {
- best_interintra_rd = rd;
- best_interintra_mode = mbmi->interintra_mode;
- }
- }
- mbmi->interintra_mode = best_interintra_mode;
- rmode = interintra_mode_cost[mbmi->interintra_mode];
- av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
- intrapred, bw);
- av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
- rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
- if (rd != INT64_MAX)
- rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum);
- best_interintra_rd = rd;
-
- if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) {
- // restore ref_frame[1]
- mbmi->ref_frame[1] = ref_frame_1;
- continue;
- }
-
- if (is_interintra_wedge_used(bsize)) {
- int64_t best_interintra_rd_nowedge = INT64_MAX;
- int64_t best_interintra_rd_wedge = INT64_MAX;
- int_mv tmp_mv;
- InterpFilters backup_interp_filters = mbmi->interp_filters;
- int rwedge = x->wedge_interintra_cost[bsize][0];
- if (rd != INT64_MAX)
- rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum + rwedge, dist_sum);
- best_interintra_rd_nowedge = rd;
-
- // Disable wedge search if source variance is small
- if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
- mbmi->use_wedge_interintra = 1;
-
- rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
- x->wedge_interintra_cost[bsize][1];
-
- best_interintra_rd_wedge =
- pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
-
- best_interintra_rd_wedge +=
- RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0);
- // Refine motion vector.
- if (have_newmv_in_inter_mode(mbmi->mode)) {
- // get negative of mask
- const uint8_t *mask = av1_get_contiguous_soft_mask(
- mbmi->interintra_wedge_index, 1, bsize);
- tmp_mv = av1_get_ref_mv(x, 0);
- compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
- mi_col, intrapred, mask, bw,
- &tmp_rate_mv, 0);
- mbmi->mv[0].as_int = tmp_mv.as_int;
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
- bsize);
- av1_subtract_plane(x, bsize, 0);
- model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL,
- NULL);
- rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge,
- dist_sum);
- if (rd >= best_interintra_rd_wedge) {
- tmp_mv.as_int = mv0.as_int;
- tmp_rate_mv = rate_mv;
- mbmi->interp_filters = backup_interp_filters;
- av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
- }
- } else {
- tmp_mv.as_int = mv0.as_int;
- tmp_rate_mv = rate_mv;
- av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
- }
- // Evaluate closer to true rd
- rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
- INT64_MAX);
- if (rd != INT64_MAX)
- rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
- dist_sum);
- best_interintra_rd_wedge = rd;
- if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
- mbmi->use_wedge_interintra = 1;
- mbmi->mv[0].as_int = tmp_mv.as_int;
- tmp_rate2 += tmp_rate_mv - rate_mv;
- } else {
- mbmi->use_wedge_interintra = 0;
- mbmi->mv[0].as_int = mv0.as_int;
- mbmi->interp_filters = backup_interp_filters;
- }
- } else {
- mbmi->use_wedge_interintra = 0;
- }
- } // if (is_interintra_wedge_used(bsize))
- restore_dst_buf(xd, *orig_dst, num_planes);
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ const int ret = handle_inter_intra_mode(
+ cpi, x, bsize, mi_row, mi_col, mbmi, args, ref_best_rd, &tmp_rate_mv,
+ &tmp_rate2, orig_dst);
+ if (ret < 0) continue;
}
if (!cpi->common.all_lossless)
@@ -8220,8 +8808,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
rd_stats->sse = 0;
rd_stats->skip = 1;
rd_stats->rate = tmp_rate2;
- if (av1_is_interp_needed(xd))
- rd_stats->rate += av1_get_switchable_rate(cm, x, xd);
+ if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate;
if (interintra_allowed) {
rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]]
[mbmi->ref_frame[1] == INTRA_FRAME];
@@ -8246,167 +8833,86 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
}
}
+
if (!skip_txfm_sb) {
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
int64_t est_rd = 0;
int est_skip = 0;
- if (cpi->sf.inter_mode_rd_model_estimation) {
- InterModeRdModel *md = &inter_mode_rd_models[mbmi->sb_type];
+ if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+ cm->tile_rows == 1) {
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[mbmi->sb_type];
if (md->ready) {
const int64_t curr_sse = get_sse(cpi, x);
- est_rd =
- get_est_rd(mbmi->sb_type, x->rdmult, curr_sse, rd_stats->rate);
+ est_rd = get_est_rd(tile_data, mbmi->sb_type, x->rdmult, curr_sse,
+ rd_stats->rate);
est_skip = est_rd * 0.8 > *best_est_rd;
-#if INTER_MODE_RD_TEST
- if (est_rd < *best_est_rd) {
- *best_est_rd = est_rd;
- }
-#else // INTER_MODE_RD_TEST
if (est_skip) {
- ++md->skip_count;
mbmi->ref_frame[1] = ref_frame_1;
continue;
} else {
if (est_rd < *best_est_rd) {
*best_est_rd = est_rd;
}
- ++md->non_skip_count;
}
-#endif // INTER_MODE_RD_TEST
}
}
#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
+ }
- int64_t rdcosty = INT64_MAX;
- int is_cost_valid_uv = 0;
-
- // cost and distortion
- av1_subtract_plane(x, bsize, 0);
- if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
- // Motion mode
- select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col,
- ref_best_rd);
-#if CONFIG_COLLECT_RD_STATS == 2
- PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize);
-#endif // CONFIG_COLLECT_RD_STATS == 2
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ if (!do_tx_search) {
+ const int64_t curr_sse = get_sse(cpi, x);
+ int est_residue_cost = 0;
+ int64_t est_dist = 0;
+ const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
+ &est_residue_cost, &est_dist);
+ (void)has_est_rd;
+ assert(has_est_rd);
+ const int mode_rate = rd_stats->rate;
+ rd_stats->rate += est_residue_cost;
+ rd_stats->dist = est_dist;
+ rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (cm->reference_mode == SINGLE_REFERENCE) {
+ if (!is_comp_pred) {
+ inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+ rd_stats->rdcost, mbmi);
+ }
} else {
- super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
- memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
- memset(x->blk_skip, rd_stats_y->skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+ rd_stats->rdcost, mbmi);
}
-
- if (rd_stats_y->rate == INT_MAX) {
- av1_invalid_rd_stats(rd_stats);
- if (mbmi->motion_mode != SIMPLE_TRANSLATION ||
- mbmi->ref_frame[1] == INTRA_FRAME) {
- mbmi->ref_frame[1] = ref_frame_1;
- continue;
- } else {
- restore_dst_buf(xd, *orig_dst, num_planes);
- mbmi->ref_frame[1] = ref_frame_1;
+ } else {
+#endif
+ int mode_rate = rd_stats->rate;
+ if (!txfm_search(cpi, x, bsize, mi_row, mi_col, rd_stats, rd_stats_y,
+ rd_stats_uv, mode_rate, ref_best_rd)) {
+ if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
return INT64_MAX;
}
+ continue;
}
-
- av1_merge_rd_stats(rd_stats, rd_stats_y);
-
- rdcosty = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, 0, rd_stats->sse));
- if (num_planes > 1) {
- /* clang-format off */
- is_cost_valid_uv =
- inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty,
- FTXS_NONE);
- if (!is_cost_valid_uv) {
- mbmi->ref_frame[1] = ref_frame_1;
- continue;
+ if (!skip_txfm_sb) {
+ const int64_t curr_rd =
+ RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (curr_rd < ref_best_rd) {
+ ref_best_rd = curr_rd;
}
- /* clang-format on */
- av1_merge_rd_stats(rd_stats, rd_stats_uv);
- } else {
- av1_init_rd_stats(rd_stats_uv);
- }
-#if CONFIG_RD_DEBUG
- // record transform block coefficient cost
- // TODO(angiebird): So far rd_debug tool only detects discrepancy of
- // coefficient cost. Therefore, it is fine to copy rd_stats into mbmi
- // here because we already collect the coefficient cost. Move this part to
- // other place when we need to compare non-coefficient cost.
- mbmi->rd_stats = *rd_stats;
-#endif // CONFIG_RD_DEBUG
- const int skip_ctx = av1_get_skip_context(xd);
- if (rd_stats->skip) {
- rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
- rd_stats_y->rate = 0;
- rd_stats_uv->rate = 0;
- rd_stats->rate += x->skip_cost[skip_ctx][1];
- mbmi->skip = 0;
- // here mbmi->skip temporarily plays a role as what this_skip2 does
- } else if (!xd->lossless[mbmi->segment_id] &&
- (RDCOST(x->rdmult,
- rd_stats_y->rate + rd_stats_uv->rate +
- x->skip_cost[skip_ctx][0],
- rd_stats->dist) >= RDCOST(x->rdmult,
- x->skip_cost[skip_ctx][1],
- rd_stats->sse))) {
- rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
- rd_stats->rate += x->skip_cost[skip_ctx][1];
- rd_stats->dist = rd_stats->sse;
- rd_stats_y->rate = 0;
- rd_stats_uv->rate = 0;
- mbmi->skip = 1;
- } else {
- rd_stats->rate += x->skip_cost[skip_ctx][0];
- mbmi->skip = 0;
- }
- *disable_skip = 0;
+ *disable_skip = 0;
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
- cm->tile_rows == 1) {
-#if INTER_MODE_RD_TEST
- if (md->ready) {
- int64_t real_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- if (est_skip) {
- ++md->skip_count;
- if (real_rd < ref_best_rd) {
- ++md->fp_skip_count;
- }
- // int fp_skip = real_rd < ref_best_rd;
- // printf("est_skip %d fp_skip %d est_rd %ld best_est_rd %ld real_rd
- // %ld ref_best_rd %ld\n",
- // est_skip, fp_skip, est_rd, *best_est_rd, real_rd,
- // ref_best_rd);
- } else {
- ++md->non_skip_count;
- }
+ if (cpi->sf.inter_mode_rd_model_estimation) {
+ const int skip_ctx = av1_get_skip_context(xd);
+ inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
+ rd_stats->dist,
+ rd_stats_y->rate + rd_stats_uv->rate +
+ x->skip_cost[skip_ctx][mbmi->skip]);
}
-#endif // INTER_MODE_RD_TEST
- inter_mode_data_push(mbmi->sb_type, rd_stats->sse, rd_stats->dist,
- rd_stats_y->rate + rd_stats_uv->rate +
- x->skip_cost[skip_ctx][mbmi->skip],
- rd_stats->rate, ref_best_rd);
- }
#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
- int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- if (curr_rd < ref_best_rd) {
- ref_best_rd = curr_rd;
+ } else {
+ *disable_skip = 1;
}
- } else {
- x->skip = 1;
- *disable_skip = 1;
- mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
-
- // The cost of skip bit needs to be added.
- mbmi->skip = 0;
- rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1];
-
- rd_stats->dist = 0;
- rd_stats->sse = 0;
- rd_stats_y->rate = 0;
- rd_stats_uv->rate = 0;
- rd_stats->skip = 1;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
}
+#endif
if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
if (is_nontrans_global_motion(xd, xd->mi[0])) {
@@ -8416,23 +8922,24 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
}
tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- if ((mbmi->motion_mode == SIMPLE_TRANSLATION &&
- mbmi->ref_frame[1] != INTRA_FRAME) ||
- (tmp_rd < best_rd)) {
+ if (mode_index == 0)
+ args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
+ if ((mode_index == 0) || (tmp_rd < best_rd)) {
best_mbmi = *mbmi;
best_rd = tmp_rd;
best_rd_stats = *rd_stats;
best_rd_stats_y = *rd_stats_y;
+ best_rate_mv = tmp_rate_mv;
if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
memcpy(best_blk_skip, x->blk_skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
best_xskip = x->skip;
best_disable_skip = *disable_skip;
if (best_xskip) break;
}
}
mbmi->ref_frame[1] = ref_frame_1;
-
+ *rate_mv = best_rate_mv;
if (best_rd == INT64_MAX) {
av1_invalid_rd_stats(rd_stats);
restore_dst_buf(xd, *orig_dst, num_planes);
@@ -8443,7 +8950,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
*rd_stats_y = best_rd_stats_y;
if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
memcpy(x->blk_skip, best_blk_skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
x->skip = best_xskip;
*disable_skip = best_disable_skip;
@@ -8482,15 +8989,9 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
return 0;
}
-#ifndef NDEBUG
-static INLINE int is_single_inter_mode(int this_mode) {
- return this_mode >= SINGLE_INTER_MODE_START &&
- this_mode < SINGLE_INTER_MODE_END;
-}
-#endif
-
-static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) {
- assert(is_single_inter_mode(single_mode));
+static INLINE int get_ref_mv_offset(PREDICTION_MODE single_mode,
+ uint8_t ref_mv_idx) {
+ assert(is_inter_singleref_mode(single_mode));
int ref_mv_offset;
if (single_mode == NEARESTMV) {
ref_mv_offset = 0;
@@ -8502,14 +9003,15 @@ static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) {
return ref_mv_offset;
}
-static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
- int ref_mv_idx,
+static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+ int ref_idx, int ref_mv_idx,
const MV_REFERENCE_FRAME *ref_frame,
const MB_MODE_INFO_EXT *mbmi_ext) {
const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
const int is_comp_pred = ref_frame[1] > INTRA_FRAME;
- const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred);
- assert(is_single_inter_mode(single_mode));
+ const PREDICTION_MODE single_mode =
+ get_single_mode(this_mode, ref_idx, is_comp_pred);
+ assert(is_inter_singleref_mode(single_mode));
if (single_mode == NEWMV) {
this_mv->as_int = INVALID_MV;
} else if (single_mode == GLOBALMV) {
@@ -8533,7 +9035,7 @@ static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx,
}
// This function update the non-new mv for the current prediction mode
-static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode,
+static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
const AV1_COMMON *cm, const MACROBLOCK *x) {
const MACROBLOCKD *xd = &x->e_mbd;
const MB_MODE_INFO *mbmi = xd->mi[0];
@@ -8543,7 +9045,8 @@ static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode,
int_mv this_mv;
get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, mbmi->ref_frame,
x->mbmi_ext);
- const int single_mode = get_single_mode(this_mode, i, is_comp_pred);
+ const PREDICTION_MODE single_mode =
+ get_single_mode(this_mode, i, is_comp_pred);
if (single_mode == NEWMV) {
cur_mv[i] = this_mv;
} else {
@@ -8584,18 +9087,29 @@ static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
return cost;
}
-static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize, int mi_col, int mi_row,
- int_mv *cur_mv, int masked_compound_used,
- BUFFER_SET *orig_dst, BUFFER_SET *tmp_dst,
- int *rate_mv, int64_t *rd,
- RD_STATS *rd_stats, int64_t ref_best_rd) {
+// Struct for buffers used by compound_type_rd() function.
+// For sizes and alignment of these arrays, refer to
+// alloc_compound_type_rd_buffers() function.
+typedef struct {
+ uint8_t *pred0;
+ uint8_t *pred1;
+ int16_t *residual1; // src - pred1
+ int16_t *diff10; // pred1 - pred0
+ uint8_t *tmp_best_mask_buf; // backup of the best segmentation mask
+} CompoundTypeRdBuffers;
+
+static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_col, int mi_row,
+ int_mv *cur_mv, int masked_compound_used,
+ BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst,
+ CompoundTypeRdBuffers *buffers, int *rate_mv,
+ int64_t *rd, RD_STATS *rd_stats,
+ int64_t ref_best_rd) {
const AV1_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = xd->mi[0];
- const int this_mode = mbmi->mode;
+ const PREDICTION_MODE this_mode = mbmi->mode;
const int bw = block_size_wide[bsize];
- const int bh = block_size_high[bsize];
int rate_sum, rs2;
int64_t dist_sum;
@@ -8605,45 +9119,19 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
int64_t tmp_skip_sse_sb;
INTERINTER_COMPOUND_DATA best_compound_data;
best_compound_data.type = COMPOUND_AVERAGE;
- DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1
- DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0
- uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE];
- uint8_t *preds0[1] = { pred0 };
- uint8_t *preds1[1] = { pred1 };
+ uint8_t *preds0[1] = { buffers->pred0 };
+ uint8_t *preds1[1] = { buffers->pred1 };
int strides[1] = { bw };
int tmp_rate_mv;
const int num_pix = 1 << num_pels_log2_lookup[bsize];
const int mask_len = 2 * num_pix * sizeof(uint8_t);
COMPOUND_TYPE cur_type;
int best_compmode_interinter_cost = 0;
- int can_use_previous = cm->allow_warped_motion;
+ int calc_pred_masked_compound = 1;
best_mv[0].as_int = cur_mv[0].as_int;
best_mv[1].as_int = cur_mv[1].as_int;
*rd = INT64_MAX;
- if (masked_compound_used) {
- // get inter predictors to use for masked compound modes
- av1_build_inter_predictors_for_planes_single_buf(
- xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous);
- av1_build_inter_predictors_for_planes_single_buf(
- xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
- const struct buf_2d *const src = &x->plane[0].src;
- if (get_bitdepth_data_path_index(xd)) {
- aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
- CONVERT_TO_BYTEPTR(pred1), bw, xd->bd);
- aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(pred1),
- bw, CONVERT_TO_BYTEPTR(pred0), bw, xd->bd);
- } else {
- aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, pred1,
- bw);
- aom_subtract_block(bh, bw, diff10, bw, pred1, bw, pred0, bw);
- }
- }
- const int orig_is_best = xd->plane[0].dst.buf == orig_dst->plane[0];
- const BUFFER_SET *backup_buf = orig_is_best ? tmp_dst : orig_dst;
- const BUFFER_SET *best_buf = orig_is_best ? orig_dst : tmp_dst;
for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
if (!is_interinter_compound_used(cur_type, bsize)) continue;
@@ -8662,17 +9150,17 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
}
masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
rs2 = masked_type_cost;
- // No need to call av1_build_inter_predictors_sby here
- // 1. COMPOUND_AVERAGE is always the first candidate
- // 2. av1_build_inter_predictors_sby has been called by
- // interpolation_filter_search
- int64_t est_rd =
- estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+ if (mode_rd < ref_best_rd) {
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ int64_t est_rd =
+ estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (est_rd != INT64_MAX)
+ best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+ }
// use spare buffer for following compound type try
- restore_dst_buf(xd, *backup_buf, 1);
- if (est_rd != INT64_MAX)
- best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+ restore_dst_buf(xd, *tmp_dst, 1);
} else {
mbmi->comp_group_idx = 1;
masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
@@ -8682,19 +9170,20 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
*rd / 3 < ref_best_rd) {
best_rd_cur = build_and_cost_compound_type(
cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
- &tmp_rate_mv, preds0, preds1, residual1, diff10, strides, mi_row,
- mi_col);
+ &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10,
+ strides, mi_row, mi_col, rd_stats->rate, ref_best_rd,
+ &calc_pred_masked_compound);
}
}
if (best_rd_cur < *rd) {
*rd = best_rd_cur;
best_compound_data = mbmi->interinter_comp;
if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) {
- memcpy(tmp_best_mask_buf, xd->seg_mask, mask_len);
+ memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len);
}
best_compmode_interinter_cost = rs2;
if (have_newmv_in_inter_mode(this_mode)) {
- if (use_masked_motion_search(cur_type)) {
+ if (cur_type == COMPOUND_WEDGE) {
best_tmp_rate_mv = tmp_rate_mv;
best_mv[0].as_int = mbmi->mv[0].as_int;
best_mv[1].as_int = mbmi->mv[1].as_int;
@@ -8712,28 +9201,69 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
mbmi->comp_group_idx =
(best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1;
mbmi->interinter_comp = best_compound_data;
- memcpy(xd->seg_mask, tmp_best_mask_buf, mask_len);
+ memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len);
}
if (have_newmv_in_inter_mode(this_mode)) {
mbmi->mv[0].as_int = best_mv[0].as_int;
mbmi->mv[1].as_int = best_mv[1].as_int;
- if (use_masked_motion_search(mbmi->interinter_comp.type)) {
+ if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
rd_stats->rate += best_tmp_rate_mv - *rate_mv;
*rate_mv = best_tmp_rate_mv;
}
}
- restore_dst_buf(xd, *best_buf, 1);
+ restore_dst_buf(xd, *orig_dst, 1);
return best_compmode_interinter_cost;
}
+static INLINE int is_single_newmv_valid(HandleInterModeArgs *args,
+ MB_MODE_INFO *mbmi,
+ PREDICTION_MODE this_mode) {
+ for (int ref_idx = 0; ref_idx < 2; ++ref_idx) {
+ const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx, 1);
+ const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx];
+ if (single_mode == NEWMV &&
+ args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int get_drl_refmv_count(const MACROBLOCK *const x,
+ const MV_REFERENCE_FRAME *ref_frame,
+ PREDICTION_MODE mode) {
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0;
+ const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV);
+ const int has_drl =
+ (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1);
+ const int ref_set =
+ has_drl ? AOMMIN(MAX_REF_MV_SERCH, ref_mv_count - has_nearmv) : 1;
+
+ return ref_set;
+}
+
+typedef struct {
+ int64_t rd;
+ int drl_cost;
+ int rate_mv;
+ int_mv mv;
+} inter_mode_info;
+
static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, RD_STATS *rd_stats,
RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
int *disable_skip, int mi_row, int mi_col,
- HandleInterModeArgs *args, int64_t ref_best_rd
+ HandleInterModeArgs *args, int64_t ref_best_rd,
+ uint8_t *const tmp_buf,
+ CompoundTypeRdBuffers *rd_buffers
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
,
- int64_t *best_est_rd
+ TileDataEnc *tile_data, int64_t *best_est_rd,
+ const int do_tx_search,
+ InterModesInfo *inter_modes_info
#endif
) {
const AV1_COMMON *cm = &cpi->common;
@@ -8742,15 +9272,26 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
MB_MODE_INFO *mbmi = xd->mi[0];
MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
const int is_comp_pred = has_second_ref(mbmi);
- const int this_mode = mbmi->mode;
+ const PREDICTION_MODE this_mode = mbmi->mode;
int i;
int refs[2] = { mbmi->ref_frame[0],
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
int rate_mv = 0;
- DECLARE_ALIGNED(32, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
- uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
int64_t rd = INT64_MAX;
- BUFFER_SET orig_dst, tmp_dst;
+
+ // do first prediction into the destination buffer. Do the next
+ // prediction into a temporary buffer. Then keep track of which one
+ // of these currently holds the best predictor, and use the other
+ // one for future predictions. In the end, copy from tmp_buf to
+ // dst if necessary.
+ struct macroblockd_plane *p = xd->plane;
+ BUFFER_SET orig_dst = {
+ { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+ { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+ };
+ const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+ tmp_buf + 2 * MAX_SB_SQUARE },
+ { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
int skip_txfm_sb = 0;
int64_t skip_sse_sb = INT64_MAX;
@@ -8765,36 +9306,29 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
MB_MODE_INFO best_mbmi = *mbmi;
int best_disable_skip;
int best_xskip;
- int plane_rate[MAX_MB_PLANE] = { 0 };
- int64_t plane_sse[MAX_MB_PLANE] = { 0 };
- int64_t plane_dist[MAX_MB_PLANE] = { 0 };
int64_t newmv_ret_val = INT64_MAX;
int_mv backup_mv[2] = { { 0 } };
int backup_rate_mv = 0;
+ inter_mode_info mode_info[MAX_REF_MV_SERCH];
int comp_idx;
const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp &
(mbmi->mode != GLOBAL_GLOBALMV);
- const int has_drl = (have_nearmv_in_inter_mode(mbmi->mode) &&
- mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
- ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
- mbmi_ext->ref_mv_count[ref_frame_type] > 1);
-
// TODO(jingning): This should be deprecated shortly.
- const int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
- const int ref_set =
- has_drl ? AOMMIN(MAX_REF_MV_SERCH,
- mbmi_ext->ref_mv_count[ref_frame_type] - idx_offset)
- : 1;
+ const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+ const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ mode_info[ref_mv_idx].mv.as_int = INVALID_MV;
+ mode_info[ref_mv_idx].rd = INT64_MAX;
+
if (cpi->sf.reduce_inter_modes && ref_mv_idx > 0) {
if (mbmi->ref_frame[0] == LAST2_FRAME ||
mbmi->ref_frame[0] == LAST3_FRAME ||
mbmi->ref_frame[1] == LAST2_FRAME ||
mbmi->ref_frame[1] == LAST3_FRAME) {
- if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + idx_offset]
+ if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + has_nearmv]
.weight < REF_CAT_LEVEL) {
continue;
}
@@ -8811,41 +9345,40 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
mode_ctx =
av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
- mbmi->num_proj_ref[0] = 0;
- mbmi->num_proj_ref[1] = 0;
+ mbmi->num_proj_ref = 0;
mbmi->motion_mode = SIMPLE_TRANSLATION;
mbmi->ref_mv_idx = ref_mv_idx;
- if (is_comp_pred) {
- for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) {
- const int single_mode =
- get_single_mode(this_mode, ref_idx, is_comp_pred);
- if (single_mode == NEWMV &&
- args->single_newmv[mbmi->ref_mv_idx][mbmi->ref_frame[ref_idx]]
- .as_int == INVALID_MV)
- continue;
- }
+ if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, this_mode))) {
+ continue;
}
rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
- rd_stats->rate +=
+ const int drl_cost =
get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+ rd_stats->rate += drl_cost;
+ mode_info[ref_mv_idx].drl_cost = drl_cost;
+
+ if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+ mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+ continue;
+ }
- const RD_STATS backup_rd_stats = *rd_stats;
- const MB_MODE_INFO backup_mbmi = *mbmi;
int64_t best_rd2 = INT64_MAX;
+ const RD_STATS backup_rd_stats = *rd_stats;
// If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
int rs = 0;
int compmode_interinter_cost = 0;
- *rd_stats = backup_rd_stats;
- *mbmi = backup_mbmi;
mbmi->compound_idx = comp_idx;
-
if (is_comp_pred && comp_idx == 0) {
+ *rd_stats = backup_rd_stats;
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->num_proj_ref = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
mbmi->comp_group_idx = 0;
- mbmi->compound_idx = 0;
const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
const int comp_index_ctx = get_comp_index_context(cm, xd);
@@ -8885,32 +9418,69 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
} else {
rd_stats->rate += rate_mv;
}
+
+ if (cpi->sf.skip_repeated_newmv) {
+ if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) {
+ int skip = 0;
+ int this_rate_mv = 0;
+ for (i = 0; i < ref_mv_idx; ++i) {
+ // Check if the motion search result same as previous results
+ if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int) {
+ // If the compared mode has no valid rd, it is unlikely this
+ // mode will be the best mode
+ if (mode_info[i].rd == INT64_MAX) {
+ skip = 1;
+ break;
+ }
+ // Compare the cost difference including drl cost and mv cost
+ if (mode_info[i].mv.as_int != INVALID_MV) {
+ const int compare_cost =
+ mode_info[i].rate_mv + mode_info[i].drl_cost;
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ this_rate_mv = av1_mv_bit_cost(&mode_info[i].mv.as_mv,
+ &ref_mv.as_mv, x->nmvjointcost,
+ x->mvcost, MV_COST_WEIGHT);
+ const int this_cost = this_rate_mv + drl_cost;
+
+ if (compare_cost < this_cost) {
+ skip = 1;
+ break;
+ } else {
+ // If the cost is less than current best result, make this
+ // the best and update corresponding variables
+ if (best_mbmi.ref_mv_idx == i) {
+ assert(best_rd != INT64_MAX);
+ best_mbmi.ref_mv_idx = ref_mv_idx;
+ best_rd_stats.rate += this_cost - compare_cost;
+ best_rd = RDCOST(x->rdmult, best_rd_stats.rate,
+ best_rd_stats.dist);
+ if (best_rd < ref_best_rd) ref_best_rd = best_rd;
+
+ skip = 1;
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (skip) {
+ args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
+ args->modelled_rd[this_mode][i][refs[0]];
+ args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
+ args->simple_rd[this_mode][i][refs[0]];
+ mode_info[ref_mv_idx].rd = mode_info[i].rd;
+ mode_info[ref_mv_idx].rate_mv = this_rate_mv;
+ mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
+
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+ }
+ }
}
for (i = 0; i < is_comp_pred + 1; ++i) {
mbmi->mv[i].as_int = cur_mv[i].as_int;
}
-
- // Initialise tmp_dst and orig_dst buffers to prevent "may be used
- // uninitialized" warnings in GCC when the stream is monochrome.
- memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane));
- memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride));
- memset(orig_dst.plane, 0, sizeof(tmp_dst.plane));
- memset(orig_dst.stride, 0, sizeof(tmp_dst.stride));
-
- // do first prediction into the destination buffer. Do the next
- // prediction into a temporary buffer. Then keep track of which one
- // of these currently holds the best predictor, and use the other
- // one for future predictions. In the end, copy from tmp_buf to
- // dst if necessary.
- for (i = 0; i < num_planes; i++) {
- tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
- tmp_dst.stride[i] = MAX_SB_SIZE;
- }
- for (i = 0; i < num_planes; i++) {
- orig_dst.plane[i] = xd->plane[i].dst.buf;
- orig_dst.stride[i] = xd->plane[i].dst.stride;
- }
-
const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
#if USE_DISCOUNT_NEWMV_TEST
// We don't include the cost of the second reference here, because there
@@ -8937,47 +9507,62 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
continue;
}
- ret_val = interpolation_filter_search(
- x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
- args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
- if (ret_val != 0) {
- restore_dst_buf(xd, orig_dst, num_planes);
- continue;
- } else if (cpi->sf.model_based_post_interp_filter_breakout &&
- ref_best_rd != INT64_MAX && (rd / 6 > ref_best_rd)) {
- restore_dst_buf(xd, orig_dst, num_planes);
- if ((rd >> 4) > ref_best_rd) break;
- continue;
- }
-
+ int skip_build_pred = 0;
if (is_comp_pred && comp_idx) {
+ // Find matching interp filter or set to default interp filter
+ const int need_search =
+ av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
+ int match_found = -1;
+ const InterpFilter assign_filter = cm->interp_filter;
+ if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
+ match_found = find_interp_filter_in_stats(x, mbmi);
+ }
+ if (!need_search || match_found == -1) {
+ set_default_interp_filters(mbmi, assign_filter);
+ }
+
int64_t best_rd_compound;
compmode_interinter_cost = compound_type_rd(
cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used,
- &orig_dst, &tmp_dst, &rate_mv, &best_rd_compound, rd_stats,
- ref_best_rd);
+ &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound,
+ rd_stats, ref_best_rd);
if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
restore_dst_buf(xd, orig_dst, num_planes);
continue;
}
- if (mbmi->interinter_comp.type != COMPOUND_AVERAGE) {
- int tmp_rate;
- int64_t tmp_dist;
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst,
- bsize);
- for (int plane = 0; plane < num_planes; ++plane)
- av1_subtract_plane(x, bsize, plane);
- model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate,
- &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate,
- plane_sse, plane_dist);
- rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
+ // No need to call av1_build_inter_predictors_sby if
+ // COMPOUND_AVERAGE is selected because it is the first
+ // candidate in compound_type_rd, and the following
+ // compound types searching uses tmp_dst buffer
+ if (mbmi->interinter_comp.type == COMPOUND_AVERAGE) {
+ if (num_planes > 1)
+ av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, &orig_dst,
+ bsize);
+ skip_build_pred = 1;
}
}
+ ret_val = interpolation_filter_search(
+ x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
+ args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb,
+ skip_build_pred, args, ref_best_rd);
+ if (args->modelled_rd != NULL && !is_comp_pred) {
+ args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+ }
+ if (ret_val != 0) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ } else if (cpi->sf.model_based_post_interp_filter_breakout &&
+ ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ if ((rd >> 3) * 2 > ref_best_rd) break;
+ continue;
+ }
+
if (search_jnt_comp) {
// if 1/2 model rd is larger than best_rd in jnt_comp mode,
// use jnt_comp mode, save additional search
- if ((rd >> 1) > best_rd) {
+ if ((rd >> 3) * 4 > best_rd) {
restore_dst_buf(xd, orig_dst, num_planes);
continue;
}
@@ -8991,31 +9576,31 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
if (is_comp_pred) {
const int mode0 = compound_ref0_mode(this_mode);
const int mode1 = compound_ref1_mode(this_mode);
- const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
- args->modelled_rd[mode1][refs[1]]);
- if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+ const int64_t mrd =
+ AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+ args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+ if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
restore_dst_buf(xd, orig_dst, num_planes);
continue;
}
- } else {
- args->modelled_rd[this_mode][refs[0]] = rd;
- }
- }
-
- if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
- // if current pred_error modeled rd is substantially more than the best
- // so far, do not bother doing full rd
- if (rd / 2 > ref_best_rd) {
- restore_dst_buf(xd, orig_dst, num_planes);
- continue;
}
}
-
rd_stats->rate += compmode_interinter_cost;
if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
// TODO(chengchen): this speed feature introduces big loss.
// Need better estimation of rate distortion.
+ int dummy_rate;
+ int64_t dummy_dist;
+ int plane_rate[MAX_MB_PLANE] = { 0 };
+ int64_t plane_sse[MAX_MB_PLANE] = { 0 };
+ int64_t plane_dist[MAX_MB_PLANE] = { 0 };
+
+ model_rd_sb_fn[MODELRD_TYPE_JNT_COMPOUND](
+ cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, &dummy_rate,
+ &dummy_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, plane_sse,
+ plane_dist);
+
rd_stats->rate += rs;
rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2];
rd_stats_y->rate = plane_rate[0];
@@ -9028,18 +9613,21 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
} else {
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- ret_val =
- motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
- disable_skip, mi_row, mi_col, args, ref_best_rd,
- refs, rate_mv, &orig_dst, best_est_rd);
+ ret_val = motion_mode_rd(
+ cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip,
+ mi_row, mi_col, args, ref_best_rd, refs, &rate_mv, &orig_dst,
+ tile_data, best_est_rd, do_tx_search, inter_modes_info);
#else
ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y,
rd_stats_uv, disable_skip, mi_row, mi_col,
- args, ref_best_rd, refs, rate_mv, &orig_dst);
+ args, ref_best_rd, refs, &rate_mv, &orig_dst);
#endif
}
+ mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
+ mode_info[ref_mv_idx].rate_mv = rate_mv;
if (ret_val != INT64_MAX) {
int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ mode_info[ref_mv_idx].rd = tmp_rd;
if (tmp_rd < best_rd) {
best_rd_stats = *rd_stats;
best_rd_stats_y = *rd_stats_y;
@@ -9049,7 +9637,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
best_disable_skip = *disable_skip;
best_xskip = x->skip;
memcpy(best_blk_skip, x->blk_skip,
- sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w);
}
if (tmp_rd < best_rd2) {
@@ -9062,8 +9650,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
}
restore_dst_buf(xd, orig_dst, num_planes);
}
-
- args->modelled_rd = NULL;
}
if (best_rd == INT64_MAX) return INT64_MAX;
@@ -9078,7 +9664,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
assert(IMPLIES(mbmi->comp_group_idx == 1,
mbmi->interinter_comp.type != COMPOUND_AVERAGE));
memcpy(x->blk_skip, best_blk_skip,
- sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w);
return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
}
@@ -9186,8 +9772,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
int sadpb = x->sadperbit16;
int cost_list[5];
int bestsme = av1_full_pixel_search(
- cpi, x, bsize, &mvp_full, step_param, sadpb,
- cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
+ cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
+ sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
(MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
x->mv_limits = tmp_mv_limits;
@@ -9229,8 +9815,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
} else {
super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
- memset(x->blk_skip, rd_stats.skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+ set_blk_skip(x, 0, i, rd_stats.skip);
}
if (num_planes > 1) {
super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
@@ -9254,7 +9840,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
best_skip = x->skip;
best_rdcost = rdc_noskip;
memcpy(best_blk_skip, x->blk_skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
}
if (!xd->lossless[mbmi->segment_id]) {
@@ -9271,7 +9857,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
best_skip = x->skip;
best_rdcost = rdc_skip;
memcpy(best_blk_skip, x->blk_skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
}
}
}
@@ -9279,7 +9865,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
*rd_cost = best_rdcost;
x->skip = best_skip;
memcpy(x->blk_skip, best_blk_skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
return best_rd;
}
@@ -9302,8 +9888,8 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
mbmi->mv[0].as_int = 0;
const int64_t intra_yrd =
- rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
- &y_skip, bsize, best_rd, ctx);
+ rd_pick_intra_sby_mode(cpi, x, mi_row, mi_col, &rate_y, &rate_y_tokenonly,
+ &dist_y, &y_skip, bsize, best_rd, ctx);
if (intra_yrd < best_rd) {
// Only store reconstructed luma when there's chroma RDO. When there's no
@@ -9447,6 +10033,17 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost,
mbmi->uv_mode = UV_DC_PRED;
mbmi->ref_frame[0] = ref_frame;
mbmi->ref_frame[1] = second_ref_frame;
+ const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ if (x->mbmi_ext->ref_mv_count[ref_frame_type] == UINT8_MAX) {
+ if (x->mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
+ x->mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
+ return;
+ }
+ MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count,
+ mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+ mi_col, mbmi_ext->mode_context);
+ }
assert(this_mode == NEAREST_NEARESTMV);
if (!build_cur_mv(mbmi->mv, this_mode, cm, x)) {
@@ -9508,7 +10105,7 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost,
memset(search_state->best_mbmode.inter_tx_size,
search_state->best_mbmode.tx_size,
sizeof(search_state->best_mbmode.inter_tx_size));
- set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n8_w, xd->n8_h,
+ set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n4_w, xd->n4_h,
search_state->best_mbmode.skip && is_inter_block(mbmi), xd);
// Set up color-related variables for skip mode.
@@ -9595,11 +10192,12 @@ static void sf_refine_fast_tx_type_search(
} else {
super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
- memset(x->blk_skip, rd_stats_y.skip,
- sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w);
+ for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+ set_blk_skip(x, 0, i, rd_stats_y.skip);
}
if (num_planes > 1) {
- inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, FTXS_NONE);
+ inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, INT64_MAX,
+ FTXS_NONE);
} else {
av1_init_rd_stats(&rd_stats_uv);
}
@@ -9647,7 +10245,7 @@ static void sf_refine_fast_tx_type_search(
static void set_params_rd_pick_inter_mode(
const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2],
- uint32_t mode_skip_mask[REF_FRAMES],
+ uint32_t mode_skip_mask[REF_FRAMES], int skip_ref_frame_mask,
unsigned int ref_costs_single[REF_FRAMES],
unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
@@ -9700,18 +10298,45 @@ static void set_params_rd_pick_inter_mode(
x->pred_mv_sad[ref_frame] = INT_MAX;
x->mbmi_ext->mode_context[ref_frame] = 0;
x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+ mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) {
+ if (mbmi->partition != PARTITION_NONE &&
+ mbmi->partition != PARTITION_SPLIT) {
+ if (skip_ref_frame_mask & (1 << ref_frame)) {
+ int skip = 1;
+ for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+ if (!(skip_ref_frame_mask & (1 << r))) {
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+ if (rf[0] == ref_frame || rf[1] == ref_frame) {
+ skip = 0;
+ break;
+ }
+ }
+ }
+ if (skip) continue;
+ }
+ }
assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
yv12_mb);
}
}
-
- // TODO(zoeliu@google.com): To further optimize the obtaining of motion vector
- // references for compound prediction, as not every pair of reference frames
- // woud be examined for the RD evaluation.
+ // ref_frame = ALTREF_FRAME
for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
x->mbmi_ext->mode_context[ref_frame] = 0;
+ mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
+ if (!((cpi->ref_frame_flags & ref_frame_flag_list[rf[0]]) &&
+ (cpi->ref_frame_flags & ref_frame_flag_list[rf[1]]))) {
+ continue;
+ }
+
+ if (mbmi->partition != PARTITION_NONE &&
+ mbmi->partition != PARTITION_SPLIT) {
+ if (skip_ref_frame_mask & (1 << ref_frame)) {
+ continue;
+ }
+ }
av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
mi_col, mbmi_ext->mode_context);
@@ -9838,9 +10463,10 @@ static void set_params_rd_pick_inter_mode(
}
}
-static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
- RD_STATS *rd_cost, PICK_MODE_CONTEXT *ctx,
- BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
+static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+ int mi_col, RD_STATS *rd_cost,
+ PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
+ MB_MODE_INFO *const mbmi,
PALETTE_MODE_INFO *const pmi,
unsigned int *ref_costs_single,
InterModeSearchState *search_state) {
@@ -9867,9 +10493,9 @@ static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
mbmi->ref_frame[0] = INTRA_FRAME;
mbmi->ref_frame[1] = NONE_FRAME;
rate_overhead_palette = rd_pick_palette_intra_sby(
- cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette,
- best_palette_color_map, &best_rd_palette, &best_model_rd_palette, NULL,
- NULL, NULL, NULL, ctx, best_blk_skip);
+ cpi, x, bsize, mi_row, mi_col, intra_mode_cost[DC_PRED],
+ &best_mbmi_palette, best_palette_color_map, &best_rd_palette,
+ &best_model_rd_palette, NULL, NULL, NULL, NULL, ctx, best_blk_skip);
if (pmi->palette_size[0] == 0) return;
memcpy(x->blk_skip, best_blk_skip,
@@ -9986,15 +10612,49 @@ static void init_inter_mode_search_state(InterModeSearchState *search_state,
av1_zero(search_state->single_newmv);
av1_zero(search_state->single_newmv_rate);
av1_zero(search_state->single_newmv_valid);
- for (int i = 0; i < MB_MODE_COUNT; ++i)
- for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
- search_state->modelled_rd[i][ref_frame] = INT64_MAX;
+ for (int i = 0; i < MB_MODE_COUNT; ++i) {
+ for (int j = 0; j < MAX_REF_MV_SERCH; ++j) {
+ for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+ search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+ search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+ }
+ }
+ }
+
+ for (int dir = 0; dir < 2; ++dir) {
+ for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+ SingleInterModeState *state;
+
+ state = &search_state->single_state[dir][mode][ref_frame];
+ state->ref_frame = NONE_FRAME;
+ state->rd = INT64_MAX;
+
+ state = &search_state->single_state_modelled[dir][mode][ref_frame];
+ state->ref_frame = NONE_FRAME;
+ state->rd = INT64_MAX;
+ }
+ }
+ }
+ for (int dir = 0; dir < 2; ++dir) {
+ for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+ search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME;
+ }
+ }
+ }
+ av1_zero(search_state->single_state_cnt);
+ av1_zero(search_state->single_state_modelled_cnt);
}
+// Case 1: return 0, means don't skip this mode
+// Case 2: return 1, means skip this mode completely
+// Case 3: return 2, means skip compound only, but still try single motion modes
static int inter_mode_search_order_independent_skip(
- const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index,
- int mi_row, int mi_col, uint32_t *mode_skip_mask,
- uint16_t *ref_frame_skip_mask) {
+ const AV1_COMP *cpi, const PICK_MODE_CONTEXT *ctx, const MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mode_index, int mi_row, int mi_col,
+ uint32_t *mode_skip_mask, uint16_t *ref_frame_skip_mask,
+ InterModeSearchState *search_state) {
const SPEED_FEATURES *const sf = &cpi->sf;
const AV1_COMMON *const cm = &cpi->common;
const struct segmentation *const seg = &cm->seg;
@@ -10003,6 +10663,32 @@ static int inter_mode_search_order_independent_skip(
const unsigned char segment_id = mbmi->segment_id;
const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+ int skip_motion_mode = 0;
+ if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+ const int ref_type = av1_ref_frame_type(ref_frame);
+ int skip_ref = ctx->skip_ref_frame_mask & (1 << ref_type);
+ if (ref_type <= ALTREF_FRAME && skip_ref) {
+ // Since the compound ref modes depends on the motion estimation result of
+ // two single ref modes( best mv of single ref modes as the start point )
+ // If current single ref mode is marked skip, we need to check if it will
+ // be used in compound ref modes.
+ for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+ if (!(ctx->skip_ref_frame_mask & (1 << r))) {
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+ if (rf[0] == ref_type || rf[1] == ref_type) {
+ // Found a not skipped compound ref mode which contains current
+ // single ref. So this single ref can't be skipped completly
+ // Just skip it's motion mode search, still try it's simple
+ // transition mode.
+ skip_motion_mode = 1;
+ skip_ref = 0;
+ break;
+ }
+ }
+ }
+ }
+ if (skip_ref) return 1;
+ }
if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
!x->cb_partition_scan) {
@@ -10115,9 +10801,12 @@ static int inter_mode_search_order_independent_skip(
return 1;
}
- if (skip_repeated_mv(cm, x, this_mode, ref_frame)) {
+ if (skip_repeated_mv(cm, x, this_mode, ref_frame, search_state)) {
return 1;
}
+ if (skip_motion_mode) {
+ return 2;
+ }
return 0;
}
@@ -10139,12 +10828,13 @@ static INLINE void init_mbmi(MB_MODE_INFO *mbmi, int mode_index,
set_default_interp_filters(mbmi, cm->interp_filter);
}
-static int handle_intra_mode(InterModeSearchState *search_state,
- const AV1_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize, int ref_frame_cost,
- const PICK_MODE_CONTEXT *ctx, int disable_skip,
- RD_STATS *rd_stats, RD_STATS *rd_stats_y,
- RD_STATS *rd_stats_uv) {
+static int64_t handle_intra_mode(InterModeSearchState *search_state,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int ref_frame_cost,
+ const PICK_MODE_CONTEXT *ctx, int disable_skip,
+ RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv) {
const AV1_COMMON *cm = &cpi->common;
const SPEED_FEATURES *const sf = &cpi->sf;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -10159,9 +10849,19 @@ static int handle_intra_mode(InterModeSearchState *search_state,
const int rows = block_size_high[bsize];
const int cols = block_size_wide[bsize];
const int num_planes = av1_num_planes(cm);
- av1_init_rd_stats(rd_stats);
- av1_init_rd_stats(rd_stats_y);
- av1_init_rd_stats(rd_stats_uv);
+ const int skip_ctx = av1_get_skip_context(xd);
+
+ int known_rate = intra_mode_cost[mbmi->mode];
+ known_rate += ref_frame_cost;
+ if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED)
+ known_rate += intra_cost_penalty;
+ known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+ const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
+ if (known_rd > search_state->best_rd) {
+ search_state->skip_intra_modes = 1;
+ return INT64_MAX;
+ }
+
TX_SIZE uv_tx;
int is_directional_mode = av1_is_directional_mode(mbmi->mode);
if (is_directional_mode && av1_use_angle_delta(bsize)) {
@@ -10178,20 +10878,33 @@ static int handle_intra_mode(InterModeSearchState *search_state,
search_state->directional_mode_skip_mask);
search_state->angle_stats_ready = 1;
}
- if (search_state->directional_mode_skip_mask[mbmi->mode]) return 0;
+ if (search_state->directional_mode_skip_mask[mbmi->mode]) return INT64_MAX;
+ av1_init_rd_stats(rd_stats_y);
rd_stats_y->rate = INT_MAX;
- rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize,
- intra_mode_cost[mbmi->mode], search_state->best_rd,
- &model_rd);
+ rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &rate_dummy, rd_stats_y,
+ bsize, intra_mode_cost[mbmi->mode],
+ search_state->best_rd, &model_rd);
} else {
+ av1_init_rd_stats(rd_stats_y);
mbmi->angle_delta[PLANE_TYPE_Y] = 0;
super_block_yrd(cpi, x, rd_stats_y, bsize, search_state->best_rd);
}
uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
memcpy(best_blk_skip, x->blk_skip,
sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
-
+ int try_filter_intra = 0;
+ int64_t best_rd_tmp = INT64_MAX;
if (mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+ if (rd_stats_y->rate != INT_MAX) {
+ const int tmp_rate = rd_stats_y->rate + x->filter_intra_cost[bsize][0] +
+ intra_mode_cost[mbmi->mode];
+ best_rd_tmp = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
+ try_filter_intra = !((best_rd_tmp / 2) > search_state->best_rd);
+ } else {
+ try_filter_intra = !(search_state->best_mbmode.skip);
+ }
+ }
+ if (try_filter_intra) {
RD_STATS rd_stats_y_fi;
int filter_intra_selected_flag = 0;
TX_SIZE best_tx_size = mbmi->tx_size;
@@ -10199,20 +10912,12 @@ static int handle_intra_mode(InterModeSearchState *search_state,
memcpy(best_txk_type, mbmi->txk_type,
sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
- int64_t best_rd_tmp = INT64_MAX;
- if (rd_stats_y->rate != INT_MAX) {
- best_rd_tmp = RDCOST(x->rdmult,
- rd_stats_y->rate + x->filter_intra_cost[bsize][0] +
- intra_mode_cost[mbmi->mode],
- rd_stats_y->dist);
- }
mbmi->filter_intra_mode_info.use_filter_intra = 1;
for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED;
fi_mode < FILTER_INTRA_MODES; ++fi_mode) {
int64_t this_rd_tmp;
mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
-
super_block_yrd(cpi, x, &rd_stats_y_fi, bsize, search_state->best_rd);
if (rd_stats_y_fi.rate == INT_MAX) {
continue;
@@ -10223,6 +10928,9 @@ static int handle_intra_mode(InterModeSearchState *search_state,
intra_mode_cost[mbmi->mode]);
this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
+ if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > search_state->best_rd) {
+ break;
+ }
if (this_rd_tmp < best_rd_tmp) {
best_tx_size = mbmi->tx_size;
memcpy(best_txk_type, mbmi->txk_type,
@@ -10249,12 +10957,23 @@ static int handle_intra_mode(InterModeSearchState *search_state,
mbmi->filter_intra_mode_info.use_filter_intra = 0;
}
}
-
- if (rd_stats_y->rate == INT_MAX) return 0;
-
+ if (rd_stats_y->rate == INT_MAX) return INT64_MAX;
+ const int mode_cost_y =
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]);
+ av1_init_rd_stats(rd_stats);
+ av1_init_rd_stats(rd_stats_uv);
if (num_planes > 1) {
uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
+ int rate_y =
+ rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate;
+ const int64_t rdy =
+ RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist);
+ if (search_state->best_rd < (INT64_MAX / 2) &&
+ rdy > (search_state->best_rd + (search_state->best_rd >> 2))) {
+ search_state->skip_intra_modes = 1;
+ return INT64_MAX;
+ }
choose_intra_uv_mode(
cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
&search_state->rate_uv_tokenonly[uv_tx],
@@ -10262,6 +10981,14 @@ static int handle_intra_mode(InterModeSearchState *search_state,
&search_state->mode_uv[uv_tx]);
if (try_palette) search_state->pmi_uv[uv_tx] = *pmi;
search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
+
+ const int uv_rate = search_state->rate_uv_tokenonly[uv_tx];
+ const int64_t uv_dist = search_state->dist_uvs[uv_tx];
+ const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
+ if (uv_rd > search_state->best_rd) {
+ search_state->skip_intra_modes = 1;
+ return INT64_MAX;
+ }
}
rd_stats_uv->rate = search_state->rate_uv_tokenonly[uv_tx];
@@ -10277,10 +11004,7 @@ static int handle_intra_mode(InterModeSearchState *search_state,
}
mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
}
-
- rd_stats->rate =
- rd_stats_y->rate +
- intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]);
+ rd_stats->rate = rd_stats_y->rate + mode_cost_y;
if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
// super_block_yrd above includes the cost of the tx_size in the
// tokenonly rate, but for intra blocks, tx_size is always coded
@@ -10308,14 +11032,13 @@ static int handle_intra_mode(InterModeSearchState *search_state,
rd_stats_y->rate = 0;
rd_stats_uv->rate = 0;
// Cost the skip mb case
- rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1];
+ rd_stats->rate += x->skip_cost[skip_ctx][1];
} else {
// Add in the cost of the no skip flag.
- rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][0];
+ rd_stats->rate += x->skip_cost[skip_ctx][0];
}
// Calculate the final RD estimate for this mode.
- int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-
+ const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
// Keep record of best intra rd
if (this_rd < search_state->best_intra_rd) {
search_state->best_intra_rd = this_rd;
@@ -10333,14 +11056,322 @@ static int handle_intra_mode(InterModeSearchState *search_state,
search_state->best_pred_rd[i] =
AOMMIN(search_state->best_pred_rd[i], this_rd);
}
- return 1;
+ return this_rd;
}
-void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
+static void collect_single_states(MACROBLOCK *x,
+ InterModeSearchState *search_state,
+ const MB_MODE_INFO *const mbmi) {
+ int i, j;
+ const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1;
+ const int mode_offset = INTER_OFFSET(this_mode);
+ const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+
+ // Simple rd
+ int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame];
+ for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ int64_t rd = search_state->simple_rd[this_mode][ref_mv_idx][ref_frame];
+ if (rd < simple_rd) simple_rd = rd;
+ }
+
+ // Insertion sort of single_state
+ SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 };
+ SingleInterModeState *state_s = search_state->single_state[dir][mode_offset];
+ i = search_state->single_state_cnt[dir][mode_offset];
+ for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j)
+ state_s[j] = state_s[j - 1];
+ state_s[j] = this_state_s;
+ search_state->single_state_cnt[dir][mode_offset]++;
+
+ // Modelled rd
+ int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame];
+ for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ int64_t rd = search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame];
+ if (rd < modelled_rd) modelled_rd = rd;
+ }
+
+ // Insertion sort of single_state_modelled
+ SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 };
+ SingleInterModeState *state_m =
+ search_state->single_state_modelled[dir][mode_offset];
+ i = search_state->single_state_modelled_cnt[dir][mode_offset];
+ for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j)
+ state_m[j] = state_m[j - 1];
+ state_m[j] = this_state_m;
+ search_state->single_state_modelled_cnt[dir][mode_offset]++;
+}
+
+static void analyze_single_states(const AV1_COMP *cpi,
+ InterModeSearchState *search_state) {
+ int i, j, dir, mode;
+ if (cpi->sf.prune_comp_search_by_single_result >= 1) {
+ for (dir = 0; dir < 2; ++dir) {
+ int64_t best_rd;
+ SingleInterModeState(*state)[FWD_REFS];
+
+ // Use the best rd of GLOBALMV or NEWMV to prune the unlikely
+ // reference frames for all the modes (NEARESTMV and NEARMV may not
+ // have same motion vectors). Always keep the best of each mode
+ // because it might form the best possible combination with other mode.
+ state = search_state->single_state[dir];
+ best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+ state[INTER_OFFSET(GLOBALMV)][0].rd);
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) {
+ if (state[mode][i].rd != INT64_MAX &&
+ (state[mode][i].rd >> 1) > best_rd) {
+ state[mode][i].valid = 0;
+ }
+ }
+ }
+
+ state = search_state->single_state_modelled[dir];
+ best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+ state[INTER_OFFSET(GLOBALMV)][0].rd);
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode];
+ ++i) {
+ if (state[mode][i].rd != INT64_MAX &&
+ (state[mode][i].rd >> 1) > best_rd) {
+ state[mode][i].valid = 0;
+ }
+ }
+ }
+ }
+ }
+
+ // Ordering by simple rd first, then by modelled rd
+ for (dir = 0; dir < 2; ++dir) {
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ const int state_cnt_s = search_state->single_state_cnt[dir][mode];
+ const int state_cnt_m =
+ search_state->single_state_modelled_cnt[dir][mode];
+ SingleInterModeState *state_s = search_state->single_state[dir][mode];
+ SingleInterModeState *state_m =
+ search_state->single_state_modelled[dir][mode];
+ int count = 0;
+ const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m);
+ for (i = 0; i < state_cnt_s; ++i) {
+ if (state_s[i].rd == INT64_MAX) break;
+ if (state_s[i].valid)
+ search_state->single_rd_order[dir][mode][count++] =
+ state_s[i].ref_frame;
+ }
+ if (count < max_candidates) {
+ for (i = 0; i < state_cnt_m; ++i) {
+ if (state_m[i].rd == INT64_MAX) break;
+ if (state_m[i].valid) {
+ int ref_frame = state_m[i].ref_frame;
+ int match = 0;
+ // Check if existing already
+ for (j = 0; j < count; ++j) {
+ if (search_state->single_rd_order[dir][mode][j] == ref_frame) {
+ match = 1;
+ break;
+ }
+ }
+ if (!match) {
+ // Check if this ref_frame is removed in simple rd
+ int valid = 1;
+ for (j = 0; j < state_cnt_s; j++) {
+ if (ref_frame == state_s[j].ref_frame && !state_s[j].valid) {
+ valid = 0;
+ break;
+ }
+ }
+ if (valid)
+ search_state->single_rd_order[dir][mode][count++] = ref_frame;
+ }
+ if (count >= max_candidates) break;
+ }
+ }
+ }
+ }
+ }
+}
+
+static int compound_skip_get_candidates(
+ const AV1_COMP *cpi, const InterModeSearchState *search_state,
+ const int dir, const PREDICTION_MODE mode) {
+ const int mode_offset = INTER_OFFSET(mode);
+ const SingleInterModeState *state =
+ search_state->single_state[dir][mode_offset];
+ const SingleInterModeState *state_modelled =
+ search_state->single_state_modelled[dir][mode_offset];
+ int max_candidates = 0;
+ int candidates;
+
+ for (int i = 0; i < FWD_REFS; ++i) {
+ if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break;
+ max_candidates++;
+ }
+
+ candidates = max_candidates;
+ if (cpi->sf.prune_comp_search_by_single_result >= 2) {
+ candidates = AOMMIN(2, max_candidates);
+ }
+ if (cpi->sf.prune_comp_search_by_single_result >= 3) {
+ if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX &&
+ state[0].ref_frame == state_modelled[0].ref_frame)
+ candidates = 1;
+ if (mode == NEARMV || mode == GLOBALMV) candidates = 1;
+ }
+ return candidates;
+}
+
+static int compound_skip_by_single_states(
+ const AV1_COMP *cpi, const InterModeSearchState *search_state,
+ const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame,
+ const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) {
+ const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame };
+ const int mode[2] = { compound_ref0_mode(this_mode),
+ compound_ref1_mode(this_mode) };
+ const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) };
+ const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1,
+ refs[1] <= GOLDEN_FRAME ? 0 : 1 };
+ int ref_searched[2] = { 0, 0 };
+ int ref_mv_match[2] = { 1, 1 };
+ int i, j;
+
+ for (i = 0; i < 2; ++i) {
+ const SingleInterModeState *state =
+ search_state->single_state[mode_dir[i]][mode_offset[i]];
+ const int state_cnt =
+ search_state->single_state_cnt[mode_dir[i]][mode_offset[i]];
+ for (j = 0; j < state_cnt; ++j) {
+ if (state[j].ref_frame == refs[i]) {
+ ref_searched[i] = 1;
+ break;
+ }
+ }
+ }
+
+ const int ref_set = get_drl_refmv_count(x, refs, this_mode);
+ for (i = 0; i < 2; ++i) {
+ if (mode[i] == NEARESTMV || mode[i] == NEARMV) {
+ const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME };
+ int idential = 1;
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) {
+ int_mv single_mv;
+ int_mv comp_mv;
+ get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, single_refs,
+ x->mbmi_ext);
+ get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, refs, x->mbmi_ext);
+
+ idential &= (single_mv.as_int == comp_mv.as_int);
+ if (!idential) {
+ ref_mv_match[i] = 0;
+ break;
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < 2; ++i) {
+ if (ref_searched[i] && ref_mv_match[i]) {
+ const int candidates =
+ compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]);
+ const MV_REFERENCE_FRAME *ref_order =
+ search_state->single_rd_order[mode_dir[i]][mode_offset[i]];
+ int match = 0;
+ for (j = 0; j < candidates; ++j) {
+ if (refs[i] == ref_order[j]) {
+ match = 1;
+ break;
+ }
+ }
+ if (!match) return 1;
+ }
+ }
+
+ return 0;
+}
+
+static INLINE int sf_check_is_drop_ref(const MODE_DEFINITION *mode,
+ InterModeSearchState *search_state) {
+ const MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0];
+ const MV_REFERENCE_FRAME second_ref_frame = mode->ref_frame[1];
+ if (search_state->num_available_refs > 2) {
+ if ((ref_frame == search_state->dist_order_refs[0] &&
+ second_ref_frame == search_state->dist_order_refs[1]) ||
+ (ref_frame == search_state->dist_order_refs[1] &&
+ second_ref_frame == search_state->dist_order_refs[0]))
+ return 1; // drop this pair of refs
+ }
+ return 0;
+}
+
+static INLINE void sf_drop_ref_analyze(InterModeSearchState *search_state,
+ const MODE_DEFINITION *mode,
+ int64_t distortion2) {
+ const PREDICTION_MODE this_mode = mode->mode;
+ MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0];
+ const int idx = ref_frame - LAST_FRAME;
+ if (idx && distortion2 > search_state->dist_refs[idx]) {
+ search_state->dist_refs[idx] = distortion2;
+ search_state->dist_order_refs[idx] = ref_frame;
+ }
+
+ // Reach the last single ref prediction mode
+ if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) {
+ // bubble sort dist_refs and the order index
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ for (int k = i + 1; k < REF_FRAMES; ++k) {
+ if (search_state->dist_refs[i] < search_state->dist_refs[k]) {
+ int64_t tmp_dist = search_state->dist_refs[i];
+ search_state->dist_refs[i] = search_state->dist_refs[k];
+ search_state->dist_refs[k] = tmp_dist;
+
+ int tmp_idx = search_state->dist_order_refs[i];
+ search_state->dist_order_refs[i] = search_state->dist_order_refs[k];
+ search_state->dist_order_refs[k] = tmp_idx;
+ }
+ }
+ }
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ if (search_state->dist_refs[i] == -1) break;
+ search_state->num_available_refs = i;
+ }
+ search_state->num_available_refs++;
+ }
+}
+
+static void alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
+ CompoundTypeRdBuffers *const bufs) {
+ CHECK_MEM_ERROR(
+ cm, bufs->pred0,
+ (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
+ CHECK_MEM_ERROR(
+ cm, bufs->pred1,
+ (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
+ CHECK_MEM_ERROR(
+ cm, bufs->residual1,
+ (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
+ CHECK_MEM_ERROR(
+ cm, bufs->diff10,
+ (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
+ CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf,
+ (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+ sizeof(*bufs->tmp_best_mask_buf)));
+}
+
+static void release_compound_type_rd_buffers(
+ CompoundTypeRdBuffers *const bufs) {
+ aom_free(bufs->pred0);
+ aom_free(bufs->pred1);
+ aom_free(bufs->residual1);
+ aom_free(bufs->diff10);
+ aom_free(bufs->tmp_best_mask_buf);
+ av1_zero(*bufs); // Set all pointers to NULL for safety.
+}
+
+void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
MACROBLOCK *x, int mi_row, int mi_col,
RD_STATS *rd_cost, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) {
- const AV1_COMMON *const cm = &cpi->common;
+ AV1_COMMON *const cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
const SPEED_FEATURES *const sf = &cpi->sf;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -10350,9 +11381,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
const struct segmentation *const seg = &cm->seg;
PREDICTION_MODE this_mode;
- MV_REFERENCE_FRAME ref_frame, second_ref_frame;
unsigned char segment_id = mbmi->segment_id;
- int i, k;
+ int i;
struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
unsigned int ref_costs_single[REF_FRAMES];
unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
@@ -10364,28 +11394,57 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
InterModeSearchState search_state;
init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
best_rd_so_far);
-
+ INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
+ INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
+ INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
+ };
HandleInterModeArgs args = {
{ NULL }, { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
{ NULL }, { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 },
NULL, NULL,
- NULL, NULL,
+ NULL, search_state.modelled_rd,
{ { 0 } }, INT_MAX,
- INT_MAX
+ INT_MAX, search_state.simple_rd,
+ 0, interintra_modes
};
for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
av1_invalid_rd_stats(rd_cost);
// init params, set frame modes, speed features
- set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col,
- ref_frame_skip_mask, mode_skip_mask,
- ref_costs_single, ref_costs_comp, yv12_mb);
+ set_params_rd_pick_inter_mode(
+ cpi, x, &args, bsize, mi_row, mi_col, ref_frame_skip_mask, mode_skip_mask,
+ ctx->skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb);
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
int64_t best_est_rd = INT64_MAX;
+ // TODO(angiebird): Turn this on when this speed feature is well tested
+#if 1
+ const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ const int do_tx_search = !md->ready;
+#else
+ const int do_tx_search = 1;
+#endif
+ InterModesInfo *inter_modes_info = &tile_data->inter_modes_info;
+ inter_modes_info->num = 0;
#endif
+ int intra_mode_num = 0;
+ int intra_mode_idx_ls[MAX_MODES];
+ int reach_first_comp_mode = 0;
+
+ // Temporary buffers used by handle_inter_mode().
+ // We allocate them once and reuse it in every call to that function.
+ // Note: Must be allocated on the heap due to large size of the arrays.
+ uint8_t *tmp_buf_orig;
+ CHECK_MEM_ERROR(
+ cm, tmp_buf_orig,
+ (uint8_t *)aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE));
+ uint8_t *const tmp_buf = get_buf_by_bd(xd, tmp_buf_orig);
+
+ CompoundTypeRdBuffers rd_buffers;
+ alloc_compound_type_rd_buffers(cm, &rd_buffers);
+
for (int midx = 0; midx < MAX_MODES; ++midx) {
int mode_index = mode_map[midx];
int64_t this_rd = INT64_MAX;
@@ -10394,42 +11453,44 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
int64_t distortion2 = 0;
int skippable = 0;
int this_skip2 = 0;
-
- this_mode = av1_mode_order[mode_index].mode;
- ref_frame = av1_mode_order[mode_index].ref_frame[0];
- second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
+ const MODE_DEFINITION *mode_order = &av1_mode_order[mode_index];
+ const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
+ const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
+ const int comp_pred = second_ref_frame > INTRA_FRAME;
+ this_mode = mode_order->mode;
init_mbmi(mbmi, mode_index, cm);
x->skip = 0;
set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
- if (inter_mode_search_order_independent_skip(cpi, x, bsize, mode_index,
- mi_row, mi_col, mode_skip_mask,
- ref_frame_skip_mask))
- continue;
-
- if (ref_frame == INTRA_FRAME) {
- if (sf->skip_intra_in_interframe && search_state.skip_intra_modes)
- continue;
+ // Reach the first compound prediction mode
+ if (sf->prune_comp_search_by_single_result > 0 && comp_pred &&
+ reach_first_comp_mode == 0) {
+ analyze_single_states(cpi, &search_state);
+ reach_first_comp_mode = 1;
}
+ const int ret = inter_mode_search_order_independent_skip(
+ cpi, ctx, x, bsize, mode_index, mi_row, mi_col, mode_skip_mask,
+ ref_frame_skip_mask, &search_state);
+ if (ret == 1) continue;
+ args.skip_motion_mode = (ret == 2);
- if (sf->drop_ref) {
- if (ref_frame > INTRA_FRAME && second_ref_frame > INTRA_FRAME) {
- if (search_state.num_available_refs > 2) {
- if ((ref_frame == search_state.dist_order_refs[0] &&
- second_ref_frame == search_state.dist_order_refs[1]) ||
- (ref_frame == search_state.dist_order_refs[1] &&
- second_ref_frame == search_state.dist_order_refs[0]))
- continue;
- }
+ if (sf->drop_ref && comp_pred) {
+ if (sf_check_is_drop_ref(mode_order, &search_state)) {
+ continue;
}
}
if (search_state.best_rd < search_state.mode_threshold[mode_index])
continue;
- const int comp_pred = second_ref_frame > INTRA_FRAME;
+ if (sf->prune_comp_search_by_single_result > 0 && comp_pred) {
+ if (compound_skip_by_single_states(cpi, &search_state, this_mode,
+ ref_frame, second_ref_frame, x))
+ continue;
+ }
+
const int ref_frame_cost = comp_pred
? ref_costs_comp[ref_frame][second_ref_frame]
: ref_costs_single[ref_frame];
@@ -10474,18 +11535,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
}
if (ref_frame == INTRA_FRAME) {
- RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
- const int ret = handle_intra_mode(
- &search_state, cpi, x, bsize, ref_frame_cost, ctx, disable_skip,
- &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
- if (!ret) {
- continue;
- }
- rate2 = intra_rd_stats.rate;
- distortion2 = intra_rd_stats.dist;
- this_rd = RDCOST(x->rdmult, rate2, distortion2);
- skippable = intra_rd_stats.skip;
- rate_y = intra_rd_stats_y.rate;
+ intra_mode_idx_ls[intra_mode_num++] = mode_index;
+ continue;
} else {
mbmi->angle_delta[PLANE_TYPE_Y] = 0;
mbmi->angle_delta[PLANE_TYPE_UV] = 0;
@@ -10501,17 +11552,17 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
args.single_newmv = search_state.single_newmv;
args.single_newmv_rate = search_state.single_newmv_rate;
args.single_newmv_valid = search_state.single_newmv_valid;
- args.modelled_rd = search_state.modelled_rd;
args.single_comp_cost = real_compmode_cost;
args.ref_frame_cost = ref_frame_cost;
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
- &rd_stats_uv, &disable_skip, mi_row, mi_col,
- &args, ref_best_rd, &best_est_rd);
+ this_rd = handle_inter_mode(
+ cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip,
+ mi_row, mi_col, &args, ref_best_rd, tmp_buf, &rd_buffers, tile_data,
+ &best_est_rd, do_tx_search, inter_modes_info);
#else
this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
&rd_stats_uv, &disable_skip, mi_row, mi_col,
- &args, ref_best_rd);
+ &args, ref_best_rd, tmp_buf, &rd_buffers);
#endif
rate2 = rd_stats.rate;
skippable = rd_stats.skip;
@@ -10520,6 +11571,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
rate_uv = rd_stats_uv.rate;
}
+ if (sf->prune_comp_search_by_single_result > 0 &&
+ is_inter_singleref_mode(this_mode)) {
+ collect_single_states(x, &search_state, mbmi);
+ }
+
if (this_rd == INT64_MAX) continue;
this_skip2 = mbmi->skip;
@@ -10554,10 +11610,24 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
search_state.best_mbmode = *mbmi;
search_state.best_skip2 = this_skip2;
search_state.best_mode_skippable = skippable;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ if (do_tx_search) {
+ // When do_tx_search == 0, handle_inter_mode won't provide correct
+ // rate_y and rate_uv because txfm_search process is replaced by
+ // rd estimation.
+ // Therfore, we should avoid updating best_rate_y and best_rate_uv
+ // here. These two values will be updated when txfm_search is called
+ search_state.best_rate_y =
+ rate_y +
+ x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
+ search_state.best_rate_uv = rate_uv;
+ }
+#else // CONFIG_COLLECT_INTER_MODE_RD_STATS
search_state.best_rate_y =
rate_y +
x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
search_state.best_rate_uv = rate_uv;
+#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
memcpy(ctx->blk_skip, x->blk_skip,
sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
}
@@ -10588,43 +11658,124 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT])
search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
}
+ if (sf->drop_ref && second_ref_frame == NONE_FRAME) {
+ // Collect data from single ref mode, and analyze data.
+ sf_drop_ref_analyze(&search_state, mode_order, distortion2);
+ }
- if (sf->drop_ref) {
- if (second_ref_frame == NONE_FRAME) {
- const int idx = ref_frame - LAST_FRAME;
- if (idx && distortion2 > search_state.dist_refs[idx]) {
- search_state.dist_refs[idx] = distortion2;
- search_state.dist_order_refs[idx] = ref_frame;
- }
+ if (x->skip && !comp_pred) break;
+ }
- // Reach the last single ref prediction mode
- if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) {
- // bubble sort dist_refs and the order index
- for (i = 0; i < REF_FRAMES; ++i) {
- for (k = i + 1; k < REF_FRAMES; ++k) {
- if (search_state.dist_refs[i] < search_state.dist_refs[k]) {
- int64_t tmp_dist = search_state.dist_refs[i];
- search_state.dist_refs[i] = search_state.dist_refs[k];
- search_state.dist_refs[k] = tmp_dist;
-
- int tmp_idx = search_state.dist_order_refs[i];
- search_state.dist_order_refs[i] =
- search_state.dist_order_refs[k];
- search_state.dist_order_refs[k] = tmp_idx;
- }
- }
- }
+ aom_free(tmp_buf_orig);
+ tmp_buf_orig = NULL;
+ release_compound_type_rd_buffers(&rd_buffers);
- for (i = 0; i < REF_FRAMES; ++i) {
- if (search_state.dist_refs[i] == -1) break;
- search_state.num_available_refs = i;
- }
- search_state.num_available_refs++;
- }
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ if (!do_tx_search) {
+ inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+ search_state.best_rd = INT64_MAX;
+
+ int64_t top_est_rd =
+ inter_modes_info->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx];
+ for (int j = 0; j < inter_modes_info->num; ++j) {
+ const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
+ *mbmi = inter_modes_info->mbmi_arr[data_idx];
+ int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
+ if (curr_est_rd * 0.9 > top_est_rd) {
+ continue;
+ }
+ const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Select prediction reference frames.
+ const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ RD_STATS rd_stats;
+ RD_STATS rd_stats_y;
+ RD_STATS rd_stats_uv;
+
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+ if (mbmi->motion_mode == OBMC_CAUSAL)
+ av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+ if (!txfm_search(cpi, x, bsize, mi_row, mi_col, &rd_stats, &rd_stats_y,
+ &rd_stats_uv, mode_rate, search_state.best_rd)) {
+ continue;
+ } else {
+ const int skip_ctx = av1_get_skip_context(xd);
+ inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
+ rd_stats.dist,
+ rd_stats_y.rate + rd_stats_uv.rate +
+ x->skip_cost[skip_ctx][mbmi->skip]);
+ }
+ rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+
+ if (rd_stats.rdcost < search_state.best_rd) {
+ search_state.best_rd = rd_stats.rdcost;
+ // Note index of best mode so far
+ const int mode_index = get_prediction_mode_idx(
+ mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ search_state.best_mode_index = mode_index;
+ *rd_cost = rd_stats;
+ search_state.best_rd = rd_stats.rdcost;
+ search_state.best_mbmode = *mbmi;
+ search_state.best_skip2 = mbmi->skip;
+ search_state.best_mode_skippable = rd_stats.skip;
+ search_state.best_rate_y =
+ rd_stats_y.rate +
+ x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip];
+ search_state.best_rate_uv = rd_stats_uv.rate;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
}
}
+ }
+#endif
- if (x->skip && !comp_pred) break;
+ for (int j = 0; j < intra_mode_num; ++j) {
+ const int mode_index = intra_mode_idx_ls[j];
+ const MV_REFERENCE_FRAME ref_frame =
+ av1_mode_order[mode_index].ref_frame[0];
+ assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME);
+ assert(ref_frame == INTRA_FRAME);
+ if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break;
+ init_mbmi(mbmi, mode_index, cm);
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME);
+
+ // Select prediction reference frames.
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ }
+
+ RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
+
+ const int ref_frame_cost = ref_costs_single[ref_frame];
+ intra_rd_stats.rdcost = handle_intra_mode(
+ &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0,
+ &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
+ if (intra_rd_stats.rdcost < search_state.best_rd) {
+ search_state.best_rd = intra_rd_stats.rdcost;
+ // Note index of best mode so far
+ search_state.best_mode_index = mode_index;
+ *rd_cost = intra_rd_stats;
+ search_state.best_rd = intra_rd_stats.rdcost;
+ search_state.best_mbmode = *mbmi;
+ search_state.best_skip2 = 0;
+ search_state.best_mode_skippable = intra_rd_stats.skip;
+ search_state.best_rate_y =
+ intra_rd_stats_y.rate +
+ x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip];
+ search_state.best_rate_uv = intra_rd_stats_uv.rate;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ }
}
// In effect only when speed >= 2.
@@ -10635,7 +11786,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
// Only try palette mode when the best mode so far is an intra mode.
if (try_palette && !is_inter_mode(search_state.best_mbmode.mode)) {
- search_palette_mode(cpi, x, rd_cost, ctx, bsize, mbmi, pmi,
+ search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi,
ref_costs_single, &search_state);
}
@@ -10776,11 +11927,11 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
- mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+ mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
// Select the samples according to motion vector difference
- if (mbmi->num_proj_ref[0] > 1)
- mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
- mbmi->num_proj_ref[0], bsize);
+ if (mbmi->num_proj_ref > 1)
+ mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+ mbmi->num_proj_ref, bsize);
}
set_default_interp_filters(mbmi, cm->interp_filter);
@@ -10853,7 +12004,7 @@ static INLINE void calc_target_weighted_pred_above(
struct calc_target_weighted_pred_ctxt *ctxt =
(struct calc_target_weighted_pred_ctxt *)fun_ctxt;
- const int bw = xd->n8_w << MI_SIZE_LOG2;
+ const int bw = xd->n4_w << MI_SIZE_LOG2;
const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
@@ -10899,7 +12050,7 @@ static INLINE void calc_target_weighted_pred_left(
struct calc_target_weighted_pred_ctxt *ctxt =
(struct calc_target_weighted_pred_ctxt *)fun_ctxt;
- const int bw = xd->n8_w << MI_SIZE_LOG2;
+ const int bw = xd->n4_w << MI_SIZE_LOG2;
const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
@@ -10982,8 +12133,8 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
int above_stride, const uint8_t *left,
int left_stride) {
const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
- const int bw = xd->n8_w << MI_SIZE_LOG2;
- const int bh = xd->n8_h << MI_SIZE_LOG2;
+ const int bw = xd->n4_w << MI_SIZE_LOG2;
+ const int bh = xd->n4_h << MI_SIZE_LOG2;
int32_t *mask_buf = x->mask_buf;
int32_t *wsrc_buf = x->wsrc_buf;
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
index 12df472c1..4c11f90b8 100644
--- a/third_party/aom/av1/encoder/rdopt.h
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_RDOPT_H_
-#define AV1_ENCODER_RDOPT_H_
+#ifndef AOM_AV1_ENCODER_RDOPT_H_
+#define AOM_AV1_ENCODER_RDOPT_H_
#include "av1/common/blockd.h"
#include "av1/common/txb_common.h"
@@ -25,6 +25,10 @@ extern "C" {
#endif
#define MAX_REF_MV_SERCH 3
+#define DEFAULT_LUMA_INTERP_SKIP_FLAG 1
+#define DEFAULT_CHROMA_INTERP_SKIP_FLAG 2
+#define DEFAULT_INTERP_SKIP_FLAG \
+ (DEFAULT_LUMA_INTERP_SKIP_FLAG | DEFAULT_CHROMA_INTERP_SKIP_FLAG)
struct TileInfo;
struct macroblock;
@@ -111,7 +115,7 @@ unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
const struct buf_2d *ref,
BLOCK_SIZE bs, int bd);
-void av1_rd_pick_inter_mode_sb(const struct AV1_COMP *cpi,
+void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi,
struct TileDataEnc *tile_data,
struct macroblock *x, int mi_row, int mi_col,
struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
@@ -123,14 +127,12 @@ void av1_rd_pick_inter_mode_sb_seg_skip(
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
#if CONFIG_COLLECT_INTER_MODE_RD_STATS
-#define INTER_MODE_RD_TEST 0
-void av1_inter_mode_data_init();
-void av1_inter_mode_data_fit(int rdmult);
-void av1_inter_mode_data_show(const AV1_COMMON *cm);
+void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
#endif
#ifdef __cplusplus
} // extern "C"
#endif
-#endif // AV1_ENCODER_RDOPT_H_
+#endif // AOM_AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/reconinter_enc.c b/third_party/aom/av1/encoder/reconinter_enc.c
new file mode 100644
index 000000000..23d920fc3
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.c
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/obmc.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static INLINE void calc_subpel_params(
+ MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
+ int plane, const int pre_x, const int pre_y, int x, int y,
+ struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params,
+ int bw, int bh) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int is_scaled = av1_is_scaled(sf);
+ if (is_scaled) {
+ int ssx = pd->subsampling_x;
+ int ssy = pd->subsampling_y;
+ int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
+ orig_pos_y += mv.row * (1 << (1 - ssy));
+ int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
+ orig_pos_x += mv.col * (1 << (1 - ssx));
+ int pos_y = sf->scale_value_y(orig_pos_y, sf);
+ int pos_x = sf->scale_value_x(orig_pos_x, sf);
+ pos_x += SCALE_EXTRA_OFF;
+ pos_y += SCALE_EXTRA_OFF;
+
+ const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+ const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+ const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+ << SCALE_SUBPEL_BITS;
+ const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+ pos_y = clamp(pos_y, top, bottom);
+ pos_x = clamp(pos_x, left, right);
+
+ *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+ (pos_x >> SCALE_SUBPEL_BITS);
+ subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+ subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+ subpel_params->xs = sf->x_step_q4;
+ subpel_params->ys = sf->y_step_q4;
+ } else {
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(
+ xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+ subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+ subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
+ (x + (mv_q4.col >> SUBPEL_BITS));
+ }
+}
+
+static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int plane, const MB_MODE_INFO *mi,
+ int build_for_obmc, int bw, int bh,
+ int mi_x, int mi_y) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ int is_compound = has_second_ref(mi);
+ int ref;
+ const int is_intrabc = is_intrabc_block(mi);
+ assert(IMPLIES(is_intrabc, !is_compound));
+ int is_global[2] = { 0, 0 };
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+ is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+ }
+
+ const BLOCK_SIZE bsize = mi->sb_type;
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
+ (block_size_high[bsize] < 8 && ss_y);
+
+ if (is_intrabc) sub8x8_inter = 0;
+
+ // For sub8x8 chroma blocks, we may be covering more than one luma block's
+ // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+ // the top-left corner of the prediction source - the correct top-left corner
+ // is at (pre_x, pre_y).
+ const int row_start =
+ (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+ const int col_start =
+ (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+ const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+ const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+ sub8x8_inter = sub8x8_inter && !build_for_obmc;
+ if (sub8x8_inter) {
+ for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
+ for (int col = col_start; col <= 0; ++col) {
+ const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+ if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
+ if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
+ }
+ }
+ }
+
+ if (sub8x8_inter) {
+ // block size
+ const int b4_w = block_size_wide[bsize] >> ss_x;
+ const int b4_h = block_size_high[bsize] >> ss_y;
+ const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+ const int b8_w = block_size_wide[plane_bsize] >> ss_x;
+ const int b8_h = block_size_high[plane_bsize] >> ss_y;
+ assert(!is_compound);
+
+ const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
+
+ int row = row_start;
+ for (int y = 0; y < b8_h; y += b4_h) {
+ int col = col_start;
+ for (int x = 0; x < b8_w; x += b4_w) {
+ MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+ is_compound = has_second_ref(this_mbmi);
+ int tmp_dst_stride = 8;
+ assert(bw < 8 || bh < 8);
+ ConvolveParams conv_params = get_conv_params_no_round(
+ 0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
+ conv_params.use_jnt_comp_avg = 0;
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+
+ ref = 0;
+ const RefBuffer *ref_buf =
+ &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+
+ pd->pre[ref].buf0 =
+ (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
+ pd->pre[ref].buf =
+ pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+ ref_buf->buf->uv_stride,
+ &ref_buf->sf);
+ pd->pre[ref].width = ref_buf->buf->uv_crop_width;
+ pd->pre[ref].height = ref_buf->buf->uv_crop_height;
+ pd->pre[ref].stride = ref_buf->buf->uv_stride;
+
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+ struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+
+ const MV mv = this_mbmi->mv[ref].as_mv;
+
+ uint8_t *pre;
+ SubpelParams subpel_params;
+ WarpTypesAllowed warp_types;
+ warp_types.global_warp_allowed = is_global[ref];
+ warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
+
+ calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+ &subpel_params, bw, bh);
+ conv_params.do_average = ref;
+ if (is_masked_compound_type(mi->interinter_comp.type)) {
+ // masked compound type has its own average mechanism
+ conv_params.do_average = 0;
+ }
+
+ av1_make_inter_predictor(
+ pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf,
+ b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
+ (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
+ plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+
+ ++col;
+ }
+ ++row;
+ }
+
+ for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
+ return;
+ }
+
+ {
+ ConvolveParams conv_params = get_conv_params_no_round(
+ 0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+ av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
+ &conv_params.bck_offset,
+ &conv_params.use_jnt_comp_avg, is_compound);
+
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf;
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+ struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+ const MV mv = mi->mv[ref].as_mv;
+
+ uint8_t *pre;
+ SubpelParams subpel_params;
+ calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre,
+ &subpel_params, bw, bh);
+
+ WarpTypesAllowed warp_types;
+ warp_types.global_warp_allowed = is_global[ref];
+ warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+ if (ref && is_masked_compound_type(mi->interinter_comp.type)) {
+ // masked compound type has its own average mechanism
+ conv_params.do_average = 0;
+ av1_make_masked_inter_predictor(
+ pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+ bh, &conv_params, mi->interp_filters, plane, &warp_types,
+ mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd,
+ cm->allow_warped_motion);
+ } else {
+ conv_params.do_average = ref;
+ av1_make_inter_predictor(
+ pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+ bh, &conv_params, mi->interp_filters, &warp_types,
+ mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref,
+ mi, build_for_obmc, xd, cm->allow_warped_motion);
+ }
+ }
+ }
+}
+
+static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int mi_row, int mi_col,
+ int plane_from, int plane_to) {
+ int plane;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const int bw = pd->width;
+ const int bh = pd->height;
+
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+
+ build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
+ }
+}
+
+void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize) {
+ av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, 0);
+}
+
+void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize) {
+ for (int plane_idx = 1; plane_idx < MAX_MB_PLANE; plane_idx++) {
+ av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize,
+ plane_idx);
+ }
+}
+
+void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize, int plane_idx) {
+ build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, plane_idx,
+ plane_idx);
+
+ if (is_interintra_pred(xd->mi[0])) {
+ BUFFER_SET default_ctx = { { NULL, NULL, NULL }, { 0, 0, 0 } };
+ if (!ctx) {
+ default_ctx.plane[plane_idx] = xd->plane[plane_idx].dst.buf;
+ default_ctx.stride[plane_idx] = xd->plane[plane_idx].dst.stride;
+ ctx = &default_ctx;
+ }
+ av1_build_interintra_predictors_sbp(cm, xd, xd->plane[plane_idx].dst.buf,
+ xd->plane[plane_idx].dst.stride, ctx,
+ plane_idx, bsize);
+ }
+}
+
+void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize) {
+ const int num_planes = av1_num_planes(cm);
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+ if (num_planes > 1)
+ av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+}
+
+// TODO(sarahparker):
+// av1_build_inter_predictor should be combined with
+// av1_make_inter_predictor
+void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, const MV *src_mv,
+ const struct scale_factors *sf, int w, int h,
+ ConvolveParams *conv_params,
+ InterpFilters interp_filters,
+ const WarpTypesAllowed *warp_types, int p_col,
+ int p_row, int plane, int ref,
+ enum mv_precision precision, int x, int y,
+ const MACROBLOCKD *xd, int can_use_previous) {
+ const int is_q4 = precision == MV_PRECISION_Q4;
+ const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+ is_q4 ? src_mv->col : src_mv->col * 2 };
+ MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
+ mv.col += SCALE_EXTRA_OFF;
+ mv.row += SCALE_EXTRA_OFF;
+
+ const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+ mv.col & SCALE_SUBPEL_MASK,
+ mv.row & SCALE_SUBPEL_MASK };
+ src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
+ (mv.col >> SCALE_SUBPEL_BITS);
+
+ av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
+ w, h, conv_params, interp_filters, warp_types, p_col,
+ p_row, plane, ref, xd->mi[0], 0, xd,
+ can_use_previous);
+}
+
+static INLINE void build_prediction_by_above_pred(
+ MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+ MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+ struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+ const int above_mi_col = ctxt->mi_col + rel_mi_col;
+ int mi_x, mi_y;
+ MB_MODE_INFO backup_mbmi = *above_mbmi;
+
+ av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
+ above_mbmi, ctxt, num_planes);
+ mi_x = above_mi_col << MI_SIZE_LOG2;
+ mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+ for (int j = 0; j < num_planes; ++j) {
+ const struct macroblockd_plane *pd = &xd->plane[j];
+ int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+ int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+ block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+ build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y);
+ }
+ *above_mbmi = backup_mbmi;
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]) {
+ if (!xd->up_available) return;
+
+ // Adjust mb_to_bottom_edge to have the correct value for the OBMC
+ // prediction block. This is half the height of the original block,
+ // except for 128-wide blocks, where we only use a height of 32.
+ int this_height = xd->n4_h * MI_SIZE;
+ int pred_height = AOMMIN(this_height / 2, 32);
+ xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
+
+ struct build_prediction_ctxt ctxt = { cm, mi_row,
+ mi_col, tmp_buf,
+ tmp_width, tmp_height,
+ tmp_stride, xd->mb_to_right_edge };
+ BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ foreach_overlappable_nb_above(cm, xd, mi_col,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ build_prediction_by_above_pred, &ctxt);
+
+ xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+ xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+ xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+}
+
+static INLINE void build_prediction_by_left_pred(
+ MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
+ MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+ struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+ const int left_mi_row = ctxt->mi_row + rel_mi_row;
+ int mi_x, mi_y;
+ MB_MODE_INFO backup_mbmi = *left_mbmi;
+
+ av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
+ left_mbmi, ctxt, num_planes);
+ mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+ mi_y = left_mi_row << MI_SIZE_LOG2;
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+ for (int j = 0; j < num_planes; ++j) {
+ const struct macroblockd_plane *pd = &xd->plane[j];
+ int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+ block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+ int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+ build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y);
+ }
+ *left_mbmi = backup_mbmi;
+}
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]) {
+ if (!xd->left_available) return;
+
+ // Adjust mb_to_right_edge to have the correct value for the OBMC
+ // prediction block. This is half the width of the original block,
+ // except for 128-wide blocks, where we only use a width of 32.
+ int this_width = xd->n4_w * MI_SIZE;
+ int pred_width = AOMMIN(this_width / 2, 32);
+ xd->mb_to_right_edge += (this_width - pred_width) * 8;
+
+ struct build_prediction_ctxt ctxt = { cm, mi_row,
+ mi_col, tmp_buf,
+ tmp_width, tmp_height,
+ tmp_stride, xd->mb_to_bottom_edge };
+ BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ foreach_overlappable_nb_left(cm, xd, mi_row,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ build_prediction_by_left_pred, &ctxt);
+
+ xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+ xd->mb_to_right_edge -= (this_width - pred_width) * 8;
+ xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
+}
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col) {
+ const int num_planes = av1_num_planes(cm);
+ uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+ int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int len = sizeof(uint16_t);
+ dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+ dst_buf1[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+ dst_buf1[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+ dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+ dst_buf2[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+ dst_buf2[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+ } else {
+ dst_buf1[0] = xd->tmp_obmc_bufs[0];
+ dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+ dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+ dst_buf2[0] = xd->tmp_obmc_bufs[1];
+ dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+ dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+ }
+ av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
+ dst_width1, dst_height1, dst_stride1);
+ av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
+ dst_width2, dst_height2, dst_stride2);
+ av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
+ mi_row, mi_col, 0, num_planes);
+ av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
+ dst_buf2, dst_stride2);
+}
+
+// Builds the inter-predictor for the single ref case
+// for use in the encoder to search the wedges efficiently.
+static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
+ int bw, int bh, int x, int y,
+ int w, int h, int mi_x, int mi_y,
+ int ref, uint8_t *const ext_dst,
+ int ext_dst_stride,
+ int can_use_previous) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const MB_MODE_INFO *mi = xd->mi[0];
+
+ const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+ struct buf_2d *const pre_buf = &pd->pre[ref];
+ uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
+ const MV mv = mi->mv[ref].as_mv;
+
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+ WarpTypesAllowed warp_types;
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+ warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
+ warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+ const int pre_x = (mi_x) >> pd->subsampling_x;
+ const int pre_y = (mi_y) >> pd->subsampling_y;
+ uint8_t *pre;
+ SubpelParams subpel_params;
+ calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+ &subpel_params, bw, bh);
+
+ av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
+ &subpel_params, sf, w, h, &conv_params,
+ mi->interp_filters, &warp_types, pre_x + x,
+ pre_y + y, plane, ref, mi, 0, xd, can_use_previous);
+}
+
+void av1_build_inter_predictors_for_planes_single_buf(
+ MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+ int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+ int can_use_previous) {
+ int plane;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(
+ bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x,
+ mi_y, ref, ext_dst[plane],
+ ext_dst_stride[plane], can_use_previous);
+ }
+}
+
+static void build_masked_compound(
+ uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+ int w) {
+ // Derive subsampling from h and w passed in. May be refactored to
+ // pass in subsampling factors directly.
+ const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+ const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+ const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+ aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, block_size_wide[sb_type], w, h, subw, subh);
+}
+
+static void build_masked_compound_highbd(
+ uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+ const uint8_t *src1_8, int src1_stride,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+ int w, int bd) {
+ // Derive subsampling from h and w passed in. May be refactored to
+ // pass in subsampling factors directly.
+ const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+ const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+ const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+ // const uint8_t *mask =
+ // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+ aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, block_size_wide[sb_type], w, h,
+ subw, subh, bd);
+}
+
+static void build_wedge_inter_predictor_from_buf(
+ MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
+ int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_compound = has_second_ref(mbmi);
+ MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+ mbmi->interinter_comp.seg_mask = xd->seg_mask;
+ const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+
+ if (is_compound && is_masked_compound_type(comp_data->type)) {
+ if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ av1_build_compound_diffwtd_mask_highbd(
+ comp_data->seg_mask, comp_data->mask_type,
+ CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
+ else
+ av1_build_compound_diffwtd_mask(
+ comp_data->seg_mask, comp_data->mask_type, ext_dst0,
+ ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
+ }
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ build_masked_compound_highbd(
+ dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
+ mbmi->sb_type, h, w, xd->bd);
+ else
+ build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+ ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
+ h, w);
+ } else {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
+ xd->bd);
+ else
+ aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
+ 0, NULL, 0, w, h);
+ }
+}
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane_from, int plane_to,
+ uint8_t *ext_dst0[3],
+ int ext_dst_stride0[3],
+ uint8_t *ext_dst1[3],
+ int ext_dst_stride1[3]) {
+ int plane;
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(
+ bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ build_wedge_inter_predictor_from_buf(
+ xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
+ ext_dst1[plane], ext_dst_stride1[plane]);
+ }
+}
diff --git a/third_party/aom/av1/encoder/reconinter_enc.h b/third_party/aom/av1/encoder/reconinter_enc.h
new file mode 100644
index 000000000..10d5e8c28
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_
+#define AOM_AV1_ENCODER_RECONINTER_ENC_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/filter.h"
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/convolve.h"
+#include "av1/common/warped_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize);
+
+void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize);
+
+void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize, int plane_idx);
+
+void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize);
+
+void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, const MV *src_mv,
+ const struct scale_factors *sf, int w, int h,
+ ConvolveParams *conv_params,
+ InterpFilters interp_filters,
+ const WarpTypesAllowed *warp_types, int p_col,
+ int p_row, int plane, int ref,
+ enum mv_precision precision, int x, int y,
+ const MACROBLOCKD *xd, int can_use_previous);
+
+// Detect if the block have sub-pixel level motion vectors
+// per component.
+#define CHECK_SUBPEL 0
+static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi,
+ const MACROBLOCKD *const xd,
+ int dir) {
+#if CHECK_SUBPEL
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ int plane;
+ int ref = (dir >> 1);
+
+ if (dir & 0x01) {
+ if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
+ } else {
+ if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
+ }
+
+ return 0;
+#else
+ (void)mbmi;
+ (void)xd;
+ (void)dir;
+ return 1;
+#endif
+}
+
+static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const int is_compound = has_second_ref(mi);
+ int ref;
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ int row_col;
+ for (row_col = 0; row_col < 2; ++row_col) {
+ const int dir = (ref << 1) + row_col;
+ if (has_subpel_mv_component(mi, xd, dir)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col);
+
+void av1_build_inter_predictors_for_planes_single_buf(
+ MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+ int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+ int can_use_previous);
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane_from, int plane_to,
+ uint8_t *ext_dst0[3],
+ int ext_dst_stride0[3],
+ uint8_t *ext_dst1[3],
+ int ext_dst_stride1[3]);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RECONINTER_ENC_H_
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
index a207b0f26..1ad13d66a 100644
--- a/third_party/aom/av1/encoder/segmentation.h
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_SEGMENTATION_H_
-#define AV1_ENCODER_SEGMENTATION_H_
+#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_
+#define AOM_AV1_ENCODER_SEGMENTATION_H_
#include "av1/common/blockd.h"
#include "av1/encoder/encoder.h"
@@ -35,4 +35,4 @@ void av1_reset_segment_features(AV1_COMMON *cm);
} // extern "C"
#endif
-#endif // AV1_ENCODER_SEGMENTATION_H_
+#endif // AOM_AV1_ENCODER_SEGMENTATION_H_
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
index d4b4b19c4..4c35baae0 100644
--- a/third_party/aom/av1/encoder/speed_features.c
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -98,6 +98,15 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
sf->use_square_partition_only_threshold = BLOCK_64X64;
}
+ // TODO(huisu@google.com): train models for 720P and above.
+ if (!is_720p_or_larger) {
+ sf->ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64
+ sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ }
+
if (speed >= 1) {
if (is_720p_or_larger) {
sf->use_square_partition_only_threshold = BLOCK_128X128;
@@ -106,6 +115,14 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
} else {
sf->use_square_partition_only_threshold = BLOCK_32X32;
}
+
+ if (!is_720p_or_larger) {
+ sf->ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64
+ sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ }
}
if (speed >= 2) {
@@ -126,13 +143,11 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
if (speed >= 3) {
if (is_720p_or_larger) {
sf->disable_split_mask = DISABLE_ALL_SPLIT;
- sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
sf->partition_search_breakout_dist_thr = (1 << 25);
sf->partition_search_breakout_rate_thr = 200;
} else {
sf->max_intra_bsize = BLOCK_32X32;
sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
- sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
sf->partition_search_breakout_dist_thr = (1 << 23);
sf->partition_search_breakout_rate_thr = 120;
}
@@ -166,6 +181,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
// Speed 0 for all speed features that give neutral coding performance change.
sf->reduce_inter_modes = 1;
sf->prune_ext_partition_types_search_level = 1;
+ sf->ml_prune_rect_partition = 1;
sf->ml_prune_ab_partition = 1;
sf->ml_prune_4_partition = 1;
sf->adaptive_txb_search_level = 1;
@@ -173,6 +189,11 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->model_based_prune_tx_search_level = 1;
sf->model_based_post_interp_filter_breakout = 1;
sf->inter_mode_rd_model_estimation = 1;
+ sf->prune_ref_frame_for_rect_partitions =
+ !(boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame);
+ sf->less_rectangular_check_level = 1;
+ sf->gm_search_type = GM_REDUCED_REF_SEARCH;
+ sf->gm_disable_recode = 1;
if (speed >= 1) {
sf->gm_erroradv_type = GM_ERRORADV_TR_1;
@@ -182,8 +203,10 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->intra_tx_size_search_init_depth_rect = 1;
sf->intra_tx_size_search_init_depth_sqr = 1;
sf->tx_size_search_lgr_block = 1;
- sf->two_pass_partition_search = 1;
- sf->mode_pruning_based_on_two_pass_partition_search = 1;
+ if (speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL) {
+ sf->two_pass_partition_search = 1;
+ sf->mode_pruning_based_on_two_pass_partition_search = 1;
+ }
sf->prune_ext_partition_types_search_level = 2;
sf->use_fast_interpolation_filter_search = 1;
sf->skip_repeat_interpolation_filter_search = 1;
@@ -198,6 +221,11 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->use_intra_txb_hash = 1;
sf->optimize_b_precheck = 1;
sf->dual_sgr_penalty_level = 1;
+ sf->use_accurate_subpel_search = 1;
+ sf->reuse_inter_intra_mode = 1;
+ sf->prune_comp_search_by_single_result = 1;
+ sf->skip_repeated_newmv = 1;
+ sf->obmc_full_pixel_search_level = 1;
}
if (speed >= 2) {
@@ -206,7 +234,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->selective_ref_frame = 2;
sf->fast_cdef_search = 1;
- sf->use_rd_breakout = 1;
sf->adaptive_rd_thresh = 1;
sf->mv.auto_mv_step_size = 1;
sf->mv.subpel_iters_per_step = 1;
@@ -224,8 +251,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
if (speed >= 3) {
sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
- sf->less_rectangular_check = 1;
- sf->mode_skip_start = 10;
+ sf->less_rectangular_check_level = 2;
sf->adaptive_pred_interp_filter = 1;
// adaptive_motion_search breaks encoder multi-thread tests.
// The values in x->pred_mv[] differ for single and multi-thread cases.
@@ -237,6 +263,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->adaptive_rd_thresh = 2;
sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
sf->gm_search_type = GM_DISABLE_SEARCH;
+ sf->prune_comp_search_by_single_result = 2;
}
if (speed >= 4) {
@@ -250,10 +277,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->adaptive_pred_interp_filter = 0;
sf->adaptive_mode_search = 1;
sf->cb_partition_search = !boosted;
- sf->cb_pred_filter_search = 1;
sf->alt_ref_search_fp = 1;
- sf->mode_skip_start = 6;
- sf->adaptive_interp_filter_search = 1;
}
if (speed >= 5) {
@@ -276,7 +300,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
FLAG_EARLY_TERMINATE;
sf->disable_filter_search_var_thresh = 200;
- sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
sf->use_fast_coef_costing = 1;
sf->partition_search_breakout_rate_thr = 300;
sf->use_transform_domain_distortion = 2;
@@ -296,33 +319,17 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->simple_model_rd_from_var = 1;
}
if (speed >= 7) {
- const int is_keyframe = cm->frame_type == KEY_FRAME;
- const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
sf->default_max_partition_size = BLOCK_32X32;
sf->default_min_partition_size = BLOCK_8X8;
sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
sf->frame_parameter_update = 0;
sf->mv.search_method = FAST_HEX;
- sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
- sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
- sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
- sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
- sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST;
- sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST;
- sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST;
sf->partition_search_type = REFERENCE_PARTITION;
- sf->reuse_inter_pred_sby = 1;
- sf->force_frame_boost =
- is_keyframe ||
- (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
- sf->max_delta_qindex = is_keyframe ? 20 : 15;
- sf->coeff_prob_appx_step = 4;
sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
}
if (speed >= 8) {
sf->mv.search_method = FAST_DIAMOND;
- sf->mv.fullpel_search_step_param = 10;
sf->mv.subpel_force_stop = 2;
sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
}
@@ -356,54 +363,6 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
}
-static void set_dev_sf(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) {
- AV1_COMMON *const cm = &cpi->common;
-
- if (speed & TXFM_CODING_SF) {
- sf->inter_tx_size_search_init_depth_rect = 1;
- sf->inter_tx_size_search_init_depth_sqr = 1;
- sf->intra_tx_size_search_init_depth_rect = 1;
- sf->intra_tx_size_search_init_depth_sqr = 1;
- sf->tx_size_search_method = USE_FAST_RD;
- sf->tx_type_search.fast_intra_tx_type_search = 1;
- sf->tx_type_search.fast_inter_tx_type_search = 1;
- }
-
- if (speed & INTER_PRED_SF) {
- sf->selective_ref_frame = 2;
- // sf->adaptive_motion_search = 1;
- sf->mv.auto_mv_step_size = 1;
- sf->adaptive_rd_thresh = 1;
- sf->mv.subpel_iters_per_step = 1;
- sf->adaptive_pred_interp_filter = 1;
- }
-
- if (speed & INTRA_PRED_SF) {
- sf->max_intra_bsize = BLOCK_32X32;
- }
-
- if (speed & PARTITION_SF) {
- if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
- has_internal_image_edge(cpi)) {
- sf->use_square_partition_only_threshold =
- frame_is_boosted(cpi) ? BLOCK_128X128 : BLOCK_4X4;
- } else {
- sf->use_square_partition_only_threshold =
- frame_is_intra_only(cm) ? BLOCK_128X128 : BLOCK_4X4;
- }
- sf->less_rectangular_check = 1;
- sf->prune_ext_partition_types_search_level = 2;
- }
-
- if (speed & LOOP_FILTER_SF) {
- sf->fast_cdef_search = 1;
- }
-
- if (speed & RD_SKIP_SF) {
- sf->use_rd_breakout = 1;
- }
-}
-
void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
SPEED_FEATURES *const sf = &cpi->sf;
@@ -432,9 +391,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
#endif // DISABLE_TRELLISQ_SEARCH
sf->gm_erroradv_type = GM_ERRORADV_TR_0;
sf->mv.reduce_first_step_size = 0;
- sf->coeff_prob_appx_step = 1;
sf->mv.auto_mv_step_size = 0;
- sf->mv.fullpel_search_step_param = 6;
sf->comp_inter_joint_search_thresh = BLOCK_4X4;
sf->adaptive_rd_thresh = 0;
sf->tx_size_search_method = USE_FULL_RD;
@@ -450,7 +407,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
sf->adaptive_motion_search = 0;
sf->adaptive_pred_interp_filter = 0;
sf->adaptive_mode_search = 0;
- sf->cb_pred_filter_search = 0;
sf->cb_partition_search = 0;
sf->alt_ref_search_fp = 0;
sf->partition_search_type = SEARCH_PARTITION;
@@ -461,22 +417,19 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
sf->tx_type_search.fast_inter_tx_type_search = 0;
sf->tx_type_search.skip_tx_search = 0;
sf->selective_ref_frame = 0;
- sf->less_rectangular_check = 0;
+ sf->less_rectangular_check_level = 0;
sf->use_square_partition_only_threshold = BLOCK_128X128;
+ sf->prune_ref_frame_for_rect_partitions = 0;
sf->auto_min_max_partition_size = NOT_IN_USE;
sf->rd_auto_partition_min_limit = BLOCK_4X4;
sf->default_max_partition_size = BLOCK_LARGEST;
sf->default_min_partition_size = BLOCK_4X4;
sf->adjust_partitioning_from_last_frame = 0;
- sf->last_partitioning_redo_frequency = 4;
sf->disable_split_mask = 0;
sf->mode_search_skip_flags = 0;
- sf->force_frame_boost = 0;
- sf->max_delta_qindex = 0;
sf->disable_filter_search_var_thresh = 0;
- sf->adaptive_interp_filter_search = 0;
sf->allow_partition_search_skip = 0;
- sf->use_accurate_subpel_search = 1;
+ sf->use_accurate_subpel_search = 2;
sf->disable_wedge_search_var_thresh = 0;
sf->fast_wedge_sign_estimate = 0;
sf->drop_ref = 0;
@@ -491,48 +444,46 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
sf->optimize_b_precheck = 0;
sf->jnt_comp_fast_tx_search = 0;
sf->jnt_comp_skip_mv_search = 0;
+ sf->reuse_inter_intra_mode = 0;
for (i = 0; i < TX_SIZES; i++) {
sf->intra_y_mode_mask[i] = INTRA_ALL;
sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
}
- sf->use_rd_breakout = 0;
sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
- sf->use_fast_coef_updates = TWO_LOOP;
sf->use_fast_coef_costing = 0;
- sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set
- sf->schedule_mode_search = 0;
- for (i = 0; i < BLOCK_SIZES_ALL; ++i) sf->inter_mode_mask[i] = INTER_ALL;
sf->max_intra_bsize = BLOCK_LARGEST;
- sf->reuse_inter_pred_sby = 0;
// This setting only takes effect when partition_search_type is set
// to FIXED_PARTITION.
sf->always_this_block_size = BLOCK_16X16;
- sf->search_type_check_frequency = 50;
// Recode loop tolerance %.
sf->recode_tolerance = 25;
- sf->default_interp_filter = SWITCHABLE;
sf->partition_search_breakout_dist_thr = 0;
sf->partition_search_breakout_rate_thr = 0;
sf->simple_model_rd_from_var = 0;
sf->prune_ext_partition_types_search_level = 0;
+ sf->ml_prune_rect_partition = 0;
sf->ml_prune_ab_partition = 0;
sf->ml_prune_4_partition = 0;
sf->fast_cdef_search = 0;
+ for (i = 0; i < PARTITION_BLOCK_SIZES; ++i)
+ sf->ml_partition_search_breakout_thresh[i] = -1; // -1 means not enabled.
// Set this at the appropriate speed levels
sf->use_transform_domain_distortion = 0;
sf->gm_search_type = GM_FULL_SEARCH;
+ sf->gm_disable_recode = 0;
sf->use_fast_interpolation_filter_search = 0;
sf->skip_repeat_interpolation_filter_search = 0;
sf->use_hash_based_trellis = 0;
+ sf->prune_comp_search_by_single_result = 0;
+ sf->skip_repeated_newmv = 0;
// Set decoder side speed feature to use less dual sgr modes
sf->dual_sgr_penalty_level = 0;
sf->inter_mode_rd_model_estimation = 0;
-
- set_dev_sf(cpi, sf, oxcf->dev_sf);
+ sf->obmc_full_pixel_search_level = 0;
if (oxcf->mode == GOOD)
set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed);
@@ -599,10 +550,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
x->min_partition_size = sf->default_min_partition_size;
x->max_partition_size = sf->default_max_partition_size;
- if (!cpi->oxcf.frame_periodic_boost) {
- sf->max_delta_qindex = 0;
- }
-
// This is only used in motion vector unit test.
if (cpi->oxcf.motion_vector_unit_test == 1)
cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
@@ -611,5 +558,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
#if CONFIG_DIST_8X8
if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
+
+ if (cpi->oxcf.using_dist_8x8) x->min_partition_size = BLOCK_8X8;
#endif // CONFIG_DIST_8X8
}
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
index d0408ba2f..41013b2e7 100644
--- a/third_party/aom/av1/encoder/speed_features.h
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_SPEED_FEATURES_H_
-#define AV1_ENCODER_SPEED_FEATURES_H_
+#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_
+#define AOM_AV1_ENCODER_SPEED_FEATURES_H_
#include "av1/common/enums.h"
@@ -54,25 +54,6 @@ enum {
(1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
(1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) |
(1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV),
- INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) |
- (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV),
- INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) |
- (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
- (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
- (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
- INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) |
- (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
- (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV),
- INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << GLOBALMV) | (1 << NEWMV) |
- (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
- (1 << NEW_NEWMV) | (1 << NEW_NEARESTMV) |
- (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) |
- (1 << NEAR_NEWMV),
- INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) |
- (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
- (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
- (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
- (1 << NEAR_NEARMV),
INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
(1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
(1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
@@ -133,11 +114,6 @@ typedef enum {
} SUBPEL_SEARCH_METHODS;
typedef enum {
- NO_MOTION_THRESHOLD = 0,
- LOW_MOTION_THRESHOLD = 7
-} MOTION_THRESHOLD;
-
-typedef enum {
USE_FULL_RD = 0,
USE_FAST_RD,
USE_LARGESTALL,
@@ -179,12 +155,6 @@ typedef enum {
} MODE_SEARCH_SKIP_LOGIC;
typedef enum {
- FLAG_SKIP_EIGHTTAP_REGULAR = 1 << EIGHTTAP_REGULAR,
- FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
- FLAG_SKIP_MULTITAP_SHARP = 1 << MULTITAP_SHARP,
-} INTERP_FILTER_MASK;
-
-typedef enum {
NO_PRUNE = 0,
// eliminates one tx type in vertical and horizontal direction
PRUNE_ONE = 1,
@@ -224,16 +194,6 @@ typedef enum {
REFERENCE_PARTITION
} PARTITION_SEARCH_TYPE;
-typedef enum {
- // Does a dry run to see if any of the contexts need to be updated or not,
- // before the final run.
- TWO_LOOP = 0,
-
- // No dry run, also only half the coef contexts and bands are updated.
- // The rest are not updated at all.
- ONE_LOOP_REDUCED = 1
-} FAST_COEFF_UPDATE;
-
typedef struct MV_SPEED_FEATURES {
// Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
SEARCH_METHODS search_method;
@@ -257,9 +217,6 @@ typedef struct MV_SPEED_FEATURES {
// Control when to stop subpel search
int subpel_force_stop;
-
- // This variable sets the step_param used in full pel motion search.
- int fullpel_search_step_param;
} MV_SPEED_FEATURES;
#define MAX_MESH_STEP 4
@@ -332,13 +289,6 @@ typedef struct SPEED_FEATURES {
// mode to be evaluated. A high value means we will be faster.
int adaptive_rd_thresh;
- // Coefficient probability model approximation step size
- int coeff_prob_appx_step;
-
- // The threshold is to determine how slow the motino is, it is used when
- // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
- MOTION_THRESHOLD lf_motion_threshold;
-
// Determine which method we use to determine transform size. We can choose
// between options like full rd, largest for prediction size, largest
// for intra and model coefs for the rest.
@@ -355,11 +305,6 @@ typedef struct SPEED_FEATURES {
// largest transform only, since the largest transform block size is 64x64.
int tx_size_search_lgr_block;
- // After looking at the first set of modes (set by index here), skip
- // checking modes for reference frames that don't match the reference frame
- // of the best so far.
- int mode_skip_start;
-
PARTITION_SEARCH_TYPE partition_search_type;
TX_TYPE_SEARCH tx_type_search;
@@ -397,6 +342,9 @@ typedef struct SPEED_FEATURES {
// aggressiveness of pruning in order.
int prune_ext_partition_types_search_level;
+ // Use a ML model to prune horz and vert partitions
+ int ml_prune_rect_partition;
+
// Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
int ml_prune_ab_partition;
@@ -413,12 +361,16 @@ typedef struct SPEED_FEATURES {
int mode_pruning_based_on_two_pass_partition_search;
// Skip rectangular partition test when partition type none gives better
- // rd than partition type split.
- int less_rectangular_check;
+ // rd than partition type split. Can take values 0 - 2, 0 referring to no
+ // skipping, and 1 - 2 increasing aggressiveness of skipping in order.
+ int less_rectangular_check_level;
// Use square partition only beyond this block size.
BLOCK_SIZE use_square_partition_only_threshold;
+ // Prune reference frames for rectangular partitions.
+ int prune_ref_frame_for_rect_partitions;
+
// Sets min and max partition sizes for this superblock based on the
// same superblock in last encoded frame, and the left and above neighbor.
AUTO_MIN_MAX_MODE auto_min_max_partition_size;
@@ -435,10 +387,6 @@ typedef struct SPEED_FEATURES {
// frame's partitioning. Only used if use_lastframe_partitioning is set.
int adjust_partitioning_from_last_frame;
- // How frequently we re do the partitioning from scratch. Only used if
- // use_lastframe_partitioning is set.
- int last_partitioning_redo_frequency;
-
// Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
// it always, to allow it for only Last frame and Intra, disable it for all
// inter modes or to enable it always.
@@ -461,8 +409,6 @@ typedef struct SPEED_FEATURES {
// Pattern to be used for any exhaustive mesh searches.
MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
- int schedule_mode_search;
-
// Allows sub 8x8 modes to use the prediction filter that was determined
// best for 8x8 mode. If set to 0 we always re check all the filters for
// sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
@@ -472,20 +418,10 @@ typedef struct SPEED_FEATURES {
// Adaptive prediction mode search
int adaptive_mode_search;
- // Chessboard pattern prediction filter type search
- int cb_pred_filter_search;
-
int cb_partition_search;
int alt_ref_search_fp;
- // Use finer quantizer in every other few frames that run variable block
- // partition type search.
- int force_frame_boost;
-
- // Maximally allowed base quantization index fluctuation.
- int max_delta_qindex;
-
// Implements various heuristics to skip searching modes
// The heuristics selected are based on flags
// defined in the MODE_SEARCH_SKIP_HEURISTICS enum
@@ -506,22 +442,9 @@ typedef struct SPEED_FEATURES {
int intra_y_mode_mask[TX_SIZES];
int intra_uv_mode_mask[TX_SIZES];
- // This variable enables an early break out of mode testing if the model for
- // rd built from the prediction signal indicates a value that's much
- // higher than the best rd we've seen so far.
- int use_rd_breakout;
-
// This feature controls how the loop filter level is determined.
LPF_PICK_METHOD lpf_pick;
- // This feature limits the number of coefficients updates we actually do
- // by only looking at counts from 1/2 the bands.
- FAST_COEFF_UPDATE use_fast_coef_updates;
-
- // A binary mask indicating if NEARESTMV, NEARMV, GLOBALMV, NEWMV
- // modes are used in order from LSB to MSB for each BLOCK_SIZE.
- int inter_mode_mask[BLOCK_SIZES_ALL];
-
// This feature controls whether we do the expensive context update and
// calculation in the rd coefficient costing loop.
int use_fast_coef_costing;
@@ -535,28 +458,13 @@ typedef struct SPEED_FEATURES {
// TODO(aconverse): Fold this into one of the other many mode skips
BLOCK_SIZE max_intra_bsize;
- // The frequency that we check if
- // FIXED_PARTITION search type should be used.
- int search_type_check_frequency;
-
- // When partition is pre-set, the inter prediction result from pick_inter_mode
- // can be reused in final block encoding process. It is enabled only for real-
- // time mode speed 6.
- int reuse_inter_pred_sby;
-
- // default interp filter choice
- InterpFilter default_interp_filter;
-
- // adaptive interp_filter search to allow skip of certain filter types.
- int adaptive_interp_filter_search;
-
- // mask for skip evaluation of certain interp_filter type.
- INTERP_FILTER_MASK interp_filter_search_mask;
-
// Partition search early breakout thresholds.
int64_t partition_search_breakout_dist_thr;
int partition_search_breakout_rate_thr;
+ // Thresholds for ML based partition search breakout.
+ int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
+
// Allow skipping partition search for still image frame
int allow_partition_search_skip;
@@ -577,6 +485,9 @@ typedef struct SPEED_FEATURES {
GM_SEARCH_TYPE gm_search_type;
+ // whether to disable the global motion recode loop
+ int gm_disable_recode;
+
// Do limited interpolation filter search for dual filters, since best choice
// usually includes EIGHTTAP_REGULAR.
int use_fast_interpolation_filter_search;
@@ -624,6 +535,25 @@ typedef struct SPEED_FEATURES {
// Dynamically estimate final rd from prediction error and mode cost
int inter_mode_rd_model_estimation;
+
+ // Skip some ref frames in compound motion search by single motion search
+ // result. Has three levels for now: 0 referring to no skipping, and 1 - 3
+ // increasing aggressiveness of skipping in order.
+ // Note: The search order might affect the result. It is better to search same
+ // single inter mode as a group.
+ int prune_comp_search_by_single_result;
+
+ // Reuse the inter_intra_mode search result from NEARESTMV mode to other
+ // single ref modes
+ int reuse_inter_intra_mode;
+
+ // Set the full pixel search level of obmc
+ // 0: obmc_full_pixel_diamond
+ // 1: obmc_refining_search_sad (faster)
+ int obmc_full_pixel_search_level;
+
+ // flag to skip NEWMV mode in drl if the motion search result is the same
+ int skip_repeated_newmv;
} SPEED_FEATURES;
struct AV1_COMP;
@@ -635,4 +565,4 @@ void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi);
} // extern "C"
#endif
-#endif // AV1_ENCODER_SPEED_FEATURES_H_
+#endif // AOM_AV1_ENCODER_SPEED_FEATURES_H_
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
index d7e4f4eb3..75fdf02a5 100644
--- a/third_party/aom/av1/encoder/temporal_filter.c
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -25,6 +25,7 @@
#include "av1/encoder/mcomp.h"
#include "av1/encoder/encoder.h"
#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/reconinter_enc.h"
#include "av1/encoder/segmentation.h"
#include "av1/encoder/temporal_filter.h"
#include "aom_dsp/aom_dsp_common.h"
@@ -37,13 +38,12 @@ static void temporal_filter_predictors_mb_c(
MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
uint8_t *pred, struct scale_factors *scale, int x, int y,
- int can_use_previous) {
- const int which_mv = 0;
+ int can_use_previous, int num_planes) {
const MV mv = { mv_row, mv_col };
enum mv_precision mv_precision_uv;
int uv_stride;
// TODO(angiebird): change plane setting accordingly
- ConvolveParams conv_params = get_conv_params(which_mv, 0, 0, xd->bd);
+ ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
const InterpFilters interp_filters = xd->mi[0]->interp_filters;
WarpTypesAllowed warp_types;
memset(&warp_types, 0, sizeof(WarpTypesAllowed));
@@ -55,37 +55,21 @@ static void temporal_filter_predictors_mb_c(
uv_stride = stride;
mv_precision_uv = MV_PRECISION_Q3;
}
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- av1_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale,
- 16, 16, which_mv, interp_filters,
- &warp_types, x, y, 0, MV_PRECISION_Q3, x,
- y, xd, can_use_previous);
-
- av1_highbd_build_inter_predictor(
- u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale,
- uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types,
- x, y, 1, mv_precision_uv, x, y, xd, can_use_previous);
-
- av1_highbd_build_inter_predictor(
- v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale,
- uv_block_width, uv_block_height, which_mv, interp_filters, &warp_types,
- x, y, 2, mv_precision_uv, x, y, xd, can_use_previous);
- return;
- }
av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
&conv_params, interp_filters, &warp_types, x, y, 0,
0, MV_PRECISION_Q3, x, y, xd, can_use_previous);
- av1_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width,
- &mv, scale, uv_block_width, uv_block_height,
- &conv_params, interp_filters, &warp_types, x, y, 1,
- 0, mv_precision_uv, x, y, xd, can_use_previous);
+ if (num_planes > 1) {
+ av1_build_inter_predictor(
+ u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale,
+ uv_block_width, uv_block_height, &conv_params, interp_filters,
+ &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous);
- av1_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width,
- &mv, scale, uv_block_width, uv_block_height,
- &conv_params, interp_filters, &warp_types, x, y, 2,
- 0, mv_precision_uv, x, y, xd, can_use_previous);
+ av1_build_inter_predictor(
+ v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale,
+ uv_block_width, uv_block_height, &conv_params, interp_filters,
+ &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous);
+ }
}
void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
@@ -214,7 +198,8 @@ void av1_highbd_temporal_filter_apply_c(
static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
uint8_t *arf_frame_buf,
uint8_t *frame_ptr_buf,
- int stride) {
+ int stride, int x_pos,
+ int y_pos) {
MACROBLOCK *const x = &cpi->td.mb;
MACROBLOCKD *const xd = &x->e_mbd;
const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
@@ -250,11 +235,9 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
x->mvcost = x->mv_cost_stack;
x->nmvjointcost = x->nmv_vec_cost;
- // Use mv costing from x->mvcost directly
- av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
- cond_cost_list(cpi, cost_list), &cpi->fn_ptr[BLOCK_16X16], 0,
- &best_ref_mv1);
-
+ av1_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
+ NSTEP, 1, sadpb, cond_cost_list(cpi, cost_list),
+ &best_ref_mv1, 0, 0, x_pos, y_pos, 0);
x->mv_limits = tmp_mv_limits;
// Ignore mv costing by sending NULL pointer instead of cost array
@@ -370,7 +353,8 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
// Find best match in this frame by MC
int err = temporal_filter_find_matching_mb_c(
cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
- frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride);
+ frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride,
+ mb_col * 16, mb_row * 16);
// Assign higher weight to matching MB if it's error
// score is lower. If not applying MC default behavior
@@ -386,7 +370,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row,
mbd->mi[0]->mv[0].as_mv.col, predictor, scale, mb_col * 16,
- mb_row * 16, cm->allow_warped_motion);
+ mb_row * 16, cm->allow_warped_motion, num_planes);
// Apply the filter (YUV)
if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -556,14 +540,6 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
strength = group_boost / 300;
}
- // Adjustments for second level arf in multi arf case.
- if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
- strength >>= 1;
- }
- }
-
*arnr_frames = frames;
*arnr_strength = strength;
}
@@ -593,21 +569,6 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) {
int which_arf = gf_group->arf_update_idx[gf_group->index];
-#if USE_GF16_MULTI_LAYER
- if (cpi->rc.baseline_gf_interval == 16) {
- // Identify the index to the current ARF.
- const int num_arfs_in_gf = cpi->num_extra_arfs + 1;
- int arf_idx;
- for (arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) {
- if (gf_group->index == cpi->arf_pos_in_gf[arf_idx]) {
- which_arf = arf_idx;
- break;
- }
- }
- assert(arf_idx < num_arfs_in_gf);
- }
-#endif // USE_GF16_MULTI_LAYER
-
// Set the temporal filtering status for the corresponding OVERLAY frame
if (strength == 0 && frames_to_blur == 1)
cpi->is_arf_filter_off[which_arf] = 1;
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
index bc0863a63..2ddc68b2c 100644
--- a/third_party/aom/av1/encoder/temporal_filter.h
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_TEMPORAL_FILTER_H_
-#define AV1_ENCODER_TEMPORAL_FILTER_H_
+#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
#ifdef __cplusplus
extern "C" {
@@ -22,4 +22,4 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance);
} // extern "C"
#endif
-#endif // AV1_ENCODER_TEMPORAL_FILTER_H_
+#endif // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
index de1cbe99c..63b505f36 100644
--- a/third_party/aom/av1/encoder/tokenize.h
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_TOKENIZE_H_
-#define AV1_ENCODER_TOKENIZE_H_
+#ifndef AOM_AV1_ENCODER_TOKENIZE_H_
+#define AOM_AV1_ENCODER_TOKENIZE_H_
#include "av1/common/entropy.h"
#include "av1/encoder/block.h"
@@ -70,4 +70,4 @@ static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
} // extern "C"
#endif
-#endif // AV1_ENCODER_TOKENIZE_H_
+#endif // AOM_AV1_ENCODER_TOKENIZE_H_
diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h
index 69063b801..405bc9e6e 100644
--- a/third_party/aom/av1/encoder/tx_prune_model_weights.h
+++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
-#define AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
#ifdef __cplusplus
extern "C" {
@@ -19,79 +19,114 @@ extern "C" {
#include "av1/encoder/ml.h"
// Tx type model for 4x4 block.
-static const float av1_tx_type_nn_weights_4x4_layer0[32] = {
- 0.72406f, -0.40019f, 0.51795f, -0.43881f, -0.49746f, -0.41780f, -0.39409f,
- -0.16183f, -1.00135f, -0.41733f, -0.96534f, 0.93272f, 1.06229f, 0.04188f,
- 0.60919f, 0.92405f, -0.39359f, 0.70570f, 0.75375f, 1.11966f, -1.86360f,
- -0.35421f, 0.18743f, 0.13346f, -0.21262f, 0.07050f, 0.10533f, -0.47402f,
- 1.33417f, 1.72899f, 1.17983f, 0.10552f,
+static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = {
+ -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+ 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f,
+ -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+ 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f,
+ 1.35792f, 0.27733f, 0.88660f, -0.68304f,
};
-static const float av1_tx_type_nn_bias_4x4_layer0[8] = {
- 1.96273f, -0.69845f, -0.10999f, -1.11311f,
- 1.35101f, 0.43842f, -0.29264f, -1.15376f,
+static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = {
+ 1.38742f, 0.59540f, -1.37622f, 1.92114f,
+ 0.00000f, -0.38998f, -0.32726f, -0.15650f,
};
-static const float av1_tx_type_nn_weights_4x4_layer1[32] = {
- 0.79770f, 0.08520f, 0.23298f, 0.05285f, 0.87506f, -0.90784f, -0.06197f,
- -1.00580f, 0.68639f, -0.34881f, 0.15366f, -1.64658f, 0.80755f, -0.26293f,
- 0.10253f, -0.23915f, 1.14696f, -0.10928f, -1.61377f, 0.00863f, 0.98599f,
- -0.43872f, 0.61196f, -0.03787f, 1.01060f, 0.17643f, -0.00208f, -0.15738f,
- 0.06517f, 0.72885f, 0.24387f, 1.28535f,
+static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = {
+ 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f,
+ -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f,
+ -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f,
+ 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+ -0.26782f, -0.65416f, -0.10648f, 0.05568f,
};
-static const float av1_tx_type_nn_bias_4x4_layer1[4] = {
- 1.23769f,
- 1.40308f,
- 0.09871f,
- 1.82070f,
+static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = {
+ 4.07177f,
+ 3.26961f,
+ 0.58083f,
+ 1.21199f,
};
-static const NN_CONFIG av1_tx_type_nnconfig_4x4 = {
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = {
4, // num_inputs
4, // num_outputs
1, // num_hidden_layers
{
8,
}, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x4_hor_layer0,
+ av1_tx_type_nn_weights_4x4_hor_layer1 },
+ { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = {
+ -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f,
+ 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f,
+ 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f,
+ 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+ -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = {
+ -0.33685f, 0.22025f, 0.28140f, 0.56138f,
+ 0.93489f, -1.77048f, 1.34989f, -0.93747f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = {
+ -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f,
+ 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f,
+ -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+ -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f,
+ -0.86315f, -0.53336f, 0.30320f, -1.32331f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = {
+ -1.31519f,
+ -3.26321f,
+ 1.71794f,
+ -1.90778f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
{
- av1_tx_type_nn_weights_4x4_layer0,
- av1_tx_type_nn_weights_4x4_layer1,
- },
- {
- av1_tx_type_nn_bias_4x4_layer0,
- av1_tx_type_nn_bias_4x4_layer1,
- },
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x4_ver_layer0,
+ av1_tx_type_nn_weights_4x4_ver_layer1 },
+ { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 }
};
/******************************************************************************/
// Tx type model for 4x8 block.
static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = {
- 0.68355f, -0.06887f, 0.68525f, -0.86048f, -0.35906f, -0.28597f, -0.21108f,
- 0.12591f, -1.13025f, -0.65695f, -0.25658f, 0.39155f, 0.89011f, 0.19258f,
- 0.28316f, 0.61172f, 0.52587f, 0.99182f, 0.75704f, 0.66788f, -1.61814f,
- -1.23483f, -0.62868f, -0.11902f, 0.33295f, 0.64796f, 0.92345f, -0.71821f,
- 0.07575f, 0.34687f, 0.20518f, -0.87850f,
+ 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f,
+ 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f,
+ -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f,
+ -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+ -1.35896f, -1.17121f, 1.68866f, 0.10357f,
};
static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = {
- 1.14049f, -0.18583f, 1.92114f, -0.72057f,
- 1.32715f, 0.96713f, 1.09877f, -0.64345f,
+ 2.93391f, 0.66831f, -0.21419f, 0.00000f,
+ -0.72878f, 0.15127f, -1.46755f, 0.16658f,
};
static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = {
- 0.71978f, 0.06896f, 1.48617f, 0.97124f, -0.02487f, -0.95359f, 0.68983f,
- -0.16313f, 0.51324f, -0.33770f, 0.45938f, -1.08238f, 0.72938f, 0.42300f,
- 0.85691f, -0.03783f, 1.12617f, -0.04034f, 0.36923f, 0.25638f, 1.10167f,
- 0.41633f, 0.72602f, -0.14797f, 0.66888f, 0.11437f, -0.99797f, -0.20725f,
- 1.01163f, 2.06308f, 1.23331f, -0.15481f,
+ -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f,
+ -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f,
+ 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f,
+ 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+ -0.50191f, 0.18219f, 1.83664f, -0.75276f,
};
static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = {
- 2.14443f,
- 1.98356f,
- 0.74616f,
- 2.58795f,
+ -1.17455f,
+ -2.26089f,
+ -1.79863f,
+ -2.26333f,
};
static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
@@ -101,62 +136,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
{
8,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_4x8_hor_layer0,
- av1_tx_type_nn_weights_4x8_hor_layer1,
- },
- {
- av1_tx_type_nn_bias_4x8_hor_layer0,
- av1_tx_type_nn_bias_4x8_hor_layer1,
- },
+ { av1_tx_type_nn_weights_4x8_hor_layer0,
+ av1_tx_type_nn_weights_4x8_hor_layer1 },
+ { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 }
};
static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = {
- 0.88859f, 1.02796f, 1.15509f, 0.61719f, 0.85804f, 1.17581f, 0.93524f,
- 0.06546f, 0.08018f, -0.78562f, -0.36614f, 0.14149f, -0.30069f, -0.52647f,
- -0.82789f, 0.60527f, -1.74026f, -0.20271f, 0.09875f, 0.03708f, 0.09430f,
- -0.24043f, -0.38433f, 1.21014f, 1.42443f, 0.69586f, 1.07812f, 1.21748f,
- 1.10989f, 0.93122f, 1.04127f, 0.39424f, 0.95592f, 0.12904f, 0.46330f,
- 0.49722f, 0.46303f, 0.36979f, 0.60227f, 0.39345f, -2.01632f, -0.05706f,
- 0.07766f, -0.01271f, -0.16577f, -0.21957f, -0.14800f, 0.24898f, 0.27662f,
- 0.42028f, 0.44748f, 1.14585f, 1.38805f, 0.46182f, -0.22982f, -0.07324f,
- 0.29886f, -0.46959f, -0.04228f, -0.01064f, 0.24260f, -0.32282f, -0.23804f,
- 1.44466f, -0.42190f, -0.36385f, 0.39746f, 0.38557f, -0.09624f, -0.21540f,
- 0.57385f, -0.72878f, -0.39677f, -0.00717f, 0.60499f, 1.33849f, 1.05337f,
- 1.11947f, 0.38487f, 0.86534f, -0.33970f, 0.71140f, 0.20772f, 0.61132f,
- 0.06181f, -0.20027f, 0.13736f, -0.72321f, 0.64586f, -0.56740f, -0.90912f,
- -0.20452f, 0.15381f, -0.84346f, 0.19550f, 0.63164f, 1.35441f, 0.63218f,
- 0.82883f, 0.38803f, -0.23874f, -0.02962f, 0.23846f, -0.06822f, -0.40159f,
- -0.17850f, -0.69524f, 1.12299f, -0.08286f, -0.14150f, -0.28456f, -0.41519f,
- -0.12792f, -0.55286f, 0.51655f, 0.06636f, 0.73759f, 0.70072f, 0.12616f,
- 0.31282f, 0.17130f, -1.34233f, 0.37221f, 0.95838f, 0.16286f, 1.04301f,
- 0.73600f, -0.11233f,
+ -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f,
+ -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+ -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f,
+ 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f,
+ 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f,
+ 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f,
+ -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f,
+ -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f,
+ 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f,
+ -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+ -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f,
+ -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f,
+ 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+ 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+ -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f,
+ -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f,
+ 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f,
+ -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f,
+ -0.21958f, 0.05970f,
};
static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = {
- -0.89131f, 0.09124f, -0.71678f, -1.19929f, 0.98963f, 0.16896f,
- -0.44943f, -0.97532f, -0.13997f, 1.07136f, -0.46362f, -0.45253f,
- -0.63015f, -0.20008f, 1.24048f, -0.21265f,
+ 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f,
+ 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f,
+ 0.08288f, 0.18195f, -0.79890f, 0.10047f,
};
static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = {
- -0.79795f, 0.45973f, -0.54188f, -1.05095f, 0.64404f, -0.56470f, -0.57018f,
- 0.61644f, 0.50229f, 1.14006f, 0.13805f, -0.42058f, -0.07468f, 0.66203f,
- 0.93180f, -0.59662f, -0.25152f, 0.00336f, 1.09769f, -1.11921f, 0.15151f,
- 0.58750f, -0.42480f, -0.95908f, -0.10980f, 1.31715f, 0.06665f, -0.52371f,
- 0.37228f, -0.12364f, 0.54876f, -0.32698f, 0.39863f, -0.97669f, -1.06351f,
- 1.82755f, 1.02851f, 0.10322f, -0.08322f, 0.08891f, -0.05715f, 0.93503f,
- 0.02096f, -0.39506f, -0.99330f, -0.09407f, 0.75108f, -0.30104f, 1.78314f,
- -0.01786f, -0.17392f, 0.00461f, 0.41394f, 0.92566f, 1.11251f, -0.71380f,
- -0.04907f, 0.12736f, 0.00208f, 0.94451f, -0.31783f, -0.19655f, 0.64619f,
- 0.50359f,
+ -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f,
+ -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f,
+ -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f,
+ -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f,
+ 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f,
+ 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f,
+ -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+ -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+ -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f,
+ -1.01848f,
};
static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = {
- 0.39274f,
- 1.27276f,
- 0.30322f,
- 2.55238f,
+ -1.45955f,
+ -2.08949f,
+ -1.24813f,
+ -1.55368f,
};
static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
@@ -166,64 +196,59 @@ static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_4x8_ver_layer0,
- av1_tx_type_nn_weights_4x8_ver_layer1,
- },
- {
- av1_tx_type_nn_bias_4x8_ver_layer0,
- av1_tx_type_nn_bias_4x8_ver_layer1,
- },
+ { av1_tx_type_nn_weights_4x8_ver_layer0,
+ av1_tx_type_nn_weights_4x8_ver_layer1 },
+ { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 }
};
/******************************************************************************/
// Tx type model for 8x4 block.
static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = {
- 0.64828f, 0.61618f, 0.98975f, -0.14562f, 0.26957f, 1.80872f, 0.58299f,
- -0.06917f, 0.00937f, -0.74073f, -0.66045f, -0.04576f, -0.39802f, -0.76960f,
- -0.85166f, 0.88799f, -0.70694f, -0.34366f, -0.54906f, -0.39502f, -0.29465f,
- -0.49650f, -0.32171f, 1.37181f, 1.30432f, 0.71843f, 1.01916f, 1.01582f,
- 0.90999f, 0.86334f, 1.04603f, 0.40734f, 0.96187f, 0.53742f, 0.07510f,
- 0.44167f, 0.02049f, -0.02874f, 0.97191f, 1.03647f, -2.62751f, -0.01390f,
- -0.09282f, -0.02522f, -0.30849f, -0.19386f, -0.51356f, 0.52778f, 0.77191f,
- 0.75416f, 0.69067f, 0.93561f, 1.35982f, 0.76193f, 0.57869f, 0.00251f,
- -0.87244f, -0.26922f, -0.06682f, 0.07176f, 0.51142f, 0.58948f, 0.13914f,
- 0.71165f, -0.40329f, -0.33201f, 0.35293f, 0.33437f, -0.01812f, -0.24765f,
- 0.26810f, -0.77088f, 1.35707f, 0.22243f, 0.78402f, 0.66191f, 0.79890f,
- 1.90669f, 0.73189f, 0.24222f, -0.34682f, 0.66990f, 0.19554f, 0.58414f,
- 0.05060f, -0.21271f, 0.11656f, -0.74907f, 0.68837f, -0.39147f, -1.78263f,
- -0.69918f, -0.06838f, -0.26927f, 0.38502f, 0.08305f, 1.29848f, 0.67328f,
- 0.67269f, 0.65805f, -0.47778f, -1.02617f, 0.16523f, 0.12223f, -0.35294f,
- -0.15866f, -0.56224f, 1.25895f, -0.21422f, -0.33518f, -0.33519f, -0.37414f,
- 0.55122f, 0.14806f, 0.44312f, -0.07865f, 0.75295f, 0.10766f, 0.59922f,
- 0.48837f, -0.19099f, -2.07991f, 0.35755f, 0.87813f, 0.07559f, 1.00724f,
- 0.25223f, -0.06761f,
+ -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f,
+ 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f,
+ -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f,
+ -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f,
+ -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f,
+ 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f,
+ 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f,
+ -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+ -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+ 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f,
+ 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f,
+ -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f,
+ -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f,
+ 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f,
+ 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+ 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f,
+ -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+ -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+ -1.85523f, 0.92532f,
};
static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = {
- -0.54227f, 0.08599f, -0.77447f, -1.10920f, 0.89298f, 0.05454f,
- -0.73681f, 0.21048f, -0.41041f, 1.25690f, -0.60918f, 0.14661f,
- -0.65392f, -0.25881f, 1.67995f, -0.03550f,
+ 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f,
+ -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f,
+ -0.28958f, -0.32869f, -0.01704f, 0.68171f,
};
static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = {
- -0.22312f, 0.73552f, 0.48399f, -0.66996f, 0.36527f, -0.42228f, -1.10793f,
- 0.31167f, 0.16177f, 1.69315f, -0.06287f, -0.35804f, -0.24889f, 0.80824f,
- 1.08952f, -0.62838f, 0.30066f, -0.19043f, -0.00518f, -1.31005f, 0.65797f,
- 1.07714f, -0.24253f, 0.49779f, 0.05848f, 1.08914f, 0.08015f, -0.38853f,
- 0.35108f, -0.11026f, 0.64528f, -0.37615f, 0.39995f, -0.58117f, -1.29627f,
- 1.74169f, 0.75558f, -0.04910f, 0.35020f, 0.04556f, 0.12634f, 1.27223f,
- 0.02608f, -0.19687f, -0.78649f, -0.22746f, 1.02589f, -0.28411f, 1.42443f,
- -0.42115f, -0.21153f, -0.01733f, 0.62001f, 0.87167f, 1.66008f, -0.39179f,
- -0.06293f, 0.27012f, 0.16871f, 0.64597f, 0.67358f, -0.20053f, 0.95830f,
- 0.44232f,
+ -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f,
+ -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f,
+ 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f,
+ -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+ 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f,
+ -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f,
+ -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f,
+ 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f,
+ 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f,
+ -1.10654f,
};
static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = {
- 0.14889f,
- 1.74197f,
- 0.53696f,
- 2.87574f,
+ -0.92861f,
+ -1.45151f,
+ -1.33588f,
+ -4.33853f,
};
static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
@@ -233,42 +258,37 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_8x4_hor_layer0,
- av1_tx_type_nn_weights_8x4_hor_layer1,
- },
- {
- av1_tx_type_nn_bias_8x4_hor_layer0,
- av1_tx_type_nn_bias_8x4_hor_layer1,
- },
+ { av1_tx_type_nn_weights_8x4_hor_layer0,
+ av1_tx_type_nn_weights_8x4_hor_layer1 },
+ { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 }
};
static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = {
- 0.81919f, 0.15527f, 0.60055f, -0.54617f, -0.35510f, -0.28223f, -0.20478f,
- 0.15001f, -1.84806f, -0.30274f, -0.00865f, 0.33939f, 1.11970f, 0.44630f,
- 0.32074f, 0.39637f, 0.08149f, 1.28070f, 0.86703f, 0.76503f, -1.83991f,
- -1.13575f, -0.68605f, -0.23690f, 0.07099f, 0.64960f, 0.82543f, -0.72028f,
- 0.08220f, 0.34338f, 0.20245f, -0.88920f,
+ -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+ -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+ -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f,
+ -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f,
+ 1.66212f, 1.70826f, 1.55182f, 0.12230f,
};
static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = {
- 1.14995f, -0.16021f, 2.38325f, -0.65179f,
- 1.09624f, 1.07662f, 0.63837f, -0.64847f,
+ 0.10943f, 2.09789f, 2.16578f, 0.15766f,
+ -0.42461f, 0.00000f, 1.22090f, -1.28717f,
};
static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = {
- 0.10278f, 0.06819f, 1.73885f, 1.29889f, -0.18482f, -1.06132f, 0.67003f,
- -0.23280f, 0.50181f, -0.33890f, 0.43524f, -1.03147f, 1.09640f, 0.66332f,
- 0.47652f, -0.02251f, 0.94245f, -0.03861f, 0.84776f, 0.28377f, 0.92044f,
- 0.23572f, 0.52082f, -0.16266f, 0.45290f, 0.11342f, -0.50310f, -0.92633f,
- 1.46345f, 1.84714f, 1.06804f, -0.13610f,
+ 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f,
+ 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f,
+ 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f,
+ -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f,
+ -1.15005f, -0.39311f, 1.51236f, -1.68973f,
};
static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = {
- 2.41028f,
- 1.95675f,
- 0.82387f,
- 2.41923f,
+ 1.81013f,
+ 1.10517f,
+ 2.90059f,
+ 0.95391f,
};
static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
@@ -278,131 +298,181 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
{
8,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_8x4_ver_layer0,
- av1_tx_type_nn_weights_8x4_ver_layer1,
- },
- {
- av1_tx_type_nn_bias_8x4_ver_layer0,
- av1_tx_type_nn_bias_8x4_ver_layer1,
- },
+ { av1_tx_type_nn_weights_8x4_ver_layer0,
+ av1_tx_type_nn_weights_8x4_ver_layer1 },
+ { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 }
};
/******************************************************************************/
// Tx type model for 8x8 block.
-static const float av1_tx_type_nn_weights_8x8_layer0[128] = {
- 0.98214f, 1.05643f, 0.91173f, 0.24165f, 0.39961f, 0.25736f, 0.68593f,
- 0.10553f, 0.13353f, -0.49687f, -1.66413f, 1.16584f, 2.25147f, -0.72247f,
- -2.65486f, -0.03628f, -1.47746f, -1.07644f, -1.25551f, -0.91260f, -1.26199f,
- -1.06022f, -1.42138f, 1.10500f, 2.96552f, -0.40638f, 0.02258f, -0.23137f,
- 0.34922f, -0.01454f, 0.41251f, 0.35944f, -1.56742f, 0.01406f, 0.88114f,
- 1.42462f, 0.87243f, 0.02439f, 0.07035f, 0.34303f, -3.16843f, 0.25798f,
- 0.07494f, 0.38926f, -0.12267f, 0.09049f, -0.36711f, 0.01551f, 1.41269f,
- 1.33505f, 1.43627f, 1.41909f, 1.44605f, 1.43008f, 1.36721f, 0.19443f,
- -0.08606f, 0.17285f, 0.63692f, 0.92092f, 0.61007f, 0.87100f, -0.33631f,
- 1.98025f, -0.40686f, -0.33808f, 0.34919f, 0.33817f, -0.01807f, -0.25259f,
- 0.26442f, -0.76979f, 1.07788f, -1.38747f, 1.34315f, 2.79947f, 2.02838f,
- -0.25062f, 0.00174f, 1.25888f, 0.17344f, 0.20897f, 1.28765f, 1.95749f,
- 1.62351f, 1.04556f, 0.43858f, 0.12463f, 1.66399f, 0.03971f, 0.36614f,
- 0.56932f, 0.15982f, 0.11587f, 0.21402f, 1.89386f, -0.91267f, -0.79781f,
- 1.79155f, 0.60147f, -0.90118f, -4.32718f, -0.58154f, -0.02181f, -0.40734f,
- -0.11409f, -0.79470f, 0.69697f, -0.16588f, -0.16090f, -0.21236f, -0.52776f,
- -0.64455f, 0.09173f, 0.80766f, 0.76097f, 0.20295f, -0.93467f, -0.43509f,
- 0.59659f, 0.07788f, -3.79459f, 0.16268f, 0.47343f, 0.05106f, -0.24880f,
- 1.18941f, 0.10346f,
-};
-
-static const float av1_tx_type_nn_bias_8x8_layer0[16] = {
- 0.75780f, 0.25628f, 0.19911f, -0.41384f, 1.33909f, 0.31498f,
- -1.37171f, -1.09561f, -0.44056f, 0.49001f, -0.65804f, -1.96031f,
- 0.64806f, -0.52520f, 1.38838f, 0.15519f,
-};
-
-static const float av1_tx_type_nn_weights_8x8_layer1[64] = {
- -0.63856f, -2.02670f, -0.92947f, 0.00216f, 1.47710f, -2.01099f, -2.11289f,
- -0.92288f, 0.19296f, 1.37866f, -0.85975f, -0.78624f, -2.10392f, 0.13976f,
- 1.06968f, -2.04120f, 0.57991f, -1.84941f, -0.81512f, -2.08254f, -0.47334f,
- 0.12256f, -1.39594f, -1.02829f, 0.06134f, 2.25646f, -1.25196f, -2.65317f,
- -1.94473f, 0.10989f, 0.55446f, -1.76557f, 0.33455f, -1.85556f, -3.01878f,
- -0.25100f, 1.65520f, -1.61409f, 1.16336f, -1.15560f, 0.13631f, 1.50733f,
- -1.07538f, -0.91200f, -1.93132f, 0.09271f, 0.24425f, -1.80655f, -0.01138f,
- -1.36421f, -0.62970f, -0.84766f, -0.34714f, -0.50531f, 1.91005f, -1.60316f,
- -0.02495f, 1.04938f, 0.28411f, -0.79809f, -1.48232f, 0.00766f, 0.94016f,
- -1.11974f,
-};
-
-static const float av1_tx_type_nn_bias_8x8_layer1[4] = {
- 0.53574f,
- 1.57736f,
- -0.13698f,
- 2.64613f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_8x8 = {
+static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = {
+ -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f,
+ -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f,
+ 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f,
+ 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+ -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f,
+ -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f,
+ -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f,
+ 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+ 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f,
+ -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+ 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f,
+ -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f,
+ 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f,
+ 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f,
+ 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f,
+ 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+ 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f,
+ 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+ -0.99892f, 1.09823f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = {
+ -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f,
+ -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f,
+ -0.26319f, 2.65579f, -1.30137f, -0.01487f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = {
+ -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f,
+ -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f,
+ 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+ 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f,
+ 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+ -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f,
+ 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f,
+ 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f,
+ 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f,
+ 0.06161f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = {
+ 1.70385f,
+ 1.82373f,
+ 1.78496f,
+ 1.80826f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = {
8, // num_inputs
4, // num_outputs
1, // num_hidden_layers
{
16,
}, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x8_hor_layer0,
+ av1_tx_type_nn_weights_8x8_hor_layer1 },
+ { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = {
+ -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+ 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f,
+ -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f,
+ -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f,
+ 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f,
+ 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f,
+ 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f,
+ -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+ -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f,
+ 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f,
+ 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f,
+ -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f,
+ 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f,
+ 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f,
+ -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f,
+ 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f,
+ -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f,
+ -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+ -1.29848f, 0.39308f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = {
+ -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f,
+ 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f,
+ 0.83015f, 0.06024f, 1.17180f, 0.65122f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = {
+ -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f,
+ 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+ 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f,
+ 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+ 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+ 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f,
+ 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f,
+ 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f,
+ -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f,
+ -0.41305f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = {
+ 2.14067f,
+ 2.76699f,
+ 2.04233f,
+ 1.34803f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
{
- av1_tx_type_nn_weights_8x8_layer0,
- av1_tx_type_nn_weights_8x8_layer1,
- },
- {
- av1_tx_type_nn_bias_8x8_layer0,
- av1_tx_type_nn_bias_8x8_layer1,
- },
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x8_ver_layer0,
+ av1_tx_type_nn_weights_8x8_ver_layer1 },
+ { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 }
};
/******************************************************************************/
// Tx type model for 8x16 block.
static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = {
- 1.36274f, 1.37313f, 1.26859f, 1.26459f, 1.37979f, 1.47217f, 1.29710f,
- 0.15765f, 0.31552f, -0.05727f, 0.25562f, 0.47925f, -0.32913f, -0.55757f,
- -0.98010f, 0.08568f, -0.62754f, 0.12834f, -0.03717f, 0.06286f, 0.26159f,
- 0.26023f, -0.62605f, 1.34500f, 1.47720f, 0.47937f, 0.84793f, 0.87866f,
- 0.81260f, 0.74761f, 0.84217f, 0.53321f, -0.78232f, 0.35321f, 0.41240f,
- 0.45002f, 0.88973f, 0.51055f, 0.91115f, -0.45512f, -2.37418f, -0.25205f,
- 0.05893f, -0.15685f, -0.25156f, -0.17104f, -0.12230f, 0.17802f, 0.18796f,
- -0.05797f, 0.26484f, 1.23515f, 1.70393f, 0.46022f, -0.14354f, 0.08501f,
- -0.84625f, -0.42578f, -0.29345f, -0.51797f, -0.56515f, -0.47465f, 0.23970f,
- 1.59912f, -0.40332f, -0.33209f, 0.37274f, 0.36831f, -0.00248f, -0.24295f,
- 0.29539f, -0.76136f, -0.22531f, 0.12371f, 0.37889f, 1.02639f, 1.73330f,
- 1.09686f, 1.04111f, 0.69006f, -1.27157f, 0.94013f, 0.61621f, 0.62274f,
- 0.48759f, 0.55672f, 0.62597f, -0.38846f, 1.72124f, 0.08214f, -0.06650f,
- 0.32617f, 0.10958f, 0.24650f, 0.10740f, 1.16861f, 0.50701f, 0.45383f,
- 0.90016f, -0.00695f, -0.11986f, -0.07834f, 0.20346f, 0.25863f, -0.40889f,
- -0.11344f, -0.79108f, 0.76259f, -0.14562f, -0.15459f, -0.20954f, -0.51306f,
- 0.02743f, -0.82456f, -0.00861f, -0.27274f, 0.28762f, 0.07282f, 0.26410f,
- 0.53413f, -0.22208f, -0.85031f, -1.39129f, -0.74519f, 0.09771f, 0.80313f,
- 1.07698f, 0.02531f,
+ -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+ 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f,
+ -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f,
+ 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+ -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f,
+ 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f,
+ -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f,
+ 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f,
+ -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f,
+ -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f,
+ 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f,
+ 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f,
+ -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f,
+ 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f,
+ -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f,
+ 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f,
+ 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f,
+ -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+ -0.28136f, 0.42556f,
};
static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = {
- -1.30434f, -1.19259f, -0.43467f, -0.85386f, 0.96584f, 0.29276f,
- -0.41990f, -0.96924f, -0.30933f, 0.95264f, -0.25330f, -1.19584f,
- 1.46564f, -0.42959f, 1.55720f, 0.18479f,
+ 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f,
+ -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+ 1.81560f, -1.02643f, -0.81690f, 0.08302f,
};
static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = {
- -1.72959f, -0.21670f, 0.10616f, -0.02006f, 0.15084f, -0.85303f, -0.27535f,
- 0.58704f, 0.23683f, 1.19743f, 0.77971f, 0.49874f, 0.19508f, 0.19641f,
- 1.47895f, -0.52173f, -0.56746f, -0.50761f, 0.15864f, -0.95168f, 0.48103f,
- 0.91904f, -0.11700f, 0.62863f, 0.06526f, 1.63803f, -0.72325f, -1.80449f,
- 0.66373f, 0.12831f, 0.27139f, -0.26346f, 1.50852f, 0.25079f, -0.54255f,
- 1.78815f, 1.39691f, -0.44989f, -0.18511f, -1.52903f, 0.13983f, 1.06906f,
- -0.30184f, 0.37566f, 0.46209f, 0.10440f, 0.64695f, -0.34002f, 1.96990f,
- 0.21189f, -0.91248f, -0.11263f, 0.26708f, 1.27405f, 1.89776f, 0.02081f,
- -0.06977f, -0.02584f, 0.47733f, 0.27117f, 1.33315f, -0.09175f, 0.48747f,
- 1.16772f,
+ 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f,
+ -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f,
+ 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f,
+ -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f,
+ 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f,
+ 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f,
+ 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f,
+ 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f,
+ 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f,
+ -1.31243f,
};
static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = {
- 1.25783f,
- 1.19452f,
- 0.69964f,
- 2.41982f,
+ 0.83359f,
+ 1.06875f,
+ 1.77645f,
+ 1.49570f,
};
static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
@@ -412,62 +482,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_8x16_hor_layer0,
- av1_tx_type_nn_weights_8x16_hor_layer1,
- },
- {
- av1_tx_type_nn_bias_8x16_hor_layer0,
- av1_tx_type_nn_bias_8x16_hor_layer1,
- },
+ { av1_tx_type_nn_weights_8x16_hor_layer0,
+ av1_tx_type_nn_weights_8x16_hor_layer1 },
+ { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 }
};
static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = {
- 0.90888f, 0.86305f, 0.81674f, 0.75352f, 1.07834f, 0.99048f, 0.96355f,
- 0.13836f, -0.51334f, 0.19906f, 1.84608f, 0.67828f, 0.45876f, 0.08325f,
- 0.28190f, -0.01958f, -1.96553f, 0.27837f, -0.05929f, 0.13491f, 0.21036f,
- 0.05797f, -0.01373f, 0.73765f, 1.39603f, -0.53767f, 0.10362f, 0.03420f,
- 0.41909f, 0.09510f, 0.32284f, 0.83860f, 0.13954f, 0.48434f, 1.47762f,
- 0.45891f, 0.23613f, 0.13013f, 0.82097f, -0.03251f, -1.89757f, 0.21589f,
- -0.10370f, 0.02530f, -0.25659f, 0.01466f, -0.23661f, 0.22783f, 0.92100f,
- 1.02915f, 1.20358f, 1.17251f, 0.97749f, 1.04696f, 0.91333f, 0.54576f,
- -0.52792f, 0.02217f, 0.25652f, 0.31405f, -0.18398f, 0.04572f, -0.81359f,
- 1.82883f, -0.40047f, -0.33056f, 0.35255f, 0.34448f, -0.00339f, -0.23857f,
- 0.28925f, -0.77175f, -0.24325f, -0.21420f, 1.11451f, 1.39553f, 0.51573f,
- 0.05476f, 1.13791f, 0.94959f, -0.35710f, 0.67467f, 0.16722f, 0.61213f,
- 0.07683f, -0.20613f, 0.13440f, -0.72131f, -0.15418f, -0.17688f, -0.16510f,
- -0.19226f, 0.09270f, -2.43559f, -0.12669f, 0.05074f, 0.30414f, 0.00927f,
- 0.60630f, 0.00801f, -1.07310f, -0.06227f, 2.10607f, 0.02382f, -0.39891f,
- -0.09149f, -0.78596f, 0.83966f, -0.14802f, -0.14083f, -0.20831f, -0.55136f,
- 0.08566f, -0.00647f, 0.07044f, 0.53408f, 0.85720f, -0.07393f, 0.24476f,
- 0.43767f, 0.30519f, -1.89430f, 0.23252f, 1.63790f, 0.17316f, -0.03903f,
- 0.25269f, 0.01562f,
+ 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f,
+ -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f,
+ -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f,
+ 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+ -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f,
+ 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f,
+ 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f,
+ 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f,
+ -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f,
+ -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f,
+ 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f,
+ 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+ -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+ -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f,
+ -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f,
+ -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+ -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+ 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+ -0.12236f, 0.16075f,
};
static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = {
- -0.83370f, -0.20704f, -0.60437f, -0.81664f, 1.16998f, 0.16745f,
- -1.34680f, -1.07083f, -0.34649f, 0.65598f, -0.56278f, 0.22660f,
- -0.25956f, -0.29608f, 1.24359f, -0.09167f,
+ -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f,
+ -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+ 0.57598f, 0.99819f, 0.75175f, 0.17044f,
};
static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = {
- -0.71147f, -0.63964f, -0.69220f, 0.22326f, 0.67191f, -0.58894f, -0.98464f,
- 0.23583f, 0.22824f, 1.39838f, 0.09920f, -0.59411f, -0.67101f, 0.19088f,
- 0.83025f, -0.66991f, -0.42889f, -0.49969f, 1.39532f, -1.02000f, 0.62101f,
- 0.57175f, -0.83226f, 0.01551f, 0.05604f, 1.23028f, 0.02030f, -0.55995f,
- -0.42349f, 0.15375f, 0.52132f, -0.52421f, 0.89586f, -0.73778f, -0.10911f,
- 0.22447f, 1.16858f, -0.48169f, 1.73890f, -0.69860f, 0.12504f, 1.10492f,
- 0.04391f, -0.85670f, -0.49257f, 0.09616f, 0.76518f, -0.44854f, 1.50938f,
- 0.62246f, -0.40366f, -0.11182f, -0.01680f, 0.59724f, 1.32170f, -1.09061f,
- -0.04278f, -0.02449f, 0.25024f, 1.26239f, 0.42345f, -0.10031f, 0.80871f,
- 0.44198f,
+ -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f,
+ 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f,
+ -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+ 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f,
+ -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+ -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f,
+ -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+ 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f,
+ 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f,
+ 2.20547f,
};
static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = {
- 0.68329f,
- 1.33555f,
- 0.25943f,
- 3.23439f,
+ -0.44080f,
+ -1.67455f,
+ -1.46332f,
+ -6.13206f,
};
static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
@@ -477,64 +542,59 @@ static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_8x16_ver_layer0,
- av1_tx_type_nn_weights_8x16_ver_layer1,
- },
- {
- av1_tx_type_nn_bias_8x16_ver_layer0,
- av1_tx_type_nn_bias_8x16_ver_layer1,
- },
+ { av1_tx_type_nn_weights_8x16_ver_layer0,
+ av1_tx_type_nn_weights_8x16_ver_layer1 },
+ { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 }
};
/******************************************************************************/
// Tx type model for 16x8 block.
static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = {
- 0.89821f, 0.90804f, 1.13052f, 0.74855f, 1.02053f, 0.91260f, 0.97102f,
- 0.16808f, -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f,
- -0.12236f, -0.03158f, -1.43561f, 0.07794f, 0.16586f, 0.09731f, 0.12967f,
- 0.09725f, -0.16826f, 1.26640f, 0.88004f, 0.27312f, -0.07993f, 0.33640f,
- 0.11732f, 0.33384f, 0.97066f, -0.61744f, -0.48545f, 0.44622f, 0.73744f,
- 0.32262f, -0.05713f, 0.42280f, 1.10378f, 0.18540f, -2.07906f, 0.11443f,
- 0.37877f, 0.24136f, -0.12524f, -0.12434f, 0.02116f, 0.11716f, 1.28267f,
- 1.01508f, 1.26184f, 1.22545f, 1.29582f, 1.18855f, 1.27564f, 0.42001f,
- -0.41481f, 0.06725f, -0.13133f, -0.24801f, 0.16515f, 0.16228f, 0.35197f,
- 0.53610f, -0.39805f, -0.32584f, 0.40096f, 0.38621f, -0.00030f, -0.23434f,
- 0.29149f, -0.76542f, 0.04996f, -0.30036f, 1.48687f, 0.90852f, -0.03083f,
- -0.15953f, 1.19259f, 0.87690f, -1.08977f, 0.78757f, 0.81149f, 0.54089f,
- 0.35400f, 0.37919f, 0.84997f, -0.20449f, 0.39601f, -0.37596f, 0.64748f,
- 0.26021f, 0.37354f, 0.23593f, 0.16335f, 1.70681f, 0.31800f, -0.00964f,
- 0.82687f, -0.78372f, -1.47438f, 0.32410f, 1.37436f, 0.07476f, -0.40574f,
- -0.10353f, -0.79300f, 0.74381f, -0.15601f, -0.14380f, -0.20961f, -0.52697f,
- 0.04669f, -0.00870f, 0.05624f, -0.09036f, 0.25701f, 0.30336f, 0.24199f,
- 0.45579f, 0.66330f, -1.81834f, 0.74965f, 1.22747f, 0.25072f, 0.25100f,
- 0.43289f, -0.00362f,
+ 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f,
+ -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f,
+ -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f,
+ 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f,
+ 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f,
+ 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f,
+ 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f,
+ -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f,
+ -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f,
+ -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f,
+ 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f,
+ -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f,
+ -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f,
+ -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f,
+ 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f,
+ -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+ -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+ 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f,
+ -0.36570f, -0.50757f,
};
static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = {
- -0.87643f, 0.36754f, -0.86409f, 1.37761f, 1.22688f, 0.09074f,
- -1.47139f, -1.06100f, -0.24087f, 1.10382f, -0.32837f, -1.39592f,
- -0.14741f, -0.43954f, 1.72137f, -0.21704f,
+ -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+ 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f,
+ -0.12329f, 0.08986f, 1.08117f, -0.00220f,
};
static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = {
- -0.81860f, -0.80745f, -0.43612f, 0.58656f, 0.37455f, -0.56519f, -1.71536f,
- 0.23278f, 0.23951f, 1.09610f, 0.49986f, 0.43375f, -0.53182f, 0.17376f,
- 1.05626f, -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f, 1.14295f,
- 0.45571f, -0.52504f, -0.00303f, 0.06044f, 0.66119f, -0.60340f, -1.14344f,
- -0.28045f, 0.12742f, 0.61484f, -0.41016f, 1.36102f, -0.86969f, -0.52728f,
- 1.01725f, 0.67083f, -0.10138f, 1.36406f, 0.34066f, 0.12498f, 0.86595f,
- -0.39636f, -0.27888f, -0.40244f, 0.09847f, 0.81178f, -0.45313f, 1.39127f,
- 0.99865f, -0.57908f, 0.55072f, 0.49638f, 1.11524f, 1.85504f, -0.28316f,
- -0.05195f, -0.23284f, 0.26461f, -1.28120f, 0.60707f, -0.06110f, 0.74085f,
- 0.63304f,
+ 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+ 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f,
+ -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f,
+ -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f,
+ -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f,
+ -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f,
+ 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f,
+ 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f,
+ 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f,
+ -0.23347f,
};
static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = {
- 0.71765f,
- 1.40400f,
- 0.32221f,
- 3.07234f,
+ 3.57175f,
+ 2.42612f,
+ 3.31259f,
+ 2.08287f,
};
static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
@@ -544,62 +604,57 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_16x8_hor_layer0,
- av1_tx_type_nn_weights_16x8_hor_layer1,
- },
- {
- av1_tx_type_nn_bias_16x8_hor_layer0,
- av1_tx_type_nn_bias_16x8_hor_layer1,
- },
+ { av1_tx_type_nn_weights_16x8_hor_layer0,
+ av1_tx_type_nn_weights_16x8_hor_layer1 },
+ { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 }
};
static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = {
- 1.20497f, 1.23691f, 1.23738f, 1.07773f, 1.15264f, 1.31959f, 1.15365f,
- 0.17179f, 0.68612f, 0.55636f, 0.57145f, 0.67022f, 0.19636f, -1.27420f,
- -1.36428f, -0.16706f, -1.20934f, -0.87794f, -0.97146f, -0.74722f, -1.14493f,
- -1.02689f, -0.88153f, 0.83857f, 1.53355f, 0.13601f, 0.35451f, 0.53750f,
- 0.62381f, 0.32438f, 0.59405f, 0.33090f, -1.52948f, -0.46094f, 0.42634f,
- 0.48763f, 0.30707f, 0.52553f, 0.71427f, -0.31287f, -2.37106f, -0.18756f,
- 0.16561f, -0.00431f, -0.13747f, -0.09336f, -0.16511f, 0.13454f, 0.45010f,
- -0.00317f, -0.06403f, 0.95442f, 1.59636f, 0.30602f, -0.05515f, 0.05467f,
- -0.21758f, -0.19192f, -0.17935f, -0.00545f, 0.35409f, 0.26141f, -0.32174f,
- 1.78129f, -0.40161f, -0.33158f, 0.38084f, 0.38081f, 0.01053f, -0.23567f,
- 0.29239f, -0.76159f, -0.19373f, 0.13649f, 0.66949f, 1.19733f, 1.92557f,
- 1.16691f, 0.94955f, 0.62324f, -0.85434f, -0.07699f, 0.87683f, 0.95911f,
- 0.86106f, 0.57959f, 0.40146f, -0.35851f, 1.55427f, 0.15349f, -0.01582f,
- 0.32517f, 0.03784f, 0.15916f, 0.09024f, 1.43187f, 0.56160f, 0.11521f,
- 0.52476f, -0.26107f, -0.38167f, -0.31596f, 0.31304f, -0.65366f, -0.40680f,
- -0.11082f, -0.78585f, 0.77906f, -0.13322f, -0.13747f, -0.21001f, -0.53204f,
- -0.06752f, -0.84741f, -0.53442f, -0.16284f, 0.54027f, 0.13586f, -0.42001f,
- 0.85388f, 0.08300f, -0.89325f, -1.73681f, -0.70473f, 0.23151f, 0.69549f,
- 0.72124f, 0.12769f,
+ 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f,
+ 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+ -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f,
+ 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f,
+ 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f,
+ -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f,
+ 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f,
+ -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f,
+ 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f,
+ 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f,
+ 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+ -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+ -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f,
+ -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f,
+ 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f,
+ 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+ -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f,
+ -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f,
+ -0.81945f, -0.41647f,
};
static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = {
- -1.15644f, -0.31062f, 0.20697f, -0.60304f, -1.19498f, 0.21451f,
- -0.42825f, -0.71800f, -0.25816f, 1.47408f, -0.24423f, -1.45773f,
- -0.55834f, -0.36938f, 1.56759f, 0.07238f,
+ 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f,
+ 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f,
+ -0.04510f, 0.48000f, -0.09354f, -0.42422f,
};
static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = {
- -1.45227f, -0.67141f, 0.75237f, 0.32681f, -0.70528f, -0.76730f, -0.49777f,
- 0.02418f, 0.25096f, 1.14840f, 0.23548f, 0.48755f, 0.33164f, 0.21050f,
- 1.41651f, -0.28888f, -0.76668f, 0.04439f, 0.67538f, -1.06438f, 0.68128f,
- 0.95824f, 0.08530f, -0.03635f, 0.06820f, 1.38621f, -0.50424f, -1.72992f,
- -0.20949f, 0.13400f, 0.93366f, -0.05324f, 1.41593f, -0.75119f, -1.80912f,
- 1.05440f, 0.62580f, -0.30867f, -0.07025f, -0.34654f, 0.13621f, 1.74426f,
- -0.22417f, 0.47031f, -0.08142f, 0.10151f, 0.42498f, 0.06635f, 1.50623f,
- 1.04130f, 0.85107f, 0.23382f, 0.69800f, 1.10856f, 1.18767f, -0.69395f,
- -0.07985f, 0.50412f, 0.46019f, 0.49214f, 0.44219f, -0.09502f, 0.75745f,
- 0.99208f,
+ 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f,
+ -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f,
+ 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f,
+ -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f,
+ -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f,
+ 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f,
+ 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f,
+ -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f,
+ 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f,
+ -0.00873f,
};
static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = {
- 0.68774f,
- 0.88572f,
- 0.77462f,
- 3.05667f,
+ 3.34981f,
+ 3.74710f,
+ 1.38339f,
+ 0.45176f,
};
static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
@@ -609,14 +664,9 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_16x8_ver_layer0,
- av1_tx_type_nn_weights_16x8_ver_layer1,
- },
- {
- av1_tx_type_nn_bias_16x8_ver_layer0,
- av1_tx_type_nn_bias_16x8_ver_layer1,
- },
+ { av1_tx_type_nn_weights_16x8_ver_layer0,
+ av1_tx_type_nn_weights_16x8_ver_layer1 },
+ { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 }
};
/******************************************************************************/
@@ -687,445 +737,253 @@ static const NN_CONFIG av1_tx_type_nnconfig_16x16 = {
};
/******************************************************************************/
-// Tx type model for 16x32 block.
-static const float av1_tx_type_nn_weights_16x32_hor_layer0[128] = {
- 0.89821f, 0.90804f, 1.13052f, 0.74855f, 1.02053f, 0.91260f, 0.97102f,
- 0.16808f, -0.19982f, -0.33296f, -0.22490f, -0.22481f, -0.09332f, -2.44338f,
- -0.12236f, -0.03158f, -1.43561f, 0.07794f, 0.16586f, 0.09731f, 0.12967f,
- 0.09725f, -0.16826f, 1.26640f, 0.88004f, 0.27312f, -0.07993f, 0.33640f,
- 0.11732f, 0.33384f, 0.97066f, -0.61744f, -0.48545f, 0.44622f, 0.73744f,
- 0.32262f, -0.05713f, 0.42280f, 1.10378f, 0.18540f, -2.07906f, 0.11443f,
- 0.37877f, 0.24136f, -0.12524f, -0.12434f, 0.02116f, 0.11716f, 1.28267f,
- 1.01508f, 1.26184f, 1.22545f, 1.29582f, 1.18855f, 1.27564f, 0.42001f,
- -0.41481f, 0.06725f, -0.13133f, -0.24801f, 0.16515f, 0.16228f, 0.35197f,
- 0.53610f, -0.39805f, -0.32584f, 0.40096f, 0.38621f, -0.00030f, -0.23434f,
- 0.29149f, -0.76542f, 0.04996f, -0.30036f, 1.48687f, 0.90852f, -0.03083f,
- -0.15953f, 1.19259f, 0.87690f, -1.08977f, 0.78757f, 0.81149f, 0.54089f,
- 0.35400f, 0.37919f, 0.84997f, -0.20449f, 0.39601f, -0.37596f, 0.64748f,
- 0.26021f, 0.37354f, 0.23593f, 0.16335f, 1.70681f, 0.31800f, -0.00964f,
- 0.82687f, -0.78372f, -1.47438f, 0.32410f, 1.37436f, 0.07476f, -0.40574f,
- -0.10353f, -0.79300f, 0.74381f, -0.15601f, -0.14380f, -0.20961f, -0.52697f,
- 0.04669f, -0.00870f, 0.05624f, -0.09036f, 0.25701f, 0.30336f, 0.24199f,
- 0.45579f, 0.66330f, -1.81834f, 0.74965f, 1.22747f, 0.25072f, 0.25100f,
- 0.43289f, -0.00362f,
-};
-
-static const float av1_tx_type_nn_bias_16x32_hor_layer0[16] = {
- -0.87643f, 0.36754f, -0.86409f, 1.37761f, 1.22688f, 0.09074f,
- -1.47139f, -1.06100f, -0.24087f, 1.10382f, -0.32837f, -1.39592f,
- -0.14741f, -0.43954f, 1.72137f, -0.21704f,
-};
-
-static const float av1_tx_type_nn_weights_16x32_hor_layer1[64] = {
- -0.81860f, -0.80745f, -0.43612f, 0.58656f, 0.37455f, -0.56519f, -1.71536f,
- 0.23278f, 0.23951f, 1.09610f, 0.49986f, 0.43375f, -0.53182f, 0.17376f,
- 1.05626f, -0.61743f, -0.71777f, -0.66943f, 1.40091f, 0.34426f, 1.14295f,
- 0.45571f, -0.52504f, -0.00303f, 0.06044f, 0.66119f, -0.60340f, -1.14344f,
- -0.28045f, 0.12742f, 0.61484f, -0.41016f, 1.36102f, -0.86969f, -0.52728f,
- 1.01725f, 0.67083f, -0.10138f, 1.36406f, 0.34066f, 0.12498f, 0.86595f,
- -0.39636f, -0.27888f, -0.40244f, 0.09847f, 0.81178f, -0.45313f, 1.39127f,
- 0.99865f, -0.57908f, 0.55072f, 0.49638f, 1.11524f, 1.85504f, -0.28316f,
- -0.05195f, -0.23284f, 0.26461f, -1.28120f, 0.60707f, -0.06110f, 0.74085f,
- 0.63304f,
-};
-
-static const float av1_tx_type_nn_bias_16x32_hor_layer1[4] = {
- 0.71765f,
- 1.40400f,
- 0.32221f,
- 3.07234f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_16x32_hor = {
+// Tx type model for 4x16 block.
+static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = {
+ 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f,
+ 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+ 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f,
+ 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f,
+ -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = {
+ -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+ 0.62806f, -0.20675f, 4.91940f, -0.56079f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = {
+ -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f,
+ -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f,
+ 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+ 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+ 1.28413f, -0.30326f, 2.45329f, -0.83335f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = {
+ 2.33198f,
+ 3.36245f,
+ 1.62603f,
+ 2.91056f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x16_hor_layer0,
+ av1_tx_type_nn_weights_4x16_hor_layer1 },
+ { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = {
+ 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f,
+ 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f,
+ -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f,
+ -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+ -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f,
+ -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f,
+ 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f,
+ 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f,
+ 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+ -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f,
+ -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f,
+ 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+ 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+ 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f,
+ 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f,
+ -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+ 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f,
+ 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f,
+ -0.27975f, -0.01149f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = {
+ -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f,
+ -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+ -0.32530f, 0.73483f, 0.08322f, -0.23890f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = {
+ 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f,
+ -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f,
+ 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f,
+ -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f,
+ 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f,
+ -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f,
+ 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f,
+ 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f,
+ -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f,
+ -0.56513f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = {
+ 4.60896f,
+ 4.53551f,
+ 4.53124f,
+ 4.27435f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = {
8, // num_inputs
4, // num_outputs
1, // num_hidden_layers
{
16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_16x32_hor_layer0,
- av1_tx_type_nn_weights_16x32_hor_layer1,
- },
- {
- av1_tx_type_nn_bias_16x32_hor_layer0,
- av1_tx_type_nn_bias_16x32_hor_layer1,
- },
+ { av1_tx_type_nn_weights_4x16_ver_layer0,
+ av1_tx_type_nn_weights_4x16_ver_layer1 },
+ { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 }
};
+/******************************************************************************/
-static const float av1_tx_type_nn_weights_16x32_ver_layer0[512] = {
- -0.01219f, 0.51494f, 0.25450f, 0.45788f, -0.87277f, 0.32954f, -0.04851f,
- -0.24321f, -0.40000f, 0.21915f, 0.14108f, 0.98268f, 0.18989f, 0.54298f,
- 0.36349f, 0.38931f, 1.08124f, 0.87199f, 1.03553f, 1.14777f, 1.04254f,
- 1.11336f, 0.92198f, 0.84715f, 1.89363f, 1.21587f, 0.72377f, 1.25097f,
- 0.84231f, 0.95529f, 1.12346f, 0.19113f, -0.04559f, 0.56859f, 0.59747f,
- 0.60176f, 0.82465f, 0.59009f, 0.67240f, 1.58674f, -0.92951f, -0.23449f,
- 0.11923f, -0.19151f, -0.15914f, 0.03146f, -0.16541f, 0.17181f, -0.21834f,
- 0.21906f, 0.96708f, 0.36085f, -0.42380f, -2.25681f, -0.48812f, 0.72875f,
- 0.06585f, 0.18818f, -0.02109f, -0.10996f, 0.00187f, -0.02078f, 0.04484f,
- -0.07171f, 0.94773f, -0.33466f, 0.28484f, 0.14791f, 0.30274f, 0.13377f,
- 0.40970f, 0.45133f, 1.69265f, -0.36422f, -0.15889f, 0.07670f, 0.44675f,
- -0.28665f, -0.07097f, 1.03803f, -0.83274f, -0.24571f, 0.08039f, -0.23790f,
- -0.23276f, -0.28031f, 0.26451f, -0.18513f, -2.23336f, -0.62073f, 0.32495f,
- -0.67644f, -0.08559f, -0.36382f, -0.24515f, -0.01899f, 0.09012f, 0.19723f,
- 0.04017f, 0.31624f, 0.58369f, 0.30411f, -0.81165f, -2.58541f, -0.20491f,
- 0.68089f, -0.14799f, 0.13925f, 0.12867f, 0.15229f, 0.06887f, -0.03784f,
- 0.02288f, -0.28712f, 0.14107f, 0.29485f, -0.11662f, 0.25239f, 0.30311f,
- -0.07377f, -0.10962f, 0.59856f, 0.47967f, 0.01847f, -0.27889f, 0.46786f,
- 0.18118f, 0.09355f, -2.10076f, 0.38823f, 0.28202f, 0.29104f, 0.86977f,
- 0.52377f, 0.21161f, 0.72888f, -0.00952f, 0.15982f, -0.14651f, 0.28763f,
- -0.14155f, 0.00093f, 0.08351f, 0.34685f, -0.22066f, 0.20378f, 0.25416f,
- 0.03423f, -0.11068f, -0.41612f, 0.56913f, -0.06697f, -0.12585f, -0.21033f,
- -0.14513f, -0.04477f, -0.35778f, 0.03437f, 0.06956f, -0.25356f, -1.46010f,
- -0.08142f, 0.11926f, -0.63551f, -0.13882f, 0.34164f, 0.10821f, 1.07323f,
- -0.62435f, -0.27116f, 0.25971f, 0.11952f, -0.39480f, -0.05474f, -0.12582f,
- 0.28289f, 0.13723f, 0.58369f, 0.41865f, 0.28574f, 1.01357f, 0.46661f,
- 0.61717f, 0.85708f, -0.03930f, -0.38013f, -0.33888f, -0.20561f, -0.19087f,
- -0.01041f, 0.12119f, -0.20786f, 0.55915f, 0.67511f, 0.55554f, 0.56540f,
- 0.76647f, 0.54766f, 0.45166f, 0.61384f, 0.95407f, -0.06811f, -0.62132f,
- 0.12713f, 0.63713f, 2.04090f, 1.17054f, 0.00469f, -0.93692f, -0.24136f,
- -0.04281f, -0.15787f, 0.37956f, -0.09174f, -0.72494f, 0.55285f, -1.40996f,
- -0.54077f, 0.38445f, -0.08258f, 0.64259f, -0.54058f, -0.49865f, 1.41371f,
- 0.89014f, 0.78788f, 0.37919f, 0.87447f, -0.00760f, -0.00947f, 0.16323f,
- -0.36632f, -1.38115f, -0.24619f, 0.40490f, -0.08871f, -0.25365f, -0.60842f,
- 0.11128f, 0.18658f, -0.86001f, -0.28271f, 0.39572f, -0.29930f, -0.10110f,
- 0.33706f, 0.21731f, 0.15383f, -0.01707f, 0.02812f, 0.31192f, 0.39742f,
- 0.38260f, -0.48263f, 0.57385f, 0.53239f, -0.60013f, -0.63211f, -0.45140f,
- -0.73520f, -0.95260f, -0.70633f, -0.96190f, 0.01747f, -0.05195f, -0.07138f,
- -1.09535f, -0.63548f, -1.55700f, -0.35721f, -0.18923f, 0.77568f, 0.09419f,
- 0.36919f, -0.32761f, -0.06597f, -0.38988f, -0.43674f, -0.24284f, 0.36906f,
- 0.28414f, 0.19273f, -0.68516f, 0.09514f, -0.45381f, 0.19917f, -0.32377f,
- 1.32549f, 0.08244f, -0.64405f, 0.13195f, 2.85307f, 0.47631f, -0.33408f,
- 0.04168f, 0.18585f, -0.18029f, 0.07986f, -0.08816f, -0.00703f, -0.01515f,
- -0.13164f, 0.00571f, 0.05676f, 1.51425f, 0.73360f, 0.43486f, -0.08223f,
- -0.06183f, -0.57098f, -0.29948f, 0.05945f, 0.19238f, -0.47980f, -0.35902f,
- -0.19931f, 0.43443f, 0.67436f, 0.78573f, 0.25703f, 1.01863f, 0.99047f,
- 0.95228f, 1.02429f, 1.19264f, 0.29935f, -0.26583f, -0.98749f, -0.46167f,
- -0.29727f, -0.10515f, -0.39790f, -0.59321f, -0.61925f, -0.95452f, 0.04292f,
- -0.48273f, -0.91195f, -0.45971f, -0.46355f, -0.88319f, -0.51712f, -0.47682f,
- -0.86110f, -0.59178f, -0.57163f, -0.94946f, 0.19627f, -0.18699f, 0.11037f,
- 1.39110f, 0.05715f, 3.00762f, 1.52243f, 0.25028f, 0.12779f, -0.12871f,
- 0.04764f, 0.08288f, -0.16572f, -0.06580f, 0.05845f, -0.01474f, 0.04886f,
- -0.10000f, 0.12911f, -0.01416f, -0.12472f, 0.14358f, 0.16554f, 0.08853f,
- 0.13418f, -0.05408f, -0.13871f, -0.00049f, 0.20725f, -0.05603f, 0.27885f,
- -0.14277f, 0.29653f, -0.24739f, 0.10101f, -0.17068f, -2.43802f, 0.41834f,
- 0.49784f, 0.34949f, 0.98487f, 0.16792f, 1.07355f, 0.32546f, 1.32377f,
- -0.08584f, 0.85214f, -0.05721f, 0.90307f, 0.20167f, 0.52664f, -0.14478f,
- 0.64997f, 0.06846f, 0.32475f, 0.64453f, 0.70143f, -0.03091f, -0.24958f,
- -0.39021f, -0.57693f, -0.18319f, 0.11793f, -0.05948f, 0.36670f, -0.27932f,
- 0.14800f, -0.55459f, -0.89673f, 0.65922f, 0.54308f, -0.16731f, -0.59731f,
- -0.20705f, -0.18183f, -0.05645f, -0.06829f, -0.40210f, -0.27955f, 0.28054f,
- 0.57665f, 0.14171f, 0.54693f, -0.22144f, -0.59664f, 0.13295f, 0.07057f,
- -0.19698f, 0.03328f, -0.09687f, -0.32390f, -0.11506f, -0.40406f, -0.11473f,
- 0.10399f, -0.29824f, 0.16028f, 0.00053f, 0.22699f, 0.04203f, -0.43880f,
- -0.12654f, 0.12172f, 0.21087f, -0.46350f, -0.22081f, -0.06173f, -0.23287f,
- 0.90314f, 0.04466f, -0.06149f, 0.32682f, 0.16609f, -0.58991f, -0.03786f,
- -0.41329f, 0.02632f, 0.23411f, 0.25344f, 0.16468f, 0.31007f, 0.21845f,
- 0.32462f, 0.33945f, 0.11527f, -0.35926f, -0.18584f, 0.29340f, 0.78199f,
- 2.39287f, 0.53838f, -1.55085f, 0.02238f, -0.26153f, -0.42498f, -0.02460f,
- 0.19261f, -0.10870f, -0.08453f, -0.39561f, 0.08600f, 0.36310f, 0.58439f,
- -0.59526f, 0.13104f, -0.06703f, -0.17529f, -0.41431f, -0.23121f, -0.32394f,
- -0.33324f, -0.21405f, -0.41702f, -0.29236f, -0.31766f, -0.33512f, -0.22679f,
- -0.13680f, -0.00118f, -1.81744f, -2.34798f, -1.08048f, -0.29883f, -0.29123f,
- -0.01752f,
-};
-
-static const float av1_tx_type_nn_bias_16x32_ver_layer0[32] = {
- 1.02458f, -1.02185f, -0.18978f, 0.05981f, -0.94931f, 0.34544f, 0.04415f,
- -0.60036f, -0.11368f, -0.14154f, 1.23438f, 0.51640f, -0.57587f, -0.91380f,
- 0.95720f, 0.68298f, -0.06353f, -2.14960f, -0.11080f, 0.79380f, -0.94199f,
- 0.43040f, 0.01358f, 0.07201f, -0.49689f, -0.14839f, -0.80132f, -0.13925f,
- -0.11834f, -0.24998f, -0.33976f, 0.35497f,
-};
-
-static const float av1_tx_type_nn_weights_16x32_ver_layer1[128] = {
- 0.87367f, -1.06469f, -0.50829f, -0.70540f, 1.14596f, -1.12346f, -0.94467f,
- 0.01380f, -0.18911f, 0.07961f, -0.18626f, 0.61902f, -0.64423f, 1.21545f,
- 1.01149f, 0.26309f, 1.50380f, 1.93940f, -0.64064f, 1.03987f, -1.88000f,
- -0.44574f, -1.53303f, 1.36307f, 1.00292f, 0.37031f, 0.21594f, 0.16758f,
- 0.02592f, -0.77431f, -0.31797f, -1.53826f, 1.14013f, -1.21957f, 0.04571f,
- -0.22168f, 0.32299f, 0.25949f, -0.13306f, 0.17850f, 0.92494f, 0.19999f,
- 0.07494f, -0.03362f, -0.53453f, 1.02970f, -0.22947f, 0.73964f, 1.08445f,
- 0.16855f, -0.02686f, 0.25254f, 0.05952f, 0.02194f, 0.05649f, 0.39195f,
- 0.14139f, 0.53843f, -0.06959f, -0.06993f, -0.14151f, -0.53147f, 0.17481f,
- -1.21977f, 0.62932f, 1.07173f, 0.24049f, -0.51574f, 0.97492f, -0.28169f,
- -0.15406f, -0.05441f, -0.25415f, 0.16583f, 0.43674f, -0.00593f, -0.09277f,
- 0.61402f, 1.35562f, -0.03926f, 0.18967f, -0.29548f, -0.55509f, 0.23661f,
- 0.05023f, 0.36226f, -0.83314f, 0.39357f, 0.19943f, -0.63431f, -0.03847f,
- 0.12213f, 0.62024f, -0.11704f, -0.22483f, 0.96624f, 0.18518f, 0.09181f,
- -0.63068f, 0.66797f, 0.74107f, 0.40624f, 0.70636f, -0.06921f, 0.34175f,
- -0.15513f, 2.07844f, 0.22126f, 0.52919f, 0.26793f, -0.50018f, 1.10549f,
- 0.10970f, 0.05831f, 0.82842f, -1.22975f, 1.78377f, 0.92679f, 2.01480f,
- -1.19011f, -0.53381f, 0.38533f, 0.45579f, -0.10683f, -0.40828f, 0.31398f,
- 0.14978f, 0.91325f,
-};
-
-static const float av1_tx_type_nn_bias_16x32_ver_layer1[4] = {
- 1.03659f,
- 1.80249f,
- 1.25710f,
- 1.32000f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_16x32_ver = {
- 16, // num_inputs
- 4, // num_outputs
- 1, // num_hidden_layers
+// Tx type model for 16x4 block.
+static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = {
+ 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f,
+ 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+ -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f,
+ -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f,
+ -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f,
+ -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f,
+ 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f,
+ 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f,
+ 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f,
+ -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f,
+ 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f,
+ -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f,
+ 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f,
+ -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f,
+ -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+ -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f,
+ 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f,
+ 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f,
+ 0.19055f, -1.56413f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = {
+ -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f,
+ 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f,
+ 1.14048f, 0.33308f, -1.10886f, 0.41184f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = {
+ -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+ 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f,
+ -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f,
+ -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f,
+ 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f,
+ -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f,
+ -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f,
+ 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f,
+ 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f,
+ -0.43819f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = {
+ 2.32575f,
+ 2.75703f,
+ 1.12304f,
+ 2.15567f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
{
- 32,
+ 16,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_16x32_ver_layer0,
- av1_tx_type_nn_weights_16x32_ver_layer1,
- },
- {
- av1_tx_type_nn_bias_16x32_ver_layer0,
- av1_tx_type_nn_bias_16x32_ver_layer1,
- },
+ { av1_tx_type_nn_weights_16x4_hor_layer0,
+ av1_tx_type_nn_weights_16x4_hor_layer1 },
+ { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 }
};
-/******************************************************************************/
-// Tx type model for 32x16 block.
-static const float av1_tx_type_nn_weights_32x16_hor_layer0[512] = {
- -0.07289f, 0.30798f, 0.41881f, 0.33434f, -0.01599f, 0.85307f, -0.16060f,
- -0.07922f, -0.04693f, 0.29186f, 0.44117f, 1.02417f, 0.12447f, 0.46321f,
- 0.40060f, 0.50140f, 0.48338f, 0.47298f, 0.36585f, 0.42821f, 0.41289f,
- 0.47534f, 0.42900f, 0.26061f, 0.45887f, 0.38163f, 0.17302f, 1.00888f,
- 1.79910f, 1.36140f, 0.24471f, 0.04557f, 1.10823f, 0.74325f, 0.91210f,
- 0.81387f, 0.98865f, -0.09874f, 0.55146f, 0.19385f, -0.50752f, -0.17249f,
- 0.27261f, -0.02763f, -0.03286f, 0.09122f, 0.07015f, 0.20012f, 0.68983f,
- -1.25345f, -0.00145f, 0.71567f, 0.54948f, -0.56154f, -0.28918f, 0.11997f,
- -0.09907f, 0.09195f, 0.05768f, 0.15558f, 0.11284f, -0.35195f, -0.08723f,
- -0.03571f, 0.94031f, 0.63737f, 0.98202f, 0.93826f, 0.87126f, 0.88530f,
- 0.97697f, 0.55283f, 0.58670f, 0.86502f, 0.97008f, 0.99709f, 0.66214f,
- 0.96660f, 0.99890f, 0.31945f, -1.00301f, 0.13215f, -0.03950f, 0.21148f,
- 0.05128f, 0.10955f, 0.44839f, -0.33438f, -2.09773f, 0.13908f, 0.58669f,
- 0.25268f, -0.24006f, 0.01286f, -0.05732f, 0.03401f, -0.06896f, 0.35397f,
- 0.05133f, -0.21449f, -0.38437f, -0.32326f, -0.38731f, -0.44419f, 0.25968f,
- -0.29422f, -0.12553f, -0.08896f, -0.16400f, -0.22309f, 0.21380f, -0.26912f,
- 0.06866f, -0.25694f, 0.17632f, 0.32032f, -0.10666f, 0.26278f, 0.31877f,
- -0.09338f, -0.14289f, 0.54232f, 0.46070f, 0.00059f, -0.27914f, 0.45177f,
- 0.16274f, -0.08811f, -0.45791f, 0.53946f, -0.16794f, 0.16229f, 0.11840f,
- -0.24435f, 0.26894f, -0.33180f, -0.47314f, 0.34061f, -0.13939f, 0.13321f,
- -0.05208f, -0.18139f, -0.35234f, 1.37298f, -0.19360f, 0.21728f, 0.26088f,
- 0.04045f, -0.10763f, -0.40470f, 0.50026f, -0.06726f, -0.12871f, -0.20963f,
- -0.14583f, -0.04711f, -0.35988f, 0.03091f, 0.06491f, -0.31668f, -0.52190f,
- 0.23397f, -0.13984f, -0.15207f, -0.49977f, 0.51205f, 0.12559f, -0.03631f,
- 0.33447f, -0.36684f, 0.17533f, 0.15671f, -0.00096f, 0.06817f, 0.20922f,
- 0.34006f, 0.71260f, 0.45024f, 0.53033f, 0.15645f, 0.76019f, 0.56870f,
- 0.83066f, 0.63022f, 1.74436f, -0.24798f, 0.06795f, -0.00749f, 0.17795f,
- 0.10371f, 0.06527f, 0.41054f, 0.49003f, 0.34630f, 0.02615f, 0.30320f,
- -0.47133f, -0.49584f, 0.21775f, 0.27530f, -0.29977f, -0.64269f, 0.52627f,
- -0.02492f, 0.08077f, 0.40786f, -0.36015f, -0.70714f, -1.98185f, -0.28187f,
- 0.35018f, -0.06105f, -0.12710f, 0.06606f, -0.27805f, 0.44630f, -0.84731f,
- -0.26699f, 0.25856f, 0.06194f, -0.18674f, -0.11560f, -0.43277f, 1.10579f,
- 0.95876f, 0.17415f, 0.56386f, 0.68426f, 0.50180f, 0.24844f, 0.12347f,
- 0.15281f, -0.19089f, 0.52279f, 0.41860f, -0.05270f, -0.17029f, -0.03542f,
- 0.10621f, -0.25088f, 0.24070f, -0.08951f, 0.29950f, -0.36720f, 0.02151f,
- 0.20129f, -0.70066f, -0.23144f, -0.20070f, -0.39262f, -0.01597f, -0.05591f,
- 0.23814f, -0.25991f, 0.05812f, 0.60554f, -0.06106f, -0.58326f, 0.28762f,
- -0.18747f, 0.08232f, -0.04243f, -0.03293f, 0.14722f, -0.13017f, -0.67263f,
- 0.38698f, -0.18207f, -0.11496f, -0.27976f, -0.55345f, 1.42872f, 0.04684f,
- 0.04214f, 0.00030f, 0.02410f, 0.19966f, -0.04246f, 0.00442f, 0.23121f,
- 0.13364f, 0.21548f, -0.12748f, -0.14066f, -0.28354f, 0.59937f, -0.27553f,
- 1.57503f, -0.01050f, -0.17724f, 0.44110f, -0.80334f, 0.72064f, 1.00501f,
- -0.72638f, 0.02774f, 0.48540f, -0.72016f, -0.27721f, 0.31559f, 0.07322f,
- 0.20279f, -0.19647f, 0.02352f, 0.12662f, 0.19743f, 0.30543f, 0.25712f,
- 0.44702f, 0.16417f, 0.17888f, -2.58469f, 0.20555f, 0.57782f, -0.10892f,
- 0.14527f, 0.82251f, 0.04200f, 0.44626f, 0.10818f, 0.71204f, 0.62903f,
- 0.69178f, 0.73603f, 0.52717f, 0.83020f, 0.48824f, 1.03270f, -0.00152f,
- 0.07958f, 0.24181f, -0.78839f, -0.74214f, -0.72998f, -1.58694f, 0.17735f,
- 0.56318f, 0.32580f, -0.58503f, -0.33673f, -0.00838f, 0.48924f, 0.43362f,
- 0.12750f, 0.00295f, 0.38624f, 0.17037f, 0.00729f, -0.26256f, -0.41669f,
- 0.36847f, 0.22424f, 1.33334f, 0.18112f, 0.37682f, 0.49173f, -0.45240f,
- -0.04857f, -0.35038f, -0.83099f, -0.01988f, 0.03497f, 0.38033f, 0.13685f,
- 0.17597f, 0.28668f, 0.31193f, -0.43281f, 0.43267f, -0.50495f, 0.01969f,
- 0.14131f, -0.09326f, -0.39425f, -0.62048f, -0.09119f, -0.28306f, -0.52671f,
- -0.38584f, -0.10953f, 0.19669f, 0.34540f, -0.49941f, 0.04605f, -0.43535f,
- 0.27519f, 0.03659f, -0.31961f, 0.13330f, 0.87009f, 0.20101f, -0.70392f,
- -0.27883f, 0.33874f, -0.34308f, 0.67760f, 0.88195f, 0.55752f, -0.26563f,
- 0.17875f, 0.06964f, 0.87607f, 1.47616f, 0.46747f, -0.56408f, -0.39352f,
- -0.16427f, -0.41185f, 0.14187f, 0.19265f, -0.58613f, 0.56345f, -0.17729f,
- -0.11320f, 0.08752f, -0.01329f, 1.20981f, 0.45170f, -0.20571f, -0.01150f,
- 0.26476f, 0.13508f, 0.22020f, -0.42684f, -0.22499f, -1.51212f, 0.86648f,
- 0.21776f, 0.24666f, 0.71339f, 0.42742f, -0.00952f, 0.14762f, 0.07693f,
- -0.19599f, 0.03075f, -0.09703f, -0.32483f, -0.11616f, -0.40461f, -0.11693f,
- 0.10038f, -0.30038f, 0.14686f, 0.00548f, 0.20350f, 0.00763f, -0.43756f,
- -0.01997f, 0.00902f, 0.07470f, -0.41441f, -0.20605f, 0.07626f, -0.34973f,
- 0.47455f, -0.15251f, -0.05325f, 0.04964f, 0.32477f, -0.54604f, 0.25273f,
- -0.18461f, -0.30841f, 0.64908f, 0.60752f, 0.64148f, 0.72788f, 0.71232f,
- 0.58597f, 0.73017f, 0.58857f, 0.71908f, 0.59860f, 0.61849f, 0.99398f,
- 0.39572f, -0.36165f, -1.88646f, 0.14384f, -0.60541f, -0.21380f, -0.55498f,
- -0.50960f, -0.08801f, 0.51892f, 0.19126f, 0.57879f, 1.19447f, 0.25673f,
- -0.21631f, -0.43562f, -0.27497f, -0.02206f, -0.56169f, 0.58952f, -0.60983f,
- -0.64088f, -0.69087f, -0.56261f, -0.74089f, -0.65063f, -0.66978f, -0.60836f,
- -0.92770f, -0.77182f, -1.61031f, -0.70007f, -0.68402f, -0.42242f, -0.66722f,
- -0.14533f,
-};
-
-static const float av1_tx_type_nn_bias_32x16_hor_layer0[32] = {
- 1.53781f, -0.49320f, -0.31646f, 0.02826f, -1.05554f, 0.06559f, -0.12399f,
- -0.61671f, -0.28956f, -0.15419f, 0.87189f, -0.43375f, -1.08477f, -0.66006f,
- 0.36233f, 0.82678f, -0.92342f, -1.47101f, -0.02937f, -0.16497f, -0.75457f,
- 0.50173f, -0.07560f, 0.71598f, 1.50795f, -0.04745f, -0.14008f, -0.18510f,
- -0.14988f, -0.67044f, 0.79659f, 0.70610f,
-};
-
-static const float av1_tx_type_nn_weights_32x16_hor_layer1[128] = {
- 0.84983f, -0.62530f, -0.82600f, -0.52563f, -0.11942f, -0.50279f, -0.13425f,
- -0.02850f, 0.50767f, 0.10252f, 0.24540f, 0.67748f, -0.43483f, -0.22242f,
- 0.23431f, 0.57287f, 0.69560f, 1.13814f, -0.47427f, -0.55858f, -1.47072f,
- 0.26587f, -0.36335f, 0.83060f, 1.01645f, -0.52895f, -0.11614f, 0.17390f,
- -0.13664f, -0.83098f, -0.07985f, -1.36820f, 0.47759f, -0.55612f, 0.46852f,
- 0.07406f, -0.80467f, 0.23059f, 0.09992f, -0.06164f, 0.13541f, 0.06135f,
- 0.83605f, -0.53224f, -0.13867f, 0.93838f, -0.61290f, 0.27732f, -0.46688f,
- -0.41810f, 0.12885f, 0.13619f, -0.24612f, 0.07215f, 0.98866f, 0.10993f,
- 1.05799f, -0.27146f, -0.00079f, -0.08585f, 0.08322f, -0.33809f, 0.67598f,
- -1.06515f, 1.28866f, 0.61028f, -0.31704f, -0.59905f, 1.62151f, 0.10969f,
- 0.20671f, -0.17818f, 0.14170f, 0.19322f, 0.30602f, 0.93111f, 0.19011f,
- -0.45609f, 0.82506f, 0.32936f, -0.07858f, -0.27106f, -0.31638f, 0.23299f,
- 0.81491f, 0.32584f, -0.52093f, -0.32472f, 0.53643f, -0.42605f, 0.01641f,
- 0.09002f, 0.15832f, -0.08790f, 0.05511f, 1.00730f, 0.46309f, 0.68166f,
- -0.18835f, 0.64512f, -1.00540f, 0.86802f, 0.18981f, -0.06982f, -0.24514f,
- -0.08027f, 0.61199f, -0.20830f, 0.72001f, 0.17477f, 0.06511f, 0.00801f,
- -0.43590f, 0.37257f, 0.70323f, 0.60233f, 1.62541f, 0.74383f, -0.22254f,
- -0.33892f, 0.22881f, 0.62817f, 0.68915f, -0.06417f, 0.00969f, 1.65869f,
- 0.89060f, 0.75948f,
-};
-
-static const float av1_tx_type_nn_bias_32x16_hor_layer1[4] = {
- 0.95359f,
- 1.56043f,
- 1.06017f,
- 2.54520f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_32x16_hor = {
- 16, // num_inputs
- 4, // num_outputs
- 1, // num_hidden_layers
- {
- 32,
- }, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_32x16_hor_layer0,
- av1_tx_type_nn_weights_32x16_hor_layer1,
- },
- {
- av1_tx_type_nn_bias_32x16_hor_layer0,
- av1_tx_type_nn_bias_32x16_hor_layer1,
- },
+static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = {
+ 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+ 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f,
+ -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f,
+ -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+ -0.17967f, -0.96622f, 0.42635f, -1.04784f,
};
-static const float av1_tx_type_nn_weights_32x16_ver_layer0[128] = {
- 1.30219f, 1.30548f, 1.33334f, 1.20560f, 1.01572f, 1.38100f, 1.37504f,
- 0.12599f, -0.96957f, 0.19400f, 0.75734f, 0.11295f, -0.40447f, -1.53062f,
- -0.82980f, 0.02168f, -1.11289f, -0.66861f, -0.83663f, -0.91455f, -0.78618f,
- -0.87176f, -1.10711f, 0.71207f, 1.49689f, -0.12715f, 0.29357f, 0.35234f,
- 0.61016f, 0.80708f, 0.83564f, 1.05961f, -0.99842f, 0.82004f, 0.02638f,
- 0.44606f, 0.32298f, 0.21321f, 0.47290f, -0.71442f, -2.81050f, -0.02520f,
- -0.08919f, 0.00369f, -0.05257f, -0.07011f, -0.16394f, 0.06290f, 0.80086f,
- 0.32349f, 0.47411f, 1.36126f, 1.68162f, 0.91325f, -0.27495f, 0.00262f,
- 0.06025f, 0.42832f, 0.36965f, 0.38063f, 0.32772f, 0.40914f, 0.44510f,
- 3.02239f, -1.84077f, 0.49536f, -0.27340f, -0.10437f, -0.34293f, -0.08047f,
- -0.29651f, -0.97111f, -0.34187f, 0.52869f, 1.27240f, 1.20306f, 1.19121f,
- 1.28742f, 0.26393f, -0.62319f, 0.92285f, -0.08303f, -0.33118f, -0.13053f,
- 0.24875f, -0.52089f, 0.44691f, -1.08908f, 1.20921f, 0.36538f, -0.46792f,
- -0.18855f, -0.13443f, -0.28472f, -0.10353f, 0.06911f, 0.68519f, 0.08228f,
- -0.49027f, -0.34381f, 0.04719f, -0.33298f, 0.72525f, 0.09538f, -0.29216f,
- -0.07260f, -0.55827f, 0.54542f, -0.10144f, -0.09292f, -0.14427f, -0.38361f,
- -0.41559f, 0.75338f, -0.04530f, 0.27944f, 0.06932f, -0.11537f, 0.29568f,
- 1.92155f, -0.98996f, -0.08841f, 0.49386f, 0.15947f, 0.53290f, 1.46747f,
- 0.59360f, 0.25468f,
-};
-
-static const float av1_tx_type_nn_bias_32x16_ver_layer0[16] = {
- -1.19673f, 0.33043f, 0.24408f, 0.46221f, 2.00646f, 0.19031f,
- -0.64944f, -0.43452f, 1.04400f, 1.47371f, 0.52460f, -1.39577f,
- 0.83852f, -0.25536f, 1.33200f, -0.24444f,
-};
-
-static const float av1_tx_type_nn_weights_32x16_ver_layer1[64] = {
- -1.31447f, -0.86455f, 0.85217f, 1.00048f, 0.37395f, -1.35713f, -0.54032f,
- 0.82803f, 0.89606f, 1.57696f, 0.68067f, 0.42512f, -0.26250f, 0.14621f,
- 0.93249f, -0.77690f, -0.93652f, -0.44488f, 0.68360f, -0.88178f, 1.89111f,
- 0.67700f, -0.29310f, 0.91604f, -1.21881f, 1.11188f, 0.45045f, -0.86119f,
- -0.09294f, 0.09360f, 0.80794f, 0.41027f, 1.80399f, -0.50362f, -1.44689f,
- 0.85148f, 0.90707f, -0.18458f, 0.14165f, 1.17367f, 0.70869f, 1.57147f,
- 0.24692f, 0.16626f, 0.56794f, 0.07313f, 0.14728f, -0.74296f, 1.74127f,
- 1.26560f, 0.17753f, 1.10194f, 0.56435f, 1.73779f, 1.42841f, -1.16773f,
- 0.24584f, 0.10813f, -0.60187f, 0.79802f, 0.75229f, -0.06112f, 1.77282f,
- 1.01058f,
-};
-
-static const float av1_tx_type_nn_bias_32x16_ver_layer1[4] = {
- 0.83082f,
- 2.03845f,
- 0.59627f,
- 2.31341f,
-};
-
-static const NN_CONFIG av1_tx_type_nnconfig_32x16_ver = {
- 8, // num_inputs
+static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = {
+ -0.52088f, 0.52844f, -1.03655f, -0.30974f,
+ 2.59952f, -1.93604f, 0.00000f, 2.51787f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = {
+ 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f,
+ 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f,
+ 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f,
+ -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f,
+ 1.26814f, -1.93873f, -0.00768f, 1.58309f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = {
+ 2.34713f,
+ 1.68667f,
+ 1.25488f,
+ 1.69812f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = {
+ 4, // num_inputs
4, // num_outputs
1, // num_hidden_layers
{
- 16,
+ 8,
}, // num_hidden_nodes
- {
- av1_tx_type_nn_weights_32x16_ver_layer0,
- av1_tx_type_nn_weights_32x16_ver_layer1,
- },
- {
- av1_tx_type_nn_bias_32x16_ver_layer0,
- av1_tx_type_nn_bias_32x16_ver_layer1,
- },
+ { av1_tx_type_nn_weights_16x4_ver_layer0,
+ av1_tx_type_nn_weights_16x4_ver_layer1 },
+ { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 }
};
/******************************************************************************/
// Map tx_size to its corresponding neural net model for tx type prediction.
static const NN_CONFIG *av1_tx_type_nnconfig_map_hor[] = {
- &av1_tx_type_nnconfig_4x4, // 4x4
- &av1_tx_type_nnconfig_8x8, // 8x8
- &av1_tx_type_nnconfig_16x16, // 16x16
- NULL, // 32x32
- NULL, // 64x64
- &av1_tx_type_nnconfig_4x8_hor, // 4x8
- &av1_tx_type_nnconfig_8x4_hor, // 8x4
- &av1_tx_type_nnconfig_8x16_hor, // 8x16
- &av1_tx_type_nnconfig_16x8_hor, // 16x8
- &av1_tx_type_nnconfig_16x32_hor, // 16x32
- &av1_tx_type_nnconfig_32x16_hor, // 32x16
- NULL, // 32x64
- NULL, // 64x32
- NULL, // 4x16
- NULL, // 16x4
- NULL, // 8x32
- NULL, // 32x8
- NULL, // 16x64
- NULL, // 64x16
+ &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
};
static const NN_CONFIG *av1_tx_type_nnconfig_map_ver[] = {
- &av1_tx_type_nnconfig_4x4, // 4x4 transform
- &av1_tx_type_nnconfig_8x8, // 8x8 transform
- &av1_tx_type_nnconfig_16x16, // 16x16 transform
- NULL, // 32x32 transform
- NULL, // 64x64 transform
- &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform
- &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform
- &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform
- &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform
- &av1_tx_type_nnconfig_16x32_ver, // 16x32 transform
- &av1_tx_type_nnconfig_32x16_ver, // 32x16 transform
- NULL, // 32x64 transform
- NULL, // 64x32 transform
- NULL, // 4x16 transform
- NULL, // 16x4 transform
- NULL, // 8x32 transform
- NULL, // 32x8 transform
- NULL, // 16x64 transform
- NULL, // 64x16 transform
+ &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
};
// Tx split model for 4x8 block.
@@ -2083,4 +1941,4 @@ static const NN_CONFIG *av1_tx_split_nnconfig_map[TX_SIZES_ALL] = {
} // extern "C"
#endif
-#endif // AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#endif // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
index c71f2e74c..07615543c 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -395,7 +395,8 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
}
void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
- int8_t cos_bit) {
+ int8_t cos_bit, const int instride,
+ const int outstride) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
@@ -480,70 +481,70 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
// stage 1
__m128i x1[64];
- x1[0] = _mm_add_epi32(input[0], input[63]);
- x1[63] = _mm_sub_epi32(input[0], input[63]);
- x1[1] = _mm_add_epi32(input[1], input[62]);
- x1[62] = _mm_sub_epi32(input[1], input[62]);
- x1[2] = _mm_add_epi32(input[2], input[61]);
- x1[61] = _mm_sub_epi32(input[2], input[61]);
- x1[3] = _mm_add_epi32(input[3], input[60]);
- x1[60] = _mm_sub_epi32(input[3], input[60]);
- x1[4] = _mm_add_epi32(input[4], input[59]);
- x1[59] = _mm_sub_epi32(input[4], input[59]);
- x1[5] = _mm_add_epi32(input[5], input[58]);
- x1[58] = _mm_sub_epi32(input[5], input[58]);
- x1[6] = _mm_add_epi32(input[6], input[57]);
- x1[57] = _mm_sub_epi32(input[6], input[57]);
- x1[7] = _mm_add_epi32(input[7], input[56]);
- x1[56] = _mm_sub_epi32(input[7], input[56]);
- x1[8] = _mm_add_epi32(input[8], input[55]);
- x1[55] = _mm_sub_epi32(input[8], input[55]);
- x1[9] = _mm_add_epi32(input[9], input[54]);
- x1[54] = _mm_sub_epi32(input[9], input[54]);
- x1[10] = _mm_add_epi32(input[10], input[53]);
- x1[53] = _mm_sub_epi32(input[10], input[53]);
- x1[11] = _mm_add_epi32(input[11], input[52]);
- x1[52] = _mm_sub_epi32(input[11], input[52]);
- x1[12] = _mm_add_epi32(input[12], input[51]);
- x1[51] = _mm_sub_epi32(input[12], input[51]);
- x1[13] = _mm_add_epi32(input[13], input[50]);
- x1[50] = _mm_sub_epi32(input[13], input[50]);
- x1[14] = _mm_add_epi32(input[14], input[49]);
- x1[49] = _mm_sub_epi32(input[14], input[49]);
- x1[15] = _mm_add_epi32(input[15], input[48]);
- x1[48] = _mm_sub_epi32(input[15], input[48]);
- x1[16] = _mm_add_epi32(input[16], input[47]);
- x1[47] = _mm_sub_epi32(input[16], input[47]);
- x1[17] = _mm_add_epi32(input[17], input[46]);
- x1[46] = _mm_sub_epi32(input[17], input[46]);
- x1[18] = _mm_add_epi32(input[18], input[45]);
- x1[45] = _mm_sub_epi32(input[18], input[45]);
- x1[19] = _mm_add_epi32(input[19], input[44]);
- x1[44] = _mm_sub_epi32(input[19], input[44]);
- x1[20] = _mm_add_epi32(input[20], input[43]);
- x1[43] = _mm_sub_epi32(input[20], input[43]);
- x1[21] = _mm_add_epi32(input[21], input[42]);
- x1[42] = _mm_sub_epi32(input[21], input[42]);
- x1[22] = _mm_add_epi32(input[22], input[41]);
- x1[41] = _mm_sub_epi32(input[22], input[41]);
- x1[23] = _mm_add_epi32(input[23], input[40]);
- x1[40] = _mm_sub_epi32(input[23], input[40]);
- x1[24] = _mm_add_epi32(input[24], input[39]);
- x1[39] = _mm_sub_epi32(input[24], input[39]);
- x1[25] = _mm_add_epi32(input[25], input[38]);
- x1[38] = _mm_sub_epi32(input[25], input[38]);
- x1[26] = _mm_add_epi32(input[26], input[37]);
- x1[37] = _mm_sub_epi32(input[26], input[37]);
- x1[27] = _mm_add_epi32(input[27], input[36]);
- x1[36] = _mm_sub_epi32(input[27], input[36]);
- x1[28] = _mm_add_epi32(input[28], input[35]);
- x1[35] = _mm_sub_epi32(input[28], input[35]);
- x1[29] = _mm_add_epi32(input[29], input[34]);
- x1[34] = _mm_sub_epi32(input[29], input[34]);
- x1[30] = _mm_add_epi32(input[30], input[33]);
- x1[33] = _mm_sub_epi32(input[30], input[33]);
- x1[31] = _mm_add_epi32(input[31], input[32]);
- x1[32] = _mm_sub_epi32(input[31], input[32]);
+ x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]);
+ x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]);
+ x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]);
+ x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]);
+ x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]);
+ x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]);
+ x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]);
+ x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]);
+ x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]);
+ x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]);
+ x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]);
+ x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]);
+ x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]);
+ x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]);
+ x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]);
+ x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]);
+ x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]);
+ x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]);
+ x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]);
+ x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]);
+ x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]);
+ x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]);
+ x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]);
+ x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]);
+ x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]);
+ x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]);
+ x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]);
+ x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]);
+ x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]);
+ x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]);
+ x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]);
+ x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]);
+ x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]);
+ x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]);
+ x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]);
+ x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]);
+ x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]);
+ x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]);
+ x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]);
+ x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]);
+ x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]);
+ x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]);
+ x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]);
+ x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]);
+ x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]);
+ x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]);
+ x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]);
+ x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]);
+ x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]);
+ x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]);
+ x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]);
+ x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]);
+ x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]);
+ x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]);
+ x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]);
+ x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]);
+ x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]);
+ x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]);
+ x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]);
+ x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]);
+ x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]);
+ x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]);
+ x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]);
+ x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]);
// stage 2
__m128i x2[64];
@@ -1149,68 +1150,68 @@ void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
x10[48], __rounding, cos_bit);
// stage 11
- output[0] = x10[0];
- output[1] = x10[32];
- output[2] = x10[16];
- output[3] = x10[48];
- output[4] = x10[8];
- output[5] = x10[40];
- output[6] = x10[24];
- output[7] = x10[56];
- output[8] = x10[4];
- output[9] = x10[36];
- output[10] = x10[20];
- output[11] = x10[52];
- output[12] = x10[12];
- output[13] = x10[44];
- output[14] = x10[28];
- output[15] = x10[60];
- output[16] = x10[2];
- output[17] = x10[34];
- output[18] = x10[18];
- output[19] = x10[50];
- output[20] = x10[10];
- output[21] = x10[42];
- output[22] = x10[26];
- output[23] = x10[58];
- output[24] = x10[6];
- output[25] = x10[38];
- output[26] = x10[22];
- output[27] = x10[54];
- output[28] = x10[14];
- output[29] = x10[46];
- output[30] = x10[30];
- output[31] = x10[62];
- output[32] = x10[1];
- output[33] = x10[33];
- output[34] = x10[17];
- output[35] = x10[49];
- output[36] = x10[9];
- output[37] = x10[41];
- output[38] = x10[25];
- output[39] = x10[57];
- output[40] = x10[5];
- output[41] = x10[37];
- output[42] = x10[21];
- output[43] = x10[53];
- output[44] = x10[13];
- output[45] = x10[45];
- output[46] = x10[29];
- output[47] = x10[61];
- output[48] = x10[3];
- output[49] = x10[35];
- output[50] = x10[19];
- output[51] = x10[51];
- output[52] = x10[11];
- output[53] = x10[43];
- output[54] = x10[27];
- output[55] = x10[59];
- output[56] = x10[7];
- output[57] = x10[39];
- output[58] = x10[23];
- output[59] = x10[55];
- output[60] = x10[15];
- output[61] = x10[47];
- output[62] = x10[31];
- output[63] = x10[63];
+ output[0 * outstride] = x10[0];
+ output[1 * outstride] = x10[32];
+ output[2 * outstride] = x10[16];
+ output[3 * outstride] = x10[48];
+ output[4 * outstride] = x10[8];
+ output[5 * outstride] = x10[40];
+ output[6 * outstride] = x10[24];
+ output[7 * outstride] = x10[56];
+ output[8 * outstride] = x10[4];
+ output[9 * outstride] = x10[36];
+ output[10 * outstride] = x10[20];
+ output[11 * outstride] = x10[52];
+ output[12 * outstride] = x10[12];
+ output[13 * outstride] = x10[44];
+ output[14 * outstride] = x10[28];
+ output[15 * outstride] = x10[60];
+ output[16 * outstride] = x10[2];
+ output[17 * outstride] = x10[34];
+ output[18 * outstride] = x10[18];
+ output[19 * outstride] = x10[50];
+ output[20 * outstride] = x10[10];
+ output[21 * outstride] = x10[42];
+ output[22 * outstride] = x10[26];
+ output[23 * outstride] = x10[58];
+ output[24 * outstride] = x10[6];
+ output[25 * outstride] = x10[38];
+ output[26 * outstride] = x10[22];
+ output[27 * outstride] = x10[54];
+ output[28 * outstride] = x10[14];
+ output[29 * outstride] = x10[46];
+ output[30 * outstride] = x10[30];
+ output[31 * outstride] = x10[62];
+ output[32 * outstride] = x10[1];
+ output[33 * outstride] = x10[33];
+ output[34 * outstride] = x10[17];
+ output[35 * outstride] = x10[49];
+ output[36 * outstride] = x10[9];
+ output[37 * outstride] = x10[41];
+ output[38 * outstride] = x10[25];
+ output[39 * outstride] = x10[57];
+ output[40 * outstride] = x10[5];
+ output[41 * outstride] = x10[37];
+ output[42 * outstride] = x10[21];
+ output[43 * outstride] = x10[53];
+ output[44 * outstride] = x10[13];
+ output[45 * outstride] = x10[45];
+ output[46 * outstride] = x10[29];
+ output[47 * outstride] = x10[61];
+ output[48 * outstride] = x10[3];
+ output[49 * outstride] = x10[35];
+ output[50 * outstride] = x10[19];
+ output[51 * outstride] = x10[51];
+ output[52 * outstride] = x10[11];
+ output[53 * outstride] = x10[43];
+ output[54 * outstride] = x10[27];
+ output[55 * outstride] = x10[59];
+ output[56 * outstride] = x10[7];
+ output[57 * outstride] = x10[39];
+ output[58 * outstride] = x10[23];
+ output[59 * outstride] = x10[55];
+ output[60 * outstride] = x10[15];
+ output[61 * outstride] = x10[47];
+ output[62 * outstride] = x10[31];
+ output[63 * outstride] = x10[63];
}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
index abb95f31e..8ec0256eb 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -14,6 +14,7 @@
#include "av1/common/enums.h"
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
#include "av1/encoder/x86/av1_txfm1d_sse4.h"
#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
@@ -52,9 +53,22 @@ static void fdct32_new_sse4_1(const __m128i *input, __m128i *output,
}
}
+static void fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 64;
+ const int num_per_128 = 4;
+ int col_num = txfm_size / num_per_128;
+ (void)stage_range;
+ for (int col = 0; col < col_num; col++) {
+ av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num,
+ col_num);
+ }
+}
+
static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
switch (txfm_type) {
case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break;
+ case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break;
default: assert(0);
}
return NULL;
@@ -95,6 +109,42 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
transpose_32(txfm_size, buf_128, out_128);
}
+static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
+ int32_t *output, const int stride,
+ const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *txfm_buf) {
+ assert(cfg->tx_size < TX_SIZES);
+ const int txfm_size = tx_size_wide[cfg->tx_size];
+ const int8_t *shift = cfg->shift;
+ const int8_t *stage_range_col = cfg->stage_range_col;
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ __m128i *buf_128 = (__m128i *)txfm_buf;
+ __m128i *out_128 = (__m128i *)output;
+
+ const int num_per_128 = 4;
+ int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+ int col_num = txfm_size / num_per_128;
+
+ int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+ txfm_size);
+ /*col wise transform*/
+ txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+ transpose_32(txfm_size, out_128, buf_128);
+
+ /*row wise transform*/
+ for (int col = 0; col < (col_num >> 1); col++) {
+ av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row,
+ col_num, (col_num >> 1));
+ }
+
+ txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+ av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+ transpose_32x32(buf_128, out_128);
+}
+
void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
int stride, TX_TYPE tx_type, int bd) {
DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
@@ -104,6 +154,15 @@ void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
}
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+ (void)bd;
+ fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA,
const __m128i *inputB, __m128i *output) {
__m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]);
@@ -162,8 +221,8 @@ static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
bufA[j] = _mm_cvtepi16_epi32(buf[j]);
bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
}
- av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
- av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
+ av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+ av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
@@ -209,10 +268,10 @@ static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
bufA[j] = _mm_cvtepi16_epi32(buf[j]);
bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
}
- av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row);
- av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row);
- av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
- av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+ av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+ av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
int32_t *output8 = output + 8 * 32 * i;
for (int j = 0; j < width_div8; ++j) {
@@ -260,8 +319,8 @@ static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
}
av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row);
av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row);
- av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
- av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
int32_t *output8 = output + 8 * 32 * i;
for (int j = 0; j < (32 / 4); ++j) {
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
index c582ca0e3..38707137c 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_FWD_TXFM_AVX2_H_
-#define AV1_FWD_TXFM_AVX2_H_
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
#include <immintrin.h>
static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
@@ -100,4 +100,4 @@ static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
*in1 = _mm256_srai_epi32(temp1, cos_bit);
}
-#endif // AV1_FWD_TXFM_AVX2_H_
+#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
index aa14d3ade..99a6b9082 100644
--- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -8,8 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
-#define AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
#include <immintrin.h>
@@ -114,4 +114,4 @@ static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
}
#endif
-#endif // AV1_COMMON_X86_AV1_FWD_TXFM_SSE2_H_
+#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
index 0adefecdb..6df2a8bdb 100644
--- a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
+++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -9,8 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#ifndef AV1_TXMF1D_SSE2_H_
-#define AV1_TXMF1D_SSE2_H_
+#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
#include <smmintrin.h>
#include "av1/common/av1_txfm.h"
@@ -29,7 +29,8 @@ void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
int8_t cos_bit);
void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
- int8_t cos_bit);
+ int8_t cos_bit, const int instride,
+ const int outstride);
void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
const int8_t cos_bit, const int8_t *stage_range);
@@ -138,4 +139,4 @@ static INLINE void transpose_32(int txfm_size, const __m128i *input,
}
#endif
-#endif // AV1_TXMF1D_SSE2_H_
+#endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_avx2.c b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
new file mode 100644
index 000000000..7642f57d1
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+#include <immintrin.h> /* AVX2 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = width + TX_PAD_HOR;
+ const __m256i y_zeros = _mm256_setzero_si256();
+
+ const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
+ uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
+ uint8_t *pre_buf_end = pre_buf + pre_len;
+ do {
+ yy_storeu_256(pre_buf, y_zeros);
+ pre_buf += 32;
+ } while (pre_buf < pre_buf_end);
+
+ const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+ uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride;
+ uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
+
+ do {
+ yy_storeu_256(bottom_buf, y_zeros);
+ bottom_buf += 32;
+ } while (bottom_buf < bottom_buf_end);
+
+ int i = 0;
+ uint8_t *ls = levels;
+ const tran_low_t *cf = coeff;
+ if (width == 4) {
+ do {
+ const __m256i c0 = yy_loadu_256(cf);
+ const __m256i c1 = yy_loadu_256(cf + 8);
+ const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
+ const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
+ const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
+ const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
+ yy_storeu_256(ls, res);
+ ls += 32;
+ cf += 16;
+ i += 4;
+ } while (i < height);
+ } else if (width == 8) {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ const __m128i res0 = _mm256_castsi256_si128(res);
+ const __m128i res1 = _mm256_extracti128_si256(res, 1);
+ xx_storel_64(ls, res0);
+ *(int32_t *)(ls + width) = 0;
+ xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
+ *(int32_t *)(ls + width + stride) = 0;
+ xx_storel_64(ls + stride * 2, res1);
+ *(int32_t *)(ls + width + stride * 2) = 0;
+ xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
+ *(int32_t *)(ls + width + stride * 3) = 0;
+ cf += 32;
+ ls += stride << 2;
+ i += 4;
+ } while (i < height);
+ } else if (width == 16) {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ xx_storeu_128(ls, _mm256_castsi256_si128(res));
+ xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
+ cf += 32;
+ *(int32_t *)(ls + width) = 0;
+ *(int32_t *)(ls + stride + width) = 0;
+ ls += stride << 1;
+ i += 2;
+ } while (i < height);
+ } else {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ yy_storeu_256(ls, res);
+ cf += 32;
+ *(int32_t *)(ls + width) = 0;
+ ls += stride;
+ i += 1;
+ } while (i < height);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
index b3a879b0f..5e0687cd3 100644
--- a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
@@ -14,43 +14,55 @@
#include <smmintrin.h> /* SSE4.1 */
#include "aom/aom_integer.h"
-#include "aom_dsp/x86/mem_sse2.h"
#include "av1/common/onyxc_int.h"
#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
const int height, uint8_t *const levels) {
const int stride = width + TX_PAD_HOR;
- memset(levels - TX_PAD_TOP * stride, 0,
- sizeof(*levels) * TX_PAD_TOP * stride);
- memset(levels + stride * height, 0,
- sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
-
const __m128i zeros = _mm_setzero_si128();
+
+ const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
+ uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
+ uint8_t *pre_buf_end = pre_buf + pre_len;
+ do {
+ _mm_storeu_si128((__m128i *)(pre_buf), zeros);
+ pre_buf += 16;
+ } while (pre_buf < pre_buf_end);
+
+ const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+ uint8_t *bottom_buf = levels + stride * height;
+ uint8_t *bottom_buf_end = bottom_buf + bottom_len;
+ do {
+ _mm_storeu_si128((__m128i *)(bottom_buf), zeros);
+ bottom_buf += 16;
+ } while (bottom_buf < bottom_buf_end);
+
int i = 0;
uint8_t *ls = levels;
const tran_low_t *cf = coeff;
if (width == 4) {
do {
- const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
- const __m128i coeffB = _mm_load_si128((__m128i *)(cf + width));
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
const __m128i absAB = _mm_abs_epi16(coeffAB);
const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
- _mm_storeu_si128((__m128i *)ls, lsAB);
+ xx_storeu_128(ls, lsAB);
ls += (stride << 1);
cf += (width << 1);
i += 2;
} while (i < height);
} else if (width == 8) {
do {
- const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
- const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
const __m128i absAB = _mm_abs_epi16(coeffAB);
const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
- _mm_storeu_si128((__m128i *)ls, absAB8);
+ xx_storeu_128(ls, absAB8);
ls += stride;
cf += width;
i += 1;
@@ -59,16 +71,16 @@ void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
do {
int j = 0;
do {
- const __m128i coeffA = _mm_load_si128((__m128i *)(cf));
- const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4));
- const __m128i coeffC = _mm_load_si128((__m128i *)(cf + 8));
- const __m128i coeffD = _mm_load_si128((__m128i *)(cf + 12));
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
+ const __m128i coeffC = xx_loadu_128(cf + 8);
+ const __m128i coeffD = xx_loadu_128(cf + 12);
const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
const __m128i absAB = _mm_abs_epi16(coeffAB);
const __m128i absCD = _mm_abs_epi16(coeffCD);
const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
- _mm_storeu_si128((__m128i *)(ls + j), absABCD);
+ xx_storeu_128(ls + j, absABCD);
j += 16;
cf += 16;
} while (j < width);
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index 4cd6371a6..535485ae8 100644
--- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -17,6 +17,7 @@
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_ports/mem.h"
@@ -393,7 +394,32 @@ static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
_mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
}
-static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output,
+ const int stride) {
+ _mm_storeu_si128((__m128i *)(output), res[0]);
+ _mm_storeu_si128((__m128i *)(output + 4), res[1]);
+ _mm_storeu_si128((__m128i *)(output + stride), res[2]);
+ _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]);
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ (void)(col_num);
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
@@ -589,7 +615,9 @@ static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
out[13] = u[3]; // buf0[3]
}
-static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ (void)(col_num);
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
@@ -780,82 +808,82 @@ void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
switch (tx_type) {
case DCT_DCT:
load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case ADST_DCT:
load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case DCT_ADST:
load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case ADST_ADST:
load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case FLIPADST_DCT:
load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case DCT_FLIPADST:
load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
- fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case FLIPADST_FLIPADST:
load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case ADST_FLIPADST:
load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
case FLIPADST_ADST:
load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
- fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
transpose_8x8(out, in);
write_buffer_8x8(in, coeff);
break;
@@ -940,7 +968,26 @@ static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
convert_8x8_to_16x16(in, out);
}
-static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *botL = input + 8 * stride;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ }
+
+ load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
@@ -962,7 +1009,6 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
__m128i u[16], v[16], x;
- const int col_num = 4;
int col;
// Calculate the column 0, 1, 2, 3
@@ -1226,7 +1272,8 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
}
}
-static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int num_cols) {
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
@@ -1271,25 +1318,25 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
__m128i u[16], v[16], x, y;
int col;
- for (col = 0; col < 4; ++col) {
+ for (col = 0; col < num_cols; ++col) {
// stage 0
// stage 1
- u[0] = in[0 * 4 + col];
- u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
- u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
- u[3] = in[8 * 4 + col];
- u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
- u[5] = in[12 * 4 + col];
- u[6] = in[4 * 4 + col];
- u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
- u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
- u[9] = in[14 * 4 + col];
- u[10] = in[6 * 4 + col];
- u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
- u[12] = in[2 * 4 + col];
- u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
- u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
- u[15] = in[10 * 4 + col];
+ u[0] = in[0 * num_cols + col];
+ u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]);
+ u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]);
+ u[3] = in[8 * num_cols + col];
+ u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]);
+ u[5] = in[12 * num_cols + col];
+ u[6] = in[4 * num_cols + col];
+ u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]);
+ u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]);
+ u[9] = in[14 * num_cols + col];
+ u[10] = in[6 * num_cols + col];
+ u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]);
+ u[12] = in[2 * num_cols + col];
+ u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]);
+ u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]);
+ u[15] = in[10 * num_cols + col];
// stage 2
v[0] = u[0];
@@ -1453,22 +1500,22 @@ static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
// stage 9
- out[0 * 4 + col] = v[1];
- out[1 * 4 + col] = v[14];
- out[2 * 4 + col] = v[3];
- out[3 * 4 + col] = v[12];
- out[4 * 4 + col] = v[5];
- out[5 * 4 + col] = v[10];
- out[6 * 4 + col] = v[7];
- out[7 * 4 + col] = v[8];
- out[8 * 4 + col] = v[9];
- out[9 * 4 + col] = v[6];
- out[10 * 4 + col] = v[11];
- out[11 * 4 + col] = v[4];
- out[12 * 4 + col] = v[13];
- out[13 * 4 + col] = v[2];
- out[14 * 4 + col] = v[15];
- out[15 * 4 + col] = v[0];
+ out[0 * num_cols + col] = v[1];
+ out[1 * num_cols + col] = v[14];
+ out[2 * num_cols + col] = v[3];
+ out[3 * num_cols + col] = v[12];
+ out[4 * num_cols + col] = v[5];
+ out[5 * num_cols + col] = v[10];
+ out[6 * num_cols + col] = v[7];
+ out[7 * num_cols + col] = v[8];
+ out[8 * num_cols + col] = v[9];
+ out[9 * num_cols + col] = v[6];
+ out[10 * num_cols + col] = v[11];
+ out[11 * num_cols + col] = v[4];
+ out[12 * num_cols + col] = v[13];
+ out[13 * num_cols + col] = v[2];
+ out[14 * num_cols + col] = v[15];
+ out[15 * num_cols + col] = v[0];
}
}
@@ -1482,6 +1529,11 @@ static void col_txfm_16x16_rounding(__m128i *in, int shift) {
col_txfm_8x8_rounding(&in[48], shift);
}
+static void col_txfm_8x16_rounding(__m128i *in, int shift) {
+ col_txfm_8x8_rounding(&in[0], shift);
+ col_txfm_8x8_rounding(&in[16], shift);
+}
+
static void write_buffer_16x16(const __m128i *in, int32_t *output) {
const int size_8x8 = 16 * 4;
write_buffer_8x8(&in[0], output);
@@ -1499,85 +1551,86 @@ void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
const int txw_idx = get_txw_idx(TX_16X16);
const int txh_idx = get_txh_idx(TX_16X16);
+ const int col_num = 4;
switch (tx_type) {
case DCT_DCT:
load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case ADST_DCT:
load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case DCT_ADST:
load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case ADST_ADST:
load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case FLIPADST_DCT:
load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case DCT_FLIPADST:
load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
- fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case FLIPADST_FLIPADST:
load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case ADST_FLIPADST:
load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
case FLIPADST_ADST:
load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
col_txfm_16x16_rounding(out, -shift[1]);
transpose_16x16(out, in);
- fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
transpose_16x16(out, in);
write_buffer_16x16(in, coeff);
break;
@@ -1585,3 +1638,146 @@ void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
}
(void)bd;
}
+
+static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
+ for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
+ for (int i = 1; i < size; i += 2) in[size - i] = out[i];
+}
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_sse4_1, // DCT_DCT
+ fadst8x8_sse4_1, // ADST_DCT
+ fdct8x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fadst8x8_sse4_1, // FLIPADST_DCT
+ fdct8x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ NULL, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16x16_sse4_1, // DCT_DCT
+ fdct16x16_sse4_1, // ADST_DCT
+ fadst16x16_sse4_1, // DCT_ADST
+ fadst16x16_sse4_1, // ADST_ADST
+ fdct16x16_sse4_1, // FLIPADST_DCT
+ fadst16x16_sse4_1, // DCT_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_FLIPADST
+ fadst16x16_sse4_1, // ADST_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_ADST
+ NULL, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16x16_sse4_1, // DCT_DCT
+ fadst16x16_sse4_1, // ADST_DCT
+ fdct16x16_sse4_1, // DCT_ADST
+ fadst16x16_sse4_1, // ADST_ADST
+ fadst16x16_sse4_1, // FLIPADST_DCT
+ fdct16x16_sse4_1, // DCT_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_FLIPADST
+ fadst16x16_sse4_1, // ADST_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_ADST
+ NULL, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_sse4_1, // DCT_DCT
+ fdct8x8_sse4_1, // ADST_DCT
+ fadst8x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fdct8x8_sse4_1, // FLIPADST_DCT
+ fadst8x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ NULL, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[32], out[32];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+ int bit = fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
+ col_txfm(in, in, bit, 0);
+ col_txfm_8x8_rounding(in, -shift[1]);
+ transpose_8x8(in, out + i * 16);
+ }
+
+ if (lr_flip) {
+ flip_buf_sse4_1(in, out, 32);
+ row_txfm(in, out, bit, 2);
+ } else {
+ row_txfm(out, out, bit, 2);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ transpose_8x8(out + i * 16, in);
+ av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
+ write_buffer_16x8(in, coeff + i * 8, 16);
+ }
+
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[32], out[32];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+ int bit = fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+ col_txfm(in, in, bit, 2);
+ col_txfm_8x16_rounding(in, -shift[1]);
+ transpose_8x8(in, out);
+ transpose_8x8(in + 16, out + 16);
+
+ for (int i = 0; i < 2; i++) {
+ row_txfm(out + i * 16, out, bit, 0);
+ transpose_8x8(out, in);
+ av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
+ write_buffer_8x8(in, coeff + i * 64);
+ }
+
+ (void)bd;
+}
diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
new file mode 100644
index 000000000..06aaaa7ee
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src,
+ const __m128i *shuffle, const __m256i *kl) {
+ const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+ const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
+ const __m256i dst0 = yy_loadu_256(dst);
+ const __m256i r0 = _mm256_add_epi32(dst0, d0);
+ yy_storeu_256(dst, r0);
+}
+
+static INLINE void acc_stat_win7_one_line_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+ int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint8_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m256i kl =
+ _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win7_opt_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win7_one_line_avx2(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = 0; l < WIENER_WIN * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+static INLINE void acc_stat_win5_one_line_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint8_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m256i kl =
+ _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win5_opt_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win5_one_line_avx2(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2_CHROMA; ++k) {
+ for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, double *M, double *H) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else {
+ av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ }
+}
+
+static INLINE __m256i pair_set_epi16(uint16_t a, uint16_t b) {
+ return _mm256_set1_epi32(
+ (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+ __m256i sum64 = _mm256_setzero_si256();
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt0_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+ yy_loadu_256(flt0 + j + 8)),
+ 0xd8);
+ const __m256i flt1_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+ yy_loadu_256(flt1 + j + 8)),
+ 0xd8);
+ const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+ const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
+ const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
+ const __m256i v0 = _mm256_madd_epi16(
+ xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m256i v1 = _mm256_madd_epi16(
+ xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[0] > 0) {
+ __m256i xq_coeff =
+ pair_set_epi16(xq[0], (-xq[0] * (1 << SGRPROJ_RST_BITS)));
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt0_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+ yy_loadu_256(flt0 + j + 8)),
+ 0xd8);
+ const __m256i v0 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt0_16b, d0));
+ const __m256i v1 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt0_16b, d0));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[1] > 0) {
+ __m256i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt1_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+ yy_loadu_256(flt1 + j + 8)),
+ 0xd8);
+ const __m256i v0 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt1_16b, d0));
+ const __m256i v1 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt1_16b, d0));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt1 += flt1_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i diff0 = _mm256_sub_epi16(d0, s0);
+ const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64_0, sum64_1);
+ }
+ int64_t sum[4];
+ yy_storeu_256(sum, sum64);
+ err += sum[0] + sum[1] + sum[2] + sum[3];
+ return err;
+}
diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
new file mode 100644
index 000000000..04e4d1afc
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
+ const __m128i *shuffle, const __m128i *kl) {
+ const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+ const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
+ const __m128i d1 =
+ _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)));
+ const __m128i dst0 = xx_loadu_128(dst);
+ const __m128i dst1 = xx_loadu_128(dst + 4);
+ const __m128i r0 = _mm_add_epi32(dst0, d0);
+ const __m128i r1 = _mm_add_epi32(dst1, d1);
+ xx_storeu_128(dst, r0);
+ xx_storeu_128(dst + 4, r1);
+}
+
+static INLINE void acc_stat_win7_one_line_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+ int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ const int wiener_win = 7;
+ int j, k, l;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m128i kl =
+ _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win7_opt_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win7_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = 0; l < WIENER_WIN * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+static INLINE void acc_stat_win5_one_line_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ const int wiener_win = WIENER_WIN_CHROMA;
+ int j, k, l;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m128i kl =
+ _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win5_opt_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win5_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+ for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, double *M, double *H) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else {
+ av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ }
+}
+
+static INLINE __m128i pair_set_epi16(uint16_t a, uint16_t b) {
+ return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+ __m128i sum64 = _mm_setzero_si128();
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j < width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt0_16b =
+ _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+ const __m128i flt1_16b =
+ _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+ const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+ const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0);
+ const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0);
+ const __m128i v0 = _mm_madd_epi16(
+ xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m128i v1 = _mm_madd_epi16(
+ xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[0] > 0) {
+ __m128i xq_coeff = pair_set_epi16(xq[0], -(xq[0] << SGRPROJ_RST_BITS));
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j < width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt0_16b =
+ _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+ const __m128i v0 =
+ _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt0_16b, d0));
+ const __m128i v1 =
+ _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt0_16b, d0));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[1] > 0) {
+ __m128i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j < width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt1_16b =
+ _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+ const __m128i v0 =
+ _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt1_16b, d0));
+ const __m128i v1 =
+ _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt1_16b, d0));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt1 += flt1_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else {
+ __m128i sum32 = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width - 16; j += 16) {
+ const __m128i d = xx_loadu_128(dat + j);
+ const __m128i s = xx_loadu_128(src + j);
+ const __m128i d0 = _mm_cvtepu8_epi16(d);
+ const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8));
+ const __m128i s0 = _mm_cvtepu8_epi16(s);
+ const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8));
+ const __m128i diff0 = _mm_sub_epi16(d0, s0);
+ const __m128i diff1 = _mm_sub_epi16(d1, s1);
+ const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+ const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+ sum32 = _mm_add_epi32(sum32, err0);
+ sum32 = _mm_add_epi32(sum32, err1);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64_0, sum64_1);
+ }
+ int64_t sum[2];
+ xx_storeu_128(sum, sum64);
+ err += sum[0] + sum[1];
+ return err;
+}
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
index f776e84c7..2a792f14e 100644
--- a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
@@ -14,7 +14,7 @@
#include <smmintrin.h>
#include "aom_dsp/x86/synonyms.h"
-
+#include "aom_dsp/x86/synonyms_avx2.h"
#include "aom/aom_integer.h"
#include "av1/common/reconinter.h"
@@ -31,7 +31,7 @@ uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
uint64_t csse;
const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
- const __m256i v_zext_q = _mm256_set1_epi64x(0xffffffff);
+ const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff);
__m256i v_acc0_q = _mm256_setzero_si256();
diff --git a/third_party/aom/av1/exports_com b/third_party/aom/av1/exports_com
new file mode 100644
index 000000000..5c8e0e09d
--- /dev/null
+++ b/third_party/aom/av1/exports_com
@@ -0,0 +1,2 @@
+text aom_read_obu_header_and_size
+text av1_resize_frame420
diff --git a/third_party/aom/av1/exports_dec b/third_party/aom/av1/exports_dec
index 05860e8c0..daabf6766 100644
--- a/third_party/aom/av1/exports_dec
+++ b/third_party/aom/av1/exports_dec
@@ -1,2 +1,3 @@
data aom_codec_av1_dx_algo
text aom_codec_av1_dx
+text av1_add_film_grain
diff --git a/third_party/aom/av1/exports_test b/third_party/aom/av1/exports_test
new file mode 100644
index 000000000..dab377575
--- /dev/null
+++ b/third_party/aom/av1/exports_test
@@ -0,0 +1,2 @@
+text av1_get_fwd_txfm_cfg
+text av1_rtcd