From 68569dee1416593955c1570d638b3d9250b33012 Mon Sep 17 00:00:00 2001 From: trav90 Date: Mon, 15 Oct 2018 21:45:30 -0500 Subject: Import aom library This is the reference implementation for the Alliance for Open Media's av1 video code. The commit used was 4d668d7feb1f8abd809d1bca0418570a7f142a36. --- third_party/aom/av1/encoder/aq_complexity.c | 163 + third_party/aom/av1/encoder/aq_complexity.h | 37 + third_party/aom/av1/encoder/aq_cyclicrefresh.c | 566 + third_party/aom/av1/encoder/aq_cyclicrefresh.h | 98 + third_party/aom/av1/encoder/aq_variance.c | 207 + third_party/aom/av1/encoder/aq_variance.h | 31 + third_party/aom/av1/encoder/arm/neon/dct_neon.c | 36 + third_party/aom/av1/encoder/arm/neon/error_neon.c | 42 + .../aom/av1/encoder/arm/neon/quantize_neon.c | 118 + third_party/aom/av1/encoder/av1_quantize.c | 1790 +++ third_party/aom/av1/encoder/av1_quantize.h | 184 + third_party/aom/av1/encoder/bitstream.c | 5399 ++++++++ third_party/aom/av1/encoder/bitstream.h | 53 + third_party/aom/av1/encoder/block.h | 241 + third_party/aom/av1/encoder/blockiness.c | 142 + third_party/aom/av1/encoder/context_tree.c | 331 + third_party/aom/av1/encoder/context_tree.h | 111 + third_party/aom/av1/encoder/corner_detect.c | 37 + third_party/aom/av1/encoder/corner_detect.h | 22 + third_party/aom/av1/encoder/corner_match.c | 193 + third_party/aom/av1/encoder/corner_match.h | 29 + third_party/aom/av1/encoder/cost.c | 67 + third_party/aom/av1/encoder/cost.h | 63 + third_party/aom/av1/encoder/daala_compat_enc.c | 30 + third_party/aom/av1/encoder/dct.c | 2228 ++++ third_party/aom/av1/encoder/encint.h | 51 + third_party/aom/av1/encoder/encodeframe.c | 7160 +++++++++++ third_party/aom/av1/encoder/encodeframe.h | 58 + third_party/aom/av1/encoder/encodemb.c | 1671 +++ third_party/aom/av1/encoder/encodemb.h | 92 + third_party/aom/av1/encoder/encodemv.c | 497 + third_party/aom/av1/encoder/encodemv.h | 43 + third_party/aom/av1/encoder/encoder.c | 5980 +++++++++ third_party/aom/av1/encoder/encoder.h | 883 ++ third_party/aom/av1/encoder/encodetxb.c | 784 ++ third_party/aom/av1/encoder/encodetxb.h | 53 + third_party/aom/av1/encoder/ethread.c | 176 + third_party/aom/av1/encoder/ethread.h | 34 + third_party/aom/av1/encoder/extend.c | 192 + third_party/aom/av1/encoder/extend.h | 32 + third_party/aom/av1/encoder/firstpass.c | 3026 +++++ third_party/aom/av1/encoder/firstpass.h | 202 + third_party/aom/av1/encoder/generic_encoder.c | 157 + third_party/aom/av1/encoder/global_motion.c | 319 + third_party/aom/av1/encoder/global_motion.h | 62 + third_party/aom/av1/encoder/hybrid_fwd_txfm.c | 499 + third_party/aom/av1/encoder/hybrid_fwd_txfm.h | 44 + third_party/aom/av1/encoder/laplace_encoder.c | 107 + third_party/aom/av1/encoder/lookahead.c | 225 + third_party/aom/av1/encoder/lookahead.h | 114 + third_party/aom/av1/encoder/mbgraph.c | 398 + third_party/aom/av1/encoder/mbgraph.h | 39 + third_party/aom/av1/encoder/mcomp.c | 3493 +++++ third_party/aom/av1/encoder/mcomp.h | 163 + third_party/aom/av1/encoder/mips/msa/error_msa.c | 108 + .../aom/av1/encoder/mips/msa/fdct16x16_msa.c | 436 + third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c | 98 + third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c | 65 + third_party/aom/av1/encoder/mips/msa/fdct_msa.h | 117 + .../aom/av1/encoder/mips/msa/temporal_filter_msa.c | 284 + third_party/aom/av1/encoder/palette.c | 277 + third_party/aom/av1/encoder/palette.h | 73 + third_party/aom/av1/encoder/pickcdef.c | 490 + third_party/aom/av1/encoder/picklpf.c | 211 + third_party/aom/av1/encoder/picklpf.h | 32 + third_party/aom/av1/encoder/pickrst.c | 1269 ++ third_party/aom/av1/encoder/pickrst.h | 30 + third_party/aom/av1/encoder/pvq_encoder.c | 988 ++ third_party/aom/av1/encoder/pvq_encoder.h | 53 + third_party/aom/av1/encoder/ransac.c | 1210 ++ third_party/aom/av1/encoder/ransac.h | 44 + third_party/aom/av1/encoder/ratectrl.c | 1759 +++ third_party/aom/av1/encoder/ratectrl.h | 284 + third_party/aom/av1/encoder/ratectrl_xiph.c | 1244 ++ third_party/aom/av1/encoder/ratectrl_xiph.h | 200 + third_party/aom/av1/encoder/rd.c | 1204 ++ third_party/aom/av1/encoder/rd.h | 505 + third_party/aom/av1/encoder/rdopt.c | 12713 +++++++++++++++++++ third_party/aom/av1/encoder/rdopt.h | 142 + third_party/aom/av1/encoder/segmentation.c | 394 + third_party/aom/av1/encoder/segmentation.h | 51 + third_party/aom/av1/encoder/speed_features.c | 506 + third_party/aom/av1/encoder/speed_features.h | 484 + third_party/aom/av1/encoder/subexp.c | 282 + third_party/aom/av1/encoder/subexp.h | 49 + third_party/aom/av1/encoder/temporal_filter.c | 719 ++ third_party/aom/av1/encoder/temporal_filter.h | 25 + third_party/aom/av1/encoder/tokenize.c | 887 ++ third_party/aom/av1/encoder/tokenize.h | 151 + third_party/aom/av1/encoder/treewriter.c | 59 + third_party/aom/av1/encoder/treewriter.h | 42 + third_party/aom/av1/encoder/variance_tree.c | 61 + third_party/aom/av1/encoder/variance_tree.h | 96 + third_party/aom/av1/encoder/wedge_utils.c | 125 + .../aom/av1/encoder/x86/av1_highbd_quantize_sse4.c | 193 + .../aom/av1/encoder/x86/av1_quantize_sse2.c | 211 + .../av1/encoder/x86/av1_quantize_ssse3_x86_64.asm | 204 + .../aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm | 219 + third_party/aom/av1/encoder/x86/dct_intrin_sse2.c | 3884 ++++++ third_party/aom/av1/encoder/x86/dct_sse2.asm | 87 + third_party/aom/av1/encoder/x86/dct_ssse3.c | 469 + .../aom/av1/encoder/x86/error_intrin_avx2.c | 73 + third_party/aom/av1/encoder/x86/error_sse2.asm | 125 + .../encoder/x86/highbd_block_error_intrin_sse2.c | 72 + .../aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c | 1895 +++ .../aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c | 1678 +++ .../av1/encoder/x86/temporal_filter_apply_sse2.asm | 215 + third_party/aom/av1/encoder/x86/wedge_utils_sse2.c | 254 + 108 files changed, 76118 insertions(+) create mode 100644 third_party/aom/av1/encoder/aq_complexity.c create mode 100644 third_party/aom/av1/encoder/aq_complexity.h create mode 100644 third_party/aom/av1/encoder/aq_cyclicrefresh.c create mode 100644 third_party/aom/av1/encoder/aq_cyclicrefresh.h create mode 100644 third_party/aom/av1/encoder/aq_variance.c create mode 100644 third_party/aom/av1/encoder/aq_variance.h create mode 100644 third_party/aom/av1/encoder/arm/neon/dct_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/error_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/quantize_neon.c create mode 100644 third_party/aom/av1/encoder/av1_quantize.c create mode 100644 third_party/aom/av1/encoder/av1_quantize.h create mode 100644 third_party/aom/av1/encoder/bitstream.c create mode 100644 third_party/aom/av1/encoder/bitstream.h create mode 100644 third_party/aom/av1/encoder/block.h create mode 100644 third_party/aom/av1/encoder/blockiness.c create mode 100644 third_party/aom/av1/encoder/context_tree.c create mode 100644 third_party/aom/av1/encoder/context_tree.h create mode 100644 third_party/aom/av1/encoder/corner_detect.c create mode 100644 third_party/aom/av1/encoder/corner_detect.h create mode 100644 third_party/aom/av1/encoder/corner_match.c create mode 100644 third_party/aom/av1/encoder/corner_match.h create mode 100644 third_party/aom/av1/encoder/cost.c create mode 100644 third_party/aom/av1/encoder/cost.h create mode 100644 third_party/aom/av1/encoder/daala_compat_enc.c create mode 100644 third_party/aom/av1/encoder/dct.c create mode 100644 third_party/aom/av1/encoder/encint.h create mode 100644 third_party/aom/av1/encoder/encodeframe.c create mode 100644 third_party/aom/av1/encoder/encodeframe.h create mode 100644 third_party/aom/av1/encoder/encodemb.c create mode 100644 third_party/aom/av1/encoder/encodemb.h create mode 100644 third_party/aom/av1/encoder/encodemv.c create mode 100644 third_party/aom/av1/encoder/encodemv.h create mode 100644 third_party/aom/av1/encoder/encoder.c create mode 100644 third_party/aom/av1/encoder/encoder.h create mode 100644 third_party/aom/av1/encoder/encodetxb.c create mode 100644 third_party/aom/av1/encoder/encodetxb.h create mode 100644 third_party/aom/av1/encoder/ethread.c create mode 100644 third_party/aom/av1/encoder/ethread.h create mode 100644 third_party/aom/av1/encoder/extend.c create mode 100644 third_party/aom/av1/encoder/extend.h create mode 100644 third_party/aom/av1/encoder/firstpass.c create mode 100644 third_party/aom/av1/encoder/firstpass.h create mode 100644 third_party/aom/av1/encoder/generic_encoder.c create mode 100644 third_party/aom/av1/encoder/global_motion.c create mode 100644 third_party/aom/av1/encoder/global_motion.h create mode 100644 third_party/aom/av1/encoder/hybrid_fwd_txfm.c create mode 100644 third_party/aom/av1/encoder/hybrid_fwd_txfm.h create mode 100644 third_party/aom/av1/encoder/laplace_encoder.c create mode 100644 third_party/aom/av1/encoder/lookahead.c create mode 100644 third_party/aom/av1/encoder/lookahead.h create mode 100644 third_party/aom/av1/encoder/mbgraph.c create mode 100644 third_party/aom/av1/encoder/mbgraph.h create mode 100644 third_party/aom/av1/encoder/mcomp.c create mode 100644 third_party/aom/av1/encoder/mcomp.h create mode 100644 third_party/aom/av1/encoder/mips/msa/error_msa.c create mode 100644 third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c create mode 100644 third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c create mode 100644 third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c create mode 100644 third_party/aom/av1/encoder/mips/msa/fdct_msa.h create mode 100644 third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c create mode 100644 third_party/aom/av1/encoder/palette.c create mode 100644 third_party/aom/av1/encoder/palette.h create mode 100644 third_party/aom/av1/encoder/pickcdef.c create mode 100644 third_party/aom/av1/encoder/picklpf.c create mode 100644 third_party/aom/av1/encoder/picklpf.h create mode 100644 third_party/aom/av1/encoder/pickrst.c create mode 100644 third_party/aom/av1/encoder/pickrst.h create mode 100644 third_party/aom/av1/encoder/pvq_encoder.c create mode 100644 third_party/aom/av1/encoder/pvq_encoder.h create mode 100644 third_party/aom/av1/encoder/ransac.c create mode 100644 third_party/aom/av1/encoder/ransac.h create mode 100644 third_party/aom/av1/encoder/ratectrl.c create mode 100644 third_party/aom/av1/encoder/ratectrl.h create mode 100644 third_party/aom/av1/encoder/ratectrl_xiph.c create mode 100644 third_party/aom/av1/encoder/ratectrl_xiph.h create mode 100644 third_party/aom/av1/encoder/rd.c create mode 100644 third_party/aom/av1/encoder/rd.h create mode 100644 third_party/aom/av1/encoder/rdopt.c create mode 100644 third_party/aom/av1/encoder/rdopt.h create mode 100644 third_party/aom/av1/encoder/segmentation.c create mode 100644 third_party/aom/av1/encoder/segmentation.h create mode 100644 third_party/aom/av1/encoder/speed_features.c create mode 100644 third_party/aom/av1/encoder/speed_features.h create mode 100644 third_party/aom/av1/encoder/subexp.c create mode 100644 third_party/aom/av1/encoder/subexp.h create mode 100644 third_party/aom/av1/encoder/temporal_filter.c create mode 100644 third_party/aom/av1/encoder/temporal_filter.h create mode 100644 third_party/aom/av1/encoder/tokenize.c create mode 100644 third_party/aom/av1/encoder/tokenize.h create mode 100644 third_party/aom/av1/encoder/treewriter.c create mode 100644 third_party/aom/av1/encoder/treewriter.h create mode 100644 third_party/aom/av1/encoder/variance_tree.c create mode 100644 third_party/aom/av1/encoder/variance_tree.h create mode 100644 third_party/aom/av1/encoder/wedge_utils.c create mode 100644 third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/av1_quantize_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm create mode 100644 third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm create mode 100644 third_party/aom/av1/encoder/x86/dct_intrin_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/dct_sse2.asm create mode 100644 third_party/aom/av1/encoder/x86/dct_ssse3.c create mode 100644 third_party/aom/av1/encoder/x86/error_intrin_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/error_sse2.asm create mode 100644 third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm create mode 100644 third_party/aom/av1/encoder/x86/wedge_utils_sse2.c (limited to 'third_party/aom/av1/encoder') diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c new file mode 100644 index 000000000..054b0e062 --- /dev/null +++ b/third_party/aom/av1/encoder/aq_complexity.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/encodeframe.h" +#include "av1/common/seg_common.h" +#include "av1/encoder/segmentation.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/system_state.h" + +#define AQ_C_SEGMENTS 5 +#define DEFAULT_AQ2_SEG 3 // Neutral Q segment +#define AQ_C_STRENGTHS 3 +static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { 1.75, 1.25, 1.05, 1.00, 0.90 }, + { 2.00, 1.50, 1.15, 1.00, 0.85 }, + { 2.50, 1.75, 1.25, 1.00, 0.80 } +}; +static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { 0.15, 0.30, 0.55, 2.00, 100.0 }, + { 0.20, 0.40, 0.65, 2.00, 100.0 }, + { 0.25, 0.50, 0.75, 2.00, 100.0 } +}; +static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { -4.0, -3.0, -2.0, 100.00, 100.0 }, + { -3.5, -2.5, -1.5, 100.00, 100.0 }, + { -3.0, -2.0, -1.0, 100.00, 100.0 } +}; + +#define DEFAULT_COMPLEXITY 64 + +static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) { + // Approximate base quatizer (truncated to int) + const int base_quant = av1_ac_quant(q_index, 0, bit_depth) / 4; + return (base_quant > 10) + (base_quant > 25); +} + +void av1_setup_in_frame_q_adj(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + struct segmentation *const seg = &cm->seg; + + // Make SURE use of floating point in this function is safe. + aom_clear_system_state(); + + if (frame_is_intra_only(cm) || cm->error_resilient_mode || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + int segment; + const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth); + + // Clear down the segment map. + memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols); + + av1_clearall_segfeatures(seg); + + // Segmentation only makes sense if the target bits per SB is above a + // threshold. Below this the overheads will usually outweigh any benefit. + if (cpi->rc.sb64_target_rate < 256) { + av1_disable_segmentation(seg); + return; + } + + av1_enable_segmentation(seg); + + // Select delta coding method. + seg->abs_delta = SEGMENT_DELTADATA; + + // Default segment "Q" feature is disabled so it defaults to the baseline Q. + av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q); + + // Use some of the segments for in frame Q adjustment. + for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) { + int qindex_delta; + + if (segment == DEFAULT_AQ2_SEG) continue; + + qindex_delta = av1_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, cm->base_qindex, + aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth); + + // For AQ complexity mode, we dont allow Q0 in a segment if the base + // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) { + qindex_delta = -cm->base_qindex + 1; + } + if ((cm->base_qindex + qindex_delta) > 0) { + av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q); + av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta); + } + } + } +} + +#define DEFAULT_LV_THRESH 10.0 +#define MIN_DEFAULT_LV_THRESH 8.0 +#define VAR_STRENGTH_STEP 0.25 +// Select a segment for the current block. +// The choice of segment for a block depends on the ratio of the projected +// bits for the block vs a target average and its spatial complexity. +void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, + int mi_row, int mi_col, int projected_rate) { + const AV1_COMMON *const cm = &cpi->common; + + const int mi_offset = mi_row * cm->mi_cols + mi_col; + const int xmis = AOMMIN(cm->mi_cols - mi_col, mi_size_wide[bs]); + const int ymis = AOMMIN(cm->mi_rows - mi_row, mi_size_high[bs]); + int x, y; + int i; + unsigned char segment; + + if (0) { + segment = DEFAULT_AQ2_SEG; + } else { + // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh). + // It is converted to bits * 256 units. + const int64_t num = (int64_t)cpi->rc.sb64_target_rate * xmis * ymis * 256; + const int denom = cm->mib_size * cm->mib_size; + const int target_rate = (int)(num / denom); + double logvar; + double low_var_thresh; + const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth); + + aom_clear_system_state(); + low_var_thresh = (cpi->oxcf.pass == 2) ? AOMMAX(cpi->twopass.mb_av_energy, + MIN_DEFAULT_LV_THRESH) + : DEFAULT_LV_THRESH; + + av1_setup_src_planes(mb, cpi->source, mi_row, mi_col); + logvar = av1_log_block_var(cpi, mb, bs); + + segment = AQ_C_SEGMENTS - 1; // Just in case no break out below. + for (i = 0; i < AQ_C_SEGMENTS; ++i) { + // Test rate against a threshold value and variance against a threshold. + // Increasing segment number (higher variance and complexity) = higher Q. + if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) && + (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) { + segment = i; + break; + } + } + } + + // Fill in the entires in the segment map corresponding to this SB64. + for (y = 0; y < ymis; y++) { + for (x = 0; x < xmis; x++) { + cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment; + } + } +} diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h new file mode 100644 index 000000000..af525b36d --- /dev/null +++ b/third_party/aom/av1/encoder/aq_complexity.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_AQ_COMPLEXITY_H_ +#define AV1_ENCODER_AQ_COMPLEXITY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/common/enums.h" + +struct AV1_COMP; +struct macroblock; + +// Select a segment for the current Block. +void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *, + BLOCK_SIZE bs, int mi_row, int mi_col, + int projected_rate); + +// This function sets up a set of segments with delta Q values around +// the baseline frame quantizer. +void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_AQ_COMPLEXITY_H_ diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c new file mode 100644 index 000000000..e41c608b6 --- /dev/null +++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c @@ -0,0 +1,566 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/seg_common.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/segmentation.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/system_state.h" + +struct CYCLIC_REFRESH { + // Percentage of blocks per frame that are targeted as candidates + // for cyclic refresh. + int percent_refresh; + // Maximum q-delta as percentage of base q. + int max_qdelta_perc; + // Superblock starting index for cycling through the frame. + int sb_index; + // Controls how long block will need to wait to be refreshed again, in + // excess of the cycle time, i.e., in the case of all zero motion, block + // will be refreshed every (100/percent_refresh + time_for_refresh) frames. + int time_for_refresh; + // Target number of (8x8) blocks that are set for delta-q. + int target_num_seg_blocks; + // Actual number of (8x8) blocks that were applied delta-q. + int actual_num_seg1_blocks; + int actual_num_seg2_blocks; + // RD mult. parameters for segment 1. + int rdmult; + // Cyclic refresh map. + signed char *map; + // Map of the last q a block was coded at. + uint8_t *last_coded_q_map; + // Thresholds applied to the projected rate/distortion of the coding block, + // when deciding whether block should be refreshed. + int64_t thresh_rate_sb; + int64_t thresh_dist_sb; + // Threshold applied to the motion vector (in units of 1/8 pel) of the + // coding block, when deciding whether block should be refreshed. + int16_t motion_thresh; + // Rate target ratio to set q delta. + double rate_ratio_qdelta; + // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2. + int rate_boost_fac; + double low_content_avg; + int qindex_delta[3]; +}; + +CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) { + size_t last_coded_q_map_size; + CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr)); + if (cr == NULL) return NULL; + + cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map)); + if (cr->map == NULL) { + av1_cyclic_refresh_free(cr); + return NULL; + } + last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map); + cr->last_coded_q_map = aom_malloc(last_coded_q_map_size); + if (cr->last_coded_q_map == NULL) { + av1_cyclic_refresh_free(cr); + return NULL; + } + assert(MAXQ <= 255); + memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size); + + return cr; +} + +void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) { + aom_free(cr->map); + aom_free(cr->last_coded_q_map); + aom_free(cr); +} + +// Check if we should turn off cyclic refresh based on bitrate condition. +static int apply_cyclic_refresh_bitrate(const AV1_COMMON *cm, + const RATE_CONTROL *rc) { + // Turn off cyclic refresh if bits available per frame is not sufficiently + // larger than bit cost of segmentation. Segment map bit cost should scale + // with number of seg blocks, so compare available bits to number of blocks. + // Average bits available per frame = avg_frame_bandwidth + // Number of (8x8) blocks in frame = mi_rows * mi_cols; + const float factor = 0.25; + const int number_blocks = cm->mi_rows * cm->mi_cols; + // The condition below corresponds to turning off at target bitrates: + // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p. + // Also turn off at very small frame sizes, to avoid too large fraction of + // superblocks to be refreshed per frame. Threshold below is less than QCIF. + if (rc->avg_frame_bandwidth < factor * number_blocks || + number_blocks / 64 < 5) + return 0; + else + return 1; +} + +// Check if this coding block, of size bsize, should be considered for refresh +// (lower-qp coding). Decision can be based on various factors, such as +// size of the coding block (i.e., below min_block size rejected), coding +// mode, and rate/distortion. +static int candidate_refresh_aq(const CYCLIC_REFRESH *cr, + const MB_MODE_INFO *mbmi, int64_t rate, + int64_t dist, int bsize) { + MV mv = mbmi->mv[0].as_mv; + // Reject the block for lower-qp coding if projected distortion + // is above the threshold, and any of the following is true: + // 1) mode uses large mv + // 2) mode is an intra-mode + // Otherwise accept for refresh. + if (dist > cr->thresh_dist_sb && + (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh || + mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh || + !is_inter_block(mbmi))) + return CR_SEGMENT_ID_BASE; + else if (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb && + is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 && + cr->rate_boost_fac > 10) + // More aggressive delta-q for bigger blocks with zero motion. + return CR_SEGMENT_ID_BOOST2; + else + return CR_SEGMENT_ID_BOOST1; +} + +// Compute delta-q for the segment. +static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) { + const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const RATE_CONTROL *const rc = &cpi->rc; + int deltaq = av1_compute_qdelta_by_rate(rc, cpi->common.frame_type, q, + rate_factor, cpi->common.bit_depth); + if ((-deltaq) > cr->max_qdelta_perc * q / 100) { + deltaq = -cr->max_qdelta_perc * q / 100; + } + return deltaq; +} + +// For the just encoded frame, estimate the bits, incorporating the delta-q +// from non-base segment. For now ignore effect of multiple segments +// (with different delta-q). Note this function is called in the postencode +// (called from rc_update_rate_correction_factors()). +int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi, + double correction_factor) { + const AV1_COMMON *const cm = &cpi->common; + const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int estimated_bits; + int mbs = cm->MBs; + int num8x8bl = mbs << 2; + // Weight for non-base segments: use actual number of blocks refreshed in + // previous/just encoded frame. Note number of blocks here is in 8x8 units. + double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl; + double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl; + // Take segment weighted average for estimated bits. + estimated_bits = + (int)((1.0 - weight_segment1 - weight_segment2) * + av1_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs, + correction_factor, cm->bit_depth) + + weight_segment1 * + av1_estimate_bits_at_q(cm->frame_type, + cm->base_qindex + cr->qindex_delta[1], + mbs, correction_factor, cm->bit_depth) + + weight_segment2 * + av1_estimate_bits_at_q(cm->frame_type, + cm->base_qindex + cr->qindex_delta[2], + mbs, correction_factor, cm->bit_depth)); + return estimated_bits; +} + +// Prior to encoding the frame, estimate the bits per mb, for a given q = i and +// a corresponding delta-q (for segment 1). This function is called in the +// rc_regulate_q() to set the base qp index. +// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or +// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding. +int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i, + double correction_factor) { + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int bits_per_mb; + int num8x8bl = cm->MBs << 2; + // Weight for segment prior to encoding: take the average of the target + // number for the frame to be encoded and the actual from the previous frame. + double weight_segment = + (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks + + cr->actual_num_seg2_blocks) >> + 1) / + num8x8bl; + // Compute delta-q corresponding to qindex i. + int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); + // Take segment weighted average for bits per mb. + bits_per_mb = (int)((1.0 - weight_segment) * + av1_rc_bits_per_mb(cm->frame_type, i, + correction_factor, cm->bit_depth) + + weight_segment * + av1_rc_bits_per_mb(cm->frame_type, i + deltaq, + correction_factor, cm->bit_depth)); + return bits_per_mb; +} + +// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col), +// check if we should reset the segment_id, and update the cyclic_refresh map +// and segmentation map. +void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, + MB_MODE_INFO *const mbmi, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int64_t rate, int64_t dist, int skip) { + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(cm->mi_cols - mi_col, bw); + const int ymis = AOMMIN(cm->mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_cols + mi_col; + const int refresh_this_block = + candidate_refresh_aq(cr, mbmi, rate, dist, bsize); + // Default is to not update the refresh map. + int new_map_value = cr->map[block_index]; + int x = 0; + int y = 0; + + // If this block is labeled for refresh, check if we should reset the + // segment_id. + if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) { + mbmi->segment_id = refresh_this_block; + // Reset segment_id if will be skipped. + if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE; + } + + // Update the cyclic refresh map, to be used for setting segmentation map + // for the next frame. If the block will be refreshed this frame, mark it + // as clean. The magnitude of the -ve influences how long before we consider + // it for refresh again. + if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) { + new_map_value = -cr->time_for_refresh; + } else if (refresh_this_block) { + // Else if it is accepted as candidate for refresh, and has not already + // been refreshed (marked as 1) then mark it as a candidate for cleanup + // for future time (marked as 0), otherwise don't update it. + if (cr->map[block_index] == 1) new_map_value = 0; + } else { + // Leave it marked as block that is not candidate for refresh. + new_map_value = 1; + } + + // Update entries in the cyclic refresh map with new_map_value, and + // copy mbmi->segment_id into global segmentation map. + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) { + int map_offset = block_index + y * cm->mi_cols + x; + cr->map[map_offset] = new_map_value; + cpi->segmentation_map[map_offset] = mbmi->segment_id; + // Inter skip blocks were clearly not coded at the current qindex, so + // don't update the map for them. For cases where motion is non-zero or + // the reference frame isn't the previous frame, the previous value in + // the map for this spatial location is not entirely correct. + if ((!is_inter_block(mbmi) || !skip) && + mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) { + cr->last_coded_q_map[map_offset] = clamp( + cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ); + } else if (is_inter_block(mbmi) && skip && + mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) { + cr->last_coded_q_map[map_offset] = + AOMMIN(clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id], + 0, MAXQ), + cr->last_coded_q_map[map_offset]); + } + } +} + +// Update the actual number of blocks that were applied the segment delta q. +void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + unsigned char *const seg_map = cpi->segmentation_map; + int mi_row, mi_col; + cr->actual_num_seg1_blocks = 0; + cr->actual_num_seg2_blocks = 0; + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) + for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { + if (cyclic_refresh_segment_id(seg_map[mi_row * cm->mi_cols + mi_col]) == + CR_SEGMENT_ID_BOOST1) + cr->actual_num_seg1_blocks++; + else if (cyclic_refresh_segment_id( + seg_map[mi_row * cm->mi_cols + mi_col]) == + CR_SEGMENT_ID_BOOST2) + cr->actual_num_seg2_blocks++; + } +} + +// Set golden frame update interval, for 1 pass CBR mode. +void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) { + RATE_CONTROL *const rc = &cpi->rc; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + // Set minimum gf_interval for GF update to a multiple (== 2) of refresh + // period. Depending on past encoding stats, GF flag may be reset and update + // may not occur until next baseline_gf_interval. + if (cr->percent_refresh > 0) + rc->baseline_gf_interval = 4 * (100 / cr->percent_refresh); + else + rc->baseline_gf_interval = 40; +} + +// Update some encoding stats (from the just encoded frame). If this frame's +// background has high motion, refresh the golden frame. Otherwise, if the +// golden reference is to be updated check if we should NOT update the golden +// ref. +void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int mi_row, mi_col; + double fraction_low = 0.0; + int low_content_frame = 0; + + MODE_INFO **mi; + RATE_CONTROL *const rc = &cpi->rc; + const int rows = cm->mi_rows, cols = cm->mi_cols; + int cnt1 = 0, cnt2 = 0; + int force_gf_refresh = 0; + + for (mi_row = 0; mi_row < rows; mi_row++) { + mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + + for (mi_col = 0; mi_col < cols; mi_col++) { + int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0 + ? mi[0]->mbmi.mv[0].as_mv.row + : -1 * mi[0]->mbmi.mv[0].as_mv.row; + int16_t abs_mvc = mi[0]->mbmi.mv[0].as_mv.col >= 0 + ? mi[0]->mbmi.mv[0].as_mv.col + : -1 * mi[0]->mbmi.mv[0].as_mv.col; + + // Calculate the motion of the background. + if (abs_mvr <= 16 && abs_mvc <= 16) { + cnt1++; + if (abs_mvr == 0 && abs_mvc == 0) cnt2++; + } + mi++; + + // Accumulate low_content_frame. + if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++; + } + } + + // For video conference clips, if the background has high motion in current + // frame because of the camera movement, set this frame as the golden frame. + // Use 70% and 5% as the thresholds for golden frame refreshing. + // Also, force this frame as a golden update frame if this frame will change + // the resolution (resize_pending != 0). + if (cpi->resize_pending != 0 || + (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) { + av1_cyclic_refresh_set_golden_update(cpi); + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + + if (rc->frames_till_gf_update_due > rc->frames_to_key) + rc->frames_till_gf_update_due = rc->frames_to_key; + cpi->refresh_golden_frame = 1; + force_gf_refresh = 1; + } + + fraction_low = (double)low_content_frame / (rows * cols); + // Update average. + cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4; + if (!force_gf_refresh && cpi->refresh_golden_frame == 1) { + // Don't update golden reference if the amount of low_content for the + // current encoded frame is small, or if the recursive average of the + // low_content over the update interval window falls below threshold. + if (fraction_low < 0.8 || cr->low_content_avg < 0.7) + cpi->refresh_golden_frame = 0; + // Reset for next internal. + cr->low_content_avg = fraction_low; + } +} + +// Update the segmentation map, and related quantities: cyclic refresh map, +// refresh sb_index, and target number of blocks to be refreshed. +// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to +// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock. +// Blocks labeled as BOOST1 may later get set to BOOST2 (during the +// encoding of the superblock). +static void cyclic_refresh_update_map(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + unsigned char *const seg_map = cpi->segmentation_map; + int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; + int xmis, ymis, x, y; + memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols); + sb_cols = (cm->mi_cols + cm->mib_size - 1) / cm->mib_size; + sb_rows = (cm->mi_rows + cm->mib_size - 1) / cm->mib_size; + sbs_in_frame = sb_cols * sb_rows; + // Number of target blocks to get the q delta (segment 1). + block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; + // Set the segmentation map: cycle through the superblocks, starting at + // cr->mb_index, and stopping when either block_count blocks have been found + // to be refreshed, or we have passed through whole frame. + assert(cr->sb_index < sbs_in_frame); + i = cr->sb_index; + cr->target_num_seg_blocks = 0; + do { + int sum_map = 0; + // Get the mi_row/mi_col corresponding to superblock index i. + int sb_row_index = (i / sb_cols); + int sb_col_index = i - sb_row_index * sb_cols; + int mi_row = sb_row_index * cm->mib_size; + int mi_col = sb_col_index * cm->mib_size; + int qindex_thresh = + cpi->oxcf.content == AOM_CONTENT_SCREEN + ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex) + : 0; + assert(mi_row >= 0 && mi_row < cm->mi_rows); + assert(mi_col >= 0 && mi_col < cm->mi_cols); + bl_index = mi_row * cm->mi_cols + mi_col; + // Loop through all MI blocks in superblock and update map. + xmis = AOMMIN(cm->mi_cols - mi_col, cm->mib_size); + ymis = AOMMIN(cm->mi_rows - mi_row, cm->mib_size); + for (y = 0; y < ymis; y++) { + for (x = 0; x < xmis; x++) { + const int bl_index2 = bl_index + y * cm->mi_cols + x; + // If the block is as a candidate for clean up then mark it + // for possible boost/refresh (segment 1). The segment id may get + // reset to 0 later if block gets coded anything other than ZEROMV. + if (cr->map[bl_index2] == 0) { + if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++; + } else if (cr->map[bl_index2] < 0) { + cr->map[bl_index2]++; + } + } + } + // Enforce constant segment over superblock. + // If segment is at least half of superblock, set to 1. + if (sum_map >= xmis * ymis / 2) { + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) { + seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1; + } + cr->target_num_seg_blocks += xmis * ymis; + } + i++; + if (i == sbs_in_frame) { + i = 0; + } + } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index); + cr->sb_index = i; +} + +// Set cyclic refresh parameters. +void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + cr->percent_refresh = 10; + cr->max_qdelta_perc = 50; + cr->time_for_refresh = 0; + // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4) + // periods of the refresh cycle, after a key frame. + if (rc->frames_since_key < 4 * cr->percent_refresh) + cr->rate_ratio_qdelta = 3.0; + else + cr->rate_ratio_qdelta = 2.0; + // Adjust some parameters for low resolutions at low bitrates. + if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) { + cr->motion_thresh = 4; + cr->rate_boost_fac = 10; + } else { + cr->motion_thresh = 32; + cr->rate_boost_fac = 17; + } +} + +// Setup cyclic background refresh: set delta q and segmentation map. +void av1_cyclic_refresh_setup(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + struct segmentation *const seg = &cm->seg; + const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc); + if (cm->current_video_frame == 0) cr->low_content_avg = 0.0; + // Don't apply refresh on key frame or enhancement layer frames. + if (!apply_cyclic_refresh || cm->frame_type == KEY_FRAME) { + // Set segmentation map to 0 and disable. + unsigned char *const seg_map = cpi->segmentation_map; + memset(seg_map, 0, cm->mi_rows * cm->mi_cols); + av1_disable_segmentation(&cm->seg); + if (cm->frame_type == KEY_FRAME) { + memset(cr->last_coded_q_map, MAXQ, + cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); + cr->sb_index = 0; + } + return; + } else { + int qindex_delta = 0; + int qindex2; + const double q = av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth); + aom_clear_system_state(); + // Set rate threshold to some multiple (set to 2 for now) of the target + // rate (target is given by sb64_target_rate and scaled by 256). + cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2; + // Distortion threshold, quadratic in Q, scale factor to be adjusted. + // q will not exceed 457, so (q * q) is within 32bit; see: + // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[]. + cr->thresh_dist_sb = ((int64_t)(q * q)) << 2; + + // Set up segmentation. + // Clear down the segment map. + av1_enable_segmentation(&cm->seg); + av1_clearall_segfeatures(seg); + // Select delta coding method. + seg->abs_delta = SEGMENT_DELTADATA; + + // Note: setting temporal_update has no effect, as the seg-map coding method + // (temporal or spatial) is determined in + // av1_choose_segmap_coding_method(), + // based on the coding cost of each method. For error_resilient mode on the + // last_frame_seg_map is set to 0, so if temporal coding is used, it is + // relative to 0 previous map. + // seg->temporal_update = 0; + + // Segment BASE "Q" feature is disabled so it defaults to the baseline Q. + av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q); + // Use segment BOOST1 for in-frame Q adjustment. + av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q); + // Use segment BOOST2 for more aggressive in-frame Q adjustment. + av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q); + + // Set the q delta for segment BOOST1. + qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta); + cr->qindex_delta[1] = qindex_delta; + + // Compute rd-mult for segment BOOST1. + qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ); + + cr->rdmult = av1_compute_rd_mult(cpi, qindex2); + + av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta); + + // Set a more aggressive (higher) q delta for segment BOOST2. + qindex_delta = compute_deltaq( + cpi, cm->base_qindex, + AOMMIN(CR_MAX_RATE_TARGET_RATIO, + 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta)); + cr->qindex_delta[2] = qindex_delta; + av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta); + + // Update the segmentation and refresh map. + cyclic_refresh_update_map(cpi); + } +} + +int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) { + return cr->rdmult; +} + +void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) { + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + memset(cr->map, 0, cm->mi_rows * cm->mi_cols); + cr->sb_index = 0; + cpi->refresh_golden_frame = 1; +} diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h new file mode 100644 index 000000000..459ab80b8 --- /dev/null +++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_AQ_CYCLICREFRESH_H_ +#define AV1_ENCODER_AQ_CYCLICREFRESH_H_ + +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The segment ids used in cyclic refresh: from base (no boost) to increasing +// boost (higher delta-qp). +#define CR_SEGMENT_ID_BASE 0 +#define CR_SEGMENT_ID_BOOST1 1 +#define CR_SEGMENT_ID_BOOST2 2 + +// Maximum rate target ratio for setting segment delta-qp. +#define CR_MAX_RATE_TARGET_RATIO 4.0 + +struct AV1_COMP; + +struct CYCLIC_REFRESH; +typedef struct CYCLIC_REFRESH CYCLIC_REFRESH; + +CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols); + +void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr); + +// Estimate the bits, incorporating the delta-q from segment 1, after encoding +// the frame. +int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi, + double correction_factor); + +// Estimate the bits per mb, for a given q = i and a corresponding delta-q +// (for segment 1), prior to encoding the frame. +int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i, + double correction_factor); + +// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col), +// check if we should reset the segment_id, and update the cyclic_refresh map +// and segmentation map. +void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi, + MB_MODE_INFO *const mbmi, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int64_t rate, int64_t dist, int skip); + +// Update the segmentation map, and related quantities: cyclic refresh map, +// refresh sb_index, and target number of blocks to be refreshed. +void av1_cyclic_refresh_update__map(struct AV1_COMP *const cpi); + +// Update the actual number of blocks that were applied the segment delta q. +void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi); + +// Set golden frame update interval, for 1 pass CBR mode. +void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi); + +// Check if we should not update golden reference, based on past refresh stats. +void av1_cyclic_refresh_check_golden_update(struct AV1_COMP *const cpi); + +// Set/update global/frame level refresh parameters. +void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi); + +// Setup cyclic background refresh: set delta q and segmentation map. +void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi); + +int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr); + +void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi); + +static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) { + return segment_id == CR_SEGMENT_ID_BOOST1 || + segment_id == CR_SEGMENT_ID_BOOST2; +} + +static INLINE int cyclic_refresh_segment_id(int segment_id) { + if (segment_id == CR_SEGMENT_ID_BOOST1) + return CR_SEGMENT_ID_BOOST1; + else if (segment_id == CR_SEGMENT_ID_BOOST2) + return CR_SEGMENT_ID_BOOST2; + else + return CR_SEGMENT_ID_BASE; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_AQ_CYCLICREFRESH_H_ diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c new file mode 100644 index 000000000..ab9b3790b --- /dev/null +++ b/third_party/aom/av1/encoder/aq_variance.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_ports/mem.h" + +#include "av1/encoder/aq_variance.h" + +#include "av1/common/seg_common.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/segmentation.h" +#include "aom_ports/system_state.h" + +#define ENERGY_MIN (-4) +#define ENERGY_MAX (1) +#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) +#define ENERGY_IN_BOUNDS(energy) \ + assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) + +static const double rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0, + 0.75, 1.0, 1.0, 1.0 }; +static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 }; + +#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN] + +DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 }; +#if CONFIG_HIGHBITDEPTH +DECLARE_ALIGNED(16, static const uint16_t, + av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 }; +#endif + +unsigned int av1_vaq_segment_id(int energy) { + ENERGY_IN_BOUNDS(energy); + return SEGMENT_ID(energy); +} + +void av1_vaq_frame_setup(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + struct segmentation *seg = &cm->seg; + int i; + + if (frame_is_intra_only(cm) || cm->error_resilient_mode || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + cpi->vaq_refresh = 1; + + av1_enable_segmentation(seg); + av1_clearall_segfeatures(seg); + + seg->abs_delta = SEGMENT_DELTADATA; + + aom_clear_system_state(); + + for (i = 0; i < MAX_SEGMENTS; ++i) { + int qindex_delta = + av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, + rate_ratio[i], cm->bit_depth); + + // We don't allow qindex 0 in a segment if the base value is not 0. + // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) { + qindex_delta = -cm->base_qindex + 1; + } + + // No need to enable SEG_LVL_ALT_Q for this segment. + if (rate_ratio[i] == 1.0) { + continue; + } + + av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta); + av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } + } +} + +/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions + * of variance() and highbd_8_variance(). It should not. + */ +static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, unsigned int *sse, + int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +#if CONFIG_HIGHBITDEPTH +static void aq_highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint64_t *sse, uint64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +static void aq_highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)sse_long; + *sum = (int)sum_long; +} +#endif // CONFIG_HIGHBITDEPTH + +static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bs) { + MACROBLOCKD *xd = &x->e_mbd; + unsigned int var, sse; + int right_overflow = + (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; + int bottom_overflow = + (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; + + if (right_overflow || bottom_overflow) { + const int bw = 8 * mi_size_wide[bs] - right_overflow; + const int bh = 8 * mi_size_high[bs] - bottom_overflow; + int avg; +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, bw, bh, + &sse, &avg); + sse >>= 2 * (xd->bd - 8); + avg >>= (xd->bd - 8); + } else { + aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0, + bw, bh, &sse, &avg); + } +#else + aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0, + bw, bh, &sse, &avg); +#endif // CONFIG_HIGHBITDEPTH + var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh)); + return (unsigned int)((uint64_t)var * 256) / (bw * bh); + } else { +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + var = + cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse); + } else { + var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, + av1_all_zeros, 0, &sse); + } +#else + var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, + av1_all_zeros, 0, &sse); +#endif // CONFIG_HIGHBITDEPTH + return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs]; + } +} + +double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { + unsigned int var = block_variance(cpi, x, bs); + aom_clear_system_state(); + return log(var + 1.0); +} + +#define DEFAULT_E_MIDPOINT 10.0 +int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { + double energy; + double energy_midpoint; + aom_clear_system_state(); + energy_midpoint = + (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT; + energy = av1_log_block_var(cpi, x, bs) - energy_midpoint; + return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); +} diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h new file mode 100644 index 000000000..05725c5de --- /dev/null +++ b/third_party/aom/av1/encoder/aq_variance.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_AQ_VARIANCE_H_ +#define AV1_ENCODER_AQ_VARIANCE_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int av1_vaq_segment_id(int energy); +void av1_vaq_frame_setup(AV1_COMP *cpi); + +int av1_block_energy(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); +double av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_AQ_VARIANCE_H_ diff --git a/third_party/aom/av1/encoder/arm/neon/dct_neon.c b/third_party/aom/av1/encoder/arm/neon/dct_neon.c new file mode 100644 index 000000000..f6ce24a3d --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/dct_neon.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./av1_rtcd.h" +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "av1/common/blockd.h" +#include "aom_dsp/txfm_common.h" + +void av1_fdct8x8_quant_neon(const int16_t *input, int stride, + int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + int16_t temp_buffer[64]; + (void)coeff_ptr; + + aom_fdct8x8_neon(input, temp_buffer, stride); + av1_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan_ptr, iscan_ptr); +} diff --git a/third_party/aom/av1/encoder/arm/neon/error_neon.c b/third_party/aom/av1/encoder/arm/neon/error_neon.c new file mode 100644 index 000000000..fe5233f89 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/error_neon.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./av1_rtcd.h" + +int64_t av1_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, + int block_size) { + int64x2_t error = vdupq_n_s64(0); + + assert(block_size >= 8); + assert((block_size % 8) == 0); + + do { + const int16x8_t c = vld1q_s16(coeff); + const int16x8_t d = vld1q_s16(dqcoeff); + const int16x8_t diff = vsubq_s16(c, d); + const int16x4_t diff_lo = vget_low_s16(diff); + const int16x4_t diff_hi = vget_high_s16(diff); + // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before + // accumulating them in 64-bits. + const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); + const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); + const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); + error = vaddq_s64(error, err2); + coeff += 8; + dqcoeff += 8; + block_size -= 8; + } while (block_size != 0); + + return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); +} diff --git a/third_party/aom/av1/encoder/arm/neon/quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c new file mode 100644 index 000000000..36e7d3370 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include + +#include "aom_mem/aom_mem.h" + +#include "av1/common/quant_common.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rd.h" + +void av1_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)scan; + + if (!skip_block) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + int i; + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_one = vdupq_n_s16(1); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + int16x8_t v_round = vmovq_n_s16(round_ptr[1]); + int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); + int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); + // adjust for dc + v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); + v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); + v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); + // process dc and the first seven ac coeffs + { + const int16x8_t v_iscan = vld1q_s16(&iscan[0]); + const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + vst1q_s16(&qcoeff_ptr[0], v_qcoeff); + vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff); + v_round = vmovq_n_s16(round_ptr[1]); + v_quant = vmovq_n_s16(quant_ptr[1]); + v_dequant = vmovq_n_s16(dequant_ptr[1]); + } + // now process the rest of the ac coeffs + for (i = 8; i < count; i += 8) { + const int16x8_t v_iscan = vld1q_s16(&iscan[i]); + const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + vst1q_s16(&qcoeff_ptr[i], v_qcoeff); + vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff); + } + { + const int16x4_t v_eobmax_3210 = vmax_s16( + vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + + *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); + } + } else { + memset(qcoeff_ptr, 0, count * sizeof(int16_t)); + memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); + *eob_ptr = 0; + } +} diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c new file mode 100644 index 000000000..6cffac264 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_quantize.c @@ -0,0 +1,1790 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/quantize.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/idct.h" +#include "av1/common/quant_common.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rd.h" + +#if CONFIG_NEW_QUANT +static INLINE int quantize_coeff_nuq( + const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift, + const int16_t dequant, const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr) { + const int coeff = coeffv; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int i, q; + int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX); + for (i = 0; i < NUQ_KNOTS; i++) { + if (tmp < cuml_bins_ptr[i]) { + q = i; + break; + } + } + if (i == NUQ_KNOTS) { + tmp -= cuml_bins_ptr[NUQ_KNOTS - 1]; + q = NUQ_KNOTS + (((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16); + } + if (q) { + *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val); + *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; + *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; + } else { + *qcoeff_ptr = 0; + *dqcoeff_ptr = 0; + } + return (q != 0); +} + +static INLINE int quantize_coeff_bigtx_nuq( + const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift, + const int16_t dequant, const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int logsizeby16) { + const int coeff = coeffv; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int i, q; + int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX); + for (i = 0; i < NUQ_KNOTS; i++) { + if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) { + q = i; + break; + } + } + if (i == NUQ_KNOTS) { + tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16); + q = NUQ_KNOTS + + (((((tmp * quant) >> 16) + tmp) * quant_shift) >> (16 - logsizeby16)); + } + if (q) { + *dqcoeff_ptr = ROUND_POWER_OF_TWO( + av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16); + // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >> + // (logsizeby16); + *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; + *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; + } else { + *qcoeff_ptr = 0; + *dqcoeff_ptr = 0; + } + return (q != 0); +} + +static INLINE int quantize_coeff_fp_nuq( + const tran_low_t coeffv, const int16_t quant, const int16_t dequant, + const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) { + const int coeff = coeffv; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int i, q; + int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX); + for (i = 0; i < NUQ_KNOTS; i++) { + if (tmp < cuml_bins_ptr[i]) { + q = i; + break; + } + } + if (i == NUQ_KNOTS) { + q = NUQ_KNOTS + + ((((int64_t)tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16); + } + if (q) { + *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val); + *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; + *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; + } else { + *qcoeff_ptr = 0; + *dqcoeff_ptr = 0; + } + return (q != 0); +} + +static INLINE int quantize_coeff_bigtx_fp_nuq( + const tran_low_t coeffv, const int16_t quant, const int16_t dequant, + const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) { + const int coeff = coeffv; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int i, q; + int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX); + for (i = 0; i < NUQ_KNOTS; i++) { + if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) { + q = i; + break; + } + } + if (i == NUQ_KNOTS) { + q = NUQ_KNOTS + + ((((int64_t)tmp - + ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) * + quant) >> + (16 - logsizeby16)); + } + if (q) { + *dqcoeff_ptr = ROUND_POWER_OF_TWO( + av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16); + // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >> + // (logsizeby16); + *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; + *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; + } else { + *qcoeff_ptr = 0; + *dqcoeff_ptr = 0; + } + return (q != 0); +} + +void quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t quant, + const int16_t quant_shift, const int16_t dequant, + const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + const int rc = 0; + if (quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant, + cuml_bins_ptr, dequant_val, qcoeff_ptr, dqcoeff_ptr)) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t quant, + const int16_t dequant, const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + const int rc = 0; + if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant, cuml_bins_ptr, + dequant_val, qcoeff_ptr, dqcoeff_ptr)) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void quantize_dc_32x32_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t quant, + const int16_t quant_shift, const int16_t dequant, + const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + const int rc = 0; + if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant, + cuml_bins_ptr, dequant_val, qcoeff_ptr, + dqcoeff_ptr, av1_get_tx_scale(TX_32X32))) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void quantize_dc_32x32_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t quant, + const int16_t dequant, + const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + const int rc = 0; + if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant, + cuml_bins_ptr, dequant_val, qcoeff_ptr, + dqcoeff_ptr, av1_get_tx_scale(TX_32X32))) + eob = 0; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void quantize_dc_64x64_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t quant, + const int16_t quant_shift, const int16_t dequant, + const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + const int rc = 0; + if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant, + cuml_bins_ptr, dequant_val, qcoeff_ptr, + dqcoeff_ptr, av1_get_tx_scale(TX_64X64))) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void quantize_dc_64x64_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t quant, + const int16_t dequant, + const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + const int rc = 0; + if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant, + cuml_bins_ptr, dequant_val, qcoeff_ptr, + dqcoeff_ptr, av1_get_tx_scale(TX_64X64))) + eob = 0; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 + +void quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, + const cuml_bins_type_nuq *cuml_bins_ptr, + const dequant_val_type_nuq *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const uint8_t *band) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + int i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + if (quantize_coeff_nuq(coeff_ptr[rc], quant_ptr[rc != 0], + quant_shift_ptr[rc != 0], dequant_ptr[rc != 0], + cuml_bins_ptr[band[i]], dequant_val[band[i]], + &qcoeff_ptr[rc], &dqcoeff_ptr[rc])) + eob = i; + } + } + *eob_ptr = eob + 1; +} + +void quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *quant_ptr, + const int16_t *dequant_ptr, + const cuml_bins_type_nuq *cuml_bins_ptr, + const dequant_val_type_nuq *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const uint8_t *band) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + int i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant_ptr[rc != 0], + dequant_ptr[rc != 0], cuml_bins_ptr[band[i]], + dequant_val[band[i]], &qcoeff_ptr[rc], + &dqcoeff_ptr[rc])) + eob = i; + } + } + *eob_ptr = eob + 1; +} + +void quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + const int16_t *dequant_ptr, + const cuml_bins_type_nuq *cuml_bins_ptr, + const dequant_val_type_nuq *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const uint8_t *band) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + int i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + if (quantize_coeff_bigtx_nuq( + coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0], + dequant_ptr[rc != 0], cuml_bins_ptr[band[i]], + dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc], + av1_get_tx_scale(TX_32X32))) + eob = i; + } + } + *eob_ptr = eob + 1; +} + +void quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *quant_ptr, + const int16_t *dequant_ptr, + const cuml_bins_type_nuq *cuml_bins_ptr, + const dequant_val_type_nuq *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const uint8_t *band) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + int i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + if (quantize_coeff_bigtx_fp_nuq( + coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0], + cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc], + &dqcoeff_ptr[rc], av1_get_tx_scale(TX_32X32))) + eob = i; + } + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + const int16_t *dequant_ptr, + const cuml_bins_type_nuq *cuml_bins_ptr, + const dequant_val_type_nuq *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const uint8_t *band) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + int i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + if (quantize_coeff_bigtx_nuq( + coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0], + dequant_ptr[rc != 0], cuml_bins_ptr[band[i]], + dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc], + av1_get_tx_scale(TX_64X64))) + eob = i; + } + } + *eob_ptr = eob + 1; +} + +void quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *quant_ptr, + const int16_t *dequant_ptr, + const cuml_bins_type_nuq *cuml_bins_ptr, + const dequant_val_type_nuq *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const uint8_t *band) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + int i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + if (quantize_coeff_bigtx_fp_nuq( + coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0], + cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc], + &dqcoeff_ptr[rc], av1_get_tx_scale(TX_64X64))) + eob = i; + } + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 +#endif // CONFIG_NEW_QUANT + +void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + *eob_ptr = 0; +} + +static void quantize_fp_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, +#if CONFIG_AOM_QM + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, +#endif + int log_scale) { + int i, eob = -1; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; +#if CONFIG_AOM_QM + const qm_val_t wt = qm_ptr[rc]; + const qm_val_t iwt = iqm_ptr[rc]; + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; +#endif + const int coeff_sign = (coeff >> 31); + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32 = 0; +#if CONFIG_AOM_QM + if (abs_coeff * wt >= + (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) { +#else + if (abs_coeff >= (dequant_ptr[rc != 0] >> (1 + log_scale))) { +#endif + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); +#if CONFIG_AOM_QM + tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >> + ((16 - log_scale) + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale); +#else + tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = + qcoeff_ptr[rc] * dequant_ptr[rc != 0] / (1 << log_scale); +#endif + } + + if (tmp32) eob = i; + } + } + *eob_ptr = eob + 1; +} + +void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan +#if CONFIG_AOM_QM + , + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr +#endif + ) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + 0); +} + +void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan +#if CONFIG_AOM_QM + , + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr +#endif + ) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + 1); +} + +#if CONFIG_TX64X64 +void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan +#if CONFIG_AOM_QM + , + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr +#endif + ) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + 2); +} +#endif // CONFIG_TX64X64 + +void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; +#if CONFIG_AOM_QM + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; +#endif // CONFIG_AOM_QM + + switch (qparam->log_scale) { + case 0: + if (n_coeffs < 16) { + // TODO(jingning): Need SIMD implementation for smaller block size + // quantization. + quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin, + p->round_fp, p->quant_fp, p->quant_shift, + qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, + sc->scan, sc->iscan, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + qparam->log_scale); + } else { + av1_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, + pd->dequant, eob_ptr, sc->scan, sc->iscan +#if CONFIG_AOM_QM + , + qm_ptr, iqm_ptr +#endif + ); + } + break; + case 1: + av1_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, + p->round_fp, p->quant_fp, p->quant_shift, + qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, + sc->scan, sc->iscan +#if CONFIG_AOM_QM + , + qm_ptr, iqm_ptr +#endif + ); + break; +#if CONFIG_TX64X64 + case 2: + av1_quantize_fp_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, + p->round_fp, p->quant_fp, p->quant_shift, + qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, + sc->scan, sc->iscan +#if CONFIG_AOM_QM + , + qm_ptr, iqm_ptr +#endif + ); + break; +#endif // CONFIG_TX64X64 + default: assert(0); + } +} + +void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr, const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; +#if CONFIG_AOM_QM + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; +#endif // CONFIG_AOM_QM + + switch (qparam->log_scale) { + case 0: + aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, + pd->dequant, eob_ptr, sc->scan, sc->iscan +#if CONFIG_AOM_QM + , + qm_ptr, iqm_ptr +#endif + ); + break; + case 1: + aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, + pd->dequant, eob_ptr, sc->scan, sc->iscan +#if CONFIG_AOM_QM + , + qm_ptr, iqm_ptr +#endif + ); + break; +#if CONFIG_TX64X64 + case 2: + aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, + pd->dequant, eob_ptr, sc->scan, sc->iscan +#if CONFIG_AOM_QM + , + qm_ptr, iqm_ptr +#endif + ); + break; +#endif // CONFIG_TX64X64 + default: assert(0); + } +} + +void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; +#if CONFIG_AOM_QM + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; +#endif // CONFIG_AOM_QM + + (void)sc; + + switch (qparam->log_scale) { + case 0: + aom_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round, + p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], + eob_ptr +#if CONFIG_AOM_QM + , + qm_ptr, iqm_ptr +#endif + ); + break; + case 1: + aom_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0], + qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr +#if CONFIG_AOM_QM + , + qm_ptr, iqm_ptr +#endif + ); + break; +#if CONFIG_TX64X64 + aom_quantize_dc_64x64(coeff_ptr, skip_block, p->round, p->quant_fp[0], + qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr +#if CONFIG_AOM_QM + , + qm_ptr, iqm_ptr +#endif + ); + case 2: break; +#endif // CONFIG_TX64X64 + default: assert(0); + } +} + +#if CONFIG_NEW_QUANT +void av1_quantize_b_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; + const uint8_t *band = get_band_translate(qparam->tx_size); + int dq = qparam->dq; + + switch (qparam->log_scale) { + case 0: + quantize_nuq(coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift, + pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], + qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band); + break; + case 1: + quantize_32x32_nuq(coeff_ptr, n_coeffs, skip_block, p->quant, + p->quant_shift, pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], + qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band); + break; +#if CONFIG_TX64X64 + case 2: + quantize_64x64_nuq(coeff_ptr, n_coeffs, skip_block, p->quant, + p->quant_shift, pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], + qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band); + break; +#endif // CONFIG_TX64X64 + default: assert(0); + } +} + +void av1_quantize_fp_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; + const uint8_t *band = get_band_translate(qparam->tx_size); + int dq = qparam->dq; + + switch (qparam->log_scale) { + case 0: + quantize_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], + qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band); + break; + case 1: + quantize_32x32_fp_nuq( + coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, + dqcoeff_ptr, eob_ptr, sc->scan, band); + break; +#if CONFIG_TX64X64 + case 2: + quantize_64x64_fp_nuq( + coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, + dqcoeff_ptr, eob_ptr, sc->scan, band); + break; +#endif // CONFIG_TX64X64 + default: assert(0); + } +} + +void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; + int dq = qparam->dq; + (void)sc; + + switch (qparam->log_scale) { + case 0: + quantize_dc_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], + pd->dequant[0], p->cuml_bins_nuq[dq][0], + pd->dequant_val_nuq[dq][0], qcoeff_ptr, dqcoeff_ptr, + eob_ptr); + break; + case 1: + quantize_dc_32x32_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], + pd->dequant[0], p->cuml_bins_nuq[dq][0], + pd->dequant_val_nuq[dq][0], qcoeff_ptr, + dqcoeff_ptr, eob_ptr); + break; +#if CONFIG_TX64X64 + case 2: + quantize_dc_64x64_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], + pd->dequant[0], p->cuml_bins_nuq[dq][0], + pd->dequant_val_nuq[dq][0], qcoeff_ptr, + dqcoeff_ptr, eob_ptr); + break; +#endif // CONFIG_TX64X64 + default: assert(0); + } +} +#endif // CONFIG_NEW_QUANT + +#if CONFIG_HIGHBITDEPTH +void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; +#if CONFIG_AOM_QM + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; +#endif // CONFIG_AOM_QM + + if (n_coeffs < 16) { + // TODO(jingning): Need SIMD implementation for smaller block size + // quantization. + av1_highbd_quantize_fp_c(coeff_ptr, n_coeffs, skip_block, p->zbin, + p->round_fp, p->quant_fp, p->quant_shift, + qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, + sc->scan, sc->iscan, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + qparam->log_scale); + return; + } + + av1_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, + pd->dequant, eob_ptr, sc->scan, sc->iscan, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + qparam->log_scale); +} + +void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; +#if CONFIG_AOM_QM + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; +#endif // CONFIG_AOM_QM + + switch (qparam->log_scale) { + case 0: + aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, + pd->dequant, eob_ptr, sc->scan, sc->iscan +#if CONFIG_AOM_QM + , + qm_ptr, iqm_ptr +#endif + ); + break; + case 1: + aom_highbd_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, + p->round, p->quant, p->quant_shift, + qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, + sc->scan, sc->iscan +#if CONFIG_AOM_QM + , + qm_ptr, iqm_ptr +#endif + ); + break; +#if CONFIG_TX64X64 + case 2: + aom_highbd_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, + p->round, p->quant, p->quant_shift, + qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, + sc->scan, sc->iscan +#if CONFIG_AOM_QM + , + qm_ptr, iqm_ptr +#endif + ); + break; +#endif // CONFIG_TX64X64 + default: assert(0); + } +} + +#if CONFIG_HIGHBITDEPTH +static INLINE void highbd_quantize_dc( + const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr, +#if CONFIG_AOM_QM + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, +#endif + const int log_scale) { + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); +#if CONFIG_AOM_QM + (void)qm_ptr; + (void)iqm_ptr; +#endif + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + round_ptr[0]; + const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> (16 - log_scale)); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / (1 << log_scale); + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_HIGHBITDEPTH + +void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; +#if CONFIG_AOM_QM + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; +#endif // CONFIG_AOM_QM + + (void)sc; + + highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round, + p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], + eob_ptr, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + qparam->log_scale); +} + +#if CONFIG_NEW_QUANT +static INLINE int highbd_quantize_coeff_nuq( + const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift, + const int16_t dequant, const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr) { + const int coeff = coeffv; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int i, q; + int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX); + for (i = 0; i < NUQ_KNOTS; i++) { + if (tmp < cuml_bins_ptr[i]) { + q = i; + break; + } + } + if (i == NUQ_KNOTS) { + tmp -= cuml_bins_ptr[NUQ_KNOTS - 1]; + q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16); + } + if (q) { + *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val); + *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; + *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; + } else { + *qcoeff_ptr = 0; + *dqcoeff_ptr = 0; + } + return (q != 0); +} + +static INLINE int highbd_quantize_coeff_fp_nuq( + const tran_low_t coeffv, const int16_t quant, const int16_t dequant, + const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) { + const int coeff = coeffv; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int i, q; + int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX); + for (i = 0; i < NUQ_KNOTS; i++) { + if (tmp < cuml_bins_ptr[i]) { + q = i; + break; + } + } + if (i == NUQ_KNOTS) { + q = NUQ_KNOTS + (int)(((tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16); + } + if (q) { + *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val); + *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; + *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; + } else { + *qcoeff_ptr = 0; + *dqcoeff_ptr = 0; + } + return (q != 0); +} + +static INLINE int highbd_quantize_coeff_bigtx_fp_nuq( + const tran_low_t coeffv, const int16_t quant, const int16_t dequant, + const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby16) { + const int coeff = coeffv; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int i, q; + int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX); + for (i = 0; i < NUQ_KNOTS; i++) { + if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) { + q = i; + break; + } + } + if (i == NUQ_KNOTS) { + q = NUQ_KNOTS + + (int)(((tmp - + ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16)) * + quant) >> + (16 - logsizeby16)); + } + if (q) { + *dqcoeff_ptr = ROUND_POWER_OF_TWO( + av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16); + *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; + *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; + } else { + *qcoeff_ptr = 0; + *dqcoeff_ptr = 0; + } + return (q != 0); +} + +static INLINE int highbd_quantize_coeff_bigtx_nuq( + const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift, + const int16_t dequant, const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int logsizeby16) { + const int coeff = coeffv; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int i, q; + int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX); + for (i = 0; i < NUQ_KNOTS; i++) { + if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], logsizeby16)) { + q = i; + break; + } + } + if (i == NUQ_KNOTS) { + tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], logsizeby16); + q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >> + (16 - logsizeby16)); + } + if (q) { + *dqcoeff_ptr = ROUND_POWER_OF_TWO( + av1_dequant_abscoeff_nuq(q, dequant, dequant_val), logsizeby16); + *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign; + *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr; + } else { + *qcoeff_ptr = 0; + *dqcoeff_ptr = 0; + } + return (q != 0); +} + +void highbd_quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t quant, + const int16_t quant_shift, const int16_t dequant, + const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + const int rc = 0; + if (highbd_quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant, + cuml_bins_ptr, dequant_val, qcoeff_ptr, + dqcoeff_ptr)) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void highbd_quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t quant, + const int16_t dequant, + const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + const int rc = 0; + if (highbd_quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant, + cuml_bins_ptr, dequant_val, qcoeff_ptr, + dqcoeff_ptr)) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void highbd_quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + const int16_t *dequant_ptr, + const cuml_bins_type_nuq *cuml_bins_ptr, + const dequant_val_type_nuq *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const uint8_t *band) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + int i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + if (highbd_quantize_coeff_nuq( + coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0], + dequant_ptr[rc != 0], cuml_bins_ptr[band[i]], + dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc])) + eob = i; + } + } + *eob_ptr = eob + 1; +} + +void highbd_quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + const int16_t *dequant_ptr, + const cuml_bins_type_nuq *cuml_bins_ptr, + const dequant_val_type_nuq *dequant_val, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const int16_t *scan, const uint8_t *band) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + int i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + if (highbd_quantize_coeff_bigtx_nuq( + coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0], + dequant_ptr[rc != 0], cuml_bins_ptr[band[i]], + dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc], + av1_get_tx_scale(TX_32X32))) + eob = i; + } + } + *eob_ptr = eob + 1; +} + +void highbd_quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, int skip_block, + const int16_t *quant_ptr, + const int16_t *dequant_ptr, + const cuml_bins_type_nuq *cuml_bins_ptr, + const dequant_val_type_nuq *dequant_val, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const int16_t *scan, const uint8_t *band) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + int i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + if (highbd_quantize_coeff_bigtx_fp_nuq( + coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0], + cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc], + &dqcoeff_ptr[rc], av1_get_tx_scale(TX_32X32))) + eob = i; + } + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void highbd_quantize_64x64_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + const int16_t *dequant_ptr, + const cuml_bins_type_nuq *cuml_bins_ptr, + const dequant_val_type_nuq *dequant_val, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const int16_t *scan, const uint8_t *band) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + int i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + if (highbd_quantize_coeff_bigtx_nuq( + coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0], + dequant_ptr[rc != 0], cuml_bins_ptr[band[i]], + dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc], + av1_get_tx_scale(TX_64X64))) + eob = i; + } + } + *eob_ptr = eob + 1; +} + +void highbd_quantize_64x64_fp_nuq_c(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, int skip_block, + const int16_t *quant_ptr, + const int16_t *dequant_ptr, + const cuml_bins_type_nuq *cuml_bins_ptr, + const dequant_val_type_nuq *dequant_val, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const int16_t *scan, const uint8_t *band) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + int i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + if (highbd_quantize_coeff_bigtx_fp_nuq( + coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0], + cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc], + &dqcoeff_ptr[rc], av1_get_tx_scale(TX_64X64))) + eob = i; + } + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 + +void highbd_quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *quant_ptr, + const int16_t *dequant_ptr, + const cuml_bins_type_nuq *cuml_bins_ptr, + const dequant_val_type_nuq *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const uint8_t *band) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + int i; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + if (highbd_quantize_coeff_fp_nuq( + coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0], + cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc], + &dqcoeff_ptr[rc])) + eob = i; + } + } + *eob_ptr = eob + 1; +} + +void highbd_quantize_dc_32x32_nuq( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t quant, const int16_t quant_shift, const int16_t dequant, + const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + const int rc = 0; + if (highbd_quantize_coeff_bigtx_nuq( + coeff_ptr[rc], quant, quant_shift, dequant, cuml_bins_ptr, + dequant_val, qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_32X32))) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void highbd_quantize_dc_32x32_fp_nuq( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + const int rc = 0; + if (highbd_quantize_coeff_bigtx_fp_nuq( + coeff_ptr[rc], quant, dequant, cuml_bins_ptr, dequant_val, + qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_32X32))) + eob = 0; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void highbd_quantize_dc_64x64_nuq( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t quant, const int16_t quant_shift, const int16_t dequant, + const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + const int rc = 0; + if (highbd_quantize_coeff_bigtx_nuq( + coeff_ptr[rc], quant, quant_shift, dequant, cuml_bins_ptr, + dequant_val, qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_64X64))) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void highbd_quantize_dc_64x64_fp_nuq( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr, + const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { + int eob = -1; + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + if (!skip_block) { + const int rc = 0; + if (highbd_quantize_coeff_bigtx_fp_nuq( + coeff_ptr[rc], quant, dequant, cuml_bins_ptr, dequant_val, + qcoeff_ptr, dqcoeff_ptr, av1_get_tx_scale(TX_64X64))) + eob = 0; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 + +void av1_highbd_quantize_b_nuq_facade( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; + const uint8_t *band = get_band_translate(qparam->tx_size); + const int dq = qparam->dq; + + switch (qparam->log_scale) { + case 0: + highbd_quantize_nuq(coeff_ptr, n_coeffs, skip_block, p->quant, + p->quant_shift, pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], + qcoeff_ptr, dqcoeff_ptr, eob_ptr, sc->scan, band); + break; + case 1: + highbd_quantize_32x32_nuq( + coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift, + pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, + dqcoeff_ptr, eob_ptr, sc->scan, band); + break; +#if CONFIG_TX64X64 + case 2: + highbd_quantize_64x64_nuq( + coeff_ptr, n_coeffs, skip_block, p->quant, p->quant_shift, + pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, + dqcoeff_ptr, eob_ptr, sc->scan, band); + break; +#endif // CONFIG_TX64X64 + default: assert(0); + } +} + +void av1_highbd_quantize_fp_nuq_facade( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; + const uint8_t *band = get_band_translate(qparam->tx_size); + const int dq = qparam->dq; + + switch (qparam->log_scale) { + case 0: + highbd_quantize_fp_nuq( + coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, + dqcoeff_ptr, eob_ptr, sc->scan, band); + break; + case 1: + highbd_quantize_32x32_fp_nuq( + coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, + dqcoeff_ptr, eob_ptr, sc->scan, band); + break; +#if CONFIG_TX64X64 + case 2: + highbd_quantize_64x64_fp_nuq( + coeff_ptr, n_coeffs, skip_block, p->quant_fp, pd->dequant, + (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq], + (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff_ptr, + dqcoeff_ptr, eob_ptr, sc->scan, band); + break; +#endif // CONFIG_TX64X64 + default: assert(0); + } +} + +void av1_highbd_quantize_dc_nuq_facade( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; + const int dq = qparam->dq; + (void)sc; + + switch (qparam->log_scale) { + case 0: + highbd_quantize_dc_fp_nuq(coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], + pd->dequant[0], p->cuml_bins_nuq[dq][0], + pd->dequant_val_nuq[dq][0], qcoeff_ptr, + dqcoeff_ptr, eob_ptr); + break; + case 1: + highbd_quantize_dc_32x32_fp_nuq( + coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], pd->dequant[0], + p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0], qcoeff_ptr, + dqcoeff_ptr, eob_ptr); + break; +#if CONFIG_TX64X64 + case 2: + highbd_quantize_dc_64x64_fp_nuq( + coeff_ptr, n_coeffs, skip_block, p->quant_fp[0], pd->dequant[0], + p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0], qcoeff_ptr, + dqcoeff_ptr, eob_ptr); + break; +#endif // CONFIG_TX64X64 + default: assert(0); + } +} +#endif // CONFIG_NEW_QUANT +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_HIGHBITDEPTH +void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, +#if CONFIG_AOM_QM + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, +#endif + int log_scale) { + int i; + int eob = -1; + const int scale = 1 << log_scale; + const int shift = 16 - log_scale; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)iscan; + + memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; +#if CONFIG_AOM_QM + const qm_val_t wt = qm_ptr[rc]; + const qm_val_t iwt = iqm_ptr[rc]; + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; +#endif + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + round_ptr[rc != 0]; +#if CONFIG_AOM_QM + const uint32_t abs_qcoeff = + (uint32_t)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / scale; +#else + const uint32_t abs_qcoeff = + (uint32_t)((tmp * quant_ptr[rc != 0]) >> shift); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale; +#endif + if (abs_qcoeff) eob = i; + } + } + *eob_ptr = eob + 1; +} + +#endif // CONFIG_HIGHBITDEPTH + +static void invert_quant(int16_t *quant, int16_t *shift, int d) { + uint32_t t; + int l, m; + t = d; + for (l = 0; t > 1; l++) t >>= 1; + m = 1 + (1 << (16 + l)) / d; + *quant = (int16_t)(m - (1 << 16)); + *shift = 1 << (16 - l); +} + +static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) { + const int quant = av1_dc_quant(q, 0, bit_depth); +#if CONFIG_HIGHBITDEPTH + switch (bit_depth) { + case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80); + case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80); + case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80); + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +#else + (void)bit_depth; + return q == 0 ? 64 : (quant < 148 ? 84 : 80); +#endif +} + +void av1_init_quantizer(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + QUANTS *const quants = &cpi->quants; + int i, q, quant; +#if CONFIG_NEW_QUANT + int dq; +#endif + + for (q = 0; q < QINDEX_RANGE; q++) { + const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth); + const int qrounding_factor = q == 0 ? 64 : 48; + + for (i = 0; i < 2; ++i) { + int qrounding_factor_fp = 64; + // y + quant = i == 0 ? av1_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth) + : av1_ac_quant(q, 0, cm->bit_depth); + invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant); + quants->y_quant_fp[q][i] = (1 << 16) / quant; + quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; + quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); + quants->y_round[q][i] = (qrounding_factor * quant) >> 7; + cpi->y_dequant[q][i] = quant; + + // uv + quant = i == 0 ? av1_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth) + : av1_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth); + invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i], + quant); + quants->uv_quant_fp[q][i] = (1 << 16) / quant; + quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; + quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); + quants->uv_round[q][i] = (qrounding_factor * quant) >> 7; + cpi->uv_dequant[q][i] = quant; + } + +#if CONFIG_NEW_QUANT + for (dq = 0; dq < QUANT_PROFILES; dq++) { + for (i = 0; i < COEF_BANDS; i++) { + const int y_quant = cpi->y_dequant[q][i != 0]; + const int uvquant = cpi->uv_dequant[q][i != 0]; + av1_get_dequant_val_nuq(y_quant, i, cpi->y_dequant_val_nuq[dq][q][i], + quants->y_cuml_bins_nuq[dq][q][i], dq); + av1_get_dequant_val_nuq(uvquant, i, cpi->uv_dequant_val_nuq[dq][q][i], + quants->uv_cuml_bins_nuq[dq][q][i], dq); + } + } +#endif // CONFIG_NEW_QUANT + + for (i = 2; i < 8; i++) { // 8: SIMD width + quants->y_quant[q][i] = quants->y_quant[q][1]; + quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1]; + quants->y_round_fp[q][i] = quants->y_round_fp[q][1]; + quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1]; + quants->y_zbin[q][i] = quants->y_zbin[q][1]; + quants->y_round[q][i] = quants->y_round[q][1]; + cpi->y_dequant[q][i] = cpi->y_dequant[q][1]; + + quants->uv_quant[q][i] = quants->uv_quant[q][1]; + quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1]; + quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1]; + quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1]; + quants->uv_zbin[q][i] = quants->uv_zbin[q][1]; + quants->uv_round[q][i] = quants->uv_round[q][1]; + cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1]; + } + } +} + +void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x, + int segment_id) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const QUANTS *const quants = &cpi->quants; + +#if CONFIG_DELTA_Q +#if CONFIG_EXT_DELTA_Q + int current_q_index = AOMMAX( + 0, AOMMIN(QINDEX_RANGE - 1, cpi->oxcf.deltaq_mode != NO_DELTA_Q + ? cm->base_qindex + xd->delta_qindex + : cm->base_qindex)); +#else + int current_q_index = AOMMAX( + 0, AOMMIN(QINDEX_RANGE - 1, cm->delta_q_present_flag + ? cm->base_qindex + xd->delta_qindex + : cm->base_qindex)); +#endif + const int qindex = av1_get_qindex(&cm->seg, segment_id, current_q_index); +#else + const int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex); +#endif + const int rdmult = av1_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q); + int i; +#if CONFIG_AOM_QM + int minqm = cm->min_qmlevel; + int maxqm = cm->max_qmlevel; + // Quant matrix only depends on the base QP so there is only one set per frame + int qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0) + ? NUM_QM_LEVELS - 1 + : aom_get_qmlevel(cm->base_qindex, minqm, maxqm); +#endif +#if CONFIG_NEW_QUANT + int dq; +#endif + + // Y + x->plane[0].quant = quants->y_quant[qindex]; + x->plane[0].quant_fp = quants->y_quant_fp[qindex]; + x->plane[0].round_fp = quants->y_round_fp[qindex]; + x->plane[0].quant_shift = quants->y_quant_shift[qindex]; + x->plane[0].zbin = quants->y_zbin[qindex]; + x->plane[0].round = quants->y_round[qindex]; +#if CONFIG_AOM_QM + memcpy(&xd->plane[0].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][0], + sizeof(cm->gqmatrix[qmlevel][0])); + memcpy(&xd->plane[0].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][0], + sizeof(cm->giqmatrix[qmlevel][0])); +#endif + xd->plane[0].dequant = cpi->y_dequant[qindex]; +#if CONFIG_NEW_QUANT + for (dq = 0; dq < QUANT_PROFILES; dq++) { + x->plane[0].cuml_bins_nuq[dq] = quants->y_cuml_bins_nuq[dq][qindex]; + xd->plane[0].dequant_val_nuq[dq] = cpi->y_dequant_val_nuq[dq][qindex]; + } +#endif // CONFIG_NEW_QUANT + + // UV + for (i = 1; i < 3; i++) { + x->plane[i].quant = quants->uv_quant[qindex]; + x->plane[i].quant_fp = quants->uv_quant_fp[qindex]; + x->plane[i].round_fp = quants->uv_round_fp[qindex]; + x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; + x->plane[i].zbin = quants->uv_zbin[qindex]; + x->plane[i].round = quants->uv_round[qindex]; +#if CONFIG_AOM_QM + memcpy(&xd->plane[i].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1], + sizeof(cm->gqmatrix[qmlevel][1])); + memcpy(&xd->plane[i].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1], + sizeof(cm->giqmatrix[qmlevel][1])); +#endif + xd->plane[i].dequant = cpi->uv_dequant[qindex]; +#if CONFIG_NEW_QUANT + for (dq = 0; dq < QUANT_PROFILES; dq++) { + x->plane[i].cuml_bins_nuq[dq] = quants->uv_cuml_bins_nuq[dq][qindex]; + xd->plane[i].dequant_val_nuq[dq] = cpi->uv_dequant_val_nuq[dq][qindex]; + } +#endif // CONFIG_NEW_QUANT + } + + x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); + x->qindex = qindex; + + set_error_per_bit(x, rdmult); + + av1_initialize_me_consts(cpi, x, qindex); +} + +void av1_frame_init_quantizer(AV1_COMP *cpi) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id); +} + +void av1_set_quantizer(AV1_COMMON *cm, int q) { + // quantizer has to be reinitialized with av1_init_quantizer() if any + // delta_q changes. + cm->base_qindex = q; + cm->y_dc_delta_q = 0; + cm->uv_dc_delta_q = 0; + cm->uv_ac_delta_q = 0; +} + +// Table that converts 0-63 Q-range values passed in outside to the Qindex +// range used internally. +static const int quantizer_to_qindex[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, + 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, + 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, + 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, + 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255, +}; + +int av1_quantizer_to_qindex(int quantizer) { + return quantizer_to_qindex[quantizer]; +} + +int av1_qindex_to_quantizer(int qindex) { + int quantizer; + + for (quantizer = 0; quantizer < 64; ++quantizer) + if (quantizer_to_qindex[quantizer] >= qindex) return quantizer; + + return 63; +} diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h new file mode 100644 index 000000000..c87b6b7dc --- /dev/null +++ b/third_party/aom/av1/encoder/av1_quantize.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_QUANTIZE_H_ +#define AV1_ENCODER_QUANTIZE_H_ + +#include "./aom_config.h" +#include "av1/common/quant_common.h" +#include "av1/common/scan.h" +#include "av1/encoder/block.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct QUANT_PARAM { + int log_scale; +#if CONFIG_NEW_QUANT + TX_SIZE tx_size; + int dq; +#endif // CONFIG_NEW_QUANT +#if CONFIG_AOM_QM + const qm_val_t *qmatrix; + const qm_val_t *iqmatrix; +#endif // CONFIG_AOM_QM +} QUANT_PARAM; + +typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +typedef struct { +#if CONFIG_NEW_QUANT + DECLARE_ALIGNED( + 16, tran_low_t, + y_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS][NUQ_KNOTS]); + DECLARE_ALIGNED( + 16, tran_low_t, + uv_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS][NUQ_KNOTS]); +#endif // CONFIG_NEW_QUANT + // 0: dc 1: ac 2-8: ac repeated to SIMD width + DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]); + + // TODO(jingning): in progress of re-working the quantization. will decide + // if we want to deprecate the current use of y_quant. + DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]); + + DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); +} QUANTS; + +struct AV1_COMP; +struct AV1Common; + +void av1_frame_init_quantizer(struct AV1_COMP *cpi); + +void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x, + int segment_id); + +void av1_init_quantizer(struct AV1_COMP *cpi); + +void av1_set_quantizer(struct AV1Common *cm, int q); + +int av1_quantizer_to_qindex(int quantizer); + +int av1_qindex_to_quantizer(int qindex); + +void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr); + +void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); + +void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr, + uint16_t *eob_ptr, const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); + +#if CONFIG_NEW_QUANT +void av1_quantize_fp_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +void av1_quantize_b_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); + +void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); +#endif // CONFIG_NEW_QUANT + +#if CONFIG_HIGHBITDEPTH +void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +#if CONFIG_NEW_QUANT +void av1_highbd_quantize_fp_nuq_facade( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +void av1_highbd_quantize_b_nuq_facade( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +void av1_highbd_quantize_dc_nuq_facade( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); +#endif // CONFIG_NEW_QUANT +#endif // CONFIG_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_QUANTIZE_H_ diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c new file mode 100644 index 000000000..7cc6179ea --- /dev/null +++ b/third_party/aom/av1/encoder/bitstream.c @@ -0,0 +1,5399 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom_dsp/bitwriter_buffer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem_ops.h" +#include "aom_ports/system_state.h" +#if CONFIG_BITSTREAM_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_CDEF +#include "av1/common/cdef.h" +#include "av1/common/clpf.h" +#endif // CONFIG_CDEF +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/mvref_common.h" +#include "av1/common/odintrin.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconinter.h" +#if CONFIG_EXT_INTRA +#include "av1/common/reconintra.h" +#endif // CONFIG_EXT_INTRA +#include "av1/common/seg_common.h" +#include "av1/common/tile_common.h" + +#if CONFIG_ANS +#include "aom_dsp/buf_ans.h" +#endif // CONFIG_ANS +#if CONFIG_LV_MAP +#include "av1/encoder/encodetxb.h" +#endif // CONFIG_LV_MAP +#include "av1/encoder/bitstream.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/mcomp.h" +#if CONFIG_PALETTE && CONFIG_PALETTE_DELTA_ENCODING +#include "av1/encoder/palette.h" +#endif // CONFIG_PALETTE && CONFIG_PALETTE_DELTA_ENCODING +#include "av1/encoder/segmentation.h" +#include "av1/encoder/subexp.h" +#include "av1/encoder/tokenize.h" +#if CONFIG_PVQ +#include "av1/encoder/pvq_encoder.h" +#endif + +static struct av1_token intra_mode_encodings[INTRA_MODES]; +static struct av1_token switchable_interp_encodings[SWITCHABLE_FILTERS]; +#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EC_MULTISYMBOL +static const struct av1_token ext_partition_encodings[EXT_PARTITION_TYPES] = { + { 0, 1 }, { 4, 3 }, { 12, 4 }, { 7, 3 }, + { 10, 4 }, { 11, 4 }, { 26, 5 }, { 27, 5 } +}; +#endif +static struct av1_token partition_encodings[PARTITION_TYPES]; +#if !CONFIG_REF_MV +static struct av1_token inter_mode_encodings[INTER_MODES]; +#endif +#if CONFIG_EXT_INTER +static const struct av1_token + inter_compound_mode_encodings[INTER_COMPOUND_MODES] = { + { 2, 2 }, { 50, 6 }, { 51, 6 }, { 24, 5 }, { 52, 6 }, + { 53, 6 }, { 54, 6 }, { 55, 6 }, { 0, 1 }, { 7, 3 } + }; +#endif // CONFIG_EXT_INTER +#if CONFIG_PALETTE +static struct av1_token palette_size_encodings[PALETTE_SIZES]; +static struct av1_token palette_color_index_encodings[PALETTE_SIZES] + [PALETTE_COLORS]; +#endif // CONFIG_PALETTE +#if !CONFIG_EC_MULTISYMBOL +static const struct av1_token tx_size_encodings[MAX_TX_DEPTH][TX_SIZES] = { + { { 0, 1 }, { 1, 1 } }, // Max tx_size is 8X8 + { { 0, 1 }, { 2, 2 }, { 3, 2 } }, // Max tx_size is 16X16 + { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 } }, // Max tx_size is 32X32 +#if CONFIG_TX64X64 + { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 14, 4 }, { 15, 4 } }, // Max tx_size 64X64 +#endif // CONFIG_TX64X64 +}; +#endif + +#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE +static INLINE void write_uniform(aom_writer *w, int n, int v) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + if (l == 0) return; + if (v < m) { + aom_write_literal(w, v, l - 1); + } else { + aom_write_literal(w, m + ((v - m) >> 1), l - 1); + aom_write_literal(w, (v - m) & 1, 1); + } +} +#endif // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE + +#if CONFIG_EXT_TX +static struct av1_token ext_tx_inter_encodings[EXT_TX_SETS_INTER][TX_TYPES]; +static struct av1_token ext_tx_intra_encodings[EXT_TX_SETS_INTRA][TX_TYPES]; +#else +static struct av1_token ext_tx_encodings[TX_TYPES]; +#endif // CONFIG_EXT_TX +#if CONFIG_GLOBAL_MOTION +static struct av1_token global_motion_types_encodings[GLOBAL_TRANS_TYPES]; +#endif // CONFIG_GLOBAL_MOTION +#if CONFIG_EXT_INTRA +#if CONFIG_INTRA_INTERP +static struct av1_token intra_filter_encodings[INTRA_FILTERS]; +#endif // CONFIG_INTRA_INTERP +#endif // CONFIG_EXT_INTRA +#if CONFIG_EXT_INTER +static struct av1_token interintra_mode_encodings[INTERINTRA_MODES]; +#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE +static struct av1_token compound_type_encodings[COMPOUND_TYPES]; +#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE +#endif // CONFIG_EXT_INTER +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION +static struct av1_token motion_mode_encodings[MOTION_MODES]; +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION +#if CONFIG_LOOP_RESTORATION +static struct av1_token switchable_restore_encodings[RESTORE_SWITCHABLE_TYPES]; +#endif // CONFIG_LOOP_RESTORATION +static void write_uncompressed_header(AV1_COMP *cpi, + struct aom_write_bit_buffer *wb); +static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data); +static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst, + const uint32_t data_size, const uint32_t max_tile_size, + const uint32_t max_tile_col_size, + int *const tile_size_bytes, + int *const tile_col_size_bytes); + +void av1_encode_token_init(void) { +#if CONFIG_EXT_TX || CONFIG_PALETTE + int s; +#endif // CONFIG_EXT_TX || CONFIG_PALETTE +#if CONFIG_EXT_TX + for (s = 1; s < EXT_TX_SETS_INTER; ++s) { + av1_tokens_from_tree(ext_tx_inter_encodings[s], av1_ext_tx_inter_tree[s]); + } + for (s = 1; s < EXT_TX_SETS_INTRA; ++s) { + av1_tokens_from_tree(ext_tx_intra_encodings[s], av1_ext_tx_intra_tree[s]); + } +#else + av1_tokens_from_tree(ext_tx_encodings, av1_ext_tx_tree); +#endif // CONFIG_EXT_TX + av1_tokens_from_tree(intra_mode_encodings, av1_intra_mode_tree); + av1_tokens_from_tree(switchable_interp_encodings, av1_switchable_interp_tree); + av1_tokens_from_tree(partition_encodings, av1_partition_tree); +#if !CONFIG_REF_MV + av1_tokens_from_tree(inter_mode_encodings, av1_inter_mode_tree); +#endif + +#if CONFIG_PALETTE + av1_tokens_from_tree(palette_size_encodings, av1_palette_size_tree); + for (s = 0; s < PALETTE_SIZES; ++s) { + av1_tokens_from_tree(palette_color_index_encodings[s], + av1_palette_color_index_tree[s]); + } +#endif // CONFIG_PALETTE + +#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP + av1_tokens_from_tree(intra_filter_encodings, av1_intra_filter_tree); +#endif // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP +#if CONFIG_EXT_INTER + av1_tokens_from_tree(interintra_mode_encodings, av1_interintra_mode_tree); +#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE + av1_tokens_from_tree(compound_type_encodings, av1_compound_type_tree); +#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE +#endif // CONFIG_EXT_INTER +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + av1_tokens_from_tree(motion_mode_encodings, av1_motion_mode_tree); +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION +#if CONFIG_GLOBAL_MOTION + av1_tokens_from_tree(global_motion_types_encodings, + av1_global_motion_types_tree); +#endif // CONFIG_GLOBAL_MOTION +#if CONFIG_LOOP_RESTORATION + av1_tokens_from_tree(switchable_restore_encodings, + av1_switchable_restore_tree); +#endif // CONFIG_LOOP_RESTORATION + +#if CONFIG_EC_MULTISYMBOL + /* This hack is necessary when CONFIG_DUAL_FILTER is enabled because the five + SWITCHABLE_FILTERS are not consecutive, e.g., 0, 1, 2, 3, 4, when doing + an in-order traversal of the av1_switchable_interp_tree structure. */ + av1_indices_from_tree(av1_switchable_interp_ind, av1_switchable_interp_inv, + av1_switchable_interp_tree); +/* This hack is necessary because the four TX_TYPES are not consecutive, + e.g., 0, 1, 2, 3, when doing an in-order traversal of the av1_ext_tx_tree + structure. */ +#if CONFIG_EXT_TX + for (s = 1; s < EXT_TX_SETS_INTRA; ++s) + av1_indices_from_tree(av1_ext_tx_intra_ind[s], av1_ext_tx_intra_inv[s], + av1_ext_tx_intra_tree[s]); + for (s = 1; s < EXT_TX_SETS_INTER; ++s) + av1_indices_from_tree(av1_ext_tx_inter_ind[s], av1_ext_tx_inter_inv[s], + av1_ext_tx_inter_tree[s]); +#else + av1_indices_from_tree(av1_ext_tx_ind, av1_ext_tx_inv, av1_ext_tx_tree); +#endif + av1_indices_from_tree(av1_intra_mode_ind, av1_intra_mode_inv, + av1_intra_mode_tree); + av1_indices_from_tree(av1_inter_mode_ind, av1_inter_mode_inv, + av1_inter_mode_tree); +#endif +} + +static void write_intra_mode_kf(const AV1_COMMON *cm, FRAME_CONTEXT *frame_ctx, + const MODE_INFO *mi, const MODE_INFO *above_mi, + const MODE_INFO *left_mi, int block, + PREDICTION_MODE mode, aom_writer *w) { +#if CONFIG_INTRABC + assert(!is_intrabc_block(&mi->mbmi)); +#endif // CONFIG_INTRABC +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, av1_intra_mode_ind[mode], + get_y_mode_cdf(frame_ctx, mi, above_mi, left_mi, block), + INTRA_MODES); + (void)cm; +#else + av1_write_token(w, av1_intra_mode_tree, + get_y_mode_probs(cm, mi, above_mi, left_mi, block), + &intra_mode_encodings[mode]); + (void)frame_ctx; +#endif +} + +#if CONFIG_EXT_INTER +static void write_interintra_mode(aom_writer *w, INTERINTRA_MODE mode, + const aom_prob *probs) { + av1_write_token(w, av1_interintra_mode_tree, probs, + &interintra_mode_encodings[mode]); +} +#endif // CONFIG_EXT_INTER + +static void write_inter_mode(aom_writer *w, PREDICTION_MODE mode, + FRAME_CONTEXT *ec_ctx, const int16_t mode_ctx) { +#if CONFIG_REF_MV + const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; + const aom_prob newmv_prob = ec_ctx->newmv_prob[newmv_ctx]; + +#define IS_NEWMV_MODE(mode) ((mode) == NEWMV) + aom_write(w, !IS_NEWMV_MODE(mode), newmv_prob); + + if (!IS_NEWMV_MODE(mode)) { + const int16_t zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK; + const aom_prob zeromv_prob = ec_ctx->zeromv_prob[zeromv_ctx]; + + if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) { + assert(mode == ZEROMV); + return; + } + + aom_write(w, mode != ZEROMV, zeromv_prob); + + if (mode != ZEROMV) { + int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; + aom_prob refmv_prob; + + if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6; + if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7; + if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8; + + refmv_prob = ec_ctx->refmv_prob[refmv_ctx]; + aom_write(w, mode != NEARESTMV, refmv_prob); + } + } + +#undef IS_NEWMV_MODE + +#else // !CONFIG_REF_MV + assert(is_inter_mode(mode)); +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, av1_inter_mode_ind[INTER_OFFSET(mode)], + ec_ctx->inter_mode_cdf[mode_ctx], INTER_MODES); +#else + { + const aom_prob *const inter_probs = ec_ctx->inter_mode_probs[mode_ctx]; + av1_write_token(w, av1_inter_mode_tree, inter_probs, + &inter_mode_encodings[INTER_OFFSET(mode)]); + } +#endif +#endif +} + +#if CONFIG_REF_MV +static void write_drl_idx(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, + const MB_MODE_INFO_EXT *mbmi_ext, aom_writer *w) { + uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + + assert(mbmi->ref_mv_idx < 3); + +#if CONFIG_EXT_INTER + if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) { +#else + if (mbmi->mode == NEWMV) { +#endif + int idx; + for (idx = 0; idx < 2; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx); + aom_prob drl_prob = cm->fc->drl_prob[drl_ctx]; + + aom_write(w, mbmi->ref_mv_idx != idx, drl_prob); + if (mbmi->ref_mv_idx == idx) return; + } + } + return; + } + + if (have_nearmv_in_inter_mode(mbmi->mode)) { + int idx; + // TODO(jingning): Temporary solution to compensate the NEARESTMV offset. + for (idx = 1; idx < 3; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx); + aom_prob drl_prob = cm->fc->drl_prob[drl_ctx]; + + aom_write(w, mbmi->ref_mv_idx != (idx - 1), drl_prob); + if (mbmi->ref_mv_idx == (idx - 1)) return; + } + } + return; + } +} +#endif + +#if CONFIG_EXT_INTER +static void write_inter_compound_mode(AV1_COMMON *cm, aom_writer *w, + PREDICTION_MODE mode, + const int16_t mode_ctx) { + const aom_prob *const inter_compound_probs = + cm->fc->inter_compound_mode_probs[mode_ctx]; + + assert(is_inter_compound_mode(mode)); + av1_write_token(w, av1_inter_compound_mode_tree, inter_compound_probs, + &inter_compound_mode_encodings[INTER_COMPOUND_OFFSET(mode)]); +} +#endif // CONFIG_EXT_INTER + +static void encode_unsigned_max(struct aom_write_bit_buffer *wb, int data, + int max) { + aom_wb_write_literal(wb, data, get_unsigned_bits(max)); +} + +#if !CONFIG_EC_ADAPT || \ + (CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION || CONFIG_EXT_INTER) +static void prob_diff_update(const aom_tree_index *tree, + aom_prob probs[/*n - 1*/], + const unsigned int counts[/*n - 1*/], int n, + int probwt, aom_writer *w) { + int i; + unsigned int branch_ct[32][2]; + + // Assuming max number of probabilities <= 32 + assert(n <= 32); + + av1_tree_probs_from_distribution(tree, branch_ct, counts); + for (i = 0; i < n - 1; ++i) + av1_cond_prob_diff_update(w, &probs[i], branch_ct[i], probwt); +} +#endif + +#if CONFIG_EXT_INTER || !CONFIG_EC_ADAPT +static int prob_diff_update_savings(const aom_tree_index *tree, + aom_prob probs[/*n - 1*/], + const unsigned int counts[/*n - 1*/], int n, + int probwt) { + int i; + unsigned int branch_ct[32][2]; + int savings = 0; + + // Assuming max number of probabilities <= 32 + assert(n <= 32); + av1_tree_probs_from_distribution(tree, branch_ct, counts); + for (i = 0; i < n - 1; ++i) { + savings += + av1_cond_prob_diff_update_savings(&probs[i], branch_ct[i], probwt); + } + return savings; +} +#endif // CONFIG_EXT_INTER || !CONFIG_EC_ADAPT + +#if CONFIG_VAR_TX +static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi, TX_SIZE tx_size, + int depth, int blk_row, int blk_col, + aom_writer *w) { + const int tx_row = blk_row >> 1; + const int tx_col = blk_col >> 1; + const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0); + const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0); + + int ctx = txfm_partition_context(xd->above_txfm_context + tx_col, + xd->left_txfm_context + tx_row, + mbmi->sb_type, tx_size); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + if (depth == MAX_VARTX_DEPTH) { + txfm_partition_update(xd->above_txfm_context + tx_col, + xd->left_txfm_context + tx_row, tx_size, tx_size); + return; + } + + if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) { + aom_write(w, 0, cm->fc->txfm_partition_prob[ctx]); + txfm_partition_update(xd->above_txfm_context + tx_col, + xd->left_txfm_context + tx_row, tx_size, tx_size); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsl = tx_size_wide_unit[sub_txs]; + int i; + + aom_write(w, 1, cm->fc->txfm_partition_prob[ctx]); + + if (tx_size == TX_8X8) { + txfm_partition_update(xd->above_txfm_context + tx_col, + xd->left_txfm_context + tx_row, sub_txs, tx_size); + return; + } + + assert(bsl > 0); + for (i = 0; i < 4; ++i) { + int offsetr = blk_row + (i >> 1) * bsl; + int offsetc = blk_col + (i & 0x01) * bsl; + write_tx_size_vartx(cm, xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, + w); + } + } +} + +static void update_txfm_partition_probs(AV1_COMMON *cm, aom_writer *w, + FRAME_COUNTS *counts, int probwt) { + int k; + for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k) + av1_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k], + counts->txfm_partition[k], probwt); +} +#endif + +static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd, + aom_writer *w) { + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const BLOCK_SIZE bsize = mbmi->sb_type; +#if CONFIG_EC_ADAPT + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + (void)cm; +#else + FRAME_CONTEXT *ec_ctx = cm->fc; +#endif +// For sub8x8 blocks the tx_size symbol does not need to be sent +#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX + if (bsize > BLOCK_4X4) { +#else + if (bsize >= BLOCK_8X8) { +#endif + const TX_SIZE tx_size = mbmi->tx_size; + const int is_inter = is_inter_block(mbmi); + const int tx_size_ctx = get_tx_size_context(xd); + const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize] + : intra_tx_size_cat_lookup[bsize]; + const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size]; + const int depth = tx_size_to_depth(coded_tx_size); +#if CONFIG_EXT_TX && CONFIG_RECT_TX + assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi))); + assert( + IMPLIES(is_rect_tx(tx_size), tx_size == max_txsize_rect_lookup[bsize])); +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx], + tx_size_cat + 2); +#else + av1_write_token(w, av1_tx_size_tree[tx_size_cat], + ec_ctx->tx_size_probs[tx_size_cat][tx_size_ctx], + &tx_size_encodings[tx_size_cat][depth]); +#endif + } +} + +#if CONFIG_REF_MV +static void update_inter_mode_probs(AV1_COMMON *cm, aom_writer *w, + FRAME_COUNTS *counts) { + int i; +#if CONFIG_TILE_GROUPS + const int probwt = cm->num_tg; +#else + const int probwt = 1; +#endif + for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) + av1_cond_prob_diff_update(w, &cm->fc->newmv_prob[i], counts->newmv_mode[i], + probwt); + for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) + av1_cond_prob_diff_update(w, &cm->fc->zeromv_prob[i], + counts->zeromv_mode[i], probwt); + for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) + av1_cond_prob_diff_update(w, &cm->fc->refmv_prob[i], counts->refmv_mode[i], + probwt); + for (i = 0; i < DRL_MODE_CONTEXTS; ++i) + av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i], + probwt); +} +#endif + +#if CONFIG_EXT_INTER +static void update_inter_compound_mode_probs(AV1_COMMON *cm, int probwt, + aom_writer *w) { + const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) - + av1_cost_zero(GROUP_DIFF_UPDATE_PROB); + int i; + int savings = 0; + int do_update = 0; + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { + savings += prob_diff_update_savings( + av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i], + cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt); + } + do_update = savings > savings_thresh; + aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB); + if (do_update) { + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { + prob_diff_update( + av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i], + cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt, w); + } + } +} +#endif // CONFIG_EXT_INTER + +static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd, + int segment_id, const MODE_INFO *mi, aom_writer *w) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 1; + } else { + const int skip = mi->mbmi.skip; + aom_write(w, skip, av1_get_skip_prob(cm, xd)); + return skip; + } +} + +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION +static void write_motion_mode(const AV1_COMMON *cm, const MODE_INFO *mi, + aom_writer *w) { + const MB_MODE_INFO *mbmi = &mi->mbmi; + MOTION_MODE last_motion_mode_allowed = motion_mode_allowed( +#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION + 0, cm->global_motion, +#endif // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION + mi); + + if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return; +#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION + if (last_motion_mode_allowed == OBMC_CAUSAL) { + aom_write(w, mbmi->motion_mode == OBMC_CAUSAL, + cm->fc->obmc_prob[mbmi->sb_type]); + } else { +#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION + av1_write_token(w, av1_motion_mode_tree, + cm->fc->motion_mode_prob[mbmi->sb_type], + &motion_mode_encodings[mbmi->motion_mode]); +#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION + } +#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION +} +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + +#if CONFIG_DELTA_Q +static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd, + int delta_qindex, aom_writer *w) { + int sign = delta_qindex < 0; + int abs = sign ? -delta_qindex : delta_qindex; + int rem_bits, thr; + int smallval = abs < DELTA_Q_SMALL ? 1 : 0; +#if CONFIG_EC_ADAPT + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + (void)cm; +#else + FRAME_CONTEXT *ec_ctx = cm->fc; + (void)xd; +#endif + +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf, + DELTA_Q_PROBS + 1); +#else + int i = 0; + while (i < DELTA_Q_SMALL && i <= abs) { + int bit = (i < abs); + aom_write(w, bit, ec_ctx->delta_q_prob[i]); + i++; + } +#endif + + if (!smallval) { + rem_bits = OD_ILOG_NZ(abs - 1) - 1; + thr = (1 << rem_bits) + 1; + aom_write_literal(w, rem_bits, 3); + aom_write_literal(w, abs - thr, rem_bits); + } + if (abs > 0) { + aom_write_bit(w, sign); + } +} + +#if !CONFIG_EC_ADAPT +static void update_delta_q_probs(AV1_COMMON *cm, aom_writer *w, + FRAME_COUNTS *counts) { + int k; +#if CONFIG_TILE_GROUPS + const int probwt = cm->num_tg; +#else + const int probwt = 1; +#endif +#if CONFIG_EXT_DELTA_Q + if (!cm->delta_q_present_flag) return; +#endif // CONFIG_EXT_DELTA_Q + for (k = 0; k < DELTA_Q_PROBS; ++k) { + av1_cond_prob_diff_update(w, &cm->fc->delta_q_prob[k], counts->delta_q[k], + probwt); + } +} +#endif // CONFIG_EC_ADAPT + +#if CONFIG_EXT_DELTA_Q +static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd, + int delta_lflevel, aom_writer *w) { + int sign = delta_lflevel < 0; + int abs = sign ? -delta_lflevel : delta_lflevel; + int rem_bits, thr; + int smallval = abs < DELTA_LF_SMALL ? 1 : 0; +#if CONFIG_EC_ADAPT + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + (void)cm; +#else + FRAME_CONTEXT *ec_ctx = cm->fc; + (void)xd; +#endif + +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf, + DELTA_LF_PROBS + 1); +#else + int i = 0; + while (i < DELTA_LF_SMALL && i <= abs) { + int bit = (i < abs); + aom_write(w, bit, ec_ctx->delta_lf_prob[i]); + i++; + } +#endif // CONFIG_EC_MULTISYMBOL + + if (!smallval) { + rem_bits = OD_ILOG_NZ(abs - 1) - 1; + thr = (1 << rem_bits) + 1; + aom_write_literal(w, rem_bits, 3); + aom_write_literal(w, abs - thr, rem_bits); + } + if (abs > 0) { + aom_write_bit(w, sign); + } +} + +#if !CONFIG_EC_ADAPT +static void update_delta_lf_probs(AV1_COMMON *cm, aom_writer *w, + FRAME_COUNTS *counts) { + int k; +#if CONFIG_TILE_GROUPS + const int probwt = cm->num_tg; +#else + const int probwt = 1; +#endif + if (!cm->delta_lf_present_flag) return; + for (k = 0; k < DELTA_LF_PROBS; ++k) { + av1_cond_prob_diff_update(w, &cm->fc->delta_lf_prob[k], counts->delta_lf[k], + probwt); + } +} +#endif // CONFIG_EC_ADAPT +#endif // CONFIG_EXT_DELTA_Q +#endif // CONFIG_DELTA_Q + +static void update_skip_probs(AV1_COMMON *cm, aom_writer *w, + FRAME_COUNTS *counts) { + int k; +#if CONFIG_TILE_GROUPS + const int probwt = cm->num_tg; +#else + const int probwt = 1; +#endif + for (k = 0; k < SKIP_CONTEXTS; ++k) { + av1_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k], + probwt); + } +} + +#if !CONFIG_EC_ADAPT +static void update_switchable_interp_probs(AV1_COMMON *cm, aom_writer *w, + FRAME_COUNTS *counts) { + int j; + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) { +#if CONFIG_TILE_GROUPS + const int probwt = cm->num_tg; +#else + const int probwt = 1; +#endif + prob_diff_update( + av1_switchable_interp_tree, cm->fc->switchable_interp_prob[j], + counts->switchable_interp[j], SWITCHABLE_FILTERS, probwt, w); + } +} +#endif + +#if !CONFIG_EC_ADAPT +#if CONFIG_EXT_TX +static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) { + const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) - + av1_cost_zero(GROUP_DIFF_UPDATE_PROB); + int i, j; + int s; +#if CONFIG_TILE_GROUPS + const int probwt = cm->num_tg; +#else + const int probwt = 1; +#endif + for (s = 1; s < EXT_TX_SETS_INTER; ++s) { + int savings = 0; + int do_update = 0; + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + if (!use_inter_ext_tx_for_txsize[s][i]) continue; + savings += prob_diff_update_savings( + av1_ext_tx_inter_tree[s], cm->fc->inter_ext_tx_prob[s][i], + cm->counts.inter_ext_tx[s][i], + num_ext_tx_set[ext_tx_set_type_inter[s]], probwt); + } + do_update = savings > savings_thresh; + aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB); + if (do_update) { + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + if (!use_inter_ext_tx_for_txsize[s][i]) continue; + prob_diff_update(av1_ext_tx_inter_tree[s], + cm->fc->inter_ext_tx_prob[s][i], + cm->counts.inter_ext_tx[s][i], + num_ext_tx_set[ext_tx_set_type_inter[s]], probwt, w); + } + } + } + + for (s = 1; s < EXT_TX_SETS_INTRA; ++s) { + int savings = 0; + int do_update = 0; + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + if (!use_intra_ext_tx_for_txsize[s][i]) continue; + for (j = 0; j < INTRA_MODES; ++j) + savings += prob_diff_update_savings( + av1_ext_tx_intra_tree[s], cm->fc->intra_ext_tx_prob[s][i][j], + cm->counts.intra_ext_tx[s][i][j], + num_ext_tx_set[ext_tx_set_type_intra[s]], probwt); + } + do_update = savings > savings_thresh; + aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB); + if (do_update) { + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + if (!use_intra_ext_tx_for_txsize[s][i]) continue; + for (j = 0; j < INTRA_MODES; ++j) + prob_diff_update(av1_ext_tx_intra_tree[s], + cm->fc->intra_ext_tx_prob[s][i][j], + cm->counts.intra_ext_tx[s][i][j], + num_ext_tx_set[ext_tx_set_type_intra[s]], probwt, w); + } + } + } +} + +#else +static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) { + const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) - + av1_cost_zero(GROUP_DIFF_UPDATE_PROB); + int i, j; + + int savings = 0; + int do_update = 0; +#if CONFIG_TILE_GROUPS + const int probwt = cm->num_tg; +#else + const int probwt = 1; +#endif + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + for (j = 0; j < TX_TYPES; ++j) + savings += prob_diff_update_savings( + av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j], + cm->counts.intra_ext_tx[i][j], TX_TYPES, probwt); + } + do_update = savings > savings_thresh; + aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB); + if (do_update) { + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + for (j = 0; j < TX_TYPES; ++j) { + prob_diff_update(av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j], + cm->counts.intra_ext_tx[i][j], TX_TYPES, probwt, w); + } + } + } + + savings = 0; + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + savings += + prob_diff_update_savings(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i], + cm->counts.inter_ext_tx[i], TX_TYPES, probwt); + } + do_update = savings > savings_thresh; + aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB); + if (do_update) { + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + prob_diff_update(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i], + cm->counts.inter_ext_tx[i], TX_TYPES, probwt, w); + } + } +} +#endif // CONFIG_EXT_TX +#endif // !CONFIG_EC_ADAPT +#if CONFIG_PALETTE +static void pack_palette_tokens(aom_writer *w, const TOKENEXTRA **tp, int n, + int num) { + int i; + const TOKENEXTRA *p = *tp; + + for (i = 0; i < num; ++i) { + av1_write_token( + w, av1_palette_color_index_tree[n - PALETTE_MIN_SIZE], p->context_tree, + &palette_color_index_encodings[n - PALETTE_MIN_SIZE][p->token]); + ++p; + } + + *tp = p; +} +#endif // CONFIG_PALETTE + +#if !CONFIG_PVQ +#if CONFIG_SUPERTX +static void update_supertx_probs(AV1_COMMON *cm, int probwt, aom_writer *w) { + const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) - + av1_cost_zero(GROUP_DIFF_UPDATE_PROB); + int i, j; + int savings = 0; + int do_update = 0; + for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) { + for (j = TX_8X8; j < TX_SIZES; ++j) { + savings += av1_cond_prob_diff_update_savings( + &cm->fc->supertx_prob[i][j], cm->counts.supertx[i][j], probwt); + } + } + do_update = savings > savings_thresh; + aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB); + if (do_update) { + for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) { + for (j = TX_8X8; j < TX_SIZES; ++j) { + av1_cond_prob_diff_update(w, &cm->fc->supertx_prob[i][j], + cm->counts.supertx[i][j], probwt); + } + } + } +} +#endif // CONFIG_SUPERTX + +#if CONFIG_NEW_MULTISYMBOL +static INLINE void write_coeff_extra(const aom_cdf_prob *const *cdf, int val, + int n, aom_writer *w) { + // Code the extra bits from LSB to MSB in groups of 4 + int i = 0; + int count = 0; + while (count < n) { + const int size = AOMMIN(n - count, 4); + const int mask = (1 << size) - 1; + aom_write_cdf(w, val & mask, cdf[i++], 1 << size); + val >>= size; + count += size; + } +} +#else +static INLINE void write_coeff_extra(const aom_prob *pb, int value, + int num_bits, int skip_bits, aom_writer *w, + TOKEN_STATS *token_stats) { + // Code the extra bits from MSB to LSB 1 bit at a time + int index; + for (index = skip_bits; index < num_bits; ++index) { + const int shift = num_bits - index - 1; + const int bb = (value >> shift) & 1; + aom_write_record(w, bb, pb[index], token_stats); + } +} +#endif + +#if CONFIG_NEW_TOKENSET && !CONFIG_LV_MAP +static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp, + const TOKENEXTRA *const stop, + aom_bit_depth_t bit_depth, const TX_SIZE tx_size, + TOKEN_STATS *token_stats) { + const TOKENEXTRA *p = *tp; +#if CONFIG_VAR_TX + int count = 0; + const int seg_eob = tx_size_2d[tx_size]; +#endif + + while (p < stop && p->token != EOSB_TOKEN) { + const int token = p->token; + if (token == BLOCK_Z_TOKEN) { + aom_write_symbol(w, 0, *p->head_cdf, HEAD_TOKENS + 1); + p++; + continue; + } + + const av1_extra_bit *const extra_bits = &av1_extra_bits[token]; + if (p->eob_val == LAST_EOB) { + // Just code a flag indicating whether the value is >1 or 1. + aom_write_bit(w, token != ONE_TOKEN); + } else { + int comb_symb = 2 * AOMMIN(token, TWO_TOKEN) - p->eob_val + p->first_val; + aom_write_symbol(w, comb_symb, *p->head_cdf, HEAD_TOKENS + p->first_val); + } + if (token > ONE_TOKEN) { + aom_write_symbol(w, token - TWO_TOKEN, *p->tail_cdf, TAIL_TOKENS); + } + + if (extra_bits->base_val) { + const int bit_string = p->extra; + const int bit_string_length = extra_bits->len; // Length of extra bits to + const int is_cat6 = (extra_bits->base_val == CAT6_MIN_VAL); + // be written excluding + // the sign bit. + int skip_bits = is_cat6 + ? (int)sizeof(av1_cat6_prob) - + av1_get_cat6_extrabits_size(tx_size, bit_depth) + : 0; + + assert(!(bit_string >> (bit_string_length - skip_bits + 1))); + if (bit_string_length > 0) +#if CONFIG_NEW_MULTISYMBOL + write_coeff_extra(extra_bits->cdf, bit_string >> 1, + bit_string_length - skip_bits, w); +#else + write_coeff_extra(extra_bits->prob, bit_string >> 1, bit_string_length, + skip_bits, w, token_stats); +#endif + + aom_write_bit_record(w, bit_string & 1, token_stats); + } + ++p; + +#if CONFIG_VAR_TX + ++count; + if (token == EOB_TOKEN || count == seg_eob) break; +#endif + } + + *tp = p; +} +#else // CONFIG_NEW_TOKENSET +#if !CONFIG_LV_MAP +static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp, + const TOKENEXTRA *const stop, + aom_bit_depth_t bit_depth, const TX_SIZE tx_size, + TOKEN_STATS *token_stats) { + const TOKENEXTRA *p = *tp; +#if CONFIG_VAR_TX + int count = 0; + const int seg_eob = tx_size_2d[tx_size]; +#endif + + while (p < stop && p->token != EOSB_TOKEN) { + const int token = p->token; +#if !CONFIG_EC_MULTISYMBOL + const struct av1_token *const coef_encoding = &av1_coef_encodings[token]; + int coef_value = coef_encoding->value; + int coef_length = coef_encoding->len; +#endif // !CONFIG_EC_MULTISYMBOL + const av1_extra_bit *const extra_bits = &av1_extra_bits[token]; + +#if CONFIG_EC_MULTISYMBOL + /* skip one or two nodes */ + if (!p->skip_eob_node) + aom_write_record(w, token != EOB_TOKEN, p->context_tree[0], token_stats); + if (token != EOB_TOKEN) { + aom_write_record(w, token != ZERO_TOKEN, p->context_tree[1], token_stats); + if (token != ZERO_TOKEN) { + aom_write_symbol(w, token - ONE_TOKEN, *p->token_cdf, + CATEGORY6_TOKEN - ONE_TOKEN + 1); + } + } +#else + /* skip one or two nodes */ + if (p->skip_eob_node) + coef_length -= p->skip_eob_node; + else + aom_write_record(w, token != EOB_TOKEN, p->context_tree[0], token_stats); + + if (token != EOB_TOKEN) { + aom_write_record(w, token != ZERO_TOKEN, p->context_tree[1], token_stats); + + if (token != ZERO_TOKEN) { + aom_write_record(w, token != ONE_TOKEN, p->context_tree[2], + token_stats); + + if (token != ONE_TOKEN) { + const int unconstrained_len = UNCONSTRAINED_NODES - p->skip_eob_node; + aom_write_tree_record( + w, av1_coef_con_tree, + av1_pareto8_full[p->context_tree[PIVOT_NODE] - 1], coef_value, + coef_length - unconstrained_len, 0, token_stats); + } + } + } +#endif // CONFIG_EC_MULTISYMBOL + + if (extra_bits->base_val) { + const int bit_string = p->extra; + const int bit_string_length = extra_bits->len; // Length of extra bits to + // be written excluding + // the sign bit. + int skip_bits = (extra_bits->base_val == CAT6_MIN_VAL) + ? (int)sizeof(av1_cat6_prob) - + av1_get_cat6_extrabits_size(tx_size, bit_depth) + : 0; + + assert(!(bit_string >> (bit_string_length - skip_bits + 1))); + if (bit_string_length > 0) { +#if CONFIG_NEW_MULTISYMBOL + skip_bits &= ~3; + write_coeff_extra(extra_bits->cdf, bit_string >> 1, + bit_string_length - skip_bits, w); +#else + write_coeff_extra(extra_bits->prob, bit_string >> 1, bit_string_length, + skip_bits, w, token_stats); +#endif + } + aom_write_bit_record(w, bit_string & 1, token_stats); + } + ++p; + +#if CONFIG_VAR_TX + ++count; + if (token == EOB_TOKEN || count == seg_eob) break; +#endif + } + + *tp = p; +} +#endif // !CONFIG_LV_MAP +#endif // CONFIG_NEW_TOKENSET +#else // !CONFIG_PVQ +static PVQ_INFO *get_pvq_block(PVQ_QUEUE *pvq_q) { + PVQ_INFO *pvq; + + assert(pvq_q->curr_pos <= pvq_q->last_pos); + assert(pvq_q->curr_pos < pvq_q->buf_len); + + pvq = pvq_q->buf + pvq_q->curr_pos; + ++pvq_q->curr_pos; + + return pvq; +} + +static void pack_pvq_tokens(aom_writer *w, MACROBLOCK *const x, + MACROBLOCKD *const xd, int plane, BLOCK_SIZE bsize, + const TX_SIZE tx_size) { + PVQ_INFO *pvq; + int idx, idy; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + od_adapt_ctx *adapt; + int max_blocks_wide; + int max_blocks_high; + int step = (1 << tx_size); + const BLOCK_SIZE plane_bsize = + get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd); + + adapt = x->daala_enc.state.adapt; + + max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + max_blocks_high = max_block_high(xd, plane_bsize, plane); + + for (idy = 0; idy < max_blocks_high; idy += step) { + for (idx = 0; idx < max_blocks_wide; idx += step) { + const int is_keyframe = 0; + const int encode_flip = 0; + const int flip = 0; + int i; + const int has_dc_skip = 1; + int *exg = &adapt->pvq.pvq_exg[plane][tx_size][0]; + int *ext = adapt->pvq.pvq_ext + tx_size * PVQ_MAX_PARTITIONS; + generic_encoder *model = adapt->pvq.pvq_param_model; + + pvq = get_pvq_block(x->pvq_q); + + // encode block skip info + aom_write_symbol(w, pvq->ac_dc_coded, + adapt->skip_cdf[2 * tx_size + (plane != 0)], 4); + + // AC coeffs coded? + if (pvq->ac_dc_coded & AC_CODED) { + assert(pvq->bs == tx_size); + for (i = 0; i < pvq->nb_bands; i++) { + if (i == 0 || + (!pvq->skip_rest && !(pvq->skip_dir & (1 << ((i - 1) % 3))))) { + pvq_encode_partition( + w, pvq->qg[i], pvq->theta[i], pvq->y + pvq->off[i], + pvq->size[i], pvq->k[i], model, adapt, exg + i, ext + i, + (plane != 0) * OD_TXSIZES * PVQ_MAX_PARTITIONS + + pvq->bs * PVQ_MAX_PARTITIONS + i, + is_keyframe, i == 0 && (i < pvq->nb_bands - 1), pvq->skip_rest, + encode_flip, flip); + } + if (i == 0 && !pvq->skip_rest && pvq->bs > 0) { + aom_write_symbol( + w, pvq->skip_dir, + &adapt->pvq + .pvq_skip_dir_cdf[(plane != 0) + 2 * (pvq->bs - 1)][0], + 7); + } + } + } + // Encode residue of DC coeff, if exist. + if (!has_dc_skip || (pvq->ac_dc_coded & DC_CODED)) { + generic_encode(w, &adapt->model_dc[plane], + abs(pvq->dq_dc_residue) - has_dc_skip, + &adapt->ex_dc[plane][pvq->bs][0], 2); + } + if ((pvq->ac_dc_coded & DC_CODED)) { + aom_write_bit(w, pvq->dq_dc_residue < 0); + } + } + } // for (idy = 0; +} +#endif // !CONFIG_PVG + +#if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE +static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp, + const TOKENEXTRA *const tok_end, +#if CONFIG_PVQ + MACROBLOCK *const x, +#endif + MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane, + BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, + int block, int blk_row, int blk_col, + TX_SIZE tx_size, TOKEN_STATS *token_stats) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; + const int tx_row = blk_row >> (1 - pd->subsampling_y); + const int tx_col = blk_col >> (1 - pd->subsampling_x); + TX_SIZE plane_tx_size; + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + plane_tx_size = + plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0] + : mbmi->inter_tx_size[tx_row][tx_col]; + + if (tx_size == plane_tx_size) { + TOKEN_STATS tmp_token_stats; + init_token_stats(&tmp_token_stats); +#if !CONFIG_PVQ + pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size, &tmp_token_stats); +#else + pack_pvq_tokens(w, x, xd, plane, bsize, tx_size); +#endif +#if CONFIG_RD_DEBUG + token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost; + token_stats->cost += tmp_token_stats.cost; +#endif + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsl = tx_size_wide_unit[sub_txs]; + int i; + + assert(bsl > 0); + + for (i = 0; i < 4; ++i) { + const int offsetr = blk_row + (i >> 1) * bsl; + const int offsetc = blk_col + (i & 0x01) * bsl; + const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs]; + + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + + pack_txb_tokens(w, tp, tok_end, +#if CONFIG_PVQ + x, +#endif + xd, mbmi, plane, plane_bsize, bit_depth, block, offsetr, + offsetc, sub_txs, token_stats); + block += step; + } + } +} +#endif + +static void write_segment_id(aom_writer *w, const struct segmentation *seg, + struct segmentation_probs *segp, int segment_id) { + if (seg->enabled && seg->update_map) { +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, segment_id, segp->tree_cdf, MAX_SEGMENTS); +#else + aom_write_tree(w, av1_segment_tree, segp->tree_probs, segment_id, 3, 0); +#endif + } +} + +// This function encodes the reference frame +static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd, + aom_writer *w) { + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const int is_compound = has_second_ref(mbmi); + const int segment_id = mbmi->segment_id; + + // If segment level coding of this signal is disabled... + // or the segment allows multiple reference frame options + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + assert(!is_compound); + assert(mbmi->ref_frame[0] == + get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME)); + } else { + // does the feature use compound prediction or not + // (if not specified at the frame/segment level) + if (cm->reference_mode == REFERENCE_MODE_SELECT) { +#if SUB8X8_COMP_REF + aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd)); +#else + if (mbmi->sb_type >= BLOCK_8X8) + aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd)); +#endif + } else { + assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE)); + } + + if (is_compound) { +#if CONFIG_EXT_REFS + const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME || + mbmi->ref_frame[0] == LAST3_FRAME); + const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME; +#else // CONFIG_EXT_REFS + const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME; +#endif // CONFIG_EXT_REFS + + aom_write(w, bit, av1_get_pred_prob_comp_ref_p(cm, xd)); + +#if CONFIG_EXT_REFS + if (!bit) { + const int bit1 = mbmi->ref_frame[0] == LAST_FRAME; + aom_write(w, bit1, av1_get_pred_prob_comp_ref_p1(cm, xd)); + } else { + const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME; + aom_write(w, bit2, av1_get_pred_prob_comp_ref_p2(cm, xd)); + } + aom_write(w, bit_bwd, av1_get_pred_prob_comp_bwdref_p(cm, xd)); +#endif // CONFIG_EXT_REFS + } else { +#if CONFIG_EXT_REFS + const int bit0 = (mbmi->ref_frame[0] == ALTREF_FRAME || + mbmi->ref_frame[0] == BWDREF_FRAME); + aom_write(w, bit0, av1_get_pred_prob_single_ref_p1(cm, xd)); + + if (bit0) { + const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME; + aom_write(w, bit1, av1_get_pred_prob_single_ref_p2(cm, xd)); + } else { + const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME || + mbmi->ref_frame[0] == GOLDEN_FRAME); + aom_write(w, bit2, av1_get_pred_prob_single_ref_p3(cm, xd)); + + if (!bit2) { + const int bit3 = mbmi->ref_frame[0] != LAST_FRAME; + aom_write(w, bit3, av1_get_pred_prob_single_ref_p4(cm, xd)); + } else { + const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME; + aom_write(w, bit4, av1_get_pred_prob_single_ref_p5(cm, xd)); + } + } +#else // CONFIG_EXT_REFS + const int bit0 = mbmi->ref_frame[0] != LAST_FRAME; + aom_write(w, bit0, av1_get_pred_prob_single_ref_p1(cm, xd)); + + if (bit0) { + const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME; + aom_write(w, bit1, av1_get_pred_prob_single_ref_p2(cm, xd)); + } +#endif // CONFIG_EXT_REFS + } + } +} + +#if CONFIG_FILTER_INTRA +static void write_filter_intra_mode_info(const AV1_COMMON *const cm, + const MB_MODE_INFO *const mbmi, + aom_writer *w) { + if (mbmi->mode == DC_PRED +#if CONFIG_PALETTE + && mbmi->palette_mode_info.palette_size[0] == 0 +#endif // CONFIG_PALETTE + ) { + aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[0], + cm->fc->filter_intra_probs[0]); + if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) { + const FILTER_INTRA_MODE mode = + mbmi->filter_intra_mode_info.filter_intra_mode[0]; + write_uniform(w, FILTER_INTRA_MODES, mode); + } + } + + if (mbmi->uv_mode == DC_PRED +#if CONFIG_PALETTE + && mbmi->palette_mode_info.palette_size[1] == 0 +#endif // CONFIG_PALETTE + ) { + aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[1], + cm->fc->filter_intra_probs[1]); + if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1]) { + const FILTER_INTRA_MODE mode = + mbmi->filter_intra_mode_info.filter_intra_mode[1]; + write_uniform(w, FILTER_INTRA_MODES, mode); + } + } +} +#endif // CONFIG_FILTER_INTRA + +#if CONFIG_EXT_INTRA +static void write_intra_angle_info(const MACROBLOCKD *xd, + FRAME_CONTEXT *const ec_ctx, aom_writer *w) { + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const BLOCK_SIZE bsize = mbmi->sb_type; +#if CONFIG_INTRA_INTERP + const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd); + int p_angle; +#endif // CONFIG_INTRA_INTERP + + (void)ec_ctx; + if (bsize < BLOCK_8X8) return; + + if (av1_is_directional_mode(mbmi->mode, bsize)) { + write_uniform(w, 2 * MAX_ANGLE_DELTA + 1, + MAX_ANGLE_DELTA + mbmi->angle_delta[0]); +#if CONFIG_INTRA_INTERP + p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP; + if (av1_is_intra_filter_switchable(p_angle)) { +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, mbmi->intra_filter, + ec_ctx->intra_filter_cdf[intra_filter_ctx], + INTRA_FILTERS); +#else + av1_write_token(w, av1_intra_filter_tree, + ec_ctx->intra_filter_probs[intra_filter_ctx], + &intra_filter_encodings[mbmi->intra_filter]); +#endif // CONFIG_EC_MULTISYMBOL + } +#endif // CONFIG_INTRA_INTERP + } + + if (av1_is_directional_mode(mbmi->uv_mode, bsize)) { + write_uniform(w, 2 * MAX_ANGLE_DELTA + 1, + MAX_ANGLE_DELTA + mbmi->angle_delta[1]); + } +} +#endif // CONFIG_EXT_INTRA + +static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd, + aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; +#if CONFIG_EC_ADAPT + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; +#else + FRAME_CONTEXT *ec_ctx = cm->fc; +#endif + + if (!av1_is_interp_needed(xd)) { +#if CONFIG_DUAL_FILTER + for (int i = 0; i < 4; ++i) + assert(mbmi->interp_filter[i] == (cm->interp_filter == SWITCHABLE + ? EIGHTTAP_REGULAR + : cm->interp_filter)); +#else + assert(mbmi->interp_filter == (cm->interp_filter == SWITCHABLE + ? EIGHTTAP_REGULAR + : cm->interp_filter)); +#endif // CONFIG_DUAL_FILTER + return; + } + if (cm->interp_filter == SWITCHABLE) { +#if CONFIG_DUAL_FILTER + int dir; + for (dir = 0; dir < 2; ++dir) { + if (has_subpel_mv_component(xd->mi[0], xd, dir) || + (mbmi->ref_frame[1] > INTRA_FRAME && + has_subpel_mv_component(xd->mi[0], xd, dir + 2))) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, av1_switchable_interp_ind[mbmi->interp_filter[dir]], + ec_ctx->switchable_interp_cdf[ctx], + SWITCHABLE_FILTERS); +#else + av1_write_token(w, av1_switchable_interp_tree, + ec_ctx->switchable_interp_prob[ctx], + &switchable_interp_encodings[mbmi->interp_filter[dir]]); +#endif + ++cpi->interp_filter_selected[0][mbmi->interp_filter[dir]]; + } else { + assert(mbmi->interp_filter[dir] == EIGHTTAP_REGULAR); + } + } +#else + { + const int ctx = av1_get_pred_context_switchable_interp(xd); +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, av1_switchable_interp_ind[mbmi->interp_filter], + ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS); +#else + av1_write_token(w, av1_switchable_interp_tree, + ec_ctx->switchable_interp_prob[ctx], + &switchable_interp_encodings[mbmi->interp_filter]); +#endif + ++cpi->interp_filter_selected[0][mbmi->interp_filter]; + } +#endif // CONFIG_DUAL_FILTER + } +} + +#if CONFIG_PALETTE +#if CONFIG_PALETTE_DELTA_ENCODING +// Write luma palette color values with delta encoding. Write the first value as +// literal, and the deltas between each value and the previous one. The luma +// palette is sorted so each delta is larger than 0. +static void write_palette_colors_y(const PALETTE_MODE_INFO *const pmi, + int bit_depth, aom_writer *w) { + const int n = pmi->palette_size[0]; + int min_bits, i; + int bits = av1_get_palette_delta_bits_y(pmi, bit_depth, &min_bits); + aom_write_literal(w, bits - min_bits, 2); + aom_write_literal(w, pmi->palette_colors[0], bit_depth); + for (i = 1; i < n; ++i) { + aom_write_literal( + w, pmi->palette_colors[i] - pmi->palette_colors[i - 1] - 1, bits); + bits = + AOMMIN(bits, av1_ceil_log2((1 << bit_depth) - pmi->palette_colors[i])); + } +} + +// Write chroma palette color values. Use delta encoding for u channel as its +// palette is sorted. For v channel, either use delta encoding or transmit +// raw values directly, whichever costs less. +static void write_palette_colors_uv(const PALETTE_MODE_INFO *const pmi, + int bit_depth, aom_writer *w) { + int i; + const int n = pmi->palette_size[1]; +#if CONFIG_HIGHBITDEPTH + const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE; + const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE; +#else + const uint8_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE; + const uint8_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE; +#endif // CONFIG_HIGHBITDEPTH + // U channel colors. + int min_bits_u = 0; + int bits_u = av1_get_palette_delta_bits_u(pmi, bit_depth, &min_bits_u); + aom_write_literal(w, bits_u - min_bits_u, 2); + aom_write_literal(w, colors_u[0], bit_depth); + for (i = 1; i < n; ++i) { + aom_write_literal(w, colors_u[i] - colors_u[i - 1], bits_u); + bits_u = AOMMIN(bits_u, av1_ceil_log2(1 + (1 << bit_depth) - colors_u[i])); + } + // V channel colors. + const int max_val = 1 << bit_depth; + int zero_count = 0, min_bits_v = 0; + int bits_v = + av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v); + const int rate_using_delta = + 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count; + const int rate_using_raw = bit_depth * n; + if (rate_using_delta < rate_using_raw) { // delta encoding + aom_write_bit(w, 1); + aom_write_literal(w, bits_v - min_bits_v, 2); + aom_write_literal(w, colors_v[0], bit_depth); + for (i = 1; i < n; ++i) { + if (colors_v[i] == colors_v[i - 1]) { // No need to signal sign bit. + aom_write_literal(w, 0, bits_v); + continue; + } + const int delta = abs((int)colors_v[i] - colors_v[i - 1]); + const int sign_bit = colors_v[i] < colors_v[i - 1]; + if (delta <= max_val - delta) { + aom_write_literal(w, delta, bits_v); + aom_write_bit(w, sign_bit); + } else { + aom_write_literal(w, max_val - delta, bits_v); + aom_write_bit(w, !sign_bit); + } + } + } else { // Transmit raw values. + aom_write_bit(w, 0); + for (i = 0; i < n; ++i) aom_write_literal(w, colors_v[i], bit_depth); + } +} +#endif // CONFIG_PALETTE_DELTA_ENCODING + +static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd, + const MODE_INFO *const mi, aom_writer *w) { + const MB_MODE_INFO *const mbmi = &mi->mbmi; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const BLOCK_SIZE bsize = mbmi->sb_type; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + + if (mbmi->mode == DC_PRED) { + const int n = pmi->palette_size[0]; + int palette_y_mode_ctx = 0; + if (above_mi) + palette_y_mode_ctx += + (above_mi->mbmi.palette_mode_info.palette_size[0] > 0); + if (left_mi) + palette_y_mode_ctx += + (left_mi->mbmi.palette_mode_info.palette_size[0] > 0); + aom_write( + w, n > 0, + av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_y_mode_ctx]); + if (n > 0) { + av1_write_token(w, av1_palette_size_tree, + av1_default_palette_y_size_prob[bsize - BLOCK_8X8], + &palette_size_encodings[n - PALETTE_MIN_SIZE]); +#if CONFIG_PALETTE_DELTA_ENCODING + write_palette_colors_y(pmi, cm->bit_depth, w); +#else + int i; + for (i = 0; i < n; ++i) + aom_write_literal(w, pmi->palette_colors[i], cm->bit_depth); +#endif // CONFIG_PALETTE_DELTA_ENCODING + write_uniform(w, n, pmi->palette_first_color_idx[0]); + } + } + + if (mbmi->uv_mode == DC_PRED) { + const int n = pmi->palette_size[1]; + const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); + aom_write(w, n > 0, av1_default_palette_uv_mode_prob[palette_uv_mode_ctx]); + if (n > 0) { + av1_write_token(w, av1_palette_size_tree, + av1_default_palette_uv_size_prob[bsize - BLOCK_8X8], + &palette_size_encodings[n - PALETTE_MIN_SIZE]); +#if CONFIG_PALETTE_DELTA_ENCODING + write_palette_colors_uv(pmi, cm->bit_depth, w); +#else + int i; + for (i = 0; i < n; ++i) { + aom_write_literal(w, pmi->palette_colors[PALETTE_MAX_SIZE + i], + cm->bit_depth); + aom_write_literal(w, pmi->palette_colors[2 * PALETTE_MAX_SIZE + i], + cm->bit_depth); + } +#endif // CONFIG_PALETTE_DELTA_ENCODING + write_uniform(w, n, pmi->palette_first_color_idx[1]); + } + } +} +#endif // CONFIG_PALETTE + +void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, +#if CONFIG_SUPERTX + const int supertx_enabled, +#endif +#if CONFIG_TXK_SEL + int block, int plane, +#endif + aom_writer *w) { + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const int is_inter = is_inter_block(mbmi); +#if CONFIG_VAR_TX + const TX_SIZE tx_size = is_inter ? mbmi->min_tx_size : mbmi->tx_size; +#else + const TX_SIZE tx_size = mbmi->tx_size; +#endif // CONFIG_VAR_TX +#if CONFIG_EC_ADAPT + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; +#else + FRAME_CONTEXT *ec_ctx = cm->fc; +#endif + +#if !CONFIG_TXK_SEL + TX_TYPE tx_type = mbmi->tx_type; +#else + // Only y plane's tx_type is transmitted + if (plane > 0) return; + PLANE_TYPE plane_type = get_plane_type(plane); + TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); +#endif + + if (!FIXED_TX_TYPE) { +#if CONFIG_EXT_TX + const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; + const BLOCK_SIZE bsize = mbmi->sb_type; + if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > + 1 && + ((!cm->seg.enabled && cm->base_qindex > 0) || + (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) && + !mbmi->skip && +#if CONFIG_SUPERTX + !supertx_enabled && +#endif // CONFIG_SUPERTX + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const int eset = + get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used); + if (is_inter) { + assert(ext_tx_used_inter[eset][tx_type]); + if (eset > 0) { +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, av1_ext_tx_inter_ind[eset][tx_type], + ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], + ext_tx_cnt_inter[eset]); +#else + av1_write_token(w, av1_ext_tx_inter_tree[eset], + ec_ctx->inter_ext_tx_prob[eset][square_tx_size], + &ext_tx_inter_encodings[eset][tx_type]); +#endif + } + } else if (ALLOW_INTRA_EXT_TX) { + assert(ext_tx_used_intra[eset][tx_type]); + if (eset > 0) { +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol( + w, av1_ext_tx_intra_ind[eset][tx_type], + ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode], + ext_tx_cnt_intra[eset]); +#else + av1_write_token( + w, av1_ext_tx_intra_tree[eset], + ec_ctx->intra_ext_tx_prob[eset][square_tx_size][mbmi->mode], + &ext_tx_intra_encodings[eset][tx_type]); +#endif + } + } + } +#else + if (tx_size < TX_32X32 && + ((!cm->seg.enabled && cm->base_qindex > 0) || + (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) && + !mbmi->skip && +#if CONFIG_SUPERTX + !supertx_enabled && +#endif // CONFIG_SUPERTX + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + if (is_inter) { +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, av1_ext_tx_ind[tx_type], + ec_ctx->inter_ext_tx_cdf[tx_size], TX_TYPES); +#else + av1_write_token(w, av1_ext_tx_tree, ec_ctx->inter_ext_tx_prob[tx_size], + &ext_tx_encodings[tx_type]); +#endif + } else { +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol( + w, av1_ext_tx_ind[tx_type], + ec_ctx->intra_ext_tx_cdf[tx_size] + [intra_mode_to_tx_type_context[mbmi->mode]], + TX_TYPES); +#else + av1_write_token( + w, av1_ext_tx_tree, + ec_ctx + ->intra_ext_tx_prob[tx_size] + [intra_mode_to_tx_type_context[mbmi->mode]], + &ext_tx_encodings[tx_type]); +#endif + } + } +#endif // CONFIG_EXT_TX + } +} + +static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize, + PREDICTION_MODE mode, aom_writer *w) { +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, av1_intra_mode_ind[mode], + frame_ctx->y_mode_cdf[size_group_lookup[bsize]], + INTRA_MODES); +#else + av1_write_token(w, av1_intra_mode_tree, + frame_ctx->y_mode_prob[size_group_lookup[bsize]], + &intra_mode_encodings[mode]); +#endif +} + +static void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx, + PREDICTION_MODE uv_mode, PREDICTION_MODE y_mode, + aom_writer *w) { +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, av1_intra_mode_ind[uv_mode], + frame_ctx->uv_mode_cdf[y_mode], INTRA_MODES); +#else + av1_write_token(w, av1_intra_mode_tree, frame_ctx->uv_mode_prob[y_mode], + &intra_mode_encodings[uv_mode]); +#endif +} + +static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, + const int mi_col, +#if CONFIG_SUPERTX + int supertx_enabled, +#endif + aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; +#if CONFIG_DELTA_Q || CONFIG_EC_ADAPT + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; +#else + const MACROBLOCK *x = &cpi->td.mb; + const MACROBLOCKD *xd = &x->e_mbd; +#endif +#if CONFIG_EC_ADAPT + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; +#else + FRAME_CONTEXT *ec_ctx = cm->fc; +#endif +#if !CONFIG_REF_MV + nmv_context *nmvc = &ec_ctx->nmvc; +#endif + const MODE_INFO *mi = xd->mi[0]; + + const struct segmentation *const seg = &cm->seg; + struct segmentation_probs *const segp = &cm->fc->seg; + const MB_MODE_INFO *const mbmi = &mi->mbmi; + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const PREDICTION_MODE mode = mbmi->mode; + const int segment_id = mbmi->segment_id; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int allow_hp = cm->allow_high_precision_mv; + const int is_inter = is_inter_block(mbmi); + const int is_compound = has_second_ref(mbmi); + int skip, ref; +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; +#endif + (void)mi_row; + (void)mi_col; + + if (seg->update_map) { + if (seg->temporal_update) { + const int pred_flag = mbmi->seg_id_predicted; + aom_prob pred_prob = av1_get_pred_prob_seg_id(segp, xd); + aom_write(w, pred_flag, pred_prob); + if (!pred_flag) write_segment_id(w, seg, segp, segment_id); + } else { + write_segment_id(w, seg, segp, segment_id); + } + } + +#if CONFIG_SUPERTX + if (supertx_enabled) + skip = mbmi->skip; + else + skip = write_skip(cm, xd, segment_id, mi, w); +#else + skip = write_skip(cm, xd, segment_id, mi, w); +#endif // CONFIG_SUPERTX +#if CONFIG_DELTA_Q + if (cm->delta_q_present_flag) { + int super_block_upper_left = + ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0); + if ((bsize != BLOCK_LARGEST || skip == 0) && super_block_upper_left) { + assert(mbmi->current_q_index > 0); + int reduced_delta_qindex = + (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res; + write_delta_qindex(cm, xd, reduced_delta_qindex, w); + xd->prev_qindex = mbmi->current_q_index; +#if CONFIG_EXT_DELTA_Q + if (cm->delta_lf_present_flag) { + int reduced_delta_lflevel = + (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) / + cm->delta_lf_res; + write_delta_lflevel(cm, xd, reduced_delta_lflevel, w); + xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base; + } +#endif // CONFIG_EXT_DELTA_Q + } + } +#endif + +#if CONFIG_SUPERTX + if (!supertx_enabled) +#endif // CONFIG_SUPERTX + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) + aom_write(w, is_inter, av1_get_intra_inter_prob(cm, xd)); + + if (cm->tx_mode == TX_MODE_SELECT && +#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_RECT_TX) +#if CONFIG_RECT_TX + bsize > BLOCK_4X4 && +#else + (bsize >= BLOCK_8X8 || (bsize > BLOCK_4X4 && is_inter)) && +#endif // CONFIG_RECT_TX +#else + bsize >= BLOCK_8X8 && +#endif +#if CONFIG_SUPERTX + !supertx_enabled && +#endif // CONFIG_SUPERTX + !(is_inter && skip) && !xd->lossless[segment_id]) { +#if CONFIG_VAR_TX + if (is_inter) { // This implies skip flag is 0. + const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, bsize); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + const int width = block_size_wide[bsize] >> tx_size_wide_log2[0]; + const int height = block_size_high[bsize] >> tx_size_wide_log2[0]; + int idx, idy; + for (idy = 0; idy < height; idy += bh) + for (idx = 0; idx < width; idx += bw) + write_tx_size_vartx(cm, xd, mbmi, max_tx_size, height != width, idy, + idx, w); + } else { + set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd); + write_selected_tx_size(cm, xd, w); + } + } else { + set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd); +#else + write_selected_tx_size(cm, xd, w); +#endif + } + + if (!is_inter) { + if (bsize >= BLOCK_8X8 || unify_bsize) { + write_intra_mode(ec_ctx, bsize, mode, w); + } else { + int idx, idy; + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode; + write_intra_mode(ec_ctx, bsize, b_mode, w); + } + } + } +#if CONFIG_CB4X4 + if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y)) + write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w); +#else // !CONFIG_CB4X4 + write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w); +#endif // CONFIG_CB4X4 + +#if CONFIG_EXT_INTRA + write_intra_angle_info(xd, ec_ctx, w); +#endif // CONFIG_EXT_INTRA +#if CONFIG_PALETTE + if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools) + write_palette_mode_info(cm, xd, mi, w); +#endif // CONFIG_PALETTE +#if CONFIG_FILTER_INTRA + if (bsize >= BLOCK_8X8 || unify_bsize) + write_filter_intra_mode_info(cm, mbmi, w); +#endif // CONFIG_FILTER_INTRA + } else { + int16_t mode_ctx; + write_ref_frames(cm, xd, w); + +#if CONFIG_REF_MV +#if CONFIG_EXT_INTER + if (is_compound) + mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]; + else +#endif // CONFIG_EXT_INTER + mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, + mbmi->ref_frame, bsize, -1); +#else // CONFIG_REF_MV + mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]]; +#endif // CONFIG_REF_MV + + // If segment skip is not enabled code the mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { + if (bsize >= BLOCK_8X8 || unify_bsize) { +#if CONFIG_EXT_INTER + if (is_inter_compound_mode(mode)) + write_inter_compound_mode(cm, w, mode, mode_ctx); + else if (is_inter_singleref_mode(mode)) +#endif // CONFIG_EXT_INTER + write_inter_mode(w, mode, ec_ctx, mode_ctx); + +#if CONFIG_REF_MV +#if CONFIG_EXT_INTER + if (mode == NEWMV || mode == NEW_NEWMV || + have_nearmv_in_inter_mode(mode)) +#else + if (mode == NEARMV || mode == NEWMV) +#endif + write_drl_idx(cm, mbmi, mbmi_ext, w); + else + assert(mbmi->ref_mv_idx == 0); +#endif + } + } + +#if !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION && !CONFIG_GLOBAL_MOTION + write_mb_interp_filter(cpi, xd, w); +#endif // !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION + + if (bsize < BLOCK_8X8 && !unify_bsize) { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int j = idy * 2 + idx; + const PREDICTION_MODE b_mode = mi->bmi[j].as_mode; +#if CONFIG_REF_MV +#if CONFIG_EXT_INTER + if (!is_compound) +#endif // CONFIG_EXT_INTER + mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, + mbmi->ref_frame, bsize, j); +#endif +#if CONFIG_EXT_INTER + if (is_inter_compound_mode(b_mode)) + write_inter_compound_mode(cm, w, b_mode, mode_ctx); + else if (is_inter_singleref_mode(b_mode)) +#endif // CONFIG_EXT_INTER + write_inter_mode(w, b_mode, ec_ctx, mode_ctx); + +#if CONFIG_EXT_INTER + if (b_mode == NEWMV || b_mode == NEW_NEWMV) { +#else + if (b_mode == NEWMV) { +#endif // CONFIG_EXT_INTER + for (ref = 0; ref < 1 + is_compound; ++ref) { +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], ref, + mbmi->ref_mv_idx); + nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; +#endif + av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv, +#if CONFIG_EXT_INTER + &mi->bmi[j].ref_mv[ref].as_mv, +#else +#if CONFIG_REF_MV + &mi->bmi[j].pred_mv[ref].as_mv, +#else + &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv, +#endif // CONFIG_REF_MV +#endif // CONFIG_EXT_INTER + nmvc, allow_hp); + } + } +#if CONFIG_EXT_INTER + else if (b_mode == NEAREST_NEWMV || b_mode == NEAR_NEWMV) { +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], 1, + mbmi->ref_mv_idx); + nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; +#endif + av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[1].as_mv, + &mi->bmi[j].ref_mv[1].as_mv, nmvc, allow_hp); + } else if (b_mode == NEW_NEARESTMV || b_mode == NEW_NEARMV) { +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], 0, + mbmi->ref_mv_idx); + nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; +#endif + av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[0].as_mv, + &mi->bmi[j].ref_mv[0].as_mv, nmvc, allow_hp); + } +#endif // CONFIG_EXT_INTER + } + } + } else { +#if CONFIG_EXT_INTER + if (mode == NEWMV || mode == NEW_NEWMV) { +#else + if (mode == NEWMV) { +#endif // CONFIG_EXT_INTER + int_mv ref_mv; + for (ref = 0; ref < 1 + is_compound; ++ref) { +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], ref, + mbmi->ref_mv_idx); + nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; +#endif + ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0]; + av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc, + allow_hp); + } +#if CONFIG_EXT_INTER + } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) { +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = + av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx); + nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; +#endif + av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, + &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv, nmvc, + allow_hp); + } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) { +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = + av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx); + nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx]; +#endif + av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, + &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv, nmvc, + allow_hp); +#endif // CONFIG_EXT_INTER + } + } + +#if CONFIG_EXT_INTER + if (cpi->common.reference_mode != COMPOUND_REFERENCE && +#if CONFIG_SUPERTX + !supertx_enabled && +#endif // CONFIG_SUPERTX + is_interintra_allowed(mbmi)) { + const int interintra = mbmi->ref_frame[1] == INTRA_FRAME; + const int bsize_group = size_group_lookup[bsize]; + aom_write(w, interintra, cm->fc->interintra_prob[bsize_group]); + if (interintra) { + write_interintra_mode(w, mbmi->interintra_mode, + cm->fc->interintra_mode_prob[bsize_group]); + if (is_interintra_wedge_used(bsize)) { + aom_write(w, mbmi->use_wedge_interintra, + cm->fc->wedge_interintra_prob[bsize]); + if (mbmi->use_wedge_interintra) { + aom_write_literal(w, mbmi->interintra_wedge_index, + get_wedge_bits_lookup(bsize)); + assert(mbmi->interintra_wedge_sign == 0); + } + } + } + } +#endif // CONFIG_EXT_INTER + +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION +#if CONFIG_SUPERTX + if (!supertx_enabled) +#endif // CONFIG_SUPERTX +#if CONFIG_EXT_INTER + if (mbmi->ref_frame[1] != INTRA_FRAME) +#endif // CONFIG_EXT_INTER + write_motion_mode(cm, mi, w); +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + +#if CONFIG_EXT_INTER + if (cpi->common.reference_mode != SINGLE_REFERENCE && + is_inter_compound_mode(mbmi->mode) +#if CONFIG_MOTION_VAR + && mbmi->motion_mode == SIMPLE_TRANSLATION +#endif // CONFIG_MOTION_VAR + && is_any_masked_compound_used(bsize)) { +#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE + av1_write_token(w, av1_compound_type_tree, + cm->fc->compound_type_prob[bsize], + &compound_type_encodings[mbmi->interinter_compound_type]); +#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE +#if CONFIG_WEDGE + if (mbmi->interinter_compound_type == COMPOUND_WEDGE) { + aom_write_literal(w, mbmi->wedge_index, get_wedge_bits_lookup(bsize)); + aom_write_bit(w, mbmi->wedge_sign); + } +#endif // CONFIG_WEDGE +#if CONFIG_COMPOUND_SEGMENT + if (mbmi->interinter_compound_type == COMPOUND_SEG) { + aom_write_literal(w, mbmi->mask_type, MAX_SEG_MASK_BITS); + } +#endif // CONFIG_COMPOUND_SEGMENT + } +#endif // CONFIG_EXT_INTER + +#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION + write_mb_interp_filter(cpi, xd, w); +#endif // CONFIG_DUAL_FILTE || CONFIG_WARPED_MOTION + } + +#if !CONFIG_TXK_SEL + av1_write_tx_type(cm, xd, +#if CONFIG_SUPERTX + supertx_enabled, +#endif + w); +#endif // !CONFIG_TXK_SEL +} + +#if CONFIG_DELTA_Q +static void write_mb_modes_kf(AV1_COMMON *cm, MACROBLOCKD *xd, const int mi_row, + const int mi_col, aom_writer *w) { + int skip; +#else +static void write_mb_modes_kf(AV1_COMMON *cm, const MACROBLOCKD *xd, + const int mi_row, const int mi_col, + aom_writer *w) { +#endif + const struct segmentation *const seg = &cm->seg; + struct segmentation_probs *const segp = &cm->fc->seg; + const MODE_INFO *const mi = xd->mi[0]; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const MB_MODE_INFO *const mbmi = &mi->mbmi; + const BLOCK_SIZE bsize = mbmi->sb_type; +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; +#endif + (void)mi_row; + (void)mi_col; + +#if CONFIG_EC_ADAPT + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; +#else + FRAME_CONTEXT *ec_ctx = cm->fc; +#endif + + if (seg->update_map) write_segment_id(w, seg, segp, mbmi->segment_id); + +#if CONFIG_DELTA_Q + skip = write_skip(cm, xd, mbmi->segment_id, mi, w); + if (cm->delta_q_present_flag) { + int super_block_upper_left = + ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0); + if ((bsize != BLOCK_LARGEST || skip == 0) && super_block_upper_left) { + assert(mbmi->current_q_index > 0); + int reduced_delta_qindex = + (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res; + write_delta_qindex(cm, xd, reduced_delta_qindex, w); + xd->prev_qindex = mbmi->current_q_index; +#if CONFIG_EXT_DELTA_Q + if (cm->delta_lf_present_flag) { + int reduced_delta_lflevel = + (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) / + cm->delta_lf_res; + write_delta_lflevel(cm, xd, reduced_delta_lflevel, w); + xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base; + } +#endif // CONFIG_EXT_DELTA_Q + } + } +#else + write_skip(cm, xd, mbmi->segment_id, mi, w); +#endif + + if (cm->tx_mode == TX_MODE_SELECT && +#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_RECT_TX) +#if CONFIG_RECT_TX + bsize > BLOCK_4X4 && +#else + bsize >= BLOCK_8X8 && +#endif // CONFIG_RECT_TX +#else + bsize >= BLOCK_8X8 && +#endif + !xd->lossless[mbmi->segment_id]) + write_selected_tx_size(cm, xd, w); + +#if CONFIG_INTRABC + if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools) { + int use_intrabc = is_intrabc_block(mbmi); + aom_write(w, use_intrabc, INTRABC_PROB); + if (use_intrabc) { + assert(mbmi->mode == DC_PRED); + assert(mbmi->uv_mode == DC_PRED); + int_mv dv_ref; + av1_find_ref_dv(&dv_ref, mi_row, mi_col); + av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc); + return; + } + } +#endif // CONFIG_INTRABC + + if (bsize >= BLOCK_8X8 || unify_bsize) { + write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, 0, mbmi->mode, w); + } else { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int block = idy * 2 + idx; + write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, block, + mi->bmi[block].as_mode, w); + } + } + } + +#if CONFIG_CB4X4 + if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y)) + write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w); +#else // !CONFIG_CB4X4 + write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w); +#endif // CONFIG_CB4X4 + +#if CONFIG_EXT_INTRA + write_intra_angle_info(xd, ec_ctx, w); +#endif // CONFIG_EXT_INTRA +#if CONFIG_PALETTE + if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools) + write_palette_mode_info(cm, xd, mi, w); +#endif // CONFIG_PALETTE +#if CONFIG_FILTER_INTRA + if (bsize >= BLOCK_8X8 || unify_bsize) + write_filter_intra_mode_info(cm, mbmi, w); +#endif // CONFIG_FILTER_INTRA + +#if !CONFIG_TXK_SEL + av1_write_tx_type(cm, xd, +#if CONFIG_SUPERTX + 0, +#endif + w); +#endif // !CONFIG_TXK_SEL +} + +#if CONFIG_SUPERTX +#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \ + mi_row, mi_col) \ + write_modes_b(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col) +#else +#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \ + mi_row, mi_col) \ + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col) +#endif // CONFIG_SUPERTX + +#if CONFIG_RD_DEBUG +static void dump_mode_info(MODE_INFO *mi) { + printf("\nmi->mbmi.mi_row == %d\n", mi->mbmi.mi_row); + printf("&& mi->mbmi.mi_col == %d\n", mi->mbmi.mi_col); + printf("&& mi->mbmi.sb_type == %d\n", mi->mbmi.sb_type); + printf("&& mi->mbmi.tx_size == %d\n", mi->mbmi.tx_size); + if (mi->mbmi.sb_type >= BLOCK_8X8) { + printf("&& mi->mbmi.mode == %d\n", mi->mbmi.mode); + } else { + printf("&& mi->bmi[0].as_mode == %d\n", mi->bmi[0].as_mode); + } +} +static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats, + int plane) { + if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) { +#if CONFIG_VAR_TX + int r, c; +#endif + printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n", + plane, rd_stats->txb_coeff_cost[plane], token_stats->cost); +#if CONFIG_VAR_TX + printf("rd txb_coeff_cost_map\n"); + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { + printf("%d ", rd_stats->txb_coeff_cost_map[plane][r][c]); + } + printf("\n"); + } + + printf("pack txb_coeff_cost_map\n"); + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { + printf("%d ", token_stats->txb_coeff_cost_map[r][c]); + } + printf("\n"); + } +#endif + return 1; + } + return 0; +} +#endif + +static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile, + aom_writer *w, +#if CONFIG_SUPERTX + int supertx_enabled, +#endif + int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + MODE_INFO *m; + int bh, bw; + xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col); + m = xd->mi[0]; + + assert(m->mbmi.sb_type <= cm->sb_size); + + bh = mi_size_high[m->mbmi.sb_type]; + bw = mi_size_wide[m->mbmi.sb_type]; + + cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); + + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, +#if CONFIG_DEPENDENT_HORZTILES + cm->dependent_horz_tiles, +#endif // CONFIG_DEPENDENT_HORZTILES + cm->mi_rows, cm->mi_cols); + + if (frame_is_intra_only(cm)) { + write_mb_modes_kf(cm, xd, mi_row, mi_col, w); + } else { +#if CONFIG_VAR_TX + xd->above_txfm_context = cm->above_txfm_context + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); +#endif +#if CONFIG_DUAL_FILTER + // has_subpel_mv_component needs the ref frame buffers set up to look + // up if they are scaled. has_subpel_mv_component is in turn needed by + // write_switchable_interp_filter, which is called by pack_inter_mode_mvs. + set_ref_ptrs(cm, xd, m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]); +#endif // CONFIG_DUAL_FILTER +#if 0 + // NOTE(zoeliu): For debug + if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) { + const PREDICTION_MODE mode = m->mbmi.mode; + const int segment_id = m->mbmi.segment_id; + const BLOCK_SIZE bsize = m->mbmi.sb_type; + + // For sub8x8, simply dump out the first sub8x8 block info + const PREDICTION_MODE b_mode = + (bsize < BLOCK_8X8) ? m->bmi[0].as_mode : -1; + const int mv_x = (bsize < BLOCK_8X8) ? + m->bmi[0].as_mv[0].as_mv.row : m->mbmi.mv[0].as_mv.row; + const int mv_y = (bsize < BLOCK_8X8) ? + m->bmi[0].as_mv[0].as_mv.col : m->mbmi.mv[0].as_mv.col; + + printf("Before pack_inter_mode_mvs(): " + "Frame=%d, (mi_row,mi_col)=(%d,%d), " + "mode=%d, segment_id=%d, bsize=%d, b_mode=%d, " + "mv[0]=(%d, %d), ref[0]=%d, ref[1]=%d\n", + cm->current_video_frame, mi_row, mi_col, + mode, segment_id, bsize, b_mode, mv_x, mv_y, + m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]); + } +#endif // 0 + pack_inter_mode_mvs(cpi, mi_row, mi_col, +#if CONFIG_SUPERTX + supertx_enabled, +#endif + w); + } +} + +static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile, + aom_writer *w, const TOKENEXTRA **tok, + const TOKENEXTRA *const tok_end, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + MODE_INFO *const m = xd->mi[0]; + MB_MODE_INFO *const mbmi = &m->mbmi; + int plane; + int bh, bw; +#if CONFIG_PVQ || CONFIG_LV_MAP + MACROBLOCK *const x = &cpi->td.mb; + (void)tok; + (void)tok_end; +#endif + xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col); + + assert(mbmi->sb_type <= cm->sb_size); + + bh = mi_size_high[mbmi->sb_type]; + bw = mi_size_wide[mbmi->sb_type]; + cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); + + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, +#if CONFIG_DEPENDENT_HORZTILES + cm->dependent_horz_tiles, +#endif // CONFIG_DEPENDENT_HORZTILES + cm->mi_rows, cm->mi_cols); + +#if CONFIG_PALETTE + for (plane = 0; plane <= 1; ++plane) { + const uint8_t palette_size_plane = + mbmi->palette_mode_info.palette_size[plane]; + if (palette_size_plane > 0) { +#if CONFIG_INTRABC + assert(mbmi->use_intrabc == 0); +#endif + int rows, cols; + assert(mbmi->sb_type >= BLOCK_8X8); + av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows, + &cols); + assert(*tok < tok_end); + pack_palette_tokens(w, tok, palette_size_plane, rows * cols - 1); + assert(*tok < tok_end + mbmi->skip); + } + } +#endif // CONFIG_PALETTE + +#if CONFIG_COEF_INTERLEAVE + if (!mbmi->skip) { + const struct macroblockd_plane *const pd_y = &xd->plane[0]; + const struct macroblockd_plane *const pd_c = &xd->plane[1]; + const TX_SIZE tx_log2_y = mbmi->tx_size; + const TX_SIZE tx_log2_c = get_uv_tx_size(mbmi, pd_c); + const int tx_sz_y = (1 << tx_log2_y); + const int tx_sz_c = (1 << tx_log2_c); + + const BLOCK_SIZE plane_bsize_y = + get_plane_block_size(AOMMAX(mbmi->sb_type, 3), pd_y); + const BLOCK_SIZE plane_bsize_c = + get_plane_block_size(AOMMAX(mbmi->sb_type, 3), pd_c); + + const int num_4x4_w_y = num_4x4_blocks_wide_lookup[plane_bsize_y]; + const int num_4x4_w_c = num_4x4_blocks_wide_lookup[plane_bsize_c]; + const int num_4x4_h_y = num_4x4_blocks_high_lookup[plane_bsize_y]; + const int num_4x4_h_c = num_4x4_blocks_high_lookup[plane_bsize_c]; + + const int max_4x4_w_y = get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge, + pd_y->subsampling_x); + const int max_4x4_h_y = get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge, + pd_y->subsampling_y); + const int max_4x4_w_c = get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge, + pd_c->subsampling_x); + const int max_4x4_h_c = get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge, + pd_c->subsampling_y); + + // The max_4x4_w/h may be smaller than tx_sz under some corner cases, + // i.e. when the SB is splitted by tile boundaries. + const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y; + const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y; + const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c; + const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c; + const int tu_num_y = tu_num_w_y * tu_num_h_y; + const int tu_num_c = tu_num_w_c * tu_num_h_c; + + int tu_idx_y = 0, tu_idx_c = 0; + TOKEN_STATS token_stats; + init_token_stats(&token_stats); + + assert(*tok < tok_end); + + while (tu_idx_y < tu_num_y) { + pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_y, &token_stats); + assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); + (*tok)++; + tu_idx_y++; + + if (tu_idx_c < tu_num_c) { + pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats); + assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); + (*tok)++; + + pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats); + assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); + (*tok)++; + + tu_idx_c++; + } + } + + // In 422 case, it's possilbe that Chroma has more TUs than Luma + while (tu_idx_c < tu_num_c) { + pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats); + assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); + (*tok)++; + + pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats); + assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); + (*tok)++; + + tu_idx_c++; + } + } +#else // CONFIG_COEF_INTERLEAVE + if (!mbmi->skip) { +#if !CONFIG_PVQ && !CONFIG_LV_MAP + assert(*tok < tok_end); +#endif + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { +#if CONFIG_CB4X4 + if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type, + xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y)) { + (*tok)++; + continue; + } +#endif +#if CONFIG_VAR_TX + const struct macroblockd_plane *const pd = &xd->plane[plane]; + BLOCK_SIZE bsize = mbmi->sb_type; +#if CONFIG_CB4X4 +#if CONFIG_CHROMA_2X2 + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); +#else + const BLOCK_SIZE plane_bsize = + AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd)); +#endif +#else + const BLOCK_SIZE plane_bsize = + get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd); +#endif + + const int num_4x4_w = + block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int num_4x4_h = + block_size_high[plane_bsize] >> tx_size_wide_log2[0]; + int row, col; + TOKEN_STATS token_stats; + init_token_stats(&token_stats); + + if (is_inter_block(mbmi)) { + const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize); + int block = 0; + const int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + const int bkw = tx_size_wide_unit[max_tx_size]; + const int bkh = tx_size_high_unit[max_tx_size]; + for (row = 0; row < num_4x4_h; row += bkh) { + for (col = 0; col < num_4x4_w; col += bkw) { + pack_txb_tokens(w, tok, tok_end, +#if CONFIG_PVQ + x, +#endif + xd, mbmi, plane, plane_bsize, cm->bit_depth, block, + row, col, max_tx_size, &token_stats); + block += step; + } + } +#if CONFIG_RD_DEBUG + if (mbmi->sb_type >= BLOCK_8X8 && + rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) { + dump_mode_info(m); + assert(0); + } +#endif // CONFIG_RD_DEBUG + } else { + TX_SIZE tx = get_tx_size(plane, xd); +#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 + tx = AOMMAX(TX_4X4, tx); +#endif + const int bkw = tx_size_wide_unit[tx]; + const int bkh = tx_size_high_unit[tx]; + for (row = 0; row < num_4x4_h; row += bkh) { + for (col = 0; col < num_4x4_w; col += bkw) { +#if !CONFIG_PVQ + pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats); +#else + pack_pvq_tokens(w, x, xd, plane, bsize, tx); +#endif + } + } + } +#else + TX_SIZE tx = get_tx_size(plane, xd); + TOKEN_STATS token_stats; +#if !CONFIG_PVQ + init_token_stats(&token_stats); +#if CONFIG_LV_MAP + (void)tx; + av1_write_coeffs_mb(cm, x, w, plane); +#else // CONFIG_LV_MAP + pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats); +#endif // CONFIG_LV_MAP + +#else + (void)token_stats; + pack_pvq_tokens(w, x, xd, plane, mbmi->sb_type, tx); +#endif +#if CONFIG_RD_DEBUG + if (is_inter_block(mbmi) && mbmi->sb_type >= BLOCK_8X8 && + rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) { + dump_mode_info(m); + assert(0); + } +#endif // CONFIG_RD_DEBUG +#endif // CONFIG_VAR_TX + +#if !CONFIG_PVQ && !CONFIG_LV_MAP + assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); + (*tok)++; +#endif + } + } +#endif // CONFIG_COEF_INTERLEAVE +} + +#if CONFIG_MOTION_VAR && CONFIG_NCOBMC +static void write_tokens_sb(AV1_COMP *cpi, const TileInfo *const tile, + aom_writer *w, const TOKENEXTRA **tok, + const TOKENEXTRA *const tok_end, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const int hbs = mi_size_wide[bsize] / 2; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; +#endif + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + partition = get_partition(cm, mi_row, mi_col, bsize); + subsize = get_subsize(bsize, partition); + + if (subsize < BLOCK_8X8 && !unify_bsize) { + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + } else { + switch (partition) { + case PARTITION_NONE: + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + break; + case PARTITION_HORZ: + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_row + hbs < cm->mi_rows) + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + break; + case PARTITION_VERT: + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_col + hbs < cm->mi_cols) + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + break; + case PARTITION_SPLIT: + write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize); + write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, + subsize); + write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, + subsize); + write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs, + subsize); + break; +#if CONFIG_EXT_PARTITION_TYPES + case PARTITION_HORZ_A: + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + break; + case PARTITION_HORZ_B: + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_VERT_A: + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + break; + case PARTITION_VERT_B: + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); + break; +#endif // CONFIG_EXT_PARTITION_TYPES + default: assert(0); + } + } +} +#endif + +static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile, + aom_writer *w, const TOKENEXTRA **tok, + const TOKENEXTRA *const tok_end, +#if CONFIG_SUPERTX + int supertx_enabled, +#endif + int mi_row, int mi_col) { + write_mbmi_b(cpi, tile, w, +#if CONFIG_SUPERTX + supertx_enabled, +#endif + mi_row, mi_col); +#if CONFIG_MOTION_VAR && CONFIG_NCOBMC + (void)tok; + (void)tok_end; +#else +#if !CONFIG_PVQ && CONFIG_SUPERTX + if (!supertx_enabled) +#endif + write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); +#endif +} + +static void write_partition(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, int hbs, int mi_row, + int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize, + aom_writer *w) { + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + const int is_partition_point = bsize >= BLOCK_8X8; + const int ctx = is_partition_point + ? partition_plane_context(xd, mi_row, mi_col, +#if CONFIG_UNPOISON_PARTITION_CTX + has_rows, has_cols, +#endif + bsize) + : 0; +#if CONFIG_UNPOISON_PARTITION_CTX + const aom_prob *const probs = + ctx < PARTITION_CONTEXTS ? cm->fc->partition_prob[ctx] : NULL; +#else + const aom_prob *const probs = cm->fc->partition_prob[ctx]; +#endif + +#if CONFIG_EC_ADAPT + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + (void)cm; +#elif CONFIG_EC_MULTISYMBOL + FRAME_CONTEXT *ec_ctx = cm->fc; +#endif + + if (!is_partition_point) return; + + if (has_rows && has_cols) { +#if CONFIG_EXT_PARTITION_TYPES + if (bsize <= BLOCK_8X8) +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], PARTITION_TYPES); +#else + av1_write_token(w, av1_partition_tree, probs, &partition_encodings[p]); +#endif + else +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], EXT_PARTITION_TYPES); +#else + av1_write_token(w, av1_ext_partition_tree, probs, + &ext_partition_encodings[p]); +#endif // CONFIG_EC_MULTISYMBOL +#else +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], PARTITION_TYPES); +#else + av1_write_token(w, av1_partition_tree, probs, &partition_encodings[p]); +#endif +#endif // CONFIG_EXT_PARTITION_TYPES + } else if (!has_rows && has_cols) { + assert(p == PARTITION_SPLIT || p == PARTITION_HORZ); + aom_write(w, p == PARTITION_SPLIT, probs[1]); + } else if (has_rows && !has_cols) { + assert(p == PARTITION_SPLIT || p == PARTITION_VERT); + aom_write(w, p == PARTITION_SPLIT, probs[2]); + } else { + assert(p == PARTITION_SPLIT); + } +} + +#if CONFIG_SUPERTX +#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \ + mi_row, mi_col, bsize) \ + write_modes_sb(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col, \ + bsize) +#else +#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \ + mi_row, mi_col, bsize) \ + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, bsize) +#endif // CONFIG_SUPERTX + +static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile, + aom_writer *const w, const TOKENEXTRA **tok, + const TOKENEXTRA *const tok_end, +#if CONFIG_SUPERTX + int supertx_enabled, +#endif + int mi_row, int mi_col, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + const int hbs = mi_size_wide[bsize] / 2; + const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize); + const BLOCK_SIZE subsize = get_subsize(bsize, partition); +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; +#endif + +#if CONFIG_SUPERTX + const int mi_offset = mi_row * cm->mi_stride + mi_col; + MB_MODE_INFO *mbmi; + const int pack_token = !supertx_enabled; + TX_SIZE supertx_size; + int plane; +#endif + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w); +#if CONFIG_SUPERTX + mbmi = &cm->mi_grid_visible[mi_offset]->mbmi; + xd->mi = cm->mi_grid_visible + mi_offset; + set_mi_row_col(xd, tile, mi_row, mi_size_high[bsize], mi_col, + mi_size_wide[bsize], +#if CONFIG_DEPENDENT_HORZTILES + cm->dependent_horz_tiles, +#endif // CONFIG_DEPENDENT_HORZTILES + cm->mi_rows, cm->mi_cols); + if (!supertx_enabled && !frame_is_intra_only(cm) && + partition != PARTITION_NONE && bsize <= MAX_SUPERTX_BLOCK_SIZE && + !xd->lossless[0]) { + aom_prob prob; + supertx_size = max_txsize_lookup[bsize]; + prob = cm->fc->supertx_prob[partition_supertx_context_lookup[partition]] + [supertx_size]; + supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size); + aom_write(w, supertx_enabled, prob); + } +#endif // CONFIG_SUPERTX + if (subsize < BLOCK_8X8 && !unify_bsize) { + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, + mi_col); + } else { + switch (partition) { + case PARTITION_NONE: + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col); + break; + case PARTITION_HORZ: + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col); + if (mi_row + hbs < cm->mi_rows) + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row + hbs, mi_col); + break; + case PARTITION_VERT: + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col); + if (mi_col + hbs < cm->mi_cols) + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col + hbs); + break; + case PARTITION_SPLIT: + write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col, subsize); + write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col + hbs, subsize); + write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row + hbs, mi_col, subsize); + write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row + hbs, mi_col + hbs, subsize); + break; +#if CONFIG_EXT_PARTITION_TYPES + case PARTITION_HORZ_A: + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col); + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col + hbs); + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row + hbs, mi_col); + break; + case PARTITION_HORZ_B: + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col); + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row + hbs, mi_col); + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row + hbs, mi_col + hbs); + break; + case PARTITION_VERT_A: + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col); + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row + hbs, mi_col); + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col + hbs); + break; + case PARTITION_VERT_B: + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col); + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row, mi_col + hbs); + write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, + mi_row + hbs, mi_col + hbs); + break; +#endif // CONFIG_EXT_PARTITION_TYPES + default: assert(0); + } + } +#if CONFIG_SUPERTX + if (partition != PARTITION_NONE && supertx_enabled && pack_token) { + int skip; + const int bsw = mi_size_wide[bsize]; + const int bsh = mi_size_high[bsize]; + + xd->mi = cm->mi_grid_visible + mi_offset; + supertx_size = mbmi->tx_size; + set_mi_row_col(xd, tile, mi_row, bsh, mi_col, bsw, +#if CONFIG_DEPENDENT_HORZTILES + cm->dependent_horz_tiles, +#endif // CONFIG_DEPENDENT_HORZTILES + cm->mi_rows, cm->mi_cols); + + assert(IMPLIES(!cm->seg.enabled, mbmi->segment_id_supertx == 0)); + assert(mbmi->segment_id_supertx < MAX_SEGMENTS); + + skip = write_skip(cm, xd, mbmi->segment_id_supertx, xd->mi[0], w); +#if CONFIG_EXT_TX + if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) > 1 && + !skip) { + const int eset = + get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used); + if (eset > 0) { +#if CONFIG_EC_MULTISYMBOL +#if CONFIG_EC_ADAPT + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; +#else + FRAME_CONTEXT *ec_ctx = cm->fc; +#endif + aom_write_symbol(w, av1_ext_tx_inter_ind[eset][mbmi->tx_type], + ec_ctx->inter_ext_tx_cdf[eset][supertx_size], + ext_tx_cnt_inter[eset]); +#else + av1_write_token(w, av1_ext_tx_inter_tree[eset], + cm->fc->inter_ext_tx_prob[eset][supertx_size], + &ext_tx_inter_encodings[eset][mbmi->tx_type]); +#endif + } + } +#else + if (supertx_size < TX_32X32 && !skip) { + av1_write_token(w, av1_ext_tx_tree, + cm->fc->inter_ext_tx_prob[supertx_size], + &ext_tx_encodings[mbmi->tx_type]); + } +#endif // CONFIG_EXT_TX + + if (!skip) { + assert(*tok < tok_end); + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int mbmi_txb_size = txsize_to_bsize[mbmi->tx_size]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi_txb_size, pd); + + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + + int row, col; + TX_SIZE tx = get_tx_size(plane, xd); + BLOCK_SIZE txb_size = txsize_to_bsize[tx]; + + const int stepr = tx_size_high_unit[txb_size]; + const int stepc = tx_size_wide_unit[txb_size]; + + TOKEN_STATS token_stats; + token_stats.cost = 0; + for (row = 0; row < max_blocks_high; row += stepr) + for (col = 0; col < max_blocks_wide; col += stepc) + pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats); + assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN); + (*tok)++; + } + } +#if CONFIG_VAR_TX + xd->above_txfm_context = cm->above_txfm_context + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + set_txfm_ctxs(xd->mi[0]->mbmi.tx_size, bsw, bsh, skip, xd); +#endif + } +#endif // CONFIG_SUPERTX + +// update partition context +#if CONFIG_EXT_PARTITION_TYPES + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); +#else + if (bsize >= BLOCK_8X8 && + (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) + update_partition_context(xd, mi_row, mi_col, subsize, bsize); +#endif // CONFIG_EXT_PARTITION_TYPES + +#if CONFIG_CDEF +#if CONFIG_EXT_PARTITION + if (cm->sb_size == BLOCK_128X128 && bsize == BLOCK_128X128 && + !sb_all_skip(cm, mi_row, mi_col)) { + aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col] + ->mbmi.cdef_strength, + cm->cdef_bits); + } else if (cm->sb_size == BLOCK_64X64 && bsize == BLOCK_64X64 && +#else + if (bsize == BLOCK_64X64 && +#endif // CONFIG_EXT_PARTITION + !sb_all_skip(cm, mi_row, mi_col)) { + if (cm->cdef_bits != 0) + aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col] + ->mbmi.cdef_strength, + cm->cdef_bits); + } +#endif +} + +static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile, + aom_writer *const w, const TOKENEXTRA **tok, + const TOKENEXTRA *const tok_end) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + const int mi_row_start = tile->mi_row_start; + const int mi_row_end = tile->mi_row_end; + const int mi_col_start = tile->mi_col_start; + const int mi_col_end = tile->mi_col_end; + int mi_row, mi_col; + +#if CONFIG_DEPENDENT_HORZTILES +#if CONFIG_TILE_GROUPS + if (!cm->dependent_horz_tiles || mi_row_start == 0 || + tile->tg_horz_boundary) { +#else + if (!cm->dependent_horz_tiles || mi_row_start == 0) { +#endif + av1_zero_above_context(cm, mi_col_start, mi_col_end); + } +#else + av1_zero_above_context(cm, mi_col_start, mi_col_end); +#endif +#if CONFIG_PVQ + assert(cpi->td.mb.pvq_q->curr_pos == 0); +#endif +#if CONFIG_DELTA_Q + if (cpi->common.delta_q_present_flag) { + xd->prev_qindex = cpi->common.base_qindex; +#if CONFIG_EXT_DELTA_Q + if (cpi->common.delta_lf_present_flag) { + xd->prev_delta_lf_from_base = 0; + } +#endif // CONFIG_EXT_DELTA_Q + } +#endif + + for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->mib_size) { + av1_zero_left_context(xd); + + for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->mib_size) { + write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, 0, mi_row, mi_col, + cm->sb_size); +#if CONFIG_MOTION_VAR && CONFIG_NCOBMC + write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, cm->sb_size); +#endif + } + } +#if CONFIG_PVQ + // Check that the number of PVQ blocks encoded and written to the bitstream + // are the same + assert(cpi->td.mb.pvq_q->curr_pos == cpi->td.mb.pvq_q->last_pos); + // Reset curr_pos in case we repack the bitstream + cpi->td.mb.pvq_q->curr_pos = 0; +#endif +} + +#if !CONFIG_LV_MAP +#if !CONFIG_PVQ && !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET) +static void build_tree_distribution(AV1_COMP *cpi, TX_SIZE tx_size, + av1_coeff_stats *coef_branch_ct, + av1_coeff_probs_model *coef_probs) { + av1_coeff_count *coef_counts = cpi->td.rd_counts.coef_counts[tx_size]; + unsigned int(*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] = + cpi->common.counts.eob_branch[tx_size]; + int i, j, k, l, m; +#if CONFIG_RECT_TX + assert(!is_rect_tx(tx_size)); +#endif // CONFIG_RECT_TX + + for (i = 0; i < PLANE_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + av1_tree_probs_from_distribution(av1_coef_tree, + coef_branch_ct[i][j][k][l], + coef_counts[i][j][k][l]); + coef_branch_ct[i][j][k][l][0][1] = + eob_branch_ct[i][j][k][l] - coef_branch_ct[i][j][k][l][0][0]; + for (m = 0; m < UNCONSTRAINED_NODES; ++m) + coef_probs[i][j][k][l][m] = + get_binary_prob(coef_branch_ct[i][j][k][l][m][0], + coef_branch_ct[i][j][k][l][m][1]); + } + } + } + } +} + +#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET) +static void update_coef_probs_common(aom_writer *const bc, AV1_COMP *cpi, + TX_SIZE tx_size, + av1_coeff_stats *frame_branch_ct, + av1_coeff_probs_model *new_coef_probs) { + av1_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size]; + const aom_prob upd = DIFF_UPDATE_PROB; +#if CONFIG_EC_ADAPT + const int entropy_nodes_update = UNCONSTRAINED_NODES - 1; +#else + const int entropy_nodes_update = UNCONSTRAINED_NODES; +#endif + int i, j, k, l, t; + int stepsize = cpi->sf.coeff_prob_appx_step; +#if CONFIG_TILE_GROUPS + const int probwt = cpi->common.num_tg; +#else + const int probwt = 1; +#endif +#if CONFIG_RECT_TX + assert(!is_rect_tx(tx_size)); +#endif // CONFIG_RECT_TX + + switch (cpi->sf.use_fast_coef_updates) { + case TWO_LOOP: { + /* dry run to see if there is any update at all needed */ + int savings = 0; + int update[2] = { 0, 0 }; + for (i = 0; i < PLANE_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + for (t = 0; t < entropy_nodes_update; ++t) { + aom_prob newp = new_coef_probs[i][j][k][l][t]; + const aom_prob oldp = old_coef_probs[i][j][k][l][t]; + int s; + int u = 0; + if (t == PIVOT_NODE) + s = av1_prob_diff_update_savings_search_model( + frame_branch_ct[i][j][k][l][0], oldp, &newp, upd, + stepsize, probwt); + else + s = av1_prob_diff_update_savings_search( + frame_branch_ct[i][j][k][l][t], oldp, &newp, upd, probwt); + + if (s > 0 && newp != oldp) u = 1; + if (u) + savings += s - (int)(av1_cost_zero(upd)); + else + savings -= (int)(av1_cost_zero(upd)); + update[u]++; + } + } + } + } + } + + /* Is coef updated at all */ + if (update[1] == 0 || savings < 0) { + aom_write_bit(bc, 0); + return; + } + aom_write_bit(bc, 1); + for (i = 0; i < PLANE_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + // calc probs and branch cts for this frame only + for (t = 0; t < entropy_nodes_update; ++t) { + aom_prob newp = new_coef_probs[i][j][k][l][t]; + aom_prob *oldp = old_coef_probs[i][j][k][l] + t; + int s; + int u = 0; + if (t == PIVOT_NODE) + s = av1_prob_diff_update_savings_search_model( + frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd, + stepsize, probwt); + else + s = av1_prob_diff_update_savings_search( + frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd, + probwt); + if (s > 0 && newp != *oldp) u = 1; + aom_write(bc, u, upd); + if (u) { + /* send/use new probability */ + av1_write_prob_diff_update(bc, newp, *oldp); + *oldp = newp; + } + } + } + } + } + } + return; + } + + case ONE_LOOP_REDUCED: { + int updates = 0; + int noupdates_before_first = 0; + for (i = 0; i < PLANE_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + // calc probs and branch cts for this frame only + for (t = 0; t < entropy_nodes_update; ++t) { + aom_prob newp = new_coef_probs[i][j][k][l][t]; + aom_prob *oldp = old_coef_probs[i][j][k][l] + t; + int s; + int u = 0; + if (t == PIVOT_NODE) { + s = av1_prob_diff_update_savings_search_model( + frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd, + stepsize, probwt); + } else { + s = av1_prob_diff_update_savings_search( + frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd, + probwt); + } + + if (s > 0 && newp != *oldp) u = 1; + updates += u; + if (u == 0 && updates == 0) { + noupdates_before_first++; + continue; + } + if (u == 1 && updates == 1) { + int v; + // first update + aom_write_bit(bc, 1); + for (v = 0; v < noupdates_before_first; ++v) + aom_write(bc, 0, upd); + } + aom_write(bc, u, upd); + if (u) { + /* send/use new probability */ + av1_write_prob_diff_update(bc, newp, *oldp); + *oldp = newp; + } + } + } + } + } + } + if (updates == 0) { + aom_write_bit(bc, 0); // no updates + } + return; + } + default: assert(0); + } +} +#endif +#if CONFIG_SUBFRAME_PROB_UPDATE +// Calculate the token counts between subsequent subframe updates. +static void get_coef_counts_diff( + AV1_COMP *cpi, int index, + av1_coeff_count coef_counts[TX_SIZES][PLANE_TYPES], + unsigned int eob_counts[TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS] + [COEFF_CONTEXTS]) { + int i, j, k, l, m, tx_size, val; + const int max_idx = cpi->common.coef_probs_update_idx; + const TX_MODE tx_mode = cpi->common.tx_mode; + const int max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + const SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats; + + assert(max_idx < COEF_PROBS_BUFS); + + for (tx_size = 0; tx_size <= max_tx_size; ++tx_size) + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + if (index == max_idx) { + val = + cpi->common.counts.eob_branch[tx_size][i][j][k][l] - + subframe_stats->eob_counts_buf[max_idx][tx_size][i][j][k][l]; + } else { + val = subframe_stats + ->eob_counts_buf[index + 1][tx_size][i][j][k][l] - + subframe_stats->eob_counts_buf[index][tx_size][i][j][k][l]; + } + assert(val >= 0); + eob_counts[tx_size][i][j][k][l] = val; + + for (m = 0; m < ENTROPY_TOKENS; ++m) { + if (index == max_idx) { + val = cpi->td.rd_counts.coef_counts[tx_size][i][j][k][l][m] - + subframe_stats + ->coef_counts_buf[max_idx][tx_size][i][j][k][l][m]; + } else { + val = subframe_stats + ->coef_counts_buf[index + 1][tx_size][i][j][k][l][m] - + subframe_stats + ->coef_counts_buf[index][tx_size][i][j][k][l][m]; + } + assert(val >= 0); + coef_counts[tx_size][i][j][k][l][m] = val; + } + } +} + +static void update_coef_probs_subframe( + aom_writer *const bc, AV1_COMP *cpi, TX_SIZE tx_size, + av1_coeff_stats branch_ct[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES], + av1_coeff_probs_model *new_coef_probs) { + av1_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size]; + const aom_prob upd = DIFF_UPDATE_PROB; + const int entropy_nodes_update = UNCONSTRAINED_NODES; + int i, j, k, l, t; + int stepsize = cpi->sf.coeff_prob_appx_step; + const int max_idx = cpi->common.coef_probs_update_idx; + int idx; + unsigned int this_branch_ct[ENTROPY_NODES][COEF_PROBS_BUFS][2]; + + switch (cpi->sf.use_fast_coef_updates) { + case TWO_LOOP: { + /* dry run to see if there is any update at all needed */ + int savings = 0; + int update[2] = { 0, 0 }; + for (i = 0; i < PLANE_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + for (t = 0; t < ENTROPY_NODES; ++t) { + for (idx = 0; idx <= max_idx; ++idx) { + memcpy(this_branch_ct[t][idx], + branch_ct[idx][tx_size][i][j][k][l][t], + 2 * sizeof(this_branch_ct[t][idx][0])); + } + } + for (t = 0; t < entropy_nodes_update; ++t) { + aom_prob newp = new_coef_probs[i][j][k][l][t]; + const aom_prob oldp = old_coef_probs[i][j][k][l][t]; + int s, u = 0; + + if (t == PIVOT_NODE) + s = av1_prob_update_search_model_subframe( + this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd, + stepsize, max_idx); + else + s = av1_prob_update_search_subframe(this_branch_ct[t], oldp, + &newp, upd, max_idx); + if (s > 0 && newp != oldp) u = 1; + if (u) + savings += s - (int)(av1_cost_zero(upd)); + else + savings -= (int)(av1_cost_zero(upd)); + update[u]++; + } + } + } + } + } + + /* Is coef updated at all */ + if (update[1] == 0 || savings < 0) { + aom_write_bit(bc, 0); + return; + } + aom_write_bit(bc, 1); + for (i = 0; i < PLANE_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + for (t = 0; t < ENTROPY_NODES; ++t) { + for (idx = 0; idx <= max_idx; ++idx) { + memcpy(this_branch_ct[t][idx], + branch_ct[idx][tx_size][i][j][k][l][t], + 2 * sizeof(this_branch_ct[t][idx][0])); + } + } + for (t = 0; t < entropy_nodes_update; ++t) { + aom_prob newp = new_coef_probs[i][j][k][l][t]; + aom_prob *oldp = old_coef_probs[i][j][k][l] + t; + int s; + int u = 0; + + if (t == PIVOT_NODE) + s = av1_prob_update_search_model_subframe( + this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd, + stepsize, max_idx); + else + s = av1_prob_update_search_subframe(this_branch_ct[t], *oldp, + &newp, upd, max_idx); + if (s > 0 && newp != *oldp) u = 1; + aom_write(bc, u, upd); + if (u) { + /* send/use new probability */ + av1_write_prob_diff_update(bc, newp, *oldp); + *oldp = newp; + } + } + } + } + } + } + return; + } + + case ONE_LOOP_REDUCED: { + int updates = 0; + int noupdates_before_first = 0; + for (i = 0; i < PLANE_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + for (t = 0; t < ENTROPY_NODES; ++t) { + for (idx = 0; idx <= max_idx; ++idx) { + memcpy(this_branch_ct[t][idx], + branch_ct[idx][tx_size][i][j][k][l][t], + 2 * sizeof(this_branch_ct[t][idx][0])); + } + } + for (t = 0; t < entropy_nodes_update; ++t) { + aom_prob newp = new_coef_probs[i][j][k][l][t]; + aom_prob *oldp = old_coef_probs[i][j][k][l] + t; + int s; + int u = 0; + + if (t == PIVOT_NODE) + s = av1_prob_update_search_model_subframe( + this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd, + stepsize, max_idx); + else + s = av1_prob_update_search_subframe(this_branch_ct[t], *oldp, + &newp, upd, max_idx); + if (s > 0 && newp != *oldp) u = 1; + updates += u; + if (u == 0 && updates == 0) { + noupdates_before_first++; + continue; + } + if (u == 1 && updates == 1) { + int v; + // first update + aom_write_bit(bc, 1); + for (v = 0; v < noupdates_before_first; ++v) + aom_write(bc, 0, upd); + } + aom_write(bc, u, upd); + if (u) { + /* send/use new probability */ + av1_write_prob_diff_update(bc, newp, *oldp); + *oldp = newp; + } + } + } + } + } + } + if (updates == 0) { + aom_write_bit(bc, 0); // no updates + } + return; + } + default: assert(0); + } +} +#endif // CONFIG_SUBFRAME_PROB_UPDATE + +#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET) +static void update_coef_probs(AV1_COMP *cpi, aom_writer *w) { + const TX_MODE tx_mode = cpi->common.tx_mode; + const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + TX_SIZE tx_size; +#if CONFIG_SUBFRAME_PROB_UPDATE + AV1_COMMON *cm = &cpi->common; + SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats; + int i; + av1_coeff_probs_model dummy_frame_coef_probs[PLANE_TYPES]; + + if (cm->do_subframe_update && + cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { + av1_copy(cpi->common.fc->coef_probs, + subframe_stats->enc_starting_coef_probs); + for (i = 0; i <= cpi->common.coef_probs_update_idx; ++i) { + get_coef_counts_diff(cpi, i, cpi->wholeframe_stats.coef_counts_buf[i], + cpi->wholeframe_stats.eob_counts_buf[i]); + } + } +#endif // CONFIG_SUBFRAME_PROB_UPDATE + + for (tx_size = 0; tx_size <= max_tx_size; ++tx_size) { + av1_coeff_stats frame_branch_ct[PLANE_TYPES]; + av1_coeff_probs_model frame_coef_probs[PLANE_TYPES]; + if (cpi->td.counts->tx_size_totals[tx_size] <= 20 || CONFIG_RD_DEBUG || + (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) { + aom_write_bit(w, 0); + } else { +#if CONFIG_SUBFRAME_PROB_UPDATE + if (cm->do_subframe_update && + cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { + unsigned int this_eob_counts_copy[PLANE_TYPES][REF_TYPES][COEF_BANDS] + [COEFF_CONTEXTS]; + av1_coeff_count coef_counts_copy[PLANE_TYPES]; + av1_copy(this_eob_counts_copy, cpi->common.counts.eob_branch[tx_size]); + av1_copy(coef_counts_copy, cpi->td.rd_counts.coef_counts[tx_size]); + build_tree_distribution(cpi, tx_size, frame_branch_ct, + frame_coef_probs); + for (i = 0; i <= cpi->common.coef_probs_update_idx; ++i) { + av1_copy(cpi->common.counts.eob_branch[tx_size], + cpi->wholeframe_stats.eob_counts_buf[i][tx_size]); + av1_copy(cpi->td.rd_counts.coef_counts[tx_size], + cpi->wholeframe_stats.coef_counts_buf[i][tx_size]); + build_tree_distribution(cpi, tx_size, cpi->branch_ct_buf[i][tx_size], + dummy_frame_coef_probs); + } + av1_copy(cpi->common.counts.eob_branch[tx_size], this_eob_counts_copy); + av1_copy(cpi->td.rd_counts.coef_counts[tx_size], coef_counts_copy); + + update_coef_probs_subframe(w, cpi, tx_size, cpi->branch_ct_buf, + frame_coef_probs); + } else { +#endif // CONFIG_SUBFRAME_PROB_UPDATE + build_tree_distribution(cpi, tx_size, frame_branch_ct, + frame_coef_probs); + update_coef_probs_common(w, cpi, tx_size, frame_branch_ct, + frame_coef_probs); +#if CONFIG_SUBFRAME_PROB_UPDATE + } +#endif // CONFIG_SUBFRAME_PROB_UPDATE + } + } + +#if CONFIG_SUBFRAME_PROB_UPDATE + av1_copy(cm->starting_coef_probs, cm->fc->coef_probs); + av1_copy(subframe_stats->coef_probs_buf[0], cm->fc->coef_probs); + if (cm->do_subframe_update && + cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { + unsigned int eob_counts_copy[TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS] + [COEFF_CONTEXTS]; + av1_copy(eob_counts_copy, cm->counts.eob_branch); + for (i = 1; i <= cpi->common.coef_probs_update_idx; ++i) { + for (tx_size = 0; tx_size <= max_tx_size; ++tx_size) + av1_full_to_model_counts(cm->counts.coef[tx_size], + subframe_stats->coef_counts_buf[i][tx_size]); + av1_copy(cm->counts.eob_branch, subframe_stats->eob_counts_buf[i]); + av1_partial_adapt_probs(cm, 0, 0); + av1_copy(subframe_stats->coef_probs_buf[i], cm->fc->coef_probs); + } + av1_copy(cm->fc->coef_probs, subframe_stats->coef_probs_buf[0]); + av1_copy(cm->counts.eob_branch, eob_counts_copy); + } +#endif // CONFIG_SUBFRAME_PROB_UPDATE +} +#endif // !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET) +#endif // !CONFIG_EC_ADAPT +#endif // !CONFIG_LV_MAP + +#if CONFIG_LOOP_RESTORATION +static void encode_restoration_mode(AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + int p; + RestorationInfo *rsi = &cm->rst_info[0]; + switch (rsi->frame_restoration_type) { + case RESTORE_NONE: + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, 0); + break; + case RESTORE_WIENER: + aom_wb_write_bit(wb, 1); + aom_wb_write_bit(wb, 0); + break; + case RESTORE_SGRPROJ: + aom_wb_write_bit(wb, 1); + aom_wb_write_bit(wb, 1); + break; + case RESTORE_SWITCHABLE: + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, 1); + break; + default: assert(0); + } + for (p = 1; p < MAX_MB_PLANE; ++p) { + rsi = &cm->rst_info[p]; + switch (rsi->frame_restoration_type) { + case RESTORE_NONE: aom_wb_write_bit(wb, 0); break; + case RESTORE_WIENER: aom_wb_write_bit(wb, 1); break; + default: assert(0); + } + } + if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || + cm->rst_info[1].frame_restoration_type != RESTORE_NONE || + cm->rst_info[2].frame_restoration_type != RESTORE_NONE) { + rsi = &cm->rst_info[0]; + aom_wb_write_bit(wb, rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX); + if (rsi->restoration_tilesize != RESTORATION_TILESIZE_MAX) { + aom_wb_write_bit( + wb, rsi->restoration_tilesize != (RESTORATION_TILESIZE_MAX >> 1)); + } + } +} + +static void write_wiener_filter(WienerInfo *wiener_info, + WienerInfo *ref_wiener_info, aom_writer *wb) { + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV); + memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info)); +} + +static void write_sgrproj_filter(SgrprojInfo *sgrproj_info, + SgrprojInfo *ref_sgrproj_info, + aom_writer *wb) { + aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS); + aom_write_primitive_refsubexpfin(wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, + SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + aom_write_primitive_refsubexpfin(wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, + SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info)); +} + +static void encode_restoration(AV1_COMMON *cm, aom_writer *wb) { + int i, p; + const int ntiles = av1_get_rest_ntiles(cm->width, cm->height, + cm->rst_info[0].restoration_tilesize, + NULL, NULL, NULL, NULL); + WienerInfo ref_wiener_info; + SgrprojInfo ref_sgrproj_info; + set_default_wiener(&ref_wiener_info); + set_default_sgrproj(&ref_sgrproj_info); + const int ntiles_uv = av1_get_rest_ntiles( + ROUND_POWER_OF_TWO(cm->width, cm->subsampling_x), + ROUND_POWER_OF_TWO(cm->height, cm->subsampling_y), + cm->rst_info[1].restoration_tilesize, NULL, NULL, NULL, NULL); + RestorationInfo *rsi = &cm->rst_info[0]; + if (rsi->frame_restoration_type != RESTORE_NONE) { + if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) { + // RESTORE_SWITCHABLE + for (i = 0; i < ntiles; ++i) { + av1_write_token( + wb, av1_switchable_restore_tree, cm->fc->switchable_restore_prob, + &switchable_restore_encodings[rsi->restoration_type[i]]); + if (rsi->restoration_type[i] == RESTORE_WIENER) { + write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb); + } else if (rsi->restoration_type[i] == RESTORE_SGRPROJ) { + write_sgrproj_filter(&rsi->sgrproj_info[i], &ref_sgrproj_info, wb); + } + } + } else if (rsi->frame_restoration_type == RESTORE_WIENER) { + for (i = 0; i < ntiles; ++i) { + aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE, + RESTORE_NONE_WIENER_PROB); + if (rsi->restoration_type[i] != RESTORE_NONE) { + write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb); + } + } + } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) { + for (i = 0; i < ntiles; ++i) { + aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE, + RESTORE_NONE_SGRPROJ_PROB); + if (rsi->restoration_type[i] != RESTORE_NONE) { + write_sgrproj_filter(&rsi->sgrproj_info[i], &ref_sgrproj_info, wb); + } + } + } + } + for (p = 1; p < MAX_MB_PLANE; ++p) { + set_default_wiener(&ref_wiener_info); + rsi = &cm->rst_info[p]; + if (rsi->frame_restoration_type == RESTORE_WIENER) { + for (i = 0; i < ntiles_uv; ++i) { + if (ntiles_uv > 1) + aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE, + RESTORE_NONE_WIENER_PROB); + if (rsi->restoration_type[i] != RESTORE_NONE) { + write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb); + } + } + } else if (rsi->frame_restoration_type != RESTORE_NONE) { + assert(0); + } + } +} +#endif // CONFIG_LOOP_RESTORATION + +static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { + int i; + struct loopfilter *lf = &cm->lf; + + // Encode the loop filter level and type + aom_wb_write_literal(wb, lf->filter_level, 6); + aom_wb_write_literal(wb, lf->sharpness_level, 3); + + // Write out loop filter deltas applied at the MB level based on mode or + // ref frame (if they are enabled). + aom_wb_write_bit(wb, lf->mode_ref_delta_enabled); + + if (lf->mode_ref_delta_enabled) { + aom_wb_write_bit(wb, lf->mode_ref_delta_update); + if (lf->mode_ref_delta_update) { + for (i = 0; i < TOTAL_REFS_PER_FRAME; i++) { + const int delta = lf->ref_deltas[i]; + const int changed = delta != lf->last_ref_deltas[i]; + aom_wb_write_bit(wb, changed); + if (changed) { + lf->last_ref_deltas[i] = delta; + aom_wb_write_inv_signed_literal(wb, delta, 6); + } + } + + for (i = 0; i < MAX_MODE_LF_DELTAS; i++) { + const int delta = lf->mode_deltas[i]; + const int changed = delta != lf->last_mode_deltas[i]; + aom_wb_write_bit(wb, changed); + if (changed) { + lf->last_mode_deltas[i] = delta; + aom_wb_write_inv_signed_literal(wb, delta, 6); + } + } + } + } +} + +#if CONFIG_CDEF +static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { + int i; + aom_wb_write_literal(wb, cm->cdef_dering_damping - 5, 1); + aom_wb_write_literal(wb, cm->cdef_clpf_damping - 3, 2); + aom_wb_write_literal(wb, cm->cdef_bits, 2); + for (i = 0; i < cm->nb_cdef_strengths; i++) { + aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS); + aom_wb_write_literal(wb, cm->cdef_uv_strengths[i], CDEF_STRENGTH_BITS); + } +} +#endif + +static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) { + if (delta_q != 0) { + aom_wb_write_bit(wb, 1); + aom_wb_write_inv_signed_literal(wb, delta_q, 6); + } else { + aom_wb_write_bit(wb, 0); + } +} + +static void encode_quantization(const AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + aom_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS); + write_delta_q(wb, cm->y_dc_delta_q); + write_delta_q(wb, cm->uv_dc_delta_q); + write_delta_q(wb, cm->uv_ac_delta_q); +#if CONFIG_AOM_QM + aom_wb_write_bit(wb, cm->using_qmatrix); + if (cm->using_qmatrix) { + aom_wb_write_literal(wb, cm->min_qmlevel, QM_LEVEL_BITS); + aom_wb_write_literal(wb, cm->max_qmlevel, QM_LEVEL_BITS); + } +#endif +} + +static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd, + struct aom_write_bit_buffer *wb) { + int i, j; + const struct segmentation *seg = &cm->seg; + + aom_wb_write_bit(wb, seg->enabled); + if (!seg->enabled) return; + + // Segmentation map + if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) { + aom_wb_write_bit(wb, seg->update_map); + } else { + assert(seg->update_map == 1); + } + if (seg->update_map) { + // Select the coding strategy (temporal or spatial) + av1_choose_segmap_coding_method(cm, xd); + + // Write out the chosen coding method. + if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) { + aom_wb_write_bit(wb, seg->temporal_update); + } else { + assert(seg->temporal_update == 0); + } + } + + // Segmentation data + aom_wb_write_bit(wb, seg->update_data); + if (seg->update_data) { + aom_wb_write_bit(wb, seg->abs_delta); + + for (i = 0; i < MAX_SEGMENTS; i++) { + for (j = 0; j < SEG_LVL_MAX; j++) { + const int active = segfeature_active(seg, i, j); + aom_wb_write_bit(wb, active); + if (active) { + const int data = get_segdata(seg, i, j); + const int data_max = av1_seg_feature_data_max(j); + + if (av1_is_segfeature_signed(j)) { + encode_unsigned_max(wb, abs(data), data_max); + aom_wb_write_bit(wb, data < 0); + } else { + encode_unsigned_max(wb, data, data_max); + } + } + } + } + } +} + +#if !CONFIG_EC_ADAPT +static void update_seg_probs(AV1_COMP *cpi, aom_writer *w) { + AV1_COMMON *cm = &cpi->common; +#if CONFIG_TILE_GROUPS + const int probwt = cm->num_tg; +#else + const int probwt = 1; +#endif + + if (!cm->seg.enabled || !cm->seg.update_map) return; + + if (cm->seg.temporal_update) { + int i; + + for (i = 0; i < PREDICTION_PROBS; i++) + av1_cond_prob_diff_update(w, &cm->fc->seg.pred_probs[i], + cm->counts.seg.pred[i], probwt); + + prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs, + cm->counts.seg.tree_mispred, MAX_SEGMENTS, probwt, w); + } else { + prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs, + cm->counts.seg.tree_total, MAX_SEGMENTS, probwt, w); + } +} +#endif + +static void write_tx_mode(AV1_COMMON *cm, MACROBLOCKD *xd, TX_MODE *mode, + struct aom_write_bit_buffer *wb) { + int i, all_lossless = 1; + + if (cm->seg.enabled) { + for (i = 0; i < MAX_SEGMENTS; ++i) { + if (!xd->lossless[i]) { + all_lossless = 0; + break; + } + } + } else { + all_lossless = xd->lossless[0]; + } + if (all_lossless) { + *mode = ONLY_4X4; + return; + } +#if CONFIG_TX64X64 + aom_wb_write_bit(wb, *mode == TX_MODE_SELECT); + if (*mode != TX_MODE_SELECT) { + aom_wb_write_literal(wb, AOMMIN(*mode, ALLOW_32X32), 2); + if (*mode >= ALLOW_32X32) aom_wb_write_bit(wb, *mode == ALLOW_64X64); + } +#else + aom_wb_write_bit(wb, *mode == TX_MODE_SELECT); + if (*mode != TX_MODE_SELECT) aom_wb_write_literal(wb, *mode, 2); +#endif // CONFIG_TX64X64 +} + +#if !CONFIG_EC_ADAPT +static void update_txfm_probs(AV1_COMMON *cm, aom_writer *w, + FRAME_COUNTS *counts) { +#if CONFIG_TILE_GROUPS + const int probwt = cm->num_tg; +#else + const int probwt = 1; +#endif + if (cm->tx_mode == TX_MODE_SELECT) { + int i, j; + for (i = 0; i < MAX_TX_DEPTH; ++i) + for (j = 0; j < TX_SIZE_CONTEXTS; ++j) + prob_diff_update(av1_tx_size_tree[i], cm->fc->tx_size_probs[i][j], + counts->tx_size[i][j], i + 2, probwt, w); + } +} +#endif + +static void write_frame_interp_filter(InterpFilter filter, + struct aom_write_bit_buffer *wb) { + aom_wb_write_bit(wb, filter == SWITCHABLE); + if (filter != SWITCHABLE) + aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS); +} + +static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) { + if (cm->interp_filter == SWITCHABLE) { + // Check to see if only one of the filters is actually used + int count[SWITCHABLE_FILTERS]; + int i, j, c = 0; + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { + count[i] = 0; + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) + count[i] += counts->switchable_interp[j][i]; + c += (count[i] > 0); + } + if (c == 1) { + // Only one filter is used. So set the filter at frame level + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { + if (count[i]) { +#if CONFIG_MOTION_VAR && (CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION) +#if CONFIG_WARPED_MOTION + if (i == EIGHTTAP_REGULAR || WARP_WM_NEIGHBORS_WITH_OBMC) +#else + if (i == EIGHTTAP_REGULAR || WARP_GM_NEIGHBORS_WITH_OBMC) +#endif // CONFIG_WARPED_MOTION +#endif // CONFIG_MOTION_VAR && (CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION) + cm->interp_filter = i; + break; + } + } + } + } +} + +static void write_tile_info(const AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { +#if CONFIG_EXT_TILE + const int tile_width = + ALIGN_POWER_OF_TWO(cm->tile_width, cm->mib_size_log2) >> + cm->mib_size_log2; + const int tile_height = + ALIGN_POWER_OF_TWO(cm->tile_height, cm->mib_size_log2) >> + cm->mib_size_log2; + + assert(tile_width > 0); + assert(tile_height > 0); + + aom_wb_write_literal(wb, cm->tile_encoding_mode, 1); + +// Write the tile sizes +#if CONFIG_EXT_PARTITION + if (cm->sb_size == BLOCK_128X128) { + assert(tile_width <= 32); + assert(tile_height <= 32); + aom_wb_write_literal(wb, tile_width - 1, 5); + aom_wb_write_literal(wb, tile_height - 1, 5); + } else +#endif // CONFIG_EXT_PARTITION + { + assert(tile_width <= 64); + assert(tile_height <= 64); + aom_wb_write_literal(wb, tile_width - 1, 6); + aom_wb_write_literal(wb, tile_height - 1, 6); + } +#else + int min_log2_tile_cols, max_log2_tile_cols, ones; + av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + + // columns + ones = cm->log2_tile_cols - min_log2_tile_cols; + while (ones--) aom_wb_write_bit(wb, 1); + + if (cm->log2_tile_cols < max_log2_tile_cols) aom_wb_write_bit(wb, 0); + + // rows + aom_wb_write_bit(wb, cm->log2_tile_rows != 0); + if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->log2_tile_rows != 1); +#endif // CONFIG_EXT_TILE + +#if CONFIG_DEPENDENT_HORZTILES + if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->dependent_horz_tiles); +#endif + +#if CONFIG_LOOPFILTERING_ACROSS_TILES + aom_wb_write_bit(wb, cm->loop_filter_across_tiles_enabled); +#endif // CONFIG_LOOPFILTERING_ACROSS_TILES +} + +static int get_refresh_mask(AV1_COMP *cpi) { + int refresh_mask = 0; + +#if CONFIG_EXT_REFS + // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be + // notified to get LAST3_FRAME refreshed and then the virtual indexes for all + // the 3 LAST reference frames will be updated accordingly, i.e.: + // (1) The original virtual index for LAST3_FRAME will become the new virtual + // index for LAST_FRAME; and + // (2) The original virtual indexes for LAST_FRAME and LAST2_FRAME will be + // shifted and become the new virtual indexes for LAST2_FRAME and + // LAST3_FRAME. + refresh_mask |= + (cpi->refresh_last_frame << cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]); + if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) { + // We have swapped the virtual indices + refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->arf_map[0]); + } else { + refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->bwd_fb_idx); + } +#else + refresh_mask |= (cpi->refresh_last_frame << cpi->lst_fb_idx); +#endif // CONFIG_EXT_REFS + + if (av1_preserve_existing_gf(cpi)) { + // We have decided to preserve the previously existing golden frame as our + // new ARF frame. However, in the short term we leave it in the GF slot and, + // if we're updating the GF with the current decoded frame, we save it + // instead to the ARF slot. + // Later, in the function av1_encoder.c:av1_update_reference_frames() we + // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it + // there so that it can be done outside of the recode loop. + // Note: This is highly specific to the use of ARF as a forward reference, + // and this needs to be generalized as other uses are implemented + // (like RTC/temporal scalability). + return refresh_mask | (cpi->refresh_golden_frame << cpi->alt_fb_idx); + } else { +#if CONFIG_EXT_REFS + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + int arf_idx = cpi->arf_map[gf_group->arf_update_idx[gf_group->index]]; +#else + int arf_idx = cpi->alt_fb_idx; + if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + arf_idx = gf_group->arf_update_idx[gf_group->index]; + } +#endif // CONFIG_EXT_REFS + return refresh_mask | (cpi->refresh_golden_frame << cpi->gld_fb_idx) | + (cpi->refresh_alt_ref_frame << arf_idx); + } +} + +#if CONFIG_EXT_TILE +static INLINE int find_identical_tile( + const int tile_row, const int tile_col, + TileBufferEnc (*const tile_buffers)[1024]) { + const MV32 candidate_offset[1] = { { 1, 0 } }; + const uint8_t *const cur_tile_data = + tile_buffers[tile_row][tile_col].data + 4; + const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size; + + int i; + + if (tile_row == 0) return 0; + + // (TODO: yunqingwang) For now, only above tile is checked and used. + // More candidates such as left tile can be added later. + for (i = 0; i < 1; i++) { + int row_offset = candidate_offset[0].row; + int col_offset = candidate_offset[0].col; + int row = tile_row - row_offset; + int col = tile_col - col_offset; + uint8_t tile_hdr; + const uint8_t *tile_data; + TileBufferEnc *candidate; + + if (row < 0 || col < 0) continue; + + tile_hdr = *(tile_buffers[row][col].data); + + // Read out tcm bit + if ((tile_hdr >> 7) == 1) { + // The candidate is a copy tile itself + row_offset += tile_hdr & 0x7f; + row = tile_row - row_offset; + } + + candidate = &tile_buffers[row][col]; + + if (row_offset >= 128 || candidate->size != cur_tile_size) continue; + + tile_data = candidate->data + 4; + + if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue; + + // Identical tile found + assert(row_offset > 0); + return row_offset; + } + + // No identical tile found + return 0; +} +#endif // CONFIG_EXT_TILE + +#if CONFIG_TILE_GROUPS +static uint32_t write_tiles(AV1_COMP *const cpi, + struct aom_write_bit_buffer *wb, + unsigned int *max_tile_size, + unsigned int *max_tile_col_size) { +#else +static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst, + unsigned int *max_tile_size, + unsigned int *max_tile_col_size) { +#endif + const AV1_COMMON *const cm = &cpi->common; +#if CONFIG_ANS + struct BufAnsCoder *buf_ans = &cpi->buf_ans; +#else + aom_writer mode_bc; +#endif // CONFIG_ANS + int tile_row, tile_col; + TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok; + TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers; + uint32_t total_size = 0; + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; + unsigned int tile_size = 0; +#if CONFIG_TILE_GROUPS + const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols; + const int have_tiles = n_log2_tiles > 0; + uint32_t comp_hdr_size; + // Fixed size tile groups for the moment + const int num_tg_hdrs = cm->num_tg; + const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs; + int tile_count = 0; + int tg_count = 1; + int tile_size_bytes = 4; + int tile_col_size_bytes; + uint32_t uncompressed_hdr_size = 0; + uint8_t *dst = NULL; + struct aom_write_bit_buffer comp_hdr_len_wb; + struct aom_write_bit_buffer tg_params_wb; + struct aom_write_bit_buffer tile_size_bytes_wb; + uint32_t saved_offset; + int mtu_size = cpi->oxcf.mtu; + int curr_tg_data_size = 0; + int hdr_size; +#endif +#if CONFIG_EXT_TILE + const int have_tiles = tile_cols * tile_rows > 1; +#endif // CONFIG_EXT_TILE + + *max_tile_size = 0; + *max_tile_col_size = 0; + +// All tile size fields are output on 4 bytes. A call to remux_tiles will +// later compact the data if smaller headers are adequate. + +#if CONFIG_EXT_TILE + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileInfo tile_info; + const int is_last_col = (tile_col == tile_cols - 1); + const uint32_t col_offset = total_size; + + av1_tile_set_col(&tile_info, cm, tile_col); + + // The last column does not have a column header + if (!is_last_col) total_size += 4; + + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; + const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col]; + const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col]; + const int data_offset = have_tiles ? 4 : 0; +#if CONFIG_EC_ADAPT + const int tile_idx = tile_row * tile_cols + tile_col; + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; +#endif + av1_tile_set_row(&tile_info, cm, tile_row); + + buf->data = dst + total_size; + + // Is CONFIG_EXT_TILE = 1, every tile in the row has a header, + // even for the last one, unless no tiling is used at all. + total_size += data_offset; +#if CONFIG_EC_ADAPT + // Initialise tile context from the frame context + this_tile->tctx = *cm->fc; + cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; +#endif +#if CONFIG_PVQ + cpi->td.mb.pvq_q = &this_tile->pvq_q; + cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context; +#endif // CONFIG_PVQ +#if !CONFIG_ANS + aom_start_encode(&mode_bc, buf->data + data_offset); + write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end); + assert(tok == tok_end); + aom_stop_encode(&mode_bc); + tile_size = mode_bc.pos; +#else + buf_ans_write_init(buf_ans, buf->data + data_offset); + write_modes(cpi, &tile_info, buf_ans, &tok, tok_end); + assert(tok == tok_end); + aom_buf_ans_flush(buf_ans); + tile_size = buf_ans_write_end(buf_ans); +#endif // !CONFIG_ANS +#if CONFIG_PVQ + cpi->td.mb.pvq_q = NULL; +#endif + buf->size = tile_size; + + // Record the maximum tile size we see, so we can compact headers later. + *max_tile_size = AOMMAX(*max_tile_size, tile_size); + + if (have_tiles) { + // tile header: size of this tile, or copy offset + uint32_t tile_header = tile_size; + + // If the tile_encoding_mode is 1 (i.e. TILE_VR), check if this tile is + // a copy tile. + // Very low chances to have copy tiles on the key frames, so don't + // search on key frames to reduce unnecessary search. + if (cm->frame_type != KEY_FRAME && cm->tile_encoding_mode) { + const int idendical_tile_offset = + find_identical_tile(tile_row, tile_col, tile_buffers); + + if (idendical_tile_offset > 0) { + tile_size = 0; + tile_header = idendical_tile_offset | 0x80; + tile_header <<= 24; + } + } + + mem_put_le32(buf->data, tile_header); + } + + total_size += tile_size; + } + + if (!is_last_col) { + uint32_t col_size = total_size - col_offset - 4; + mem_put_le32(dst + col_offset, col_size); + + // If it is not final packing, record the maximum tile column size we see, + // otherwise, check if the tile size is out of the range. + *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size); + } + } +#else +#if CONFIG_TILE_GROUPS + write_uncompressed_header(cpi, wb); + +#if CONFIG_EXT_REFS + if (cm->show_existing_frame) { + total_size = aom_wb_bytes_written(wb); + return (uint32_t)total_size; + } +#endif // CONFIG_EXT_REFS + + // Write the tile length code + tile_size_bytes_wb = *wb; + aom_wb_write_literal(wb, 3, 2); + + /* Write a placeholder for the number of tiles in each tile group */ + tg_params_wb = *wb; + saved_offset = wb->bit_offset; + if (have_tiles) { + aom_wb_overwrite_literal(wb, 3, n_log2_tiles); + aom_wb_overwrite_literal(wb, (1 << n_log2_tiles) - 1, n_log2_tiles); + } + + /* Write a placeholder for the compressed header length */ + comp_hdr_len_wb = *wb; + aom_wb_write_literal(wb, 0, 16); + + uncompressed_hdr_size = aom_wb_bytes_written(wb); + dst = wb->bit_buffer; + comp_hdr_size = write_compressed_header(cpi, dst + uncompressed_hdr_size); + aom_wb_overwrite_literal(&comp_hdr_len_wb, (int)(comp_hdr_size), 16); + hdr_size = uncompressed_hdr_size + comp_hdr_size; + total_size += hdr_size; +#endif + + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + TileInfo tile_info; + const int is_last_row = (tile_row == tile_rows - 1); + av1_tile_set_row(&tile_info, cm, tile_row); + + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + const int tile_idx = tile_row * tile_cols + tile_col; + TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; +#if CONFIG_PVQ || CONFIG_EC_ADAPT + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; +#endif + const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col]; + const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col]; + const int is_last_col = (tile_col == tile_cols - 1); + const int is_last_tile = is_last_col && is_last_row; +#if !CONFIG_TILE_GROUPS + (void)tile_idx; +#else + + if ((!mtu_size && tile_count > tg_size) || + (mtu_size && tile_count && curr_tg_data_size >= mtu_size)) { + // New tile group + tg_count++; + // We've exceeded the packet size + if (tile_count > 1) { + /* The last tile exceeded the packet size. The tile group size + should therefore be tile_count-1. + Move the last tile and insert headers before it + */ + uint32_t old_total_size = total_size - tile_size - 4; + memmove(dst + old_total_size + hdr_size, dst + old_total_size, + (tile_size + 4) * sizeof(uint8_t)); + // Copy uncompressed header + memmove(dst + old_total_size, dst, + uncompressed_hdr_size * sizeof(uint8_t)); + // Write the number of tiles in the group into the last uncompressed + // header before the one we've just inserted + aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count, + n_log2_tiles); + aom_wb_overwrite_literal(&tg_params_wb, tile_count - 2, n_log2_tiles); + // Update the pointer to the last TG params + tg_params_wb.bit_offset = saved_offset + 8 * old_total_size; + // Copy compressed header + memmove(dst + old_total_size + uncompressed_hdr_size, + dst + uncompressed_hdr_size, comp_hdr_size * sizeof(uint8_t)); + total_size += hdr_size; + tile_count = 1; + curr_tg_data_size = hdr_size + tile_size + 4; + + } else { + // We exceeded the packet size in just one tile + // Copy uncompressed header + memmove(dst + total_size, dst, + uncompressed_hdr_size * sizeof(uint8_t)); + // Write the number of tiles in the group into the last uncompressed + // header + aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count, + n_log2_tiles); + aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles); + tg_params_wb.bit_offset = saved_offset + 8 * total_size; + // Copy compressed header + memmove(dst + total_size + uncompressed_hdr_size, + dst + uncompressed_hdr_size, comp_hdr_size * sizeof(uint8_t)); + total_size += hdr_size; + tile_count = 0; + curr_tg_data_size = hdr_size; + } + } + tile_count++; +#endif + av1_tile_set_col(&tile_info, cm, tile_col); + +#if CONFIG_DEPENDENT_HORZTILES && CONFIG_TILE_GROUPS + av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col); +#endif + buf->data = dst + total_size; + + // The last tile does not have a header. + if (!is_last_tile) total_size += 4; + +#if CONFIG_EC_ADAPT + // Initialise tile context from the frame context + this_tile->tctx = *cm->fc; + cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; +#endif +#if CONFIG_PVQ + cpi->td.mb.pvq_q = &this_tile->pvq_q; + cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context; +#endif // CONFIG_PVQ +#if CONFIG_ANS + buf_ans_write_init(buf_ans, dst + total_size); + write_modes(cpi, &tile_info, buf_ans, &tok, tok_end); + assert(tok == tok_end); + aom_buf_ans_flush(buf_ans); + tile_size = buf_ans_write_end(buf_ans); +#else + aom_start_encode(&mode_bc, dst + total_size); + write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end); +#if !CONFIG_LV_MAP + assert(tok == tok_end); +#endif // !CONFIG_LV_MAP + aom_stop_encode(&mode_bc); + tile_size = mode_bc.pos; +#endif // CONFIG_ANS +#if CONFIG_PVQ + cpi->td.mb.pvq_q = NULL; +#endif + + assert(tile_size > 0); + +#if CONFIG_TILE_GROUPS + curr_tg_data_size += tile_size + 4; +#endif + buf->size = tile_size; + + if (!is_last_tile) { + *max_tile_size = AOMMAX(*max_tile_size, tile_size); + // size of this tile + mem_put_le32(buf->data, tile_size); + } + + total_size += tile_size; + } + } +#if CONFIG_TILE_GROUPS + // Write the final tile group size + if (n_log2_tiles) { + aom_wb_overwrite_literal(&tg_params_wb, (1 << n_log2_tiles) - tile_count, + n_log2_tiles); + aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles); + } + // Remux if possible. TODO (Thomas Davies): do this for more than one tile + // group + if (have_tiles && tg_count == 1) { + int data_size = total_size - (uncompressed_hdr_size + comp_hdr_size); + data_size = remux_tiles(cm, dst + uncompressed_hdr_size + comp_hdr_size, + data_size, *max_tile_size, *max_tile_col_size, + &tile_size_bytes, &tile_col_size_bytes); + total_size = data_size + uncompressed_hdr_size + comp_hdr_size; + aom_wb_overwrite_literal(&tile_size_bytes_wb, tile_size_bytes - 1, 2); + } + +#endif +#endif // CONFIG_EXT_TILE + return (uint32_t)total_size; +} + +static void write_render_size(const AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + const int scaling_active = + cm->width != cm->render_width || cm->height != cm->render_height; + aom_wb_write_bit(wb, scaling_active); + if (scaling_active) { + aom_wb_write_literal(wb, cm->render_width - 1, 16); + aom_wb_write_literal(wb, cm->render_height - 1, 16); + } +} + +#if CONFIG_FRAME_SUPERRES +static void write_superres_scale(const AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + // This scaling and frame superres are probably incompatible + assert(cm->width == cm->render_width && cm->height == cm->render_height); + + // First bit is whether to to scale or not + if (cm->superres_scale_numerator == SUPERRES_SCALE_DENOMINATOR) { + aom_wb_write_bit(wb, 0); // no scaling + } else { + aom_wb_write_bit(wb, 1); // scaling, write scale factor + // TODO(afergs): write factor to the compressed header instead + aom_wb_write_literal( + wb, cm->superres_scale_numerator - SUPERRES_SCALE_NUMERATOR_MIN, + SUPERRES_SCALE_BITS); + } +} +#endif // CONFIG_FRAME_SUPERRES + +static void write_frame_size(const AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { +#if CONFIG_FRAME_SUPERRES + // If SUPERRES scaling is happening, write the full resolution instead of the + // downscaled resolution. The decoder will reduce this resolution itself. + if (cm->superres_scale_numerator != SUPERRES_SCALE_DENOMINATOR) { + aom_wb_write_literal(wb, cm->superres_width - 1, 16); + aom_wb_write_literal(wb, cm->superres_height - 1, 16); + } else { +#endif // CONFIG_FRAME_SUPERRES + aom_wb_write_literal(wb, cm->width - 1, 16); + aom_wb_write_literal(wb, cm->height - 1, 16); +#if CONFIG_FRAME_SUPERRES + } +#endif // CONFIG_FRAME_SUPERRES + + // TODO(afergs): Also write something different to render_size? + // When superres scales, they'll be almost guaranteed to be + // different on the other side. + write_render_size(cm, wb); +#if CONFIG_FRAME_SUPERRES + write_superres_scale(cm, wb); +#endif // CONFIG_FRAME_SUPERRES +} + +static void write_frame_size_with_refs(AV1_COMP *cpi, + struct aom_write_bit_buffer *wb) { + AV1_COMMON *const cm = &cpi->common; + int found = 0; + + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame); + + if (cfg != NULL) { + found = + cm->width == cfg->y_crop_width && cm->height == cfg->y_crop_height; + found &= cm->render_width == cfg->render_width && + cm->render_height == cfg->render_height; + } + aom_wb_write_bit(wb, found); + if (found) { + break; + } + } + + if (!found) { + write_frame_size(cm, wb); + } +} + +static void write_sync_code(struct aom_write_bit_buffer *wb) { + aom_wb_write_literal(wb, AV1_SYNC_CODE_0, 8); + aom_wb_write_literal(wb, AV1_SYNC_CODE_1, 8); + aom_wb_write_literal(wb, AV1_SYNC_CODE_2, 8); +} + +static void write_profile(BITSTREAM_PROFILE profile, + struct aom_write_bit_buffer *wb) { + switch (profile) { + case PROFILE_0: aom_wb_write_literal(wb, 0, 2); break; + case PROFILE_1: aom_wb_write_literal(wb, 2, 2); break; + case PROFILE_2: aom_wb_write_literal(wb, 1, 2); break; + case PROFILE_3: aom_wb_write_literal(wb, 6, 3); break; + default: assert(0); + } +} + +static void write_bitdepth_colorspace_sampling( + AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { + if (cm->profile >= PROFILE_2) { + assert(cm->bit_depth > AOM_BITS_8); + aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_10 ? 0 : 1); + } + aom_wb_write_literal(wb, cm->color_space, 3); + if (cm->color_space != AOM_CS_SRGB) { + // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] + aom_wb_write_bit(wb, cm->color_range); + if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { + assert(cm->subsampling_x != 1 || cm->subsampling_y != 1); + aom_wb_write_bit(wb, cm->subsampling_x); + aom_wb_write_bit(wb, cm->subsampling_y); + aom_wb_write_bit(wb, 0); // unused + } else { + assert(cm->subsampling_x == 1 && cm->subsampling_y == 1); + } + } else { + assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3); + aom_wb_write_bit(wb, 0); // unused + } +} + +#if CONFIG_REFERENCE_BUFFER +void write_sequence_header(SequenceHeader *seq_params) { + /* Placeholder for actually writing to the bitstream */ + seq_params->frame_id_numbers_present_flag = FRAME_ID_NUMBERS_PRESENT_FLAG; + seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7; + seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2; +} +#endif + +static void write_uncompressed_header(AV1_COMP *cpi, + struct aom_write_bit_buffer *wb) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + +#if CONFIG_REFERENCE_BUFFER + /* TODO: Move outside frame loop or inside key-frame branch */ + write_sequence_header(&cpi->seq_params); +#endif + + aom_wb_write_literal(wb, AOM_FRAME_MARKER, 2); + + write_profile(cm->profile, wb); + +#if CONFIG_EXT_REFS + // NOTE: By default all coded frames to be used as a reference + cm->is_reference_frame = 1; + + if (cm->show_existing_frame) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show]; + + if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Buffer %d does not contain a reconstructed frame", + frame_to_show); + } + ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show); + + aom_wb_write_bit(wb, 1); // show_existing_frame + aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3); + +#if CONFIG_REFERENCE_BUFFER + if (cpi->seq_params.frame_id_numbers_present_flag) { + int frame_id_len = cpi->seq_params.frame_id_length_minus7 + 7; + int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; + aom_wb_write_literal(wb, display_frame_id, frame_id_len); + /* Add a zero byte to prevent emulation of superframe marker */ + /* Same logic as when when terminating the entropy coder */ + /* Consider to have this logic only one place */ + aom_wb_write_literal(wb, 0, 8); + } +#endif + + return; + } else { +#endif // CONFIG_EXT_REFS + aom_wb_write_bit(wb, 0); // show_existing_frame +#if CONFIG_EXT_REFS + } +#endif // CONFIG_EXT_REFS + + aom_wb_write_bit(wb, cm->frame_type); + aom_wb_write_bit(wb, cm->show_frame); + aom_wb_write_bit(wb, cm->error_resilient_mode); + +#if CONFIG_REFERENCE_BUFFER + cm->invalid_delta_frame_id_minus1 = 0; + if (cpi->seq_params.frame_id_numbers_present_flag) { + int frame_id_len = cpi->seq_params.frame_id_length_minus7 + 7; + aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len); + } +#endif + +#if CONFIG_FRAME_SUPERRES + // TODO(afergs): Remove - this is just to stop superres from breaking + cm->superres_scale_numerator = SUPERRES_SCALE_DENOMINATOR; +#endif // CONFIG_FRAME_SUPERRES + + if (cm->frame_type == KEY_FRAME) { + write_sync_code(wb); + write_bitdepth_colorspace_sampling(cm, wb); + write_frame_size(cm, wb); +#if CONFIG_ANS && ANS_MAX_SYMBOLS + assert(cpi->common.ans_window_size_log2 >= 8); + assert(cpi->common.ans_window_size_log2 < 24); + aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4); +#endif // CONFIG_ANS && ANS_MAX_SYMBOLS +#if CONFIG_PALETTE + aom_wb_write_bit(wb, cm->allow_screen_content_tools); +#endif // CONFIG_PALETTE + } else { + if (!cm->show_frame) aom_wb_write_bit(wb, cm->intra_only); +#if CONFIG_PALETTE + if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools); +#endif // CONFIG_PALETTE + if (!cm->error_resilient_mode) { + if (cm->intra_only) { + aom_wb_write_bit(wb, + cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL); + } else { + aom_wb_write_bit(wb, + cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE); + if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE) + aom_wb_write_bit(wb, + cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL); + } + } + +#if CONFIG_EXT_REFS + cpi->refresh_frame_mask = get_refresh_mask(cpi); +#endif // CONFIG_EXT_REFS + + if (cm->intra_only) { + write_sync_code(wb); + write_bitdepth_colorspace_sampling(cm, wb); + +#if CONFIG_EXT_REFS + aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES); +#else + aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES); +#endif // CONFIG_EXT_REFS + write_frame_size(cm, wb); + +#if CONFIG_ANS && ANS_MAX_SYMBOLS + assert(cpi->common.ans_window_size_log2 >= 8); + assert(cpi->common.ans_window_size_log2 < 24); + aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4); +#endif // CONFIG_ANS && ANS_MAX_SYMBOLS + } else { + MV_REFERENCE_FRAME ref_frame; + +#if CONFIG_EXT_REFS + aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES); +#else + aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES); +#endif // CONFIG_EXT_REFS + +#if CONFIG_EXT_REFS + if (!cpi->refresh_frame_mask) { + // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame + // will not be used as a reference + cm->is_reference_frame = 0; + } +#endif // CONFIG_EXT_REFS + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX); + aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame), + REF_FRAMES_LOG2); + aom_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]); +#if CONFIG_REFERENCE_BUFFER + if (cpi->seq_params.frame_id_numbers_present_flag) { + int i = get_ref_frame_map_idx(cpi, ref_frame); + int frame_id_len = cpi->seq_params.frame_id_length_minus7 + 7; + int diff_len = cpi->seq_params.delta_frame_id_length_minus2 + 2; + int delta_frame_id_minus1 = + ((cm->current_frame_id - cm->ref_frame_id[i] + + (1 << frame_id_len)) % + (1 << frame_id_len)) - + 1; + if (delta_frame_id_minus1 < 0 || + delta_frame_id_minus1 >= (1 << diff_len)) + cm->invalid_delta_frame_id_minus1 = 1; + aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len); + } +#endif + } + +#if CONFIG_FRAME_SIZE + if (cm->error_resilient_mode == 0) { + write_frame_size_with_refs(cpi, wb); + } else { + write_frame_size(cm, wb); + } +#else + write_frame_size_with_refs(cpi, wb); +#endif + + aom_wb_write_bit(wb, cm->allow_high_precision_mv); + + fix_interp_filter(cm, cpi->td.counts); + write_frame_interp_filter(cm->interp_filter, wb); +#if CONFIG_TEMPMV_SIGNALING + if (!cm->error_resilient_mode) { + aom_wb_write_bit(wb, cm->use_prev_frame_mvs); + } +#endif + } + } + +#if CONFIG_REFERENCE_BUFFER + cm->refresh_mask = cm->frame_type == KEY_FRAME ? 0xFF : get_refresh_mask(cpi); +#endif + + if (!cm->error_resilient_mode) { + aom_wb_write_bit( + wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD); + } + + aom_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2); + + assert(cm->mib_size == mi_size_wide[cm->sb_size]); + assert(cm->mib_size == 1 << cm->mib_size_log2); +#if CONFIG_EXT_PARTITION + assert(cm->sb_size == BLOCK_128X128 || cm->sb_size == BLOCK_64X64); + aom_wb_write_bit(wb, cm->sb_size == BLOCK_128X128 ? 1 : 0); +#else + assert(cm->sb_size == BLOCK_64X64); +#endif // CONFIG_EXT_PARTITION + + encode_loopfilter(cm, wb); +#if CONFIG_CDEF + encode_cdef(cm, wb); +#endif +#if CONFIG_LOOP_RESTORATION + encode_restoration_mode(cm, wb); +#endif // CONFIG_LOOP_RESTORATION + encode_quantization(cm, wb); + encode_segmentation(cm, xd, wb); +#if CONFIG_DELTA_Q + { + int i; + struct segmentation *const seg = &cm->seg; + int segment_quantizer_active = 0; + for (i = 0; i < MAX_SEGMENTS; i++) { + if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) { + segment_quantizer_active = 1; + } + } + + if (cm->delta_q_present_flag) + assert(segment_quantizer_active == 0 && cm->base_qindex > 0); + if (segment_quantizer_active == 0 && cm->base_qindex > 0) { + aom_wb_write_bit(wb, cm->delta_q_present_flag); + if (cm->delta_q_present_flag) { + aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2); + xd->prev_qindex = cm->base_qindex; +#if CONFIG_EXT_DELTA_Q + assert(seg->abs_delta == SEGMENT_DELTADATA); + aom_wb_write_bit(wb, cm->delta_lf_present_flag); + if (cm->delta_lf_present_flag) { + aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2); + xd->prev_delta_lf_from_base = 0; + } +#endif // CONFIG_EXT_DELTA_Q + } + } + } +#endif + + write_tx_mode(cm, xd, &cm->tx_mode, wb); + + if (cpi->allow_comp_inter_inter) { + const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT; +#if !CONFIG_REF_ADAPT + const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE; +#endif // !CONFIG_REF_ADAPT + + aom_wb_write_bit(wb, use_hybrid_pred); +#if !CONFIG_REF_ADAPT + if (!use_hybrid_pred) aom_wb_write_bit(wb, use_compound_pred); +#endif // !CONFIG_REF_ADAPT + } + +#if CONFIG_EXT_TX + aom_wb_write_bit(wb, cm->reduced_tx_set_used); +#endif // CONFIG_EXT_TX + + write_tile_info(cm, wb); +} + +#if CONFIG_GLOBAL_MOTION +static void write_global_motion_params(WarpedMotionParams *params, + WarpedMotionParams *ref_params, + aom_prob *probs, aom_writer *w, + int allow_hp) { + TransformationType type = params->wmtype; + int trans_bits; + int trans_prec_diff; + av1_write_token(w, av1_global_motion_types_tree, probs, + &global_motion_types_encodings[type]); + switch (type) { + case HOMOGRAPHY: + case HORTRAPEZOID: + case VERTRAPEZOID: + if (type != HORTRAPEZOID) + aom_write_signed_primitive_refsubexpfin( + w, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF), + (params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF)); + if (type != VERTRAPEZOID) + aom_write_signed_primitive_refsubexpfin( + w, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF), + (params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF)); + // fallthrough intended + case AFFINE: + case ROTZOOM: + aom_write_signed_primitive_refsubexpfin( + w, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + if (type != VERTRAPEZOID) + aom_write_signed_primitive_refsubexpfin( + w, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF), + (params->wmmat[3] >> GM_ALPHA_PREC_DIFF)); + if (type >= AFFINE) { + if (type != HORTRAPEZOID) + aom_write_signed_primitive_refsubexpfin( + w, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF), + (params->wmmat[4] >> GM_ALPHA_PREC_DIFF)); + aom_write_signed_primitive_refsubexpfin( + w, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS)); + } + // fallthrough intended + case TRANSLATION: + trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp + : GM_ABS_TRANS_BITS; + trans_prec_diff = (type == TRANSLATION) + ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp + : GM_TRANS_PREC_DIFF; + aom_write_signed_primitive_refsubexpfin( + w, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[0] >> trans_prec_diff), + (params->wmmat[0] >> trans_prec_diff)); + aom_write_signed_primitive_refsubexpfin( + w, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[1] >> trans_prec_diff), + (params->wmmat[1] >> trans_prec_diff)); + break; + case IDENTITY: break; + default: assert(0); + } +} + +static void write_global_motion(AV1_COMP *cpi, aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + int frame; + for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { +#if !CONFIG_REF_MV + // With ref-mv, clearing unused global motion models here is + // unsafe, and we need to rely on the recode loop to do it + // instead. See av1_find_mv_refs for details. + if (!cpi->td.rd_counts.global_motion_used[frame]) { + set_default_warp_params(&cm->global_motion[frame]); + } +#endif + write_global_motion_params( + &cm->global_motion[frame], &cm->prev_frame->global_motion[frame], + cm->fc->global_motion_types_prob, w, cm->allow_high_precision_mv); + /* + printf("Frame %d/%d: Enc Ref %d (used %d): %d %d %d %d\n", + cm->current_video_frame, cm->show_frame, frame, + cpi->global_motion_used[frame], cm->global_motion[frame].wmmat[0], + cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2], + cm->global_motion[frame].wmmat[3]); + */ + } +} +#endif + +static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) { + AV1_COMMON *const cm = &cpi->common; +#if CONFIG_SUPERTX + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; +#endif // CONFIG_SUPERTX + FRAME_CONTEXT *const fc = cm->fc; + FRAME_COUNTS *counts = cpi->td.counts; + aom_writer *header_bc; + int i, j; + +#if CONFIG_TILE_GROUPS + const int probwt = cm->num_tg; +#else + const int probwt = 1; +#endif + +#if CONFIG_ANS + int header_size; + header_bc = &cpi->buf_ans; + buf_ans_write_init(header_bc, data); +#else + aom_writer real_header_bc; + header_bc = &real_header_bc; + aom_start_encode(header_bc, data); +#endif + +#if CONFIG_LOOP_RESTORATION + encode_restoration(cm, header_bc); +#endif // CONFIG_LOOP_RESTORATION +#if !CONFIG_EC_ADAPT + update_txfm_probs(cm, header_bc, counts); +#endif +#if CONFIG_LV_MAP + av1_write_txb_probs(cpi, header_bc); +#else +#if !CONFIG_PVQ +#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET) + update_coef_probs(cpi, header_bc); +#endif // !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET) +#endif // CONFIG_PVQ +#endif // CONFIG_LV_MAP + +#if CONFIG_VAR_TX + update_txfm_partition_probs(cm, header_bc, counts, probwt); +#endif + + update_skip_probs(cm, header_bc, counts); +#if !CONFIG_EC_ADAPT && CONFIG_DELTA_Q + update_delta_q_probs(cm, header_bc, counts); +#if CONFIG_EXT_DELTA_Q + update_delta_lf_probs(cm, header_bc, counts); +#endif +#endif +#if !CONFIG_EC_ADAPT + update_seg_probs(cpi, header_bc); + + for (i = 0; i < INTRA_MODES; ++i) { + prob_diff_update(av1_intra_mode_tree, fc->uv_mode_prob[i], + counts->uv_mode[i], INTRA_MODES, probwt, header_bc); + } + +#if CONFIG_EXT_PARTITION_TYPES + for (i = 0; i < PARTITION_PLOFFSET; ++i) + prob_diff_update(av1_partition_tree, fc->partition_prob[i], + counts->partition[i], PARTITION_TYPES, probwt, header_bc); + for (; i < PARTITION_CONTEXTS_PRIMARY; ++i) + prob_diff_update(av1_ext_partition_tree, fc->partition_prob[i], + counts->partition[i], EXT_PARTITION_TYPES, probwt, + header_bc); +#else + for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i) + prob_diff_update(av1_partition_tree, fc->partition_prob[i], + counts->partition[i], PARTITION_TYPES, probwt, header_bc); +#endif // CONFIG_EXT_PARTITION_TYPES +#if CONFIG_UNPOISON_PARTITION_CTX + for (; i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) { + unsigned int ct[2] = { counts->partition[i][PARTITION_VERT], + counts->partition[i][PARTITION_SPLIT] }; + assert(counts->partition[i][PARTITION_NONE] == 0); + assert(counts->partition[i][PARTITION_HORZ] == 0); + assert(fc->partition_prob[i][PARTITION_NONE] == 0); + assert(fc->partition_prob[i][PARTITION_HORZ] == 0); + av1_cond_prob_diff_update(header_bc, &fc->partition_prob[i][PARTITION_VERT], + ct, probwt); + } + for (; i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) { + unsigned int ct[2] = { counts->partition[i][PARTITION_HORZ], + counts->partition[i][PARTITION_SPLIT] }; + assert(counts->partition[i][PARTITION_NONE] == 0); + assert(counts->partition[i][PARTITION_VERT] == 0); + assert(fc->partition_prob[i][PARTITION_NONE] == 0); + assert(fc->partition_prob[i][PARTITION_VERT] == 0); + av1_cond_prob_diff_update(header_bc, &fc->partition_prob[i][PARTITION_HORZ], + ct, probwt); + } +#endif +#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP + for (i = 0; i < INTRA_FILTERS + 1; ++i) + prob_diff_update(av1_intra_filter_tree, fc->intra_filter_probs[i], + counts->intra_filter[i], INTRA_FILTERS, probwt, header_bc); +#endif // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP +#endif // !CONFIG_EC_ADAPT + + if (frame_is_intra_only(cm)) { + av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob); +#if CONFIG_EC_MULTISYMBOL + av1_copy(cm->fc->kf_y_cdf, av1_kf_y_mode_cdf); +#endif + +#if !CONFIG_EC_ADAPT + for (i = 0; i < INTRA_MODES; ++i) + for (j = 0; j < INTRA_MODES; ++j) + prob_diff_update(av1_intra_mode_tree, cm->kf_y_prob[i][j], + counts->kf_y_mode[i][j], INTRA_MODES, probwt, + header_bc); +#endif // CONFIG_EC_ADAPT + } else { +#if CONFIG_REF_MV + update_inter_mode_probs(cm, header_bc, counts); +#else +#if !CONFIG_EC_ADAPT + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { + prob_diff_update(av1_inter_mode_tree, cm->fc->inter_mode_probs[i], + counts->inter_mode[i], INTER_MODES, probwt, header_bc); + } +#endif +#endif +#if CONFIG_EXT_INTER + update_inter_compound_mode_probs(cm, probwt, header_bc); + + if (cm->reference_mode != COMPOUND_REFERENCE) { + for (i = 0; i < BLOCK_SIZE_GROUPS; i++) { + if (is_interintra_allowed_bsize_group(i)) { + av1_cond_prob_diff_update(header_bc, &fc->interintra_prob[i], + cm->counts.interintra[i], probwt); + } + } + for (i = 0; i < BLOCK_SIZE_GROUPS; i++) { + prob_diff_update( + av1_interintra_mode_tree, cm->fc->interintra_mode_prob[i], + counts->interintra_mode[i], INTERINTRA_MODES, probwt, header_bc); + } + for (i = 0; i < BLOCK_SIZES; i++) { + if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i)) + av1_cond_prob_diff_update(header_bc, &fc->wedge_interintra_prob[i], + cm->counts.wedge_interintra[i], probwt); + } + } +#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE + if (cm->reference_mode != SINGLE_REFERENCE) { + for (i = 0; i < BLOCK_SIZES; i++) + prob_diff_update(av1_compound_type_tree, fc->compound_type_prob[i], + cm->counts.compound_interinter[i], COMPOUND_TYPES, + probwt, header_bc); + } +#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE +#endif // CONFIG_EXT_INTER + +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i) + prob_diff_update(av1_motion_mode_tree, fc->motion_mode_prob[i], + counts->motion_mode[i], MOTION_MODES, probwt, header_bc); +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION +#if !CONFIG_EC_ADAPT + if (cm->interp_filter == SWITCHABLE) + update_switchable_interp_probs(cm, header_bc, counts); +#endif + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + av1_cond_prob_diff_update(header_bc, &fc->intra_inter_prob[i], + counts->intra_inter[i], probwt); + + if (cpi->allow_comp_inter_inter) { + const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT; + if (use_hybrid_pred) + for (i = 0; i < COMP_INTER_CONTEXTS; i++) + av1_cond_prob_diff_update(header_bc, &fc->comp_inter_prob[i], + counts->comp_inter[i], probwt); + } + + if (cm->reference_mode != COMPOUND_REFERENCE) { + for (i = 0; i < REF_CONTEXTS; i++) { + for (j = 0; j < (SINGLE_REFS - 1); j++) { + av1_cond_prob_diff_update(header_bc, &fc->single_ref_prob[i][j], + counts->single_ref[i][j], probwt); + } + } + } + if (cm->reference_mode != SINGLE_REFERENCE) { + for (i = 0; i < REF_CONTEXTS; i++) { +#if CONFIG_EXT_REFS + for (j = 0; j < (FWD_REFS - 1); j++) { + av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j], + counts->comp_ref[i][j], probwt); + } + for (j = 0; j < (BWD_REFS - 1); j++) { + av1_cond_prob_diff_update(header_bc, &fc->comp_bwdref_prob[i][j], + counts->comp_bwdref[i][j], probwt); + } +#else + for (j = 0; j < (COMP_REFS - 1); j++) { + av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j], + counts->comp_ref[i][j], probwt); + } +#endif // CONFIG_EXT_REFS + } + } + +#if !CONFIG_EC_ADAPT + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) { + prob_diff_update(av1_intra_mode_tree, cm->fc->y_mode_prob[i], + counts->y_mode[i], INTRA_MODES, probwt, header_bc); + } +#endif + + av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc, +#if CONFIG_REF_MV + counts->mv); +#else + &counts->mv); +#endif +#if !CONFIG_EC_ADAPT + update_ext_tx_probs(cm, header_bc); +#endif +#if CONFIG_SUPERTX + if (!xd->lossless[0]) update_supertx_probs(cm, probwt, header_bc); +#endif // CONFIG_SUPERTX +#if CONFIG_GLOBAL_MOTION + write_global_motion(cpi, header_bc); +#endif // CONFIG_GLOBAL_MOTION + } +#if CONFIG_EC_MULTISYMBOL +#if !CONFIG_EC_ADAPT +#if CONFIG_NEW_TOKENSET + av1_coef_head_cdfs(fc); +#endif + av1_coef_pareto_cdfs(fc); +#if CONFIG_REF_MV + for (i = 0; i < NMV_CONTEXTS; ++i) av1_set_mv_cdfs(&fc->nmvc[i]); +#else + av1_set_mv_cdfs(&fc->nmvc); +#endif +#if CONFIG_EC_MULTISYMBOL + av1_set_mode_cdfs(cm); +#endif +#endif // !CONFIG_EC_ADAPT +#endif +#if CONFIG_ANS + aom_buf_ans_flush(header_bc); + header_size = buf_ans_write_end(header_bc); + assert(header_size <= 0xffff); + return header_size; +#else + aom_stop_encode(header_bc); + assert(header_bc->pos <= 0xffff); + return header_bc->pos; +#endif // CONFIG_ANS +} + +static int choose_size_bytes(uint32_t size, int spare_msbs) { + // Choose the number of bytes required to represent size, without + // using the 'spare_msbs' number of most significant bits. + + // Make sure we will fit in 4 bytes to start with.. + if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1; + + // Normalise to 32 bits + size <<= spare_msbs; + + if (size >> 24 != 0) + return 4; + else if (size >> 16 != 0) + return 3; + else if (size >> 8 != 0) + return 2; + else + return 1; +} + +static void mem_put_varsize(uint8_t *const dst, const int sz, const int val) { + switch (sz) { + case 1: dst[0] = (uint8_t)(val & 0xff); break; + case 2: mem_put_le16(dst, val); break; + case 3: mem_put_le24(dst, val); break; + case 4: mem_put_le32(dst, val); break; + default: assert(0 && "Invalid size"); break; + } +} +static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst, + const uint32_t data_size, const uint32_t max_tile_size, + const uint32_t max_tile_col_size, + int *const tile_size_bytes, + int *const tile_col_size_bytes) { +// Choose the tile size bytes (tsb) and tile column size bytes (tcsb) +#if CONFIG_EXT_TILE + // The top bit in the tile size field indicates tile copy mode, so we + // have 1 less bit to code the tile size + const int tsb = choose_size_bytes(max_tile_size, 1); + const int tcsb = choose_size_bytes(max_tile_col_size, 0); +#else + const int tsb = choose_size_bytes(max_tile_size, 0); + const int tcsb = 4; // This is ignored + (void)max_tile_col_size; +#endif // CONFIG_EXT_TILE + + assert(tsb > 0); + assert(tcsb > 0); + + *tile_size_bytes = tsb; + *tile_col_size_bytes = tcsb; + + if (tsb == 4 && tcsb == 4) { + return data_size; + } else { + uint32_t wpos = 0; + uint32_t rpos = 0; + +#if CONFIG_EXT_TILE + int tile_row; + int tile_col; + + for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) { + // All but the last column has a column header + if (tile_col < cm->tile_cols - 1) { + uint32_t tile_col_size = mem_get_le32(dst + rpos); + rpos += 4; + + // Adjust the tile column size by the number of bytes removed + // from the tile size fields. + tile_col_size -= (4 - tsb) * cm->tile_rows; + + mem_put_varsize(dst + wpos, tcsb, tile_col_size); + wpos += tcsb; + } + + for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) { + // All, including the last row has a header + uint32_t tile_header = mem_get_le32(dst + rpos); + rpos += 4; + + // If this is a copy tile, we need to shift the MSB to the + // top bit of the new width, and there is no data to copy. + if (tile_header >> 31 != 0) { + if (tsb < 4) tile_header >>= 32 - 8 * tsb; + mem_put_varsize(dst + wpos, tsb, tile_header); + wpos += tsb; + } else { + mem_put_varsize(dst + wpos, tsb, tile_header); + wpos += tsb; + + memmove(dst + wpos, dst + rpos, tile_header); + rpos += tile_header; + wpos += tile_header; + } + } + } +#else + const int n_tiles = cm->tile_cols * cm->tile_rows; + int n; + + for (n = 0; n < n_tiles; n++) { + int tile_size; + + if (n == n_tiles - 1) { + tile_size = data_size - rpos; + } else { + tile_size = mem_get_le32(dst + rpos); + rpos += 4; + mem_put_varsize(dst + wpos, tsb, tile_size); + wpos += tsb; + } + + memmove(dst + wpos, dst + rpos, tile_size); + + rpos += tile_size; + wpos += tile_size; + } +#endif // CONFIG_EXT_TILE + + assert(rpos > wpos); + assert(rpos == data_size); + + return wpos; + } +} + +void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) { + uint8_t *data = dst; +#if !CONFIG_TILE_GROUPS + uint32_t compressed_header_size; + uint32_t uncompressed_header_size; + struct aom_write_bit_buffer saved_wb; +#endif + uint32_t data_size; + struct aom_write_bit_buffer wb = { data, 0 }; + + unsigned int max_tile_size; + unsigned int max_tile_col_size; + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_reset_write(); +#endif + +#if !CONFIG_TILE_GROUPS + int tile_size_bytes; + int tile_col_size_bytes; + AV1_COMMON *const cm = &cpi->common; + const int have_tiles = cm->tile_cols * cm->tile_rows > 1; + + // Write the uncompressed header + write_uncompressed_header(cpi, &wb); + +#if CONFIG_EXT_REFS + if (cm->show_existing_frame) { + *size = aom_wb_bytes_written(&wb); + return; + } +#endif // CONFIG_EXT_REFS + + // We do not know these in advance. Output placeholder bit. + saved_wb = wb; + // Write tile size magnitudes + if (have_tiles) { +// Note that the last item in the uncompressed header is the data +// describing tile configuration. +#if CONFIG_EXT_TILE + // Number of bytes in tile column size - 1 + aom_wb_write_literal(&wb, 0, 2); +#endif // CONFIG_EXT_TILE + // Number of bytes in tile size - 1 + aom_wb_write_literal(&wb, 0, 2); + } + // Size of compressed header + aom_wb_write_literal(&wb, 0, 16); + + uncompressed_header_size = (uint32_t)aom_wb_bytes_written(&wb); + data += uncompressed_header_size; + + aom_clear_system_state(); + + // Write the compressed header + compressed_header_size = write_compressed_header(cpi, data); + data += compressed_header_size; + + // Write the encoded tile data + data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size); +#else + data_size = write_tiles(cpi, &wb, &max_tile_size, &max_tile_col_size); +#endif +#if !CONFIG_TILE_GROUPS + if (have_tiles) { + data_size = + remux_tiles(cm, data, data_size, max_tile_size, max_tile_col_size, + &tile_size_bytes, &tile_col_size_bytes); + } + + data += data_size; + + // Now fill in the gaps in the uncompressed header. + if (have_tiles) { +#if CONFIG_EXT_TILE + assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4); + aom_wb_write_literal(&saved_wb, tile_col_size_bytes - 1, 2); +#endif // CONFIG_EXT_TILE + assert(tile_size_bytes >= 1 && tile_size_bytes <= 4); + aom_wb_write_literal(&saved_wb, tile_size_bytes - 1, 2); + } + // TODO(jbb): Figure out what to do if compressed_header_size > 16 bits. + assert(compressed_header_size <= 0xffff); + aom_wb_write_literal(&saved_wb, compressed_header_size, 16); +#else + data += data_size; +#endif +#if CONFIG_ANS && ANS_REVERSE + // Avoid aliasing the superframe index + *data++ = 0; +#endif + *size = data - dst; +} diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h new file mode 100644 index 000000000..c75d80891 --- /dev/null +++ b/third_party/aom/av1/encoder/bitstream.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_BITSTREAM_H_ +#define AV1_ENCODER_BITSTREAM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/encoder.h" + +#if CONFIG_REFERENCE_BUFFER +void write_sequence_header(SequenceHeader *seq_params); +#endif + +void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size); + +void av1_encode_token_init(void); + +static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) { +#if CONFIG_EXT_REFS + // Do not swap gf and arf indices for internal overlay frames + return !cpi->multi_arf_allowed && cpi->rc.is_src_frame_alt_ref && + !cpi->rc.is_src_frame_ext_arf; +#else + return !cpi->multi_arf_allowed && cpi->refresh_golden_frame && + cpi->rc.is_src_frame_alt_ref; +#endif // CONFIG_EXT_REFS +} + +void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, +#if CONFIG_SUPERTX + const int supertx_enabled, +#endif +#if CONFIG_TXK_SEL + int block, int plane, +#endif + aom_writer *w); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_BITSTREAM_H_ diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h new file mode 100644 index 000000000..39e08d5b4 --- /dev/null +++ b/third_party/aom/av1/encoder/block.h @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_BLOCK_H_ +#define AV1_ENCODER_BLOCK_H_ + +#include "av1/common/entropymv.h" +#include "av1/common/entropy.h" +#if CONFIG_PVQ +#include "av1/encoder/encint.h" +#endif +#if CONFIG_REF_MV +#include "av1/common/mvref_common.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_PVQ +// Maximum possible # of tx blocks in luma plane, which is currently 256, +// since there can be 16x16 of 4x4 tx. +#define MAX_PVQ_BLOCKS_IN_SB (MAX_SB_SQUARE >> 2 * OD_LOG_BSIZE0) +#endif + +typedef struct { + unsigned int sse; + int sum; + unsigned int var; +} DIFF; + +typedef struct macroblock_plane { + DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]); +#if CONFIG_PVQ + DECLARE_ALIGNED(16, int16_t, src_int16[MAX_SB_SQUARE]); +#endif + tran_low_t *qcoeff; + tran_low_t *coeff; + uint16_t *eobs; +#if CONFIG_LV_MAP + uint8_t *txb_entropy_ctx; +#endif + struct buf_2d src; + + // Quantizer setings + const int16_t *quant_fp; + const int16_t *round_fp; + const int16_t *quant; + const int16_t *quant_shift; + const int16_t *zbin; + const int16_t *round; +#if CONFIG_NEW_QUANT + const cuml_bins_type_nuq *cuml_bins_nuq[QUANT_PROFILES]; +#endif // CONFIG_NEW_QUANT +} MACROBLOCK_PLANE; + +/* The [2] dimension is for whether we skip the EOB node (i.e. if previous + * coefficient in this block was zero) or not. */ +typedef unsigned int av1_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][2] + [COEFF_CONTEXTS][ENTROPY_TOKENS]; + +typedef struct { + int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; + int16_t mode_context[MODE_CTX_REF_FRAMES]; +#if CONFIG_LV_MAP + // TODO(angiebird): Reduce the buffer size according to sb_type + tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]; + uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; + uint8_t txb_skip_ctx[MAX_MB_PLANE] + [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; + int dc_sign_ctx[MAX_MB_PLANE] + [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; +#endif +#if CONFIG_REF_MV + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; + CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; +#if CONFIG_EXT_INTER + int16_t compound_mode_context[MODE_CTX_REF_FRAMES]; +#endif // CONFIG_EXT_INTER +#endif +} MB_MODE_INFO_EXT; + +typedef struct { + int col_min; + int col_max; + int row_min; + int row_max; +} MvLimits; + +#if CONFIG_PALETTE +typedef struct { + uint8_t best_palette_color_map[MAX_SB_SQUARE]; + float kmeans_data_buf[2 * MAX_SB_SQUARE]; +} PALETTE_BUFFER; +#endif // CONFIG_PALETTE + +typedef struct macroblock MACROBLOCK; +struct macroblock { + struct macroblock_plane plane[MAX_MB_PLANE]; + + MACROBLOCKD e_mbd; + MB_MODE_INFO_EXT *mbmi_ext; + int skip_block; + int qindex; + + // The equivalent error at the current rdmult of one whole bit (not one + // bitcost unit). + int errorperbit; + // The equivalend SAD error of one (whole) bit at the current quantizer + // for large blocks. + int sadperbit16; + // The equivalend SAD error of one (whole) bit at the current quantizer + // for sub-8x8 blocks. + int sadperbit4; + int rddiv; + int rdmult; + int mb_energy; + int *m_search_count_ptr; + int *ex_search_count_ptr; + +#if CONFIG_VAR_TX + unsigned int txb_split_count; +#endif + + // These are set to their default values at the beginning, and then adjusted + // further in the encoding process. + BLOCK_SIZE min_partition_size; + BLOCK_SIZE max_partition_size; + + int mv_best_ref_index[TOTAL_REFS_PER_FRAME]; + unsigned int max_mv_context[TOTAL_REFS_PER_FRAME]; + unsigned int source_variance; + unsigned int pred_sse[TOTAL_REFS_PER_FRAME]; + int pred_mv_sad[TOTAL_REFS_PER_FRAME]; + +#if CONFIG_REF_MV + int *nmvjointcost; + int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS]; + int *nmvcost[NMV_CONTEXTS][2]; + int *nmvcost_hp[NMV_CONTEXTS][2]; + int **mv_cost_stack[NMV_CONTEXTS]; + int *nmvjointsadcost; +#else + int nmvjointcost[MV_JOINTS]; + int *nmvcost[2]; + int *nmvcost_hp[2]; + int nmvjointsadcost[MV_JOINTS]; +#endif + + int **mvcost; + int *nmvsadcost[2]; + int *nmvsadcost_hp[2]; + int **mvsadcost; +#if CONFIG_MOTION_VAR + int32_t *wsrc_buf; + int32_t *mask_buf; +#endif // CONFIG_MOTION_VAR + +#if CONFIG_PALETTE + PALETTE_BUFFER *palette_buffer; +#endif // CONFIG_PALETTE + + // These define limits to motion vector components to prevent them + // from extending outside the UMV borders + MvLimits mv_limits; + +#if CONFIG_VAR_TX + uint8_t blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8]; +#if CONFIG_REF_MV + uint8_t blk_skip_drl[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8]; +#endif +#endif + + int skip; + +#if CONFIG_CB4X4 + int skip_chroma_rd; +#endif + + // note that token_costs is the cost when eob node is skipped + av1_coeff_cost token_costs[TX_SIZES]; + + int optimize; + + // Used to store sub partition's choices. + MV pred_mv[TOTAL_REFS_PER_FRAME]; + + // Store the best motion vector during motion search + int_mv best_mv; + // Store the second best motion vector during full-pixel motion search + int_mv second_best_mv; + + // use default transform and skip transform type search for intra modes + int use_default_intra_tx_type; + // use default transform and skip transform type search for inter modes + int use_default_inter_tx_type; +#if CONFIG_PVQ + int rate; + // 1 if neither AC nor DC is coded. Only used during RDO. + int pvq_skip[MAX_MB_PLANE]; + PVQ_QUEUE *pvq_q; + + // Storage for PVQ tx block encodings in a superblock. + // There can be max 16x16 of 4x4 blocks (and YUV) encode by PVQ + // 256 is the max # of 4x4 blocks in a SB (64x64), which comes from: + // 1) Since PVQ is applied to each trasnform-ed block + // 2) 4x4 is the smallest tx size in AV1 + // 3) AV1 allows using smaller tx size than block (i.e. partition) size + // TODO(yushin) : The memory usage could be improved a lot, since this has + // storage for 10 bands and 128 coefficients for every 4x4 block, + PVQ_INFO pvq[MAX_PVQ_BLOCKS_IN_SB][MAX_MB_PLANE]; + daala_enc_ctx daala_enc; + int pvq_speed; + int pvq_coded; // Indicates whether pvq_info needs be stored to tokenize +#endif +#if CONFIG_DAALA_DIST + // Keep rate of each 4x4 block in the current macroblock during RDO + // This is needed when using the 8x8 Daala distortion metric during RDO, + // because it evaluates distortion in a different order than the underlying + // 4x4 blocks are coded. + int rate_4x4[256]; +#endif +#if CONFIG_CFL + // Whether luma needs to be stored during RDO. + int cfl_store_y; +#endif +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_BLOCK_H_ diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c new file mode 100644 index 000000000..113ceb29d --- /dev/null +++ b/third_party/aom/av1/encoder/blockiness.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./av1_rtcd.h" +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "av1/common/common.h" +#include "av1/common/filter.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +static int horizontal_filter(const uint8_t *s) { + return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6; +} + +static int vertical_filter(const uint8_t *s, int p) { + return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6; +} + +static int variance(int sum, int sum_squared, int size) { + return sum_squared / size - (sum / size) * (sum / size); +} +// Calculate a blockiness level for a vertical block edge. +// This function returns a new blockiness metric that's defined as + +// p0 p1 p2 p3 +// q0 q1 q2 q3 +// block edge -> +// r0 r1 r2 r3 +// s0 s1 s2 s3 + +// blockiness = p0*-2+q0*6+r0*-6+s0*2 + +// p1*-2+q1*6+r1*-6+s1*2 + +// p2*-2+q2*6+r2*-6+s2*2 + +// p3*-2+q3*6+r3*-6+s3*2 ; + +// reconstructed_blockiness = abs(blockiness from reconstructed buffer - +// blockiness from source buffer,0) +// +// I make the assumption that flat blocks are much more visible than high +// contrast blocks. As such, I scale the result of the blockiness calc +// by dividing the blockiness by the variance of the pixels on either side +// of the edge as follows: +// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2 +// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2 +// The returned blockiness is the scaled value +// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ; +static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r, + int rp, int size) { + int s_blockiness = 0; + int r_blockiness = 0; + int sum_0 = 0; + int sum_sq_0 = 0; + int sum_1 = 0; + int sum_sq_1 = 0; + int i; + int var_0; + int var_1; + for (i = 0; i < size; ++i, s += sp, r += rp) { + s_blockiness += horizontal_filter(s); + r_blockiness += horizontal_filter(r); + sum_0 += s[0]; + sum_sq_0 += s[0] * s[0]; + sum_1 += s[-1]; + sum_sq_1 += s[-1] * s[-1]; + } + var_0 = variance(sum_0, sum_sq_0, size); + var_1 = variance(sum_1, sum_sq_1, size); + r_blockiness = abs(r_blockiness); + s_blockiness = abs(s_blockiness); + + if (r_blockiness > s_blockiness) + return (r_blockiness - s_blockiness) / (1 + var_0 + var_1); + else + return 0; +} + +// Calculate a blockiness level for a horizontal block edge +// same as above. +static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r, + int rp, int size) { + int s_blockiness = 0; + int r_blockiness = 0; + int sum_0 = 0; + int sum_sq_0 = 0; + int sum_1 = 0; + int sum_sq_1 = 0; + int i; + int var_0; + int var_1; + for (i = 0; i < size; ++i, ++s, ++r) { + s_blockiness += vertical_filter(s, sp); + r_blockiness += vertical_filter(r, rp); + sum_0 += s[0]; + sum_sq_0 += s[0] * s[0]; + sum_1 += s[-sp]; + sum_sq_1 += s[-sp] * s[-sp]; + } + var_0 = variance(sum_0, sum_sq_0, size); + var_1 = variance(sum_1, sum_sq_1, size); + r_blockiness = abs(r_blockiness); + s_blockiness = abs(s_blockiness); + + if (r_blockiness > s_blockiness) + return (r_blockiness - s_blockiness) / (1 + var_0 + var_1); + else + return 0; +} + +// This function returns the blockiness for the entire frame currently by +// looking at all borders in steps of 4. +double av1_get_blockiness(const unsigned char *img1, int img1_pitch, + const unsigned char *img2, int img2_pitch, int width, + int height) { + double blockiness = 0; + int i, j; + aom_clear_system_state(); + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (j = 0; j < width; j += 4) { + if (i > 0 && i < height && j > 0 && j < width) { + blockiness += + blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4); + blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j, + img2_pitch, 4); + } + } + } + blockiness /= width * height / 16; + return blockiness; +} diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c new file mode 100644 index 000000000..4c7d6ff00 --- /dev/null +++ b/third_party/aom/av1/encoder/context_tree.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encoder.h" + +static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = { +#if CONFIG_CB4X4 + BLOCK_4X4, +#endif + BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, +#if CONFIG_EXT_PARTITION + BLOCK_128X128, +#endif // CONFIG_EXT_PARTITION +}; + +static void alloc_mode_context(AV1_COMMON *cm, int num_4x4_blk, +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_TYPE partition, +#endif + PICK_MODE_CONTEXT *ctx) { + const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk); + const int num_pix = num_blk * tx_size_2d[0]; + int i; +#if CONFIG_CB4X4 && CONFIG_VAR_TX + ctx->num_4x4_blk = num_blk / 4; +#else + ctx->num_4x4_blk = num_blk; +#endif + +#if CONFIG_EXT_PARTITION_TYPES + ctx->partition = partition; +#endif + + for (i = 0; i < MAX_MB_PLANE; ++i) { +#if CONFIG_VAR_TX + CHECK_MEM_ERROR(cm, ctx->blk_skip[i], aom_calloc(num_blk, sizeof(uint8_t))); +#endif + CHECK_MEM_ERROR(cm, ctx->coeff[i], + aom_memalign(32, num_pix * sizeof(*ctx->coeff[i]))); + CHECK_MEM_ERROR(cm, ctx->qcoeff[i], + aom_memalign(32, num_pix * sizeof(*ctx->qcoeff[i]))); + CHECK_MEM_ERROR(cm, ctx->dqcoeff[i], + aom_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i]))); + CHECK_MEM_ERROR(cm, ctx->eobs[i], + aom_memalign(32, num_blk * sizeof(*ctx->eobs[i]))); +#if CONFIG_LV_MAP + CHECK_MEM_ERROR( + cm, ctx->txb_entropy_ctx[i], + aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i]))); +#endif + +#if CONFIG_PVQ + CHECK_MEM_ERROR(cm, ctx->pvq_ref_coeff[i], + aom_memalign(32, num_pix * sizeof(*ctx->pvq_ref_coeff[i]))); +#endif + } + +#if CONFIG_PALETTE + if (cm->allow_screen_content_tools) { + for (i = 0; i < 2; ++i) { + CHECK_MEM_ERROR( + cm, ctx->color_index_map[i], + aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i]))); + } + } +#endif // CONFIG_PALETTE +} + +static void free_mode_context(PICK_MODE_CONTEXT *ctx) { + int i; + for (i = 0; i < MAX_MB_PLANE; ++i) { +#if CONFIG_VAR_TX + aom_free(ctx->blk_skip[i]); + ctx->blk_skip[i] = 0; +#endif + aom_free(ctx->coeff[i]); + ctx->coeff[i] = 0; + aom_free(ctx->qcoeff[i]); + ctx->qcoeff[i] = 0; + aom_free(ctx->dqcoeff[i]); + ctx->dqcoeff[i] = 0; +#if CONFIG_PVQ + aom_free(ctx->pvq_ref_coeff[i]); + ctx->pvq_ref_coeff[i] = 0; +#endif + aom_free(ctx->eobs[i]); + ctx->eobs[i] = 0; +#if CONFIG_LV_MAP + aom_free(ctx->txb_entropy_ctx[i]); + ctx->txb_entropy_ctx[i] = 0; +#endif + } + +#if CONFIG_PALETTE + for (i = 0; i < 2; ++i) { + aom_free(ctx->color_index_map[i]); + ctx->color_index_map[i] = 0; + } +#endif // CONFIG_PALETTE +} + +static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree, + int num_4x4_blk) { +#if CONFIG_EXT_PARTITION_TYPES + alloc_mode_context(cm, num_4x4_blk, PARTITION_NONE, &tree->none); + alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ, &tree->horizontal[0]); + alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->vertical[0]); + alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->horizontal[1]); + alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->vertical[1]); + + alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_A, + &tree->horizontala[0]); + alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_A, + &tree->horizontala[1]); + alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ_A, + &tree->horizontala[2]); + alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ_B, + &tree->horizontalb[0]); + alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_B, + &tree->horizontalb[1]); + alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_B, + &tree->horizontalb[2]); + alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_A, + &tree->verticala[0]); + alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_A, + &tree->verticala[1]); + alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT_A, + &tree->verticala[2]); + alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT_B, + &tree->verticalb[0]); + alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_B, + &tree->verticalb[1]); + alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_B, + &tree->verticalb[2]); +#ifdef CONFIG_SUPERTX + alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ, + &tree->horizontal_supertx); + alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT, &tree->vertical_supertx); + alloc_mode_context(cm, num_4x4_blk, PARTITION_SPLIT, &tree->split_supertx); + alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_A, + &tree->horizontala_supertx); + alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_B, + &tree->horizontalb_supertx); + alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_A, + &tree->verticala_supertx); + alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_B, + &tree->verticalb_supertx); +#endif // CONFIG_SUPERTX +#else + alloc_mode_context(cm, num_4x4_blk, &tree->none); + alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[0]); + alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[0]); +#ifdef CONFIG_SUPERTX + alloc_mode_context(cm, num_4x4_blk, &tree->horizontal_supertx); + alloc_mode_context(cm, num_4x4_blk, &tree->vertical_supertx); + alloc_mode_context(cm, num_4x4_blk, &tree->split_supertx); +#endif + + if (num_4x4_blk > 4) { + alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[1]); + alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[1]); + } else { + memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1])); + memset(&tree->vertical[1], 0, sizeof(tree->vertical[1])); + } +#endif // CONFIG_EXT_PARTITION_TYPES +} + +static void free_tree_contexts(PC_TREE *tree) { +#if CONFIG_EXT_PARTITION_TYPES + int i; + for (i = 0; i < 3; i++) { + free_mode_context(&tree->horizontala[i]); + free_mode_context(&tree->horizontalb[i]); + free_mode_context(&tree->verticala[i]); + free_mode_context(&tree->verticalb[i]); + } +#endif // CONFIG_EXT_PARTITION_TYPES + free_mode_context(&tree->none); + free_mode_context(&tree->horizontal[0]); + free_mode_context(&tree->horizontal[1]); + free_mode_context(&tree->vertical[0]); + free_mode_context(&tree->vertical[1]); +#ifdef CONFIG_SUPERTX + free_mode_context(&tree->horizontal_supertx); + free_mode_context(&tree->vertical_supertx); + free_mode_context(&tree->split_supertx); +#if CONFIG_EXT_PARTITION_TYPES + free_mode_context(&tree->horizontala_supertx); + free_mode_context(&tree->horizontalb_supertx); + free_mode_context(&tree->verticala_supertx); + free_mode_context(&tree->verticalb_supertx); +#endif // CONFIG_EXT_PARTITION_TYPES +#endif // CONFIG_SUPERTX +} + +// This function sets up a tree of contexts such that at each square +// partition level. There are contexts for none, horizontal, vertical, and +// split. Along with a block_size value and a selected block_size which +// represents the state of our search. +void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) { + int i, j; +// TODO(jingning): The pc_tree allocation is redundant. We can take out all +// the leaf nodes after cb4x4 mode is enabled. +#if CONFIG_CB4X4 +#if CONFIG_EXT_PARTITION + const int tree_nodes_inc = 1024; +#else + const int tree_nodes_inc = 256; +#endif // CONFIG_EXT_PARTITION + const int leaf_factor = 4; +#else + const int tree_nodes_inc = 0; + const int leaf_factor = 1; +#endif +#if CONFIG_EXT_PARTITION + const int leaf_nodes = 256 * leaf_factor; + const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1; +#else + const int leaf_nodes = 64 * leaf_factor; + const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1; +#endif // CONFIG_EXT_PARTITION + int pc_tree_index = 0; + PC_TREE *this_pc; + PICK_MODE_CONTEXT *this_leaf; + int square_index = 1; + int nodes; + + aom_free(td->leaf_tree); + CHECK_MEM_ERROR(cm, td->leaf_tree, + aom_calloc(leaf_nodes, sizeof(*td->leaf_tree))); + aom_free(td->pc_tree); + CHECK_MEM_ERROR(cm, td->pc_tree, + aom_calloc(tree_nodes, sizeof(*td->pc_tree))); + + this_pc = &td->pc_tree[0]; + this_leaf = &td->leaf_tree[0]; + + // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same + // context so we only need to allocate 1 for each 8x8 block. + for (i = 0; i < leaf_nodes; ++i) { +#if CONFIG_EXT_PARTITION_TYPES + alloc_mode_context(cm, 4, PARTITION_NONE, &td->leaf_tree[i]); +#else + alloc_mode_context(cm, 16, &td->leaf_tree[i]); +#endif + } + + // Sets up all the leaf nodes in the tree. + for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) { + PC_TREE *const tree = &td->pc_tree[pc_tree_index]; + tree->block_size = square[0]; +#if CONFIG_CB4X4 + alloc_tree_contexts(cm, tree, 16); +#else + alloc_tree_contexts(cm, tree, 4); +#endif + tree->leaf_split[0] = this_leaf++; + for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0]; + } + + // Each node has 4 leaf nodes, fill each block_size level of the tree + // from leafs to the root. + for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) { + for (i = 0; i < nodes; ++i) { + PC_TREE *const tree = &td->pc_tree[pc_tree_index]; +#if CONFIG_CB4X4 + alloc_tree_contexts(cm, tree, 16 << (2 * square_index)); +#else + alloc_tree_contexts(cm, tree, 4 << (2 * square_index)); +#endif + tree->block_size = square[square_index]; + for (j = 0; j < 4; j++) tree->split[j] = this_pc++; + ++pc_tree_index; + } + ++square_index; + } + + // Set up the root node for the largest superblock size + i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2; + td->pc_root[i] = &td->pc_tree[tree_nodes - 1]; + td->pc_root[i]->none.best_mode_index = 2; + // Set up the root nodes for the rest of the possible superblock sizes + while (--i >= 0) { + td->pc_root[i] = td->pc_root[i + 1]->split[0]; + td->pc_root[i]->none.best_mode_index = 2; + } +} + +void av1_free_pc_tree(ThreadData *td) { +#if CONFIG_CB4X4 +#if CONFIG_EXT_PARTITION + const int tree_nodes_inc = 1024; +#else + const int tree_nodes_inc = 256; +#endif // CONFIG_EXT_PARTITION + const int leaf_factor = 4; +#else + const int tree_nodes_inc = 0; + const int leaf_factor = 1; +#endif + +#if CONFIG_EXT_PARTITION + const int leaf_nodes = 256 * leaf_factor; + const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1; +#else + const int leaf_nodes = 64 * leaf_factor; + const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1; +#endif // CONFIG_EXT_PARTITION + int i; + + // Set up all 4x4 mode contexts + for (i = 0; i < leaf_nodes; ++i) free_mode_context(&td->leaf_tree[i]); + + // Sets up all the leaf nodes in the tree. + for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]); + + aom_free(td->pc_tree); + td->pc_tree = NULL; + aom_free(td->leaf_tree); + td->leaf_tree = NULL; +} diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h new file mode 100644 index 000000000..67954126c --- /dev/null +++ b/third_party/aom/av1/encoder/context_tree.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_CONTEXT_TREE_H_ +#define AV1_ENCODER_CONTEXT_TREE_H_ + +#include "av1/common/blockd.h" +#include "av1/encoder/block.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct AV1Common; +struct ThreadData; + +// Structure to hold snapshot of coding context during the mode picking process +typedef struct { + MODE_INFO mic; + MB_MODE_INFO_EXT mbmi_ext; +#if CONFIG_PALETTE + uint8_t *color_index_map[2]; +#endif // CONFIG_PALETTE +#if CONFIG_VAR_TX + uint8_t *blk_skip[MAX_MB_PLANE]; +#endif + + // dual buffer pointers, 0: in use, 1: best in store + tran_low_t *coeff[MAX_MB_PLANE]; + tran_low_t *qcoeff[MAX_MB_PLANE]; + tran_low_t *dqcoeff[MAX_MB_PLANE]; +#if CONFIG_PVQ + tran_low_t *pvq_ref_coeff[MAX_MB_PLANE]; +#endif + uint16_t *eobs[MAX_MB_PLANE]; +#if CONFIG_LV_MAP + uint8_t *txb_entropy_ctx[MAX_MB_PLANE]; +#endif + + int num_4x4_blk; + int skip; + int pred_pixel_ready; + // For current partition, only if all Y, U, and V transform blocks' + // coefficients are quantized to 0, skippable is set to 0. + int skippable; + int best_mode_index; + int hybrid_pred_diff; + int comp_pred_diff; + int single_pred_diff; + + // TODO(jingning) Use RD_COST struct here instead. This involves a boarder + // scope of refactoring. + int rate; + int64_t dist; + + // motion vector cache for adaptive motion search control in partition + // search loop + MV pred_mv[TOTAL_REFS_PER_FRAME]; + InterpFilter pred_interp_filter; +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_TYPE partition; +#endif +} PICK_MODE_CONTEXT; + +typedef struct PC_TREE { + int index; + PARTITION_TYPE partitioning; + BLOCK_SIZE block_size; + PICK_MODE_CONTEXT none; + PICK_MODE_CONTEXT horizontal[2]; + PICK_MODE_CONTEXT vertical[2]; +#if CONFIG_EXT_PARTITION_TYPES + PICK_MODE_CONTEXT horizontala[3]; + PICK_MODE_CONTEXT horizontalb[3]; + PICK_MODE_CONTEXT verticala[3]; + PICK_MODE_CONTEXT verticalb[3]; +#endif + union { + struct PC_TREE *split[4]; + PICK_MODE_CONTEXT *leaf_split[4]; + }; +#ifdef CONFIG_SUPERTX + PICK_MODE_CONTEXT horizontal_supertx; + PICK_MODE_CONTEXT vertical_supertx; + PICK_MODE_CONTEXT split_supertx; +#if CONFIG_EXT_PARTITION_TYPES + PICK_MODE_CONTEXT horizontala_supertx; + PICK_MODE_CONTEXT horizontalb_supertx; + PICK_MODE_CONTEXT verticala_supertx; + PICK_MODE_CONTEXT verticalb_supertx; +#endif +#endif +} PC_TREE; + +void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td); +void av1_free_pc_tree(struct ThreadData *td); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* AV1_ENCODER_CONTEXT_TREE_H_ */ diff --git a/third_party/aom/av1/encoder/corner_detect.c b/third_party/aom/av1/encoder/corner_detect.c new file mode 100644 index 000000000..e4c59dd9c --- /dev/null +++ b/third_party/aom/av1/encoder/corner_detect.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "third_party/fastfeat/fast.h" + +#include "av1/encoder/corner_detect.h" + +// Fast_9 wrapper +#define FAST_BARRIER 18 +int fast_corner_detect(unsigned char *buf, int width, int height, int stride, + int *points, int max_points) { + int num_points; + xy *const frm_corners_xy = fast9_detect_nonmax(buf, width, height, stride, + FAST_BARRIER, &num_points); + num_points = (num_points <= max_points ? num_points : max_points); + if (num_points > 0 && frm_corners_xy) { + memcpy(points, frm_corners_xy, sizeof(*frm_corners_xy) * num_points); + free(frm_corners_xy); + return num_points; + } + free(frm_corners_xy); + return 0; +} diff --git a/third_party/aom/av1/encoder/corner_detect.h b/third_party/aom/av1/encoder/corner_detect.h new file mode 100644 index 000000000..0317db5b3 --- /dev/null +++ b/third_party/aom/av1/encoder/corner_detect.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_CORNER_DETECT_H_ +#define AV1_ENCODER_CORNER_DETECT_H_ + +#include +#include +#include + +int fast_corner_detect(unsigned char *buf, int width, int height, int stride, + int *points, int max_points); + +#endif // AV1_ENCODER_CORNER_DETECT_H_ diff --git a/third_party/aom/av1/encoder/corner_match.c b/third_party/aom/av1/encoder/corner_match.c new file mode 100644 index 000000000..64ee0c5ae --- /dev/null +++ b/third_party/aom/av1/encoder/corner_match.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "av1/encoder/corner_match.h" + +#define MATCH_SZ 13 +#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2) +#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ) +#define SEARCH_SZ 9 +#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2) + +#define THRESHOLD_NCC 0.75 + +/* Compute var(im) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of im, + centered at (x, y). +*/ +static double compute_variance(unsigned char *im, int stride, int x, int y) { + int sum = 0.0; + int sumsq = 0.0; + int var; + int i, j; + for (i = 0; i < MATCH_SZ; ++i) + for (j = 0; j < MATCH_SZ; ++j) { + sum += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; + sumsq += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] * + im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; + } + var = sumsq * MATCH_SZ_SQ - sum * sum; + return (double)var; +} + +/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the + correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows + of each image, centered at (x1, y1) and (x2, y2) respectively. +*/ +static double compute_cross_correlation(unsigned char *im1, int stride1, int x1, + int y1, unsigned char *im2, int stride2, + int x2, int y2) { + int v1, v2; + int sum1 = 0; + int sum2 = 0; + int sumsq2 = 0; + int cross = 0; + int var2, cov; + int i, j; + for (i = 0; i < MATCH_SZ; ++i) + for (j = 0; j < MATCH_SZ; ++j) { + v1 = im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)]; + v2 = im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)]; + sum1 += v1; + sum2 += v2; + sumsq2 += v2 * v2; + cross += v1 * v2; + } + var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; + cov = cross * MATCH_SZ_SQ - sum1 * sum2; + return cov / sqrt((double)var2); +} + +static int is_eligible_point(int pointx, int pointy, int width, int height) { + return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 && + pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height); +} + +static int is_eligible_distance(int point1x, int point1y, int point2x, + int point2y, int width, int height) { + const int thresh = (width < height ? height : width) >> 4; + return ((point1x - point2x) * (point1x - point2x) + + (point1y - point2y) * (point1y - point2y)) <= thresh * thresh; +} + +static void improve_correspondence(unsigned char *frm, unsigned char *ref, + int width, int height, int frm_stride, + int ref_stride, + Correspondence *correspondences, + int num_correspondences) { + int i; + for (i = 0; i < num_correspondences; ++i) { + int x, y, best_x = 0, best_y = 0; + double best_match_ncc = 0.0; + for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) { + for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { + double match_ncc; + if (!is_eligible_point(correspondences[i].rx + x, + correspondences[i].ry + y, width, height)) + continue; + if (!is_eligible_distance(correspondences[i].x, correspondences[i].y, + correspondences[i].rx + x, + correspondences[i].ry + y, width, height)) + continue; + match_ncc = compute_cross_correlation( + frm, frm_stride, correspondences[i].x, correspondences[i].y, ref, + ref_stride, correspondences[i].rx + x, correspondences[i].ry + y); + if (match_ncc > best_match_ncc) { + best_match_ncc = match_ncc; + best_y = y; + best_x = x; + } + } + } + correspondences[i].rx += best_x; + correspondences[i].ry += best_y; + } + for (i = 0; i < num_correspondences; ++i) { + int x, y, best_x = 0, best_y = 0; + double best_match_ncc = 0.0; + for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) + for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { + double match_ncc; + if (!is_eligible_point(correspondences[i].x + x, + correspondences[i].y + y, width, height)) + continue; + if (!is_eligible_distance( + correspondences[i].x + x, correspondences[i].y + y, + correspondences[i].rx, correspondences[i].ry, width, height)) + continue; + match_ncc = compute_cross_correlation( + ref, ref_stride, correspondences[i].rx, correspondences[i].ry, frm, + frm_stride, correspondences[i].x + x, correspondences[i].y + y); + if (match_ncc > best_match_ncc) { + best_match_ncc = match_ncc; + best_y = y; + best_x = x; + } + } + correspondences[i].x += best_x; + correspondences[i].y += best_y; + } +} + +int determine_correspondence(unsigned char *frm, int *frm_corners, + int num_frm_corners, unsigned char *ref, + int *ref_corners, int num_ref_corners, int width, + int height, int frm_stride, int ref_stride, + int *correspondence_pts) { + // TODO(sarahparker) Improve this to include 2-way match + int i, j; + Correspondence *correspondences = (Correspondence *)correspondence_pts; + int num_correspondences = 0; + for (i = 0; i < num_frm_corners; ++i) { + double best_match_ncc = 0.0; + double template_norm; + int best_match_j = -1; + if (!is_eligible_point(frm_corners[2 * i], frm_corners[2 * i + 1], width, + height)) + continue; + for (j = 0; j < num_ref_corners; ++j) { + double match_ncc; + if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width, + height)) + continue; + if (!is_eligible_distance(frm_corners[2 * i], frm_corners[2 * i + 1], + ref_corners[2 * j], ref_corners[2 * j + 1], + width, height)) + continue; + match_ncc = compute_cross_correlation( + frm, frm_stride, frm_corners[2 * i], frm_corners[2 * i + 1], ref, + ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]); + if (match_ncc > best_match_ncc) { + best_match_ncc = match_ncc; + best_match_j = j; + } + } + // Note: We want to test if the best correlation is >= THRESHOLD_NCC, + // but need to account for the normalization in compute_cross_correlation. + template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i], + frm_corners[2 * i + 1]); + if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) { + correspondences[num_correspondences].x = frm_corners[2 * i]; + correspondences[num_correspondences].y = frm_corners[2 * i + 1]; + correspondences[num_correspondences].rx = ref_corners[2 * best_match_j]; + correspondences[num_correspondences].ry = + ref_corners[2 * best_match_j + 1]; + num_correspondences++; + } + } + improve_correspondence(frm, ref, width, height, frm_stride, ref_stride, + correspondences, num_correspondences); + return num_correspondences; +} diff --git a/third_party/aom/av1/encoder/corner_match.h b/third_party/aom/av1/encoder/corner_match.h new file mode 100644 index 000000000..c0458642c --- /dev/null +++ b/third_party/aom/av1/encoder/corner_match.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AV1_ENCODER_CORNER_MATCH_H_ +#define AV1_ENCODER_CORNER_MATCH_H_ + +#include +#include +#include + +typedef struct { + int x, y; + int rx, ry; +} Correspondence; + +int determine_correspondence(unsigned char *frm, int *frm_corners, + int num_frm_corners, unsigned char *ref, + int *ref_corners, int num_ref_corners, int width, + int height, int frm_stride, int ref_stride, + int *correspondence_pts); + +#endif // AV1_ENCODER_CORNER_MATCH_H_ diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c new file mode 100644 index 000000000..e3151a597 --- /dev/null +++ b/third_party/aom/av1/encoder/cost.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "av1/encoder/cost.h" +#include "av1/common/entropy.h" + +/* round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)) + Begins with a bogus entry for simpler addressing. */ +const uint16_t av1_prob_cost[256] = { + 4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325, 2260, + 2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780, 1748, 1718, + 1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470, 1449, 1429, 1409, + 1390, 1371, 1353, 1335, 1318, 1301, 1284, 1268, 1252, 1236, 1221, 1206, 1192, + 1177, 1163, 1149, 1136, 1123, 1110, 1097, 1084, 1072, 1059, 1047, 1036, 1024, + 1013, 1001, 990, 979, 968, 958, 947, 937, 927, 917, 907, 897, 887, + 878, 868, 859, 850, 841, 832, 823, 814, 806, 797, 789, 780, 772, + 764, 756, 748, 740, 732, 724, 717, 709, 702, 694, 687, 680, 673, + 665, 658, 651, 644, 637, 631, 624, 617, 611, 604, 598, 591, 585, + 578, 572, 566, 560, 554, 547, 541, 535, 530, 524, 518, 512, 506, + 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435, + 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, + 366, 361, 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, + 307, 302, 298, 294, 289, 285, 281, 277, 273, 268, 264, 260, 256, + 252, 248, 244, 240, 236, 232, 228, 224, 220, 216, 212, 209, 205, + 201, 197, 194, 190, 186, 182, 179, 175, 171, 168, 164, 161, 157, + 153, 150, 146, 143, 139, 136, 132, 129, 125, 122, 119, 115, 112, + 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73, 70, + 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29, + 26, 23, 20, 18, 15, 12, 9, 6, 3 +}; + +static void cost(int *costs, aom_tree tree, const aom_prob *probs, int i, + int c) { + const aom_prob prob = probs[i / 2]; + int b; + + assert(prob != 0); + for (b = 0; b <= 1; ++b) { + const int cc = c + av1_cost_bit(prob, b); + const aom_tree_index ii = tree[i + b]; + + if (ii <= 0) + costs[-ii] = cc; + else + cost(costs, tree, probs, ii, cc); + } +} + +void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree) { + cost(costs, tree, probs, 0, 0); +} + +void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree) { + assert(tree[0] <= 0 && tree[1] > 0); + + costs[-tree[0]] = av1_cost_bit(probs[0], 0); + cost(costs, tree, probs, 2, 0); +} diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h new file mode 100644 index 000000000..d8fb357e6 --- /dev/null +++ b/third_party/aom/av1/encoder/cost.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_COST_H_ +#define AV1_ENCODER_COST_H_ + +#include "aom_dsp/prob.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern const uint16_t av1_prob_cost[256]; + +// The factor to scale from cost in bits to cost in av1_prob_cost units. +#define AV1_PROB_COST_SHIFT 9 + +#define av1_cost_zero(prob) (av1_prob_cost[prob]) + +#define av1_cost_one(prob) av1_cost_zero(256 - (prob)) + +#define av1_cost_bit(prob, bit) av1_cost_zero((bit) ? 256 - (prob) : (prob)) + +// Cost of coding an n bit literal, using 128 (i.e. 50%) probability +// for each bit. +#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT)) + +static INLINE unsigned int cost_branch256(const unsigned int ct[2], + aom_prob p) { + return ct[0] * av1_cost_zero(p) + ct[1] * av1_cost_one(p); +} + +static INLINE int treed_cost(aom_tree tree, const aom_prob *probs, int bits, + int len) { + int cost = 0; + aom_tree_index i = 0; + + do { + const int bit = (bits >> --len) & 1; + cost += av1_cost_bit(probs[i >> 1], bit); + i = tree[i + bit]; + } while (len); + + return cost; +} + +void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree); +void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_COST_H_ diff --git a/third_party/aom/av1/encoder/daala_compat_enc.c b/third_party/aom/av1/encoder/daala_compat_enc.c new file mode 100644 index 000000000..3df424cac --- /dev/null +++ b/third_party/aom/av1/encoder/daala_compat_enc.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "encint.h" + +void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf) { +#if CONFIG_DAALA_EC + od_ec_enc_checkpoint(&rbuf->ec, &enc->w.ec); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + OD_COPY(&rbuf->adapt, enc->state.adapt, 1); +} + +void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf) { +#if CONFIG_DAALA_EC + od_ec_enc_rollback(&enc->w.ec, &rbuf->ec); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + OD_COPY(enc->state.adapt, &rbuf->adapt, 1); +} diff --git a/third_party/aom/av1/encoder/dct.c b/third_party/aom/av1/encoder/dct.c new file mode 100644 index 000000000..09e1b0563 --- /dev/null +++ b/third_party/aom/av1/encoder/dct.c @@ -0,0 +1,2228 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "./av1_rtcd.h" +#include "aom_dsp/fwd_txfm.h" +#include "aom_ports/mem.h" +#include "av1/common/blockd.h" +#include "av1/common/av1_fwd_txfm1d.h" +#include "av1/common/av1_fwd_txfm2d_cfg.h" +#include "av1/common/idct.h" + +static INLINE void range_check(const tran_low_t *input, const int size, + const int bit) { +#if 0 // CONFIG_COEFFICIENT_RANGE_CHECKING +// TODO(angiebird): the range_check is not used because the bit range +// in fdct# is not correct. Since we are going to merge in a new version +// of fdct# from nextgenv2, we won't fix the incorrect bit range now. + int i; + for (i = 0; i < size; ++i) { + assert(abs(input[i]) < (1 << bit)); + } +#else + (void)input; + (void)size; + (void)bit; +#endif +} + +static void fdct4(const tran_low_t *input, tran_low_t *output) { + tran_high_t temp; + tran_low_t step[4]; + + // stage 0 + range_check(input, 4, 14); + + // stage 1 + output[0] = input[0] + input[3]; + output[1] = input[1] + input[2]; + output[2] = input[1] - input[2]; + output[3] = input[0] - input[3]; + + range_check(output, 4, 15); + + // stage 2 + temp = output[0] * cospi_16_64 + output[1] * cospi_16_64; + step[0] = (tran_low_t)fdct_round_shift(temp); + temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64; + step[1] = (tran_low_t)fdct_round_shift(temp); + temp = output[2] * cospi_24_64 + output[3] * cospi_8_64; + step[2] = (tran_low_t)fdct_round_shift(temp); + temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64; + step[3] = (tran_low_t)fdct_round_shift(temp); + + range_check(step, 4, 16); + + // stage 3 + output[0] = step[0]; + output[1] = step[2]; + output[2] = step[1]; + output[3] = step[3]; + + range_check(output, 4, 16); +} + +static void fdct8(const tran_low_t *input, tran_low_t *output) { + tran_high_t temp; + tran_low_t step[8]; + + // stage 0 + range_check(input, 8, 13); + + // stage 1 + output[0] = input[0] + input[7]; + output[1] = input[1] + input[6]; + output[2] = input[2] + input[5]; + output[3] = input[3] + input[4]; + output[4] = input[3] - input[4]; + output[5] = input[2] - input[5]; + output[6] = input[1] - input[6]; + output[7] = input[0] - input[7]; + + range_check(output, 8, 14); + + // stage 2 + step[0] = output[0] + output[3]; + step[1] = output[1] + output[2]; + step[2] = output[1] - output[2]; + step[3] = output[0] - output[3]; + step[4] = output[4]; + temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64; + step[5] = (tran_low_t)fdct_round_shift(temp); + temp = output[6] * cospi_16_64 + output[5] * cospi_16_64; + step[6] = (tran_low_t)fdct_round_shift(temp); + step[7] = output[7]; + + range_check(step, 8, 15); + + // stage 3 + temp = step[0] * cospi_16_64 + step[1] * cospi_16_64; + output[0] = (tran_low_t)fdct_round_shift(temp); + temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64; + output[1] = (tran_low_t)fdct_round_shift(temp); + temp = step[2] * cospi_24_64 + step[3] * cospi_8_64; + output[2] = (tran_low_t)fdct_round_shift(temp); + temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64; + output[3] = (tran_low_t)fdct_round_shift(temp); + output[4] = step[4] + step[5]; + output[5] = step[4] - step[5]; + output[6] = step[7] - step[6]; + output[7] = step[7] + step[6]; + + range_check(output, 8, 16); + + // stage 4 + step[0] = output[0]; + step[1] = output[1]; + step[2] = output[2]; + step[3] = output[3]; + temp = output[4] * cospi_28_64 + output[7] * cospi_4_64; + step[4] = (tran_low_t)fdct_round_shift(temp); + temp = output[5] * cospi_12_64 + output[6] * cospi_20_64; + step[5] = (tran_low_t)fdct_round_shift(temp); + temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64; + step[6] = (tran_low_t)fdct_round_shift(temp); + temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64; + step[7] = (tran_low_t)fdct_round_shift(temp); + + range_check(step, 8, 16); + + // stage 5 + output[0] = step[0]; + output[1] = step[4]; + output[2] = step[2]; + output[3] = step[6]; + output[4] = step[1]; + output[5] = step[5]; + output[6] = step[3]; + output[7] = step[7]; + + range_check(output, 8, 16); +} + +static void fdct16(const tran_low_t *input, tran_low_t *output) { + tran_high_t temp; + tran_low_t step[16]; + + // stage 0 + range_check(input, 16, 13); + + // stage 1 + output[0] = input[0] + input[15]; + output[1] = input[1] + input[14]; + output[2] = input[2] + input[13]; + output[3] = input[3] + input[12]; + output[4] = input[4] + input[11]; + output[5] = input[5] + input[10]; + output[6] = input[6] + input[9]; + output[7] = input[7] + input[8]; + output[8] = input[7] - input[8]; + output[9] = input[6] - input[9]; + output[10] = input[5] - input[10]; + output[11] = input[4] - input[11]; + output[12] = input[3] - input[12]; + output[13] = input[2] - input[13]; + output[14] = input[1] - input[14]; + output[15] = input[0] - input[15]; + + range_check(output, 16, 14); + + // stage 2 + step[0] = output[0] + output[7]; + step[1] = output[1] + output[6]; + step[2] = output[2] + output[5]; + step[3] = output[3] + output[4]; + step[4] = output[3] - output[4]; + step[5] = output[2] - output[5]; + step[6] = output[1] - output[6]; + step[7] = output[0] - output[7]; + step[8] = output[8]; + step[9] = output[9]; + temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64; + step[10] = (tran_low_t)fdct_round_shift(temp); + temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64; + step[11] = (tran_low_t)fdct_round_shift(temp); + temp = output[12] * cospi_16_64 + output[11] * cospi_16_64; + step[12] = (tran_low_t)fdct_round_shift(temp); + temp = output[13] * cospi_16_64 + output[10] * cospi_16_64; + step[13] = (tran_low_t)fdct_round_shift(temp); + step[14] = output[14]; + step[15] = output[15]; + + range_check(step, 16, 15); + + // stage 3 + output[0] = step[0] + step[3]; + output[1] = step[1] + step[2]; + output[2] = step[1] - step[2]; + output[3] = step[0] - step[3]; + output[4] = step[4]; + temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64; + output[5] = (tran_low_t)fdct_round_shift(temp); + temp = step[6] * cospi_16_64 + step[5] * cospi_16_64; + output[6] = (tran_low_t)fdct_round_shift(temp); + output[7] = step[7]; + output[8] = step[8] + step[11]; + output[9] = step[9] + step[10]; + output[10] = step[9] - step[10]; + output[11] = step[8] - step[11]; + output[12] = step[15] - step[12]; + output[13] = step[14] - step[13]; + output[14] = step[14] + step[13]; + output[15] = step[15] + step[12]; + + range_check(output, 16, 16); + + // stage 4 + temp = output[0] * cospi_16_64 + output[1] * cospi_16_64; + step[0] = (tran_low_t)fdct_round_shift(temp); + temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64; + step[1] = (tran_low_t)fdct_round_shift(temp); + temp = output[2] * cospi_24_64 + output[3] * cospi_8_64; + step[2] = (tran_low_t)fdct_round_shift(temp); + temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64; + step[3] = (tran_low_t)fdct_round_shift(temp); + step[4] = output[4] + output[5]; + step[5] = output[4] - output[5]; + step[6] = output[7] - output[6]; + step[7] = output[7] + output[6]; + step[8] = output[8]; + temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64; + step[9] = (tran_low_t)fdct_round_shift(temp); + temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64; + step[10] = (tran_low_t)fdct_round_shift(temp); + step[11] = output[11]; + step[12] = output[12]; + temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64; + step[13] = (tran_low_t)fdct_round_shift(temp); + temp = output[14] * cospi_8_64 + output[9] * cospi_24_64; + step[14] = (tran_low_t)fdct_round_shift(temp); + step[15] = output[15]; + + range_check(step, 16, 16); + + // stage 5 + output[0] = step[0]; + output[1] = step[1]; + output[2] = step[2]; + output[3] = step[3]; + temp = step[4] * cospi_28_64 + step[7] * cospi_4_64; + output[4] = (tran_low_t)fdct_round_shift(temp); + temp = step[5] * cospi_12_64 + step[6] * cospi_20_64; + output[5] = (tran_low_t)fdct_round_shift(temp); + temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64; + output[6] = (tran_low_t)fdct_round_shift(temp); + temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64; + output[7] = (tran_low_t)fdct_round_shift(temp); + output[8] = step[8] + step[9]; + output[9] = step[8] - step[9]; + output[10] = step[11] - step[10]; + output[11] = step[11] + step[10]; + output[12] = step[12] + step[13]; + output[13] = step[12] - step[13]; + output[14] = step[15] - step[14]; + output[15] = step[15] + step[14]; + + range_check(output, 16, 16); + + // stage 6 + step[0] = output[0]; + step[1] = output[1]; + step[2] = output[2]; + step[3] = output[3]; + step[4] = output[4]; + step[5] = output[5]; + step[6] = output[6]; + step[7] = output[7]; + temp = output[8] * cospi_30_64 + output[15] * cospi_2_64; + step[8] = (tran_low_t)fdct_round_shift(temp); + temp = output[9] * cospi_14_64 + output[14] * cospi_18_64; + step[9] = (tran_low_t)fdct_round_shift(temp); + temp = output[10] * cospi_22_64 + output[13] * cospi_10_64; + step[10] = (tran_low_t)fdct_round_shift(temp); + temp = output[11] * cospi_6_64 + output[12] * cospi_26_64; + step[11] = (tran_low_t)fdct_round_shift(temp); + temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64; + step[12] = (tran_low_t)fdct_round_shift(temp); + temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64; + step[13] = (tran_low_t)fdct_round_shift(temp); + temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64; + step[14] = (tran_low_t)fdct_round_shift(temp); + temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64; + step[15] = (tran_low_t)fdct_round_shift(temp); + + range_check(step, 16, 16); + + // stage 7 + output[0] = step[0]; + output[1] = step[8]; + output[2] = step[4]; + output[3] = step[12]; + output[4] = step[2]; + output[5] = step[10]; + output[6] = step[6]; + output[7] = step[14]; + output[8] = step[1]; + output[9] = step[9]; + output[10] = step[5]; + output[11] = step[13]; + output[12] = step[3]; + output[13] = step[11]; + output[14] = step[7]; + output[15] = step[15]; + + range_check(output, 16, 16); +} + +static void fdct32(const tran_low_t *input, tran_low_t *output) { + tran_high_t temp; + tran_low_t step[32]; + + // stage 0 + range_check(input, 32, 14); + + // stage 1 + output[0] = input[0] + input[31]; + output[1] = input[1] + input[30]; + output[2] = input[2] + input[29]; + output[3] = input[3] + input[28]; + output[4] = input[4] + input[27]; + output[5] = input[5] + input[26]; + output[6] = input[6] + input[25]; + output[7] = input[7] + input[24]; + output[8] = input[8] + input[23]; + output[9] = input[9] + input[22]; + output[10] = input[10] + input[21]; + output[11] = input[11] + input[20]; + output[12] = input[12] + input[19]; + output[13] = input[13] + input[18]; + output[14] = input[14] + input[17]; + output[15] = input[15] + input[16]; + output[16] = input[15] - input[16]; + output[17] = input[14] - input[17]; + output[18] = input[13] - input[18]; + output[19] = input[12] - input[19]; + output[20] = input[11] - input[20]; + output[21] = input[10] - input[21]; + output[22] = input[9] - input[22]; + output[23] = input[8] - input[23]; + output[24] = input[7] - input[24]; + output[25] = input[6] - input[25]; + output[26] = input[5] - input[26]; + output[27] = input[4] - input[27]; + output[28] = input[3] - input[28]; + output[29] = input[2] - input[29]; + output[30] = input[1] - input[30]; + output[31] = input[0] - input[31]; + + range_check(output, 32, 15); + + // stage 2 + step[0] = output[0] + output[15]; + step[1] = output[1] + output[14]; + step[2] = output[2] + output[13]; + step[3] = output[3] + output[12]; + step[4] = output[4] + output[11]; + step[5] = output[5] + output[10]; + step[6] = output[6] + output[9]; + step[7] = output[7] + output[8]; + step[8] = output[7] - output[8]; + step[9] = output[6] - output[9]; + step[10] = output[5] - output[10]; + step[11] = output[4] - output[11]; + step[12] = output[3] - output[12]; + step[13] = output[2] - output[13]; + step[14] = output[1] - output[14]; + step[15] = output[0] - output[15]; + step[16] = output[16]; + step[17] = output[17]; + step[18] = output[18]; + step[19] = output[19]; + temp = output[20] * -cospi_16_64 + output[27] * cospi_16_64; + step[20] = (tran_low_t)fdct_round_shift(temp); + temp = output[21] * -cospi_16_64 + output[26] * cospi_16_64; + step[21] = (tran_low_t)fdct_round_shift(temp); + temp = output[22] * -cospi_16_64 + output[25] * cospi_16_64; + step[22] = (tran_low_t)fdct_round_shift(temp); + temp = output[23] * -cospi_16_64 + output[24] * cospi_16_64; + step[23] = (tran_low_t)fdct_round_shift(temp); + temp = output[24] * cospi_16_64 + output[23] * cospi_16_64; + step[24] = (tran_low_t)fdct_round_shift(temp); + temp = output[25] * cospi_16_64 + output[22] * cospi_16_64; + step[25] = (tran_low_t)fdct_round_shift(temp); + temp = output[26] * cospi_16_64 + output[21] * cospi_16_64; + step[26] = (tran_low_t)fdct_round_shift(temp); + temp = output[27] * cospi_16_64 + output[20] * cospi_16_64; + step[27] = (tran_low_t)fdct_round_shift(temp); + step[28] = output[28]; + step[29] = output[29]; + step[30] = output[30]; + step[31] = output[31]; + + range_check(step, 32, 16); + + // stage 3 + output[0] = step[0] + step[7]; + output[1] = step[1] + step[6]; + output[2] = step[2] + step[5]; + output[3] = step[3] + step[4]; + output[4] = step[3] - step[4]; + output[5] = step[2] - step[5]; + output[6] = step[1] - step[6]; + output[7] = step[0] - step[7]; + output[8] = step[8]; + output[9] = step[9]; + temp = step[10] * -cospi_16_64 + step[13] * cospi_16_64; + output[10] = (tran_low_t)fdct_round_shift(temp); + temp = step[11] * -cospi_16_64 + step[12] * cospi_16_64; + output[11] = (tran_low_t)fdct_round_shift(temp); + temp = step[12] * cospi_16_64 + step[11] * cospi_16_64; + output[12] = (tran_low_t)fdct_round_shift(temp); + temp = step[13] * cospi_16_64 + step[10] * cospi_16_64; + output[13] = (tran_low_t)fdct_round_shift(temp); + output[14] = step[14]; + output[15] = step[15]; + output[16] = step[16] + step[23]; + output[17] = step[17] + step[22]; + output[18] = step[18] + step[21]; + output[19] = step[19] + step[20]; + output[20] = step[19] - step[20]; + output[21] = step[18] - step[21]; + output[22] = step[17] - step[22]; + output[23] = step[16] - step[23]; + output[24] = step[31] - step[24]; + output[25] = step[30] - step[25]; + output[26] = step[29] - step[26]; + output[27] = step[28] - step[27]; + output[28] = step[28] + step[27]; + output[29] = step[29] + step[26]; + output[30] = step[30] + step[25]; + output[31] = step[31] + step[24]; + + range_check(output, 32, 17); + + // stage 4 + step[0] = output[0] + output[3]; + step[1] = output[1] + output[2]; + step[2] = output[1] - output[2]; + step[3] = output[0] - output[3]; + step[4] = output[4]; + temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64; + step[5] = (tran_low_t)fdct_round_shift(temp); + temp = output[6] * cospi_16_64 + output[5] * cospi_16_64; + step[6] = (tran_low_t)fdct_round_shift(temp); + step[7] = output[7]; + step[8] = output[8] + output[11]; + step[9] = output[9] + output[10]; + step[10] = output[9] - output[10]; + step[11] = output[8] - output[11]; + step[12] = output[15] - output[12]; + step[13] = output[14] - output[13]; + step[14] = output[14] + output[13]; + step[15] = output[15] + output[12]; + step[16] = output[16]; + step[17] = output[17]; + temp = output[18] * -cospi_8_64 + output[29] * cospi_24_64; + step[18] = (tran_low_t)fdct_round_shift(temp); + temp = output[19] * -cospi_8_64 + output[28] * cospi_24_64; + step[19] = (tran_low_t)fdct_round_shift(temp); + temp = output[20] * -cospi_24_64 + output[27] * -cospi_8_64; + step[20] = (tran_low_t)fdct_round_shift(temp); + temp = output[21] * -cospi_24_64 + output[26] * -cospi_8_64; + step[21] = (tran_low_t)fdct_round_shift(temp); + step[22] = output[22]; + step[23] = output[23]; + step[24] = output[24]; + step[25] = output[25]; + temp = output[26] * cospi_24_64 + output[21] * -cospi_8_64; + step[26] = (tran_low_t)fdct_round_shift(temp); + temp = output[27] * cospi_24_64 + output[20] * -cospi_8_64; + step[27] = (tran_low_t)fdct_round_shift(temp); + temp = output[28] * cospi_8_64 + output[19] * cospi_24_64; + step[28] = (tran_low_t)fdct_round_shift(temp); + temp = output[29] * cospi_8_64 + output[18] * cospi_24_64; + step[29] = (tran_low_t)fdct_round_shift(temp); + step[30] = output[30]; + step[31] = output[31]; + + range_check(step, 32, 18); + + // stage 5 + temp = step[0] * cospi_16_64 + step[1] * cospi_16_64; + output[0] = (tran_low_t)fdct_round_shift(temp); + temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64; + output[1] = (tran_low_t)fdct_round_shift(temp); + temp = step[2] * cospi_24_64 + step[3] * cospi_8_64; + output[2] = (tran_low_t)fdct_round_shift(temp); + temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64; + output[3] = (tran_low_t)fdct_round_shift(temp); + output[4] = step[4] + step[5]; + output[5] = step[4] - step[5]; + output[6] = step[7] - step[6]; + output[7] = step[7] + step[6]; + output[8] = step[8]; + temp = step[9] * -cospi_8_64 + step[14] * cospi_24_64; + output[9] = (tran_low_t)fdct_round_shift(temp); + temp = step[10] * -cospi_24_64 + step[13] * -cospi_8_64; + output[10] = (tran_low_t)fdct_round_shift(temp); + output[11] = step[11]; + output[12] = step[12]; + temp = step[13] * cospi_24_64 + step[10] * -cospi_8_64; + output[13] = (tran_low_t)fdct_round_shift(temp); + temp = step[14] * cospi_8_64 + step[9] * cospi_24_64; + output[14] = (tran_low_t)fdct_round_shift(temp); + output[15] = step[15]; + output[16] = step[16] + step[19]; + output[17] = step[17] + step[18]; + output[18] = step[17] - step[18]; + output[19] = step[16] - step[19]; + output[20] = step[23] - step[20]; + output[21] = step[22] - step[21]; + output[22] = step[22] + step[21]; + output[23] = step[23] + step[20]; + output[24] = step[24] + step[27]; + output[25] = step[25] + step[26]; + output[26] = step[25] - step[26]; + output[27] = step[24] - step[27]; + output[28] = step[31] - step[28]; + output[29] = step[30] - step[29]; + output[30] = step[30] + step[29]; + output[31] = step[31] + step[28]; + + range_check(output, 32, 18); + + // stage 6 + step[0] = output[0]; + step[1] = output[1]; + step[2] = output[2]; + step[3] = output[3]; + temp = output[4] * cospi_28_64 + output[7] * cospi_4_64; + step[4] = (tran_low_t)fdct_round_shift(temp); + temp = output[5] * cospi_12_64 + output[6] * cospi_20_64; + step[5] = (tran_low_t)fdct_round_shift(temp); + temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64; + step[6] = (tran_low_t)fdct_round_shift(temp); + temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64; + step[7] = (tran_low_t)fdct_round_shift(temp); + step[8] = output[8] + output[9]; + step[9] = output[8] - output[9]; + step[10] = output[11] - output[10]; + step[11] = output[11] + output[10]; + step[12] = output[12] + output[13]; + step[13] = output[12] - output[13]; + step[14] = output[15] - output[14]; + step[15] = output[15] + output[14]; + step[16] = output[16]; + temp = output[17] * -cospi_4_64 + output[30] * cospi_28_64; + step[17] = (tran_low_t)fdct_round_shift(temp); + temp = output[18] * -cospi_28_64 + output[29] * -cospi_4_64; + step[18] = (tran_low_t)fdct_round_shift(temp); + step[19] = output[19]; + step[20] = output[20]; + temp = output[21] * -cospi_20_64 + output[26] * cospi_12_64; + step[21] = (tran_low_t)fdct_round_shift(temp); + temp = output[22] * -cospi_12_64 + output[25] * -cospi_20_64; + step[22] = (tran_low_t)fdct_round_shift(temp); + step[23] = output[23]; + step[24] = output[24]; + temp = output[25] * cospi_12_64 + output[22] * -cospi_20_64; + step[25] = (tran_low_t)fdct_round_shift(temp); + temp = output[26] * cospi_20_64 + output[21] * cospi_12_64; + step[26] = (tran_low_t)fdct_round_shift(temp); + step[27] = output[27]; + step[28] = output[28]; + temp = output[29] * cospi_28_64 + output[18] * -cospi_4_64; + step[29] = (tran_low_t)fdct_round_shift(temp); + temp = output[30] * cospi_4_64 + output[17] * cospi_28_64; + step[30] = (tran_low_t)fdct_round_shift(temp); + step[31] = output[31]; + + range_check(step, 32, 18); + + // stage 7 + output[0] = step[0]; + output[1] = step[1]; + output[2] = step[2]; + output[3] = step[3]; + output[4] = step[4]; + output[5] = step[5]; + output[6] = step[6]; + output[7] = step[7]; + temp = step[8] * cospi_30_64 + step[15] * cospi_2_64; + output[8] = (tran_low_t)fdct_round_shift(temp); + temp = step[9] * cospi_14_64 + step[14] * cospi_18_64; + output[9] = (tran_low_t)fdct_round_shift(temp); + temp = step[10] * cospi_22_64 + step[13] * cospi_10_64; + output[10] = (tran_low_t)fdct_round_shift(temp); + temp = step[11] * cospi_6_64 + step[12] * cospi_26_64; + output[11] = (tran_low_t)fdct_round_shift(temp); + temp = step[12] * cospi_6_64 + step[11] * -cospi_26_64; + output[12] = (tran_low_t)fdct_round_shift(temp); + temp = step[13] * cospi_22_64 + step[10] * -cospi_10_64; + output[13] = (tran_low_t)fdct_round_shift(temp); + temp = step[14] * cospi_14_64 + step[9] * -cospi_18_64; + output[14] = (tran_low_t)fdct_round_shift(temp); + temp = step[15] * cospi_30_64 + step[8] * -cospi_2_64; + output[15] = (tran_low_t)fdct_round_shift(temp); + output[16] = step[16] + step[17]; + output[17] = step[16] - step[17]; + output[18] = step[19] - step[18]; + output[19] = step[19] + step[18]; + output[20] = step[20] + step[21]; + output[21] = step[20] - step[21]; + output[22] = step[23] - step[22]; + output[23] = step[23] + step[22]; + output[24] = step[24] + step[25]; + output[25] = step[24] - step[25]; + output[26] = step[27] - step[26]; + output[27] = step[27] + step[26]; + output[28] = step[28] + step[29]; + output[29] = step[28] - step[29]; + output[30] = step[31] - step[30]; + output[31] = step[31] + step[30]; + + range_check(output, 32, 18); + + // stage 8 + step[0] = output[0]; + step[1] = output[1]; + step[2] = output[2]; + step[3] = output[3]; + step[4] = output[4]; + step[5] = output[5]; + step[6] = output[6]; + step[7] = output[7]; + step[8] = output[8]; + step[9] = output[9]; + step[10] = output[10]; + step[11] = output[11]; + step[12] = output[12]; + step[13] = output[13]; + step[14] = output[14]; + step[15] = output[15]; + temp = output[16] * cospi_31_64 + output[31] * cospi_1_64; + step[16] = (tran_low_t)fdct_round_shift(temp); + temp = output[17] * cospi_15_64 + output[30] * cospi_17_64; + step[17] = (tran_low_t)fdct_round_shift(temp); + temp = output[18] * cospi_23_64 + output[29] * cospi_9_64; + step[18] = (tran_low_t)fdct_round_shift(temp); + temp = output[19] * cospi_7_64 + output[28] * cospi_25_64; + step[19] = (tran_low_t)fdct_round_shift(temp); + temp = output[20] * cospi_27_64 + output[27] * cospi_5_64; + step[20] = (tran_low_t)fdct_round_shift(temp); + temp = output[21] * cospi_11_64 + output[26] * cospi_21_64; + step[21] = (tran_low_t)fdct_round_shift(temp); + temp = output[22] * cospi_19_64 + output[25] * cospi_13_64; + step[22] = (tran_low_t)fdct_round_shift(temp); + temp = output[23] * cospi_3_64 + output[24] * cospi_29_64; + step[23] = (tran_low_t)fdct_round_shift(temp); + temp = output[24] * cospi_3_64 + output[23] * -cospi_29_64; + step[24] = (tran_low_t)fdct_round_shift(temp); + temp = output[25] * cospi_19_64 + output[22] * -cospi_13_64; + step[25] = (tran_low_t)fdct_round_shift(temp); + temp = output[26] * cospi_11_64 + output[21] * -cospi_21_64; + step[26] = (tran_low_t)fdct_round_shift(temp); + temp = output[27] * cospi_27_64 + output[20] * -cospi_5_64; + step[27] = (tran_low_t)fdct_round_shift(temp); + temp = output[28] * cospi_7_64 + output[19] * -cospi_25_64; + step[28] = (tran_low_t)fdct_round_shift(temp); + temp = output[29] * cospi_23_64 + output[18] * -cospi_9_64; + step[29] = (tran_low_t)fdct_round_shift(temp); + temp = output[30] * cospi_15_64 + output[17] * -cospi_17_64; + step[30] = (tran_low_t)fdct_round_shift(temp); + temp = output[31] * cospi_31_64 + output[16] * -cospi_1_64; + step[31] = (tran_low_t)fdct_round_shift(temp); + + range_check(step, 32, 18); + + // stage 9 + output[0] = step[0]; + output[1] = step[16]; + output[2] = step[8]; + output[3] = step[24]; + output[4] = step[4]; + output[5] = step[20]; + output[6] = step[12]; + output[7] = step[28]; + output[8] = step[2]; + output[9] = step[18]; + output[10] = step[10]; + output[11] = step[26]; + output[12] = step[6]; + output[13] = step[22]; + output[14] = step[14]; + output[15] = step[30]; + output[16] = step[1]; + output[17] = step[17]; + output[18] = step[9]; + output[19] = step[25]; + output[20] = step[5]; + output[21] = step[21]; + output[22] = step[13]; + output[23] = step[29]; + output[24] = step[3]; + output[25] = step[19]; + output[26] = step[11]; + output[27] = step[27]; + output[28] = step[7]; + output[29] = step[23]; + output[30] = step[15]; + output[31] = step[31]; + + range_check(output, 32, 18); +} + +#ifndef AV1_DCT_GTEST + +static void fadst4(const tran_low_t *input, tran_low_t *output) { + tran_high_t x0, x1, x2, x3; + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_4_9 * x0; + s2 = sinpi_2_9 * x1; + s3 = sinpi_1_9 * x1; + s4 = sinpi_3_9 * x2; + s5 = sinpi_4_9 * x3; + s6 = sinpi_2_9 * x3; + s7 = x0 + x1 - x3; + + x0 = s0 + s2 + s5; + x1 = sinpi_3_9 * s7; + x2 = s1 - s3 + s6; + x3 = s4; + + s0 = x0 + x3; + s1 = x1; + s2 = x2 - x3; + s3 = x2 - x0 + x3; + + // 1-D transform scaling factor is sqrt(2). + output[0] = (tran_low_t)fdct_round_shift(s0); + output[1] = (tran_low_t)fdct_round_shift(s1); + output[2] = (tran_low_t)fdct_round_shift(s2); + output[3] = (tran_low_t)fdct_round_shift(s3); +} + +static void fadst8(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_high_t x0 = input[7]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[5]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[3]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[1]; + tran_high_t x7 = input[6]; + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = fdct_round_shift(s0 - s4); + x5 = fdct_round_shift(s1 - s5); + x6 = fdct_round_shift(s2 - s6); + x7 = fdct_round_shift(s3 - s7); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = fdct_round_shift(s0 + s2); + x1 = fdct_round_shift(s1 + s3); + x2 = fdct_round_shift(s0 - s2); + x3 = fdct_round_shift(s1 - s3); + x4 = fdct_round_shift(s4 + s6); + x5 = fdct_round_shift(s5 + s7); + x6 = fdct_round_shift(s4 - s6); + x7 = fdct_round_shift(s5 - s7); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = fdct_round_shift(s2); + x3 = fdct_round_shift(s3); + x6 = fdct_round_shift(s6); + x7 = fdct_round_shift(s7); + + output[0] = (tran_low_t)x0; + output[1] = (tran_low_t)-x4; + output[2] = (tran_low_t)x6; + output[3] = (tran_low_t)-x2; + output[4] = (tran_low_t)x3; + output[5] = (tran_low_t)-x7; + output[6] = (tran_low_t)x5; + output[7] = (tran_low_t)-x1; +} + +static void fadst16(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + + tran_high_t x0 = input[15]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[13]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[11]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[9]; + tran_high_t x7 = input[6]; + tran_high_t x8 = input[7]; + tran_high_t x9 = input[8]; + tran_high_t x10 = input[5]; + tran_high_t x11 = input[10]; + tran_high_t x12 = input[3]; + tran_high_t x13 = input[12]; + tran_high_t x14 = input[1]; + tran_high_t x15 = input[14]; + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = s0 + s8; + x1 = s1 + s9; + x2 = s2 + s10; + x3 = s3 + s11; + x4 = s4 + s12; + x5 = s5 + s13; + x6 = s6 + s14; + x7 = s7 + s15; + + x8 = fdct_round_shift(s0 - s8); + x9 = fdct_round_shift(s1 - s9); + x10 = fdct_round_shift(s2 - s10); + x11 = fdct_round_shift(s3 - s11); + x12 = fdct_round_shift(s4 - s12); + x13 = fdct_round_shift(s5 - s13); + x14 = fdct_round_shift(s6 - s14); + x15 = fdct_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = fdct_round_shift(s0 - s4); + x5 = fdct_round_shift(s1 - s5); + x6 = fdct_round_shift(s2 - s6); + x7 = fdct_round_shift(s3 - s7); + + x8 = s8 + s12; + x9 = s9 + s13; + x10 = s10 + s14; + x11 = s11 + s15; + x12 = fdct_round_shift(s8 - s12); + x13 = fdct_round_shift(s9 - s13); + x14 = fdct_round_shift(s10 - s14); + x15 = fdct_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = fdct_round_shift(s0 + s2); + x1 = fdct_round_shift(s1 + s3); + x2 = fdct_round_shift(s0 - s2); + x3 = fdct_round_shift(s1 - s3); + + x4 = fdct_round_shift(s4 + s6); + x5 = fdct_round_shift(s5 + s7); + x6 = fdct_round_shift(s4 - s6); + x7 = fdct_round_shift(s5 - s7); + + x8 = fdct_round_shift(s8 + s10); + x9 = fdct_round_shift(s9 + s11); + x10 = fdct_round_shift(s8 - s10); + x11 = fdct_round_shift(s9 - s11); + + x12 = fdct_round_shift(s12 + s14); + x13 = fdct_round_shift(s13 + s15); + x14 = fdct_round_shift(s12 - s14); + x15 = fdct_round_shift(s13 - s15); + + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = fdct_round_shift(s2); + x3 = fdct_round_shift(s3); + x6 = fdct_round_shift(s6); + x7 = fdct_round_shift(s7); + x10 = fdct_round_shift(s10); + x11 = fdct_round_shift(s11); + x14 = fdct_round_shift(s14); + x15 = fdct_round_shift(s15); + + output[0] = (tran_low_t)x0; + output[1] = (tran_low_t)-x8; + output[2] = (tran_low_t)x12; + output[3] = (tran_low_t)-x4; + output[4] = (tran_low_t)x6; + output[5] = (tran_low_t)x14; + output[6] = (tran_low_t)x10; + output[7] = (tran_low_t)x2; + output[8] = (tran_low_t)x3; + output[9] = (tran_low_t)x11; + output[10] = (tran_low_t)x15; + output[11] = (tran_low_t)x7; + output[12] = (tran_low_t)x5; + output[13] = (tran_low_t)-x13; + output[14] = (tran_low_t)x9; + output[15] = (tran_low_t)-x1; +} + +// For use in lieu of ADST +static void fhalfright32(const tran_low_t *input, tran_low_t *output) { + int i; + tran_low_t inputhalf[16]; + for (i = 0; i < 16; ++i) { + output[16 + i] = input[i] * 4; + } + // Multiply input by sqrt(2) + for (i = 0; i < 16; ++i) { + inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2); + } + fdct16(inputhalf, output); + // Note overall scaling factor is 4 times orthogonal +} + +#if CONFIG_EXT_TX +static void fidtx4(const tran_low_t *input, tran_low_t *output) { + int i; + for (i = 0; i < 4; ++i) + output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2); +} + +static void fidtx8(const tran_low_t *input, tran_low_t *output) { + int i; + for (i = 0; i < 8; ++i) output[i] = input[i] * 2; +} + +static void fidtx16(const tran_low_t *input, tran_low_t *output) { + int i; + for (i = 0; i < 16; ++i) + output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2); +} + +static void fidtx32(const tran_low_t *input, tran_low_t *output) { + int i; + for (i = 0; i < 32; ++i) output[i] = input[i] * 4; +} + +static void copy_block(const int16_t *src, int src_stride, int l, int w, + int16_t *dest, int dest_stride) { + int i; + for (i = 0; i < l; ++i) { + memcpy(dest + dest_stride * i, src + src_stride * i, w * sizeof(int16_t)); + } +} + +static void fliplr(int16_t *dest, int stride, int l, int w) { + int i, j; + for (i = 0; i < l; ++i) { + for (j = 0; j < w / 2; ++j) { + const int16_t tmp = dest[i * stride + j]; + dest[i * stride + j] = dest[i * stride + w - 1 - j]; + dest[i * stride + w - 1 - j] = tmp; + } + } +} + +static void flipud(int16_t *dest, int stride, int l, int w) { + int i, j; + for (j = 0; j < w; ++j) { + for (i = 0; i < l / 2; ++i) { + const int16_t tmp = dest[i * stride + j]; + dest[i * stride + j] = dest[(l - 1 - i) * stride + j]; + dest[(l - 1 - i) * stride + j] = tmp; + } + } +} + +static void fliplrud(int16_t *dest, int stride, int l, int w) { + int i, j; + for (i = 0; i < l / 2; ++i) { + for (j = 0; j < w; ++j) { + const int16_t tmp = dest[i * stride + j]; + dest[i * stride + j] = dest[(l - 1 - i) * stride + w - 1 - j]; + dest[(l - 1 - i) * stride + w - 1 - j] = tmp; + } + } +} + +static void copy_fliplr(const int16_t *src, int src_stride, int l, int w, + int16_t *dest, int dest_stride) { + copy_block(src, src_stride, l, w, dest, dest_stride); + fliplr(dest, dest_stride, l, w); +} + +static void copy_flipud(const int16_t *src, int src_stride, int l, int w, + int16_t *dest, int dest_stride) { + copy_block(src, src_stride, l, w, dest, dest_stride); + flipud(dest, dest_stride, l, w); +} + +static void copy_fliplrud(const int16_t *src, int src_stride, int l, int w, + int16_t *dest, int dest_stride) { + copy_block(src, src_stride, l, w, dest, dest_stride); + fliplrud(dest, dest_stride, l, w); +} + +static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w, + int16_t *buff, int tx_type) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case IDTX: + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: break; + case FLIPADST_DCT: + case FLIPADST_ADST: + case V_FLIPADST: + copy_flipud(*src, *src_stride, l, w, buff, w); + *src = buff; + *src_stride = w; + break; + case DCT_FLIPADST: + case ADST_FLIPADST: + case H_FLIPADST: + copy_fliplr(*src, *src_stride, l, w, buff, w); + *src = buff; + *src_stride = w; + break; + case FLIPADST_FLIPADST: + copy_fliplrud(*src, *src_stride, l, w, buff, w); + *src = buff; + *src_stride = w; + break; + default: assert(0); break; + } +} +#endif // CONFIG_EXT_TX + +void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + if (tx_type == DCT_DCT) { + aom_fdct4x4_c(input, output, stride); + } else { + static const transform_2d FHT[] = { + { fdct4, fdct4 }, // DCT_DCT + { fadst4, fdct4 }, // ADST_DCT + { fdct4, fadst4 }, // DCT_ADST + { fadst4, fadst4 }, // ADST_ADST +#if CONFIG_EXT_TX + { fadst4, fdct4 }, // FLIPADST_DCT + { fdct4, fadst4 }, // DCT_FLIPADST + { fadst4, fadst4 }, // FLIPADST_FLIPADST + { fadst4, fadst4 }, // ADST_FLIPADST + { fadst4, fadst4 }, // FLIPADST_ADST + { fidtx4, fidtx4 }, // IDTX + { fdct4, fidtx4 }, // V_DCT + { fidtx4, fdct4 }, // H_DCT + { fadst4, fidtx4 }, // V_ADST + { fidtx4, fadst4 }, // H_ADST + { fadst4, fidtx4 }, // V_FLIPADST + { fidtx4, fadst4 }, // H_FLIPADST +#endif // CONFIG_EXT_TX + }; + const transform_2d ht = FHT[tx_type]; + tran_low_t out[4 * 4]; + int i, j; + tran_low_t temp_in[4], temp_out[4]; + +#if CONFIG_EXT_TX + int16_t flipped_input[4 * 4]; + maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type); +#endif + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16; + if (i == 0 && temp_in[0]) temp_in[0] += 1; + ht.cols(temp_in, temp_out); + for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j]; + } + + // Rows + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2; + } + } +} + +void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct8, fdct4 }, // DCT_DCT + { fadst8, fdct4 }, // ADST_DCT + { fdct8, fadst4 }, // DCT_ADST + { fadst8, fadst4 }, // ADST_ADST +#if CONFIG_EXT_TX + { fadst8, fdct4 }, // FLIPADST_DCT + { fdct8, fadst4 }, // DCT_FLIPADST + { fadst8, fadst4 }, // FLIPADST_FLIPADST + { fadst8, fadst4 }, // ADST_FLIPADST + { fadst8, fadst4 }, // FLIPADST_ADST + { fidtx8, fidtx4 }, // IDTX + { fdct8, fidtx4 }, // V_DCT + { fidtx8, fdct4 }, // H_DCT + { fadst8, fidtx4 }, // V_ADST + { fidtx8, fadst4 }, // H_ADST + { fadst8, fidtx4 }, // V_FLIPADST + { fidtx8, fadst4 }, // H_FLIPADST +#endif + }; + const transform_2d ht = FHT[tx_type]; + const int n = 4; + const int n2 = 8; + tran_low_t out[8 * 4]; + tran_low_t temp_in[8], temp_out[8]; + int i, j; +#if CONFIG_EXT_TX + int16_t flipped_input[8 * 4]; + maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type); +#endif + + // Rows + for (i = 0; i < n2; ++i) { + for (j = 0; j < n; ++j) + temp_in[j] = + (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2); + ht.rows(temp_in, temp_out); + for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j]; + } + + // Columns + for (i = 0; i < n; ++i) { + for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; + ht.cols(temp_in, temp_out); + for (j = 0; j < n2; ++j) + output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1; + } + // Note: overall scale factor of transform is 8 times unitary +} + +void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct4, fdct8 }, // DCT_DCT + { fadst4, fdct8 }, // ADST_DCT + { fdct4, fadst8 }, // DCT_ADST + { fadst4, fadst8 }, // ADST_ADST +#if CONFIG_EXT_TX + { fadst4, fdct8 }, // FLIPADST_DCT + { fdct4, fadst8 }, // DCT_FLIPADST + { fadst4, fadst8 }, // FLIPADST_FLIPADST + { fadst4, fadst8 }, // ADST_FLIPADST + { fadst4, fadst8 }, // FLIPADST_ADST + { fidtx4, fidtx8 }, // IDTX + { fdct4, fidtx8 }, // V_DCT + { fidtx4, fdct8 }, // H_DCT + { fadst4, fidtx8 }, // V_ADST + { fidtx4, fadst8 }, // H_ADST + { fadst4, fidtx8 }, // V_FLIPADST + { fidtx4, fadst8 }, // H_FLIPADST +#endif + }; + const transform_2d ht = FHT[tx_type]; + const int n = 4; + const int n2 = 8; + tran_low_t out[8 * 4]; + tran_low_t temp_in[8], temp_out[8]; + int i, j; +#if CONFIG_EXT_TX + int16_t flipped_input[8 * 4]; + maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type); +#endif + + // Columns + for (i = 0; i < n2; ++i) { + for (j = 0; j < n; ++j) + temp_in[j] = + (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2); + ht.cols(temp_in, temp_out); + for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j]; + } + + // Rows + for (i = 0; i < n; ++i) { + for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; + ht.rows(temp_in, temp_out); + for (j = 0; j < n2; ++j) + output[j + i * n2] = (temp_out[j] + (temp_out[j] < 0)) >> 1; + } + // Note: overall scale factor of transform is 8 times unitary +} + +void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct16, fdct4 }, // DCT_DCT + { fadst16, fdct4 }, // ADST_DCT + { fdct16, fadst4 }, // DCT_ADST + { fadst16, fadst4 }, // ADST_ADST +#if CONFIG_EXT_TX + { fadst16, fdct4 }, // FLIPADST_DCT + { fdct16, fadst4 }, // DCT_FLIPADST + { fadst16, fadst4 }, // FLIPADST_FLIPADST + { fadst16, fadst4 }, // ADST_FLIPADST + { fadst16, fadst4 }, // FLIPADST_ADST + { fidtx16, fidtx4 }, // IDTX + { fdct16, fidtx4 }, // V_DCT + { fidtx16, fdct4 }, // H_DCT + { fadst16, fidtx4 }, // V_ADST + { fidtx16, fadst4 }, // H_ADST + { fadst16, fidtx4 }, // V_FLIPADST + { fidtx16, fadst4 }, // H_FLIPADST +#endif + }; + const transform_2d ht = FHT[tx_type]; + const int n = 4; + const int n4 = 16; + tran_low_t out[16 * 4]; + tran_low_t temp_in[16], temp_out[16]; + int i, j; +#if CONFIG_EXT_TX + int16_t flipped_input[16 * 4]; + maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type); +#endif + + // Rows + for (i = 0; i < n4; ++i) { + for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4; + ht.rows(temp_in, temp_out); + for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j]; + } + + // Columns + for (i = 0; i < n; ++i) { + for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4]; + ht.cols(temp_in, temp_out); + for (j = 0; j < n4; ++j) + output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1; + } + // Note: overall scale factor of transform is 8 times unitary +} + +void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct4, fdct16 }, // DCT_DCT + { fadst4, fdct16 }, // ADST_DCT + { fdct4, fadst16 }, // DCT_ADST + { fadst4, fadst16 }, // ADST_ADST +#if CONFIG_EXT_TX + { fadst4, fdct16 }, // FLIPADST_DCT + { fdct4, fadst16 }, // DCT_FLIPADST + { fadst4, fadst16 }, // FLIPADST_FLIPADST + { fadst4, fadst16 }, // ADST_FLIPADST + { fadst4, fadst16 }, // FLIPADST_ADST + { fidtx4, fidtx16 }, // IDTX + { fdct4, fidtx16 }, // V_DCT + { fidtx4, fdct16 }, // H_DCT + { fadst4, fidtx16 }, // V_ADST + { fidtx4, fadst16 }, // H_ADST + { fadst4, fidtx16 }, // V_FLIPADST + { fidtx4, fadst16 }, // H_FLIPADST +#endif + }; + const transform_2d ht = FHT[tx_type]; + const int n = 4; + const int n4 = 16; + tran_low_t out[16 * 4]; + tran_low_t temp_in[16], temp_out[16]; + int i, j; +#if CONFIG_EXT_TX + int16_t flipped_input[16 * 4]; + maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type); +#endif + + // Columns + for (i = 0; i < n4; ++i) { + for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4; + ht.cols(temp_in, temp_out); + for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j]; + } + + // Rows + for (i = 0; i < n; ++i) { + for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4]; + ht.rows(temp_in, temp_out); + for (j = 0; j < n4; ++j) + output[j + i * n4] = (temp_out[j] + (temp_out[j] < 0)) >> 1; + } + // Note: overall scale factor of transform is 8 times unitary +} + +void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct16, fdct8 }, // DCT_DCT + { fadst16, fdct8 }, // ADST_DCT + { fdct16, fadst8 }, // DCT_ADST + { fadst16, fadst8 }, // ADST_ADST +#if CONFIG_EXT_TX + { fadst16, fdct8 }, // FLIPADST_DCT + { fdct16, fadst8 }, // DCT_FLIPADST + { fadst16, fadst8 }, // FLIPADST_FLIPADST + { fadst16, fadst8 }, // ADST_FLIPADST + { fadst16, fadst8 }, // FLIPADST_ADST + { fidtx16, fidtx8 }, // IDTX + { fdct16, fidtx8 }, // V_DCT + { fidtx16, fdct8 }, // H_DCT + { fadst16, fidtx8 }, // V_ADST + { fidtx16, fadst8 }, // H_ADST + { fadst16, fidtx8 }, // V_FLIPADST + { fidtx16, fadst8 }, // H_FLIPADST +#endif + }; + const transform_2d ht = FHT[tx_type]; + const int n = 8; + const int n2 = 16; + tran_low_t out[16 * 8]; + tran_low_t temp_in[16], temp_out[16]; + int i, j; +#if CONFIG_EXT_TX + int16_t flipped_input[16 * 8]; + maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type); +#endif + + // Rows + for (i = 0; i < n2; ++i) { + for (j = 0; j < n; ++j) + temp_in[j] = + (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2); + ht.rows(temp_in, temp_out); + for (j = 0; j < n; ++j) + out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); + } + + // Columns + for (i = 0; i < n; ++i) { + for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; + ht.cols(temp_in, temp_out); + for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j]; + } + // Note: overall scale factor of transform is 8 times unitary +} + +void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct8, fdct16 }, // DCT_DCT + { fadst8, fdct16 }, // ADST_DCT + { fdct8, fadst16 }, // DCT_ADST + { fadst8, fadst16 }, // ADST_ADST +#if CONFIG_EXT_TX + { fadst8, fdct16 }, // FLIPADST_DCT + { fdct8, fadst16 }, // DCT_FLIPADST + { fadst8, fadst16 }, // FLIPADST_FLIPADST + { fadst8, fadst16 }, // ADST_FLIPADST + { fadst8, fadst16 }, // FLIPADST_ADST + { fidtx8, fidtx16 }, // IDTX + { fdct8, fidtx16 }, // V_DCT + { fidtx8, fdct16 }, // H_DCT + { fadst8, fidtx16 }, // V_ADST + { fidtx8, fadst16 }, // H_ADST + { fadst8, fidtx16 }, // V_FLIPADST + { fidtx8, fadst16 }, // H_FLIPADST +#endif + }; + const transform_2d ht = FHT[tx_type]; + const int n = 8; + const int n2 = 16; + tran_low_t out[16 * 8]; + tran_low_t temp_in[16], temp_out[16]; + int i, j; +#if CONFIG_EXT_TX + int16_t flipped_input[16 * 8]; + maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type); +#endif + + // Columns + for (i = 0; i < n2; ++i) { + for (j = 0; j < n; ++j) + temp_in[j] = + (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2); + ht.cols(temp_in, temp_out); + for (j = 0; j < n; ++j) + out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); + } + + // Rows + for (i = 0; i < n; ++i) { + for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; + ht.rows(temp_in, temp_out); + for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j]; + } + // Note: overall scale factor of transform is 8 times unitary +} + +void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct32, fdct8 }, // DCT_DCT + { fhalfright32, fdct8 }, // ADST_DCT + { fdct32, fadst8 }, // DCT_ADST + { fhalfright32, fadst8 }, // ADST_ADST +#if CONFIG_EXT_TX + { fhalfright32, fdct8 }, // FLIPADST_DCT + { fdct32, fadst8 }, // DCT_FLIPADST + { fhalfright32, fadst8 }, // FLIPADST_FLIPADST + { fhalfright32, fadst8 }, // ADST_FLIPADST + { fhalfright32, fadst8 }, // FLIPADST_ADST + { fidtx32, fidtx8 }, // IDTX + { fdct32, fidtx8 }, // V_DCT + { fidtx32, fdct8 }, // H_DCT + { fhalfright32, fidtx8 }, // V_ADST + { fidtx32, fadst8 }, // H_ADST + { fhalfright32, fidtx8 }, // V_FLIPADST + { fidtx32, fadst8 }, // H_FLIPADST +#endif + }; + const transform_2d ht = FHT[tx_type]; + const int n = 8; + const int n4 = 32; + tran_low_t out[32 * 8]; + tran_low_t temp_in[32], temp_out[32]; + int i, j; +#if CONFIG_EXT_TX + int16_t flipped_input[32 * 8]; + maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type); +#endif + + // Rows + for (i = 0; i < n4; ++i) { + for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4; + ht.rows(temp_in, temp_out); + for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j]; + } + + // Columns + for (i = 0; i < n; ++i) { + for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4]; + ht.cols(temp_in, temp_out); + for (j = 0; j < n4; ++j) + output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); + } + // Note: overall scale factor of transform is 4 times unitary +} + +void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct8, fdct32 }, // DCT_DCT + { fadst8, fdct32 }, // ADST_DCT + { fdct8, fhalfright32 }, // DCT_ADST + { fadst8, fhalfright32 }, // ADST_ADST +#if CONFIG_EXT_TX + { fadst8, fdct32 }, // FLIPADST_DCT + { fdct8, fhalfright32 }, // DCT_FLIPADST + { fadst8, fhalfright32 }, // FLIPADST_FLIPADST + { fadst8, fhalfright32 }, // ADST_FLIPADST + { fadst8, fhalfright32 }, // FLIPADST_ADST + { fidtx8, fidtx32 }, // IDTX + { fdct8, fidtx32 }, // V_DCT + { fidtx8, fdct32 }, // H_DCT + { fadst8, fidtx32 }, // V_ADST + { fidtx8, fhalfright32 }, // H_ADST + { fadst8, fidtx32 }, // V_FLIPADST + { fidtx8, fhalfright32 }, // H_FLIPADST +#endif + }; + const transform_2d ht = FHT[tx_type]; + const int n = 8; + const int n4 = 32; + tran_low_t out[32 * 8]; + tran_low_t temp_in[32], temp_out[32]; + int i, j; +#if CONFIG_EXT_TX + int16_t flipped_input[32 * 8]; + maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type); +#endif + + // Columns + for (i = 0; i < n4; ++i) { + for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4; + ht.cols(temp_in, temp_out); + for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j]; + } + + // Rows + for (i = 0; i < n; ++i) { + for (j = 0; j < n4; ++j) temp_in[j] = out[j + i * n4]; + ht.rows(temp_in, temp_out); + for (j = 0; j < n4; ++j) + output[j + i * n4] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); + } + // Note: overall scale factor of transform is 4 times unitary +} + +void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct32, fdct16 }, // DCT_DCT + { fhalfright32, fdct16 }, // ADST_DCT + { fdct32, fadst16 }, // DCT_ADST + { fhalfright32, fadst16 }, // ADST_ADST +#if CONFIG_EXT_TX + { fhalfright32, fdct16 }, // FLIPADST_DCT + { fdct32, fadst16 }, // DCT_FLIPADST + { fhalfright32, fadst16 }, // FLIPADST_FLIPADST + { fhalfright32, fadst16 }, // ADST_FLIPADST + { fhalfright32, fadst16 }, // FLIPADST_ADST + { fidtx32, fidtx16 }, // IDTX + { fdct32, fidtx16 }, // V_DCT + { fidtx32, fdct16 }, // H_DCT + { fhalfright32, fidtx16 }, // V_ADST + { fidtx32, fadst16 }, // H_ADST + { fhalfright32, fidtx16 }, // V_FLIPADST + { fidtx32, fadst16 }, // H_FLIPADST +#endif + }; + const transform_2d ht = FHT[tx_type]; + const int n = 16; + const int n2 = 32; + tran_low_t out[32 * 16]; + tran_low_t temp_in[32], temp_out[32]; + int i, j; +#if CONFIG_EXT_TX + int16_t flipped_input[32 * 16]; + maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type); +#endif + + // Rows + for (i = 0; i < n2; ++i) { + for (j = 0; j < n; ++j) + temp_in[j] = + (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2); + ht.rows(temp_in, temp_out); + for (j = 0; j < n; ++j) + out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4); + } + + // Columns + for (i = 0; i < n; ++i) { + for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; + ht.cols(temp_in, temp_out); + for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j]; + } + // Note: overall scale factor of transform is 4 times unitary +} + +void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct16, fdct32 }, // DCT_DCT + { fadst16, fdct32 }, // ADST_DCT + { fdct16, fhalfright32 }, // DCT_ADST + { fadst16, fhalfright32 }, // ADST_ADST +#if CONFIG_EXT_TX + { fadst16, fdct32 }, // FLIPADST_DCT + { fdct16, fhalfright32 }, // DCT_FLIPADST + { fadst16, fhalfright32 }, // FLIPADST_FLIPADST + { fadst16, fhalfright32 }, // ADST_FLIPADST + { fadst16, fhalfright32 }, // FLIPADST_ADST + { fidtx16, fidtx32 }, // IDTX + { fdct16, fidtx32 }, // V_DCT + { fidtx16, fdct32 }, // H_DCT + { fadst16, fidtx32 }, // V_ADST + { fidtx16, fhalfright32 }, // H_ADST + { fadst16, fidtx32 }, // V_FLIPADST + { fidtx16, fhalfright32 }, // H_FLIPADST +#endif + }; + const transform_2d ht = FHT[tx_type]; + const int n = 16; + const int n2 = 32; + tran_low_t out[32 * 16]; + tran_low_t temp_in[32], temp_out[32]; + int i, j; +#if CONFIG_EXT_TX + int16_t flipped_input[32 * 16]; + maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type); +#endif + + // Columns + for (i = 0; i < n2; ++i) { + for (j = 0; j < n; ++j) + temp_in[j] = + (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2); + ht.cols(temp_in, temp_out); + for (j = 0; j < n; ++j) + out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4); + } + + // Rows + for (i = 0; i < n; ++i) { + for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; + ht.rows(temp_in, temp_out); + for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j]; + } + // Note: overall scale factor of transform is 4 times unitary +} + +void av1_fdct8x8_quant_c(const int16_t *input, int stride, + tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan +#if CONFIG_AOM_QM + , + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr +#endif + ) { + int eob = -1; + + int i, j; + tran_low_t intermediate[64]; + + // Transform columns + { + tran_low_t *output = intermediate; + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 + + for (i = 0; i < 8; i++) { + // stage 1 + s0 = (input[0 * stride] + input[7 * stride]) * 4; + s1 = (input[1 * stride] + input[6 * stride]) * 4; + s2 = (input[2 * stride] + input[5 * stride]) * 4; + s3 = (input[3 * stride] + input[4 * stride]) * 4; + s4 = (input[3 * stride] - input[4 * stride]) * 4; + s5 = (input[2 * stride] - input[5 * stride]) * 4; + s6 = (input[1 * stride] - input[6 * stride]) * 4; + s7 = (input[0 * stride] - input[7 * stride]) * 4; + + // fdct4(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + output[0 * 8] = (tran_low_t)fdct_round_shift(t0); + output[2 * 8] = (tran_low_t)fdct_round_shift(t2); + output[4 * 8] = (tran_low_t)fdct_round_shift(t1); + output[6 * 8] = (tran_low_t)fdct_round_shift(t3); + + // stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = fdct_round_shift(t0); + t3 = fdct_round_shift(t1); + + // stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + output[1 * 8] = (tran_low_t)fdct_round_shift(t0); + output[3 * 8] = (tran_low_t)fdct_round_shift(t2); + output[5 * 8] = (tran_low_t)fdct_round_shift(t1); + output[7 * 8] = (tran_low_t)fdct_round_shift(t3); + input++; + output++; + } + } + + // Rows + for (i = 0; i < 8; ++i) { + fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]); + for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2; + } + + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; +#if CONFIG_AOM_QM + const qm_val_t wt = qm_ptr[rc]; + const qm_val_t iwt = iqm_ptr[rc]; + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; +#endif + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + int tmp32; +#if CONFIG_AOM_QM + tmp32 = (int)((tmp * quant_ptr[rc != 0] * wt) >> (16 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; +#else + tmp32 = (int)((tmp * quant_ptr[rc != 0]) >> 16); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; +#endif + + if (tmp32) eob = i; + } + } + *eob_ptr = eob + 1; +} + +void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + if (tx_type == DCT_DCT) { + aom_fdct8x8_c(input, output, stride); + } else { + static const transform_2d FHT[] = { + { fdct8, fdct8 }, // DCT_DCT + { fadst8, fdct8 }, // ADST_DCT + { fdct8, fadst8 }, // DCT_ADST + { fadst8, fadst8 }, // ADST_ADST +#if CONFIG_EXT_TX + { fadst8, fdct8 }, // FLIPADST_DCT + { fdct8, fadst8 }, // DCT_FLIPADST + { fadst8, fadst8 }, // FLIPADST_FLIPADST + { fadst8, fadst8 }, // ADST_FLIPADST + { fadst8, fadst8 }, // FLIPADST_ADST + { fidtx8, fidtx8 }, // IDTX + { fdct8, fidtx8 }, // V_DCT + { fidtx8, fdct8 }, // H_DCT + { fadst8, fidtx8 }, // V_ADST + { fidtx8, fadst8 }, // H_ADST + { fadst8, fidtx8 }, // V_FLIPADST + { fidtx8, fadst8 }, // H_FLIPADST +#endif // CONFIG_EXT_TX + }; + const transform_2d ht = FHT[tx_type]; + tran_low_t out[64]; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + +#if CONFIG_EXT_TX + int16_t flipped_input[8 * 8]; + maybe_flip_input(&input, &stride, 8, 8, flipped_input, tx_type); +#endif + + // Columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4; + ht.cols(temp_in, temp_out); + for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j]; + } + + // Rows + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 8; ++j) + output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; + } + } +} + +/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per + pixel. */ +void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { + int i; + tran_high_t a1, b1, c1, d1, e1; + const int16_t *ip_pass0 = input; + const tran_low_t *ip = NULL; + tran_low_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip_pass0[0 * stride]; + b1 = ip_pass0[1 * stride]; + c1 = ip_pass0[2 * stride]; + d1 = ip_pass0[3 * stride]; + + a1 += b1; + d1 = d1 - c1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= c1; + d1 += b1; + op[0] = (tran_low_t)a1; + op[4] = (tran_low_t)c1; + op[8] = (tran_low_t)d1; + op[12] = (tran_low_t)b1; + + ip_pass0++; + op++; + } + ip = output; + op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[0]; + b1 = ip[1]; + c1 = ip[2]; + d1 = ip[3]; + + a1 += b1; + d1 -= c1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= c1; + d1 += b1; + op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR); + op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR); + op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR); + op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR); + + ip += 4; + op += 4; + } +} + +void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct16, fdct16 }, // DCT_DCT + { fadst16, fdct16 }, // ADST_DCT + { fdct16, fadst16 }, // DCT_ADST + { fadst16, fadst16 }, // ADST_ADST +#if CONFIG_EXT_TX + { fadst16, fdct16 }, // FLIPADST_DCT + { fdct16, fadst16 }, // DCT_FLIPADST + { fadst16, fadst16 }, // FLIPADST_FLIPADST + { fadst16, fadst16 }, // ADST_FLIPADST + { fadst16, fadst16 }, // FLIPADST_ADST + { fidtx16, fidtx16 }, // IDTX + { fdct16, fidtx16 }, // V_DCT + { fidtx16, fdct16 }, // H_DCT + { fadst16, fidtx16 }, // V_ADST + { fidtx16, fadst16 }, // H_ADST + { fadst16, fidtx16 }, // V_FLIPADST + { fidtx16, fadst16 }, // H_FLIPADST +#endif // CONFIG_EXT_TX + }; + + const transform_2d ht = FHT[tx_type]; + tran_low_t out[256]; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + +#if CONFIG_EXT_TX + int16_t flipped_input[16 * 16]; + maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type); +#endif + + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4; + ht.cols(temp_in, temp_out); + for (j = 0; j < 16; ++j) + out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; + } + + // Rows + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j]; + } +} + +#if CONFIG_HIGHBITDEPTH +void av1_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht4x4_c(input, output, stride, tx_type); +} + +void av1_highbd_fht4x8_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht4x8_c(input, output, stride, tx_type); +} + +void av1_highbd_fht8x4_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht8x4_c(input, output, stride, tx_type); +} + +void av1_highbd_fht8x16_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht8x16_c(input, output, stride, tx_type); +} + +void av1_highbd_fht16x8_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht16x8_c(input, output, stride, tx_type); +} + +void av1_highbd_fht16x32_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht16x32_c(input, output, stride, tx_type); +} + +void av1_highbd_fht32x16_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht32x16_c(input, output, stride, tx_type); +} + +void av1_highbd_fht4x16_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht4x16_c(input, output, stride, tx_type); +} + +void av1_highbd_fht16x4_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht16x4_c(input, output, stride, tx_type); +} + +void av1_highbd_fht8x32_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht8x32_c(input, output, stride, tx_type); +} + +void av1_highbd_fht32x8_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht32x8_c(input, output, stride, tx_type); +} + +void av1_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht8x8_c(input, output, stride, tx_type); +} + +void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, + int stride) { + av1_fwht4x4_c(input, output, stride); +} + +void av1_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht16x16_c(input, output, stride, tx_type); +} +#endif // CONFIG_HIGHBITDEPTH + +void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct32, fdct32 }, // DCT_DCT +#if CONFIG_EXT_TX + { fhalfright32, fdct32 }, // ADST_DCT + { fdct32, fhalfright32 }, // DCT_ADST + { fhalfright32, fhalfright32 }, // ADST_ADST + { fhalfright32, fdct32 }, // FLIPADST_DCT + { fdct32, fhalfright32 }, // DCT_FLIPADST + { fhalfright32, fhalfright32 }, // FLIPADST_FLIPADST + { fhalfright32, fhalfright32 }, // ADST_FLIPADST + { fhalfright32, fhalfright32 }, // FLIPADST_ADST + { fidtx32, fidtx32 }, // IDTX + { fdct32, fidtx32 }, // V_DCT + { fidtx32, fdct32 }, // H_DCT + { fhalfright32, fidtx32 }, // V_ADST + { fidtx32, fhalfright32 }, // H_ADST + { fhalfright32, fidtx32 }, // V_FLIPADST + { fidtx32, fhalfright32 }, // H_FLIPADST +#endif + }; + const transform_2d ht = FHT[tx_type]; + tran_low_t out[1024]; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + +#if CONFIG_EXT_TX + int16_t flipped_input[32 * 32]; + maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type); +#endif + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; + ht.cols(temp_in, temp_out); + for (j = 0; j < 32; ++j) + out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4); + } + + // Rows + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j]; + } +} + +#if CONFIG_TX64X64 +#if CONFIG_EXT_TX +static void fidtx64(const tran_low_t *input, tran_low_t *output) { + int i; + for (i = 0; i < 64; ++i) + output[i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2); +} + +// For use in lieu of ADST +static void fhalfright64(const tran_low_t *input, tran_low_t *output) { + int i; + tran_low_t inputhalf[32]; + for (i = 0; i < 32; ++i) { + output[32 + i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2); + } + // Multiply input by sqrt(2) + for (i = 0; i < 32; ++i) { + inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 32] * Sqrt2); + } + fdct32(inputhalf, output); + // Note overall scaling factor is 2 times unitary +} +#endif // CONFIG_EXT_TX + +static void fdct64_col(const tran_low_t *input, tran_low_t *output) { + int32_t in[64], out[64]; + int i; + for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i]; + av1_fdct64_new(in, out, fwd_cos_bit_col_dct_dct_64, + fwd_stage_range_col_dct_dct_64); + for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i]; +} + +static void fdct64_row(const tran_low_t *input, tran_low_t *output) { + int32_t in[64], out[64]; + int i; + for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i]; + av1_fdct64_new(in, out, fwd_cos_bit_row_dct_dct_64, + fwd_stage_range_row_dct_dct_64); + for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i]; +} + +void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + static const transform_2d FHT[] = { + { fdct64_col, fdct64_row }, // DCT_DCT +#if CONFIG_EXT_TX + { fhalfright64, fdct64_row }, // ADST_DCT + { fdct64_col, fhalfright64 }, // DCT_ADST + { fhalfright64, fhalfright64 }, // ADST_ADST + { fhalfright64, fdct64_row }, // FLIPADST_DCT + { fdct64_col, fhalfright64 }, // DCT_FLIPADST + { fhalfright64, fhalfright64 }, // FLIPADST_FLIPADST + { fhalfright64, fhalfright64 }, // ADST_FLIPADST + { fhalfright64, fhalfright64 }, // FLIPADST_ADST + { fidtx64, fidtx64 }, // IDTX + { fdct64_col, fidtx64 }, // V_DCT + { fidtx64, fdct64_row }, // H_DCT + { fhalfright64, fidtx64 }, // V_ADST + { fidtx64, fhalfright64 }, // H_ADST + { fhalfright64, fidtx64 }, // V_FLIPADST + { fidtx64, fhalfright64 }, // H_FLIPADST +#endif + }; + const transform_2d ht = FHT[tx_type]; + tran_low_t out[4096]; + int i, j; + tran_low_t temp_in[64], temp_out[64]; +#if CONFIG_EXT_TX + int16_t flipped_input[64 * 64]; + maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type); +#endif + // Columns + for (i = 0; i < 64; ++i) { + for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i]; + ht.cols(temp_in, temp_out); + for (j = 0; j < 64; ++j) + out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + } + + // Rows + for (i = 0; i < 64; ++i) { + for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 64; ++j) + output[j + i * 64] = + (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); + } +} +#endif // CONFIG_TX64X64 + +#if CONFIG_EXT_TX +// Forward identity transform. +void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride, + int bs, int tx_type) { + int r, c; + const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1); + if (tx_type == IDTX) { + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] * (1 << shift); + src_diff += stride; + coeff += bs; + } + } +} +#endif // CONFIG_EXT_TX + +#if CONFIG_HIGHBITDEPTH +void av1_highbd_fht32x32_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht32x32_c(input, output, stride, tx_type); +} + +#if CONFIG_TX64X64 +void av1_highbd_fht64x64_c(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + av1_fht64x64_c(input, output, stride, tx_type); +} +#endif // CONFIG_TX64X64 +#endif // CONFIG_HIGHBITDEPTH +#endif // !AV1_DCT_GTEST diff --git a/third_party/aom/av1/encoder/encint.h b/third_party/aom/av1/encoder/encint.h new file mode 100644 index 000000000..30ea8521f --- /dev/null +++ b/third_party/aom/av1/encoder/encint.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +/* clang-format off */ + +#if !defined(_encint_H) +# define _encint_H (1) + +typedef struct daala_enc_ctx od_enc_ctx; +typedef struct od_params_ctx od_params_ctx; +typedef struct od_rollback_buffer od_rollback_buffer; + +# include "aom_dsp/entenc.h" +# include "av1/common/odintrin.h" +# include "av1/common/pvq_state.h" + +struct daala_enc_ctx{ + /* Stores context-adaptive CDFs for PVQ. */ + od_state state; + /* AOM entropy encoder. */ + aom_writer w; + int use_activity_masking; + /* Mode of quantization matrice : FLAT (0) or HVS (1) */ + int qm; + /*Normalized PVQ lambda for use where we've already performed + quantization.*/ + double pvq_norm_lambda; + double pvq_norm_lambda_dc; +}; + +// from daalaenc.h +/**The encoder context.*/ +typedef struct daala_enc_ctx daala_enc_ctx; + +/** Holds important encoder information so we can roll back decisions */ +struct od_rollback_buffer { + od_ec_enc ec; + od_adapt_ctx adapt; +}; + +void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf); +void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf); + +#endif diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c new file mode 100644 index 000000000..d254157e7 --- /dev/null +++ b/third_party/aom/av1/encoder/encodeframe.c @@ -0,0 +1,7160 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "./av1_rtcd.h" +#include "./aom_dsp_rtcd.h" +#include "./aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/system_state.h" + +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/idct.h" +#include "av1/common/mv.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconintra.h" +#include "av1/common/reconinter.h" +#include "av1/common/seg_common.h" +#include "av1/common/tile_common.h" + +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/aq_variance.h" +#if CONFIG_SUPERTX +#include "av1/encoder/cost.h" +#endif +#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION +#include "av1/common/warped_motion.h" +#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION +#if CONFIG_GLOBAL_MOTION +#include "av1/encoder/global_motion.h" +#endif // CONFIG_GLOBAL_MOTION +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#if CONFIG_LV_MAP +#include "av1/encoder/encodetxb.h" +#endif +#include "av1/encoder/ethread.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/tokenize.h" +#if CONFIG_PVQ +#include "av1/common/pvq.h" +#include "av1/encoder/pvq_encoder.h" +#endif +#if CONFIG_HIGHBITDEPTH +#define IF_HBD(...) __VA_ARGS__ +#else +#define IF_HBD(...) +#endif // CONFIG_HIGHBITDEPTH + +static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td, + TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row, + int mi_col, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int *rate); + +#if CONFIG_SUPERTX +static int check_intra_b(PICK_MODE_CONTEXT *ctx); + +static int check_intra_sb(const AV1_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col, BLOCK_SIZE bsize, + PC_TREE *pc_tree); +static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td, +#if CONFIG_EXT_INTER + int mi_row_ori, int mi_col_ori, +#endif // CONFIG_EXT_INTER + int mi_row_pred, int mi_col_pred, + BLOCK_SIZE bsize_pred, int b_sub8x8, int block); +static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size, + PC_TREE *pc_tree); +static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, int mi_row, + int mi_col, int mi_row_ori, int mi_col_ori, + RUN_TYPE dry_run, BLOCK_SIZE bsize, + BLOCK_SIZE top_bsize, uint8_t *dst_buf[3], + int dst_stride[3], PC_TREE *pc_tree); +static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, int mi_row, + int mi_col, BLOCK_SIZE bsize, + RUN_TYPE dry_run, PC_TREE *pc_tree); +static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, int mi_row, int mi_col, + BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist, + TX_TYPE *best_tx, PC_TREE *pc_tree); +#endif // CONFIG_SUPERTX + +// This is used as a reference when computing the source variance for the +// purposes of activity masking. +// Eventually this should be replaced by custom no-reference routines, +// which will be faster. +static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, +#if CONFIG_EXT_PARTITION + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 +#endif // CONFIG_EXT_PARTITION +}; + +#if CONFIG_HIGHBITDEPTH +static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, +#if CONFIG_EXT_PARTITION + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 +#endif // CONFIG_EXT_PARTITION +}; + +static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = { + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, +#if CONFIG_EXT_PARTITION + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4 +#endif // CONFIG_EXT_PARTITION +}; + +static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = { + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, +#if CONFIG_EXT_PARTITION + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16 +#endif // CONFIG_EXT_PARTITION +}; +#endif // CONFIG_HIGHBITDEPTH + +unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs) { + unsigned int sse; + const unsigned int var = + cpi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse); + return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); +} + +#if CONFIG_HIGHBITDEPTH +unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs, int bd) { + unsigned int var, sse; + switch (bd) { + case 10: + var = + cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10), 0, &sse); + break; + case 12: + var = + cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12), 0, &sse); + break; + case 8: + default: + var = + cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8), 0, &sse); + break; + } + return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); +} +#endif // CONFIG_HIGHBITDEPTH + +static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi, + const struct buf_2d *ref, + int mi_row, int mi_col, + BLOCK_SIZE bs) { + unsigned int sse, var; + uint8_t *last_y; + const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME); + + assert(last != NULL); + last_y = + &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE]; + var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse); + return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); +} + +static BLOCK_SIZE get_rd_var_based_fixed_partition(AV1_COMP *cpi, MACROBLOCK *x, + int mi_row, int mi_col) { + unsigned int var = get_sby_perpixel_diff_variance( + cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64); + if (var < 8) + return BLOCK_64X64; + else if (var < 128) + return BLOCK_32X32; + else if (var < 2048) + return BLOCK_16X16; + else + return BLOCK_8X8; +} + +// Lighter version of set_offsets that only sets the mode info +// pointers. +static void set_mode_info_offsets(const AV1_COMP *const cpi, + MACROBLOCK *const x, MACROBLOCKD *const xd, + int mi_row, int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + const int idx_str = xd->mi_stride * mi_row + mi_col; + xd->mi = cm->mi_grid_visible + idx_str; + xd->mi[0] = cm->mi + idx_str; + x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); +} + +static void set_offsets_without_segment_id(const AV1_COMP *const cpi, + const TileInfo *const tile, + MACROBLOCK *const x, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + + set_skip_context(xd, mi_row, mi_col); + + set_mode_info_offsets(cpi, x, xd, mi_row, mi_col); + +#if CONFIG_VAR_TX + xd->above_txfm_context = cm->above_txfm_context + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + xd->max_tx_size = max_txsize_lookup[bsize]; +#endif + + // Set up destination pointers. + av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row, + mi_col); + + // Set up limit values for MV components. + // Mv beyond the range do not produce new/different prediction block. + x->mv_limits.row_min = + -(((mi_row + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND); + x->mv_limits.col_min = -(((mi_col + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND); + x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND; + x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND; + + set_plane_n4(xd, mi_width, mi_height); + + // Set up distance of MB to edge of frame in 1/8th pel units. + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, +#if CONFIG_DEPENDENT_HORZTILES + cm->dependent_horz_tiles, +#endif // CONFIG_DEPENDENT_HORZTILES + cm->mi_rows, cm->mi_cols); + + // Set up source buffers. + av1_setup_src_planes(x, cpi->source, mi_row, mi_col); + + // R/D setup. + x->rddiv = cpi->rd.RDDIV; + x->rdmult = cpi->rd.RDMULT; + + // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs() + xd->tile = *tile; +} + +static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile, + MACROBLOCK *const x, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; + const struct segmentation *const seg = &cm->seg; + + set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); + + mbmi = &xd->mi[0]->mbmi; + + // Setup segment ID. + if (seg->enabled) { + if (!cpi->vaq_refresh) { + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + } + av1_init_plane_quantizers(cpi, x, mbmi->segment_id); + } else { + mbmi->segment_id = 0; + } + +#if CONFIG_SUPERTX + mbmi->segment_id_supertx = MAX_SEGMENTS; +#endif // CONFIG_SUPERTX +} + +#if CONFIG_SUPERTX +static void set_offsets_supertx(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + MACROBLOCK *const x = &td->mb; + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; +#if CONFIG_DEPENDENT_HORZTILES + set_mode_info_offsets(cpi, x, xd, mi_row, mi_col, cm->dependent_horz_tiles); +#else + set_mode_info_offsets(cpi, x, xd, mi_row, mi_col); +#endif + + // Set up distance of MB to edge of frame in 1/8th pel units. + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, +#if CONFIG_DEPENDENT_HORZTILES + cm->dependent_horz_tiles, +#endif // CONFIG_DEPENDENT_HORZTILES + cm->mi_rows, cm->mi_cols); +} + +static void set_offsets_extend(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, int mi_row_pred, + int mi_col_pred, int mi_row_ori, int mi_col_ori, + BLOCK_SIZE bsize_pred) { + // Used in supertx + // (mi_row_ori, mi_col_ori, bsize_ori): region for mv + // (mi_row_pred, mi_col_pred, bsize_pred): region to predict + MACROBLOCK *const x = &td->mb; + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_width = mi_size_wide[bsize_pred]; + const int mi_height = mi_size_high[bsize_pred]; + +#if CONFIG_DEPENDENT_HORZTILES + set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori, + cm->dependent_horz_tiles); +#else + set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori); +#endif + + // Set up limit values for MV components. + // Mv beyond the range do not produce new/different prediction block. + x->mv_limits.row_min = + -(((mi_row_pred + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND); + x->mv_limits.col_min = + -(((mi_col_pred + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND); + x->mv_limits.row_max = + (cm->mi_rows - mi_row_pred) * MI_SIZE + AOM_INTERP_EXTEND; + x->mv_limits.col_max = + (cm->mi_cols - mi_col_pred) * MI_SIZE + AOM_INTERP_EXTEND; + +// Set up distance of MB to edge of frame in 1/8th pel units. +#if !CONFIG_CB4X4 + assert(!(mi_col_pred & (mi_width - mi_size_wide[BLOCK_8X8])) && + !(mi_row_pred & (mi_height - mi_size_high[BLOCK_8X8]))); +#endif + set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width, +#if CONFIG_DEPENDENT_HORZTILES + cm->dependent_horz_tiles, +#endif // CONFIG_DEPENDENT_HORZTILES + cm->mi_rows, cm->mi_cols); + xd->up_available = (mi_row_ori > tile->mi_row_start); + xd->left_available = (mi_col_ori > tile->mi_col_start); + + // R/D setup. + x->rddiv = cpi->rd.RDDIV; + x->rdmult = cpi->rd.RDMULT; +} + +static void set_segment_id_supertx(const AV1_COMP *const cpi, + MACROBLOCK *const x, const int mi_row, + const int mi_col, const BLOCK_SIZE bsize) { + const AV1_COMMON *cm = &cpi->common; + const struct segmentation *seg = &cm->seg; + const int miw = AOMMIN(mi_size_wide[bsize], cm->mi_cols - mi_col); + const int mih = AOMMIN(mi_size_high[bsize], cm->mi_rows - mi_row); + const int mi_offset = mi_row * cm->mi_stride + mi_col; + MODE_INFO **const mip = cm->mi_grid_visible + mi_offset; + int r, c; + int seg_id_supertx = MAX_SEGMENTS; + + if (!seg->enabled) { + seg_id_supertx = 0; + } else { + // Find the minimum segment_id + for (r = 0; r < mih; r++) + for (c = 0; c < miw; c++) + seg_id_supertx = + AOMMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id, seg_id_supertx); + assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS); + + // Initialize plane quantisers + av1_init_plane_quantizers(cpi, x, seg_id_supertx); + } + + // Assign the the segment_id back to segment_id_supertx + for (r = 0; r < mih; r++) + for (c = 0; c < miw; c++) + mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx; +} +#endif // CONFIG_SUPERTX + +static void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x, + MACROBLOCKD *const xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) { + const int mi_width = AOMMAX(mi_size_wide[bsize], mi_size_wide[BLOCK_8X8]); + const int mi_height = AOMMAX(mi_size_high[bsize], mi_size_high[BLOCK_8X8]); + for (int r = 0; r < mi_height; ++r) { + for (int c = 0; c < mi_width; ++c) { + set_mode_info_offsets(cpi, x, xd, mi_row + r, mi_col + c); + xd->mi[0]->mbmi.sb_type = bsize; + } + } + } +} + +static void set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x, + MACROBLOCKD *const xd, VAR_TREE *vt, int mi_row, + int mi_col, const int64_t *const threshold, + const BLOCK_SIZE *const bsize_min) { + AV1_COMMON *const cm = &cpi->common; + const int hbw = mi_size_wide[vt->bsize] / 2; + const int hbh = mi_size_high[vt->bsize] / 2; + const int has_cols = mi_col + hbw < cm->mi_cols; + const int has_rows = mi_row + hbh < cm->mi_rows; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + assert(vt->bsize >= BLOCK_8X8); + + assert(hbh == hbw); + + if (vt->bsize == BLOCK_8X8 && cm->frame_type != KEY_FRAME) { + set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_8X8); + return; + } + + if (vt->force_split || (!has_cols && !has_rows)) goto split; + + // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if + // variance is below threshold, otherwise split will be selected. + // No check for vert/horiz split as too few samples for variance. + if (vt->bsize == bsize_min[0]) { + if (has_cols && has_rows && vt->variances.none.variance < threshold[0]) { + set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize); + return; + } else { + BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_SPLIT); + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + if (vt->bsize > BLOCK_8X8) { + set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize); + set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row + hbh, mi_col + hbw, subsize); + } + return; + } + } else if (vt->bsize > bsize_min[0]) { + // For key frame: take split for bsize above 32X32 or very high variance. + if (cm->frame_type == KEY_FRAME && + (vt->bsize > BLOCK_32X32 || + vt->variances.none.variance > (threshold[0] << 4))) { + goto split; + } + // If variance is low, take the bsize (no split). + if (has_cols && has_rows && vt->variances.none.variance < threshold[0]) { + set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize); + return; + } + + // Check vertical split. + if (has_rows) { + BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_VERT); + if (vt->variances.vert[0].variance < threshold[0] && + vt->variances.vert[1].variance < threshold[0] && + get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) { + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize); + return; + } + } + // Check horizontal split. + if (has_cols) { + BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_HORZ); + if (vt->variances.horz[0].variance < threshold[0] && + vt->variances.horz[1].variance < threshold[0] && + get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) { + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize); + return; + } + } + } + +split : { + set_vt_partitioning(cpi, x, xd, vt->split[0], mi_row, mi_col, threshold + 1, + bsize_min + 1); + set_vt_partitioning(cpi, x, xd, vt->split[1], mi_row, mi_col + hbw, + threshold + 1, bsize_min + 1); + set_vt_partitioning(cpi, x, xd, vt->split[2], mi_row + hbh, mi_col, + threshold + 1, bsize_min + 1); + set_vt_partitioning(cpi, x, xd, vt->split[3], mi_row + hbh, mi_col + hbw, + threshold + 1, bsize_min + 1); + return; +} +} + +// Set the variance split thresholds for following the block sizes: +// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16, +// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is +// currently only used on key frame. +static void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], int q) { + AV1_COMMON *const cm = &cpi->common; + const int is_key_frame = (cm->frame_type == KEY_FRAME); + const int threshold_multiplier = is_key_frame ? 20 : 1; + const int64_t threshold_base = + (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]); + if (is_key_frame) { + thresholds[1] = threshold_base; + thresholds[2] = threshold_base >> 2; + thresholds[3] = threshold_base >> 2; + thresholds[4] = threshold_base << 2; + } else { + thresholds[2] = threshold_base; + if (cm->width <= 352 && cm->height <= 288) { + thresholds[1] = threshold_base >> 2; + thresholds[3] = threshold_base << 3; + } else { + thresholds[1] = threshold_base; + thresholds[2] = (5 * threshold_base) >> 2; + if (cm->width >= 1920 && cm->height >= 1080) + thresholds[2] = (7 * threshold_base) >> 2; + thresholds[3] = threshold_base << cpi->oxcf.speed; + } + } + thresholds[0] = INT64_MIN; +} + +void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q) { + AV1_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + const int is_key_frame = (cm->frame_type == KEY_FRAME); + if (sf->partition_search_type != VAR_BASED_PARTITION && + sf->partition_search_type != REFERENCE_PARTITION) { + return; + } else { + set_vbp_thresholds(cpi, cpi->vbp_thresholds, q); + // The thresholds below are not changed locally. + if (is_key_frame) { + cpi->vbp_threshold_sad = 0; + cpi->vbp_bsize_min = BLOCK_8X8; + } else { + if (cm->width <= 352 && cm->height <= 288) + cpi->vbp_threshold_sad = 100; + else + cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 + ? (cpi->y_dequant[q][1] << 1) + : 1000; + cpi->vbp_bsize_min = BLOCK_16X16; + } + cpi->vbp_threshold_minmax = 15 + (q >> 3); + } +} + +// Compute the minmax over the 8x8 subblocks. +static int compute_minmax_8x8(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, +#if CONFIG_HIGHBITDEPTH + int highbd, +#endif + int pixels_wide, int pixels_high) { + int k; + int minmax_max = 0; + int minmax_min = 255; + // Loop over the 4 8x8 subblocks. + for (k = 0; k < 4; k++) { + const int x8_idx = ((k & 1) << 3); + const int y8_idx = ((k >> 1) << 3); + int min = 0; + int max = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { + const int src_offset = y8_idx * src_stride + x8_idx; + const int ref_offset = y8_idx * ref_stride + x8_idx; +#if CONFIG_HIGHBITDEPTH + if (highbd) { + aom_highbd_minmax_8x8(src + src_offset, src_stride, ref + ref_offset, + ref_stride, &min, &max); + } else { + aom_minmax_8x8(src + src_offset, src_stride, ref + ref_offset, + ref_stride, &min, &max); + } +#else + aom_minmax_8x8(src + src_offset, src_stride, ref + ref_offset, ref_stride, + &min, &max); +#endif + if ((max - min) > minmax_max) minmax_max = (max - min); + if ((max - min) < minmax_min) minmax_min = (max - min); + } + } + return (minmax_max - minmax_min); +} + +#if CONFIG_HIGHBITDEPTH +static INLINE int avg_4x4(const uint8_t *const src, const int stride, + const int highbd) { + if (highbd) { + return aom_highbd_avg_4x4(src, stride); + } else { + return aom_avg_4x4(src, stride); + } +} +#else +static INLINE int avg_4x4(const uint8_t *const src, const int stride) { + return aom_avg_4x4(src, stride); +} +#endif + +#if CONFIG_HIGHBITDEPTH +static INLINE int avg_8x8(const uint8_t *const src, const int stride, + const int highbd) { + if (highbd) { + return aom_highbd_avg_8x8(src, stride); + } else { + return aom_avg_8x8(src, stride); + } +} +#else +static INLINE int avg_8x8(const uint8_t *const src, const int stride) { + return aom_avg_8x8(src, stride); +} +#endif + +static void init_variance_tree(VAR_TREE *const vt, +#if CONFIG_HIGHBITDEPTH + const int highbd, +#endif + BLOCK_SIZE bsize, BLOCK_SIZE leaf_size, + const int width, const int height, + const uint8_t *const src, const int src_stride, + const uint8_t *const ref, const int ref_stride) { + assert(bsize >= leaf_size); + + vt->bsize = bsize; + + vt->force_split = 0; + + vt->src = src; + vt->src_stride = src_stride; + vt->ref = ref; + vt->ref_stride = ref_stride; + + vt->width = width; + vt->height = height; + +#if CONFIG_HIGHBITDEPTH + vt->highbd = highbd; +#endif // CONFIG_HIGHBITDEPTH + + if (bsize > leaf_size) { + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const int px = block_size_wide[subsize]; + + init_variance_tree(vt->split[0], +#if CONFIG_HIGHBITDEPTH + highbd, +#endif // CONFIG_HIGHBITDEPTH + subsize, leaf_size, AOMMIN(px, width), + AOMMIN(px, height), src, src_stride, ref, ref_stride); + init_variance_tree(vt->split[1], +#if CONFIG_HIGHBITDEPTH + highbd, +#endif // CONFIG_HIGHBITDEPTH + subsize, leaf_size, width - px, AOMMIN(px, height), + src + px, src_stride, ref + px, ref_stride); + init_variance_tree(vt->split[2], +#if CONFIG_HIGHBITDEPTH + highbd, +#endif // CONFIG_HIGHBITDEPTH + subsize, leaf_size, AOMMIN(px, width), height - px, + src + px * src_stride, src_stride, ref + px * ref_stride, + ref_stride); + init_variance_tree(vt->split[3], +#if CONFIG_HIGHBITDEPTH + highbd, +#endif // CONFIG_HIGHBITDEPTH + subsize, leaf_size, width - px, height - px, + src + px * src_stride + px, src_stride, + ref + px * ref_stride + px, ref_stride); + } +} + +// Fill the variance tree based on averaging pixel values (sub-sampling), at +// the leaf node size. +static void fill_variance_tree(VAR_TREE *const vt, const BLOCK_SIZE leaf_size) { + if (vt->bsize > leaf_size) { + fill_variance_tree(vt->split[0], leaf_size); + fill_variance_tree(vt->split[1], leaf_size); + fill_variance_tree(vt->split[2], leaf_size); + fill_variance_tree(vt->split[3], leaf_size); + fill_variance_node(vt); + } else if (vt->width <= 0 || vt->height <= 0) { + fill_variance(0, 0, 0, &vt->variances.none); + } else { + unsigned int sse = 0; + int sum = 0; + int src_avg; + int ref_avg; + assert(leaf_size == BLOCK_4X4 || leaf_size == BLOCK_8X8); + if (leaf_size == BLOCK_4X4) { + src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd)); + ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd)); + } else { + src_avg = avg_8x8(vt->src, vt->src_stride IF_HBD(, vt->highbd)); + ref_avg = avg_8x8(vt->ref, vt->ref_stride IF_HBD(, vt->highbd)); + } + sum = src_avg - ref_avg; + sse = sum * sum; + fill_variance(sse, sum, 0, &vt->variances.none); + } +} + +static void refine_variance_tree(VAR_TREE *const vt, const int64_t threshold) { + if (vt->bsize >= BLOCK_8X8) { + if (vt->bsize == BLOCK_16X16) { + if (vt->variances.none.variance <= threshold) + return; + else + vt->force_split = 0; + } + + refine_variance_tree(vt->split[0], threshold); + refine_variance_tree(vt->split[1], threshold); + refine_variance_tree(vt->split[2], threshold); + refine_variance_tree(vt->split[3], threshold); + + if (vt->bsize <= BLOCK_16X16) fill_variance_node(vt); + } else if (vt->width <= 0 || vt->height <= 0) { + fill_variance(0, 0, 0, &vt->variances.none); + } else { + const int src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd)); + const int ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd)); + const int sum = src_avg - ref_avg; + const unsigned int sse = sum * sum; + assert(vt->bsize == BLOCK_4X4); + fill_variance(sse, sum, 0, &vt->variances.none); + } +} + +static int check_split_key_frame(VAR_TREE *const vt, const int64_t threshold) { + if (vt->bsize == BLOCK_32X32) { + vt->force_split = vt->variances.none.variance > threshold; + } else { + vt->force_split |= check_split_key_frame(vt->split[0], threshold); + vt->force_split |= check_split_key_frame(vt->split[1], threshold); + vt->force_split |= check_split_key_frame(vt->split[2], threshold); + vt->force_split |= check_split_key_frame(vt->split[3], threshold); + } + return vt->force_split; +} + +static int check_split(AV1_COMP *const cpi, VAR_TREE *const vt, + const int segment_id, const int64_t *const thresholds) { + if (vt->bsize == BLOCK_16X16) { + vt->force_split = vt->variances.none.variance > thresholds[0]; + if (!vt->force_split && vt->variances.none.variance > thresholds[-1] && + !cyclic_refresh_segment_id_boosted(segment_id)) { + // We have some nominal amount of 16x16 variance (based on average), + // compute the minmax over the 8x8 sub-blocks, and if above threshold, + // force split to 8x8 block for this 16x16 block. + int minmax = + compute_minmax_8x8(vt->src, vt->src_stride, vt->ref, vt->ref_stride, +#if CONFIG_HIGHBITDEPTH + vt->highbd, +#endif + vt->width, vt->height); + vt->force_split = minmax > cpi->vbp_threshold_minmax; + } + } else { + vt->force_split |= + check_split(cpi, vt->split[0], segment_id, thresholds + 1); + vt->force_split |= + check_split(cpi, vt->split[1], segment_id, thresholds + 1); + vt->force_split |= + check_split(cpi, vt->split[2], segment_id, thresholds + 1); + vt->force_split |= + check_split(cpi, vt->split[3], segment_id, thresholds + 1); + + if (vt->bsize == BLOCK_32X32 && !vt->force_split) { + vt->force_split = vt->variances.none.variance > thresholds[0]; + } + } + + return vt->force_split; +} + +// This function chooses partitioning based on the variance between source and +// reconstructed last (or golden), where variance is computed for down-sampled +// inputs. +static void choose_partitioning(AV1_COMP *const cpi, ThreadData *const td, + const TileInfo *const tile, MACROBLOCK *const x, + const int mi_row, const int mi_col) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + VAR_TREE *const vt = td->var_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2]; +#if CONFIG_DUAL_FILTER + int i; +#endif + const uint8_t *src; + const uint8_t *ref; + int src_stride; + int ref_stride; + int pixels_wide = MI_SIZE * mi_size_wide[cm->sb_size]; + int pixels_high = MI_SIZE * mi_size_high[cm->sb_size]; + int64_t thresholds[5] = { + cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], cpi->vbp_thresholds[2], + cpi->vbp_thresholds[3], cpi->vbp_thresholds[4], + }; + BLOCK_SIZE bsize_min[5] = { BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, + cpi->vbp_bsize_min, BLOCK_8X8 }; + const int start_level = cm->sb_size == BLOCK_64X64 ? 1 : 0; + const int64_t *const thre = thresholds + start_level; + const BLOCK_SIZE *const bmin = bsize_min + start_level; + + const int is_key_frame = (cm->frame_type == KEY_FRAME); + const int low_res = (cm->width <= 352 && cm->height <= 288); + + int segment_id = CR_SEGMENT_ID_BASE; + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { + const uint8_t *const map = + cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col); + + if (cyclic_refresh_segment_id_boosted(segment_id)) { + int q = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex); + set_vbp_thresholds(cpi, thresholds, q); + } + } + + set_offsets(cpi, tile, x, mi_row, mi_col, cm->sb_size); + + if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); + if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3); + + src = x->plane[0].src.buf; + src_stride = x->plane[0].src.stride; + + if (!is_key_frame) { + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + unsigned int y_sad, y_sad_g; + + const int hbs = cm->mib_size / 2; + const int split_vert = mi_col + hbs >= cm->mi_cols; + const int split_horz = mi_row + hbs >= cm->mi_rows; + BLOCK_SIZE bsize; + + if (split_vert && split_horz) + bsize = get_subsize(cm->sb_size, PARTITION_SPLIT); + else if (split_vert) + bsize = get_subsize(cm->sb_size, PARTITION_VERT); + else if (split_horz) + bsize = get_subsize(cm->sb_size, PARTITION_HORZ); + else + bsize = cm->sb_size; + + assert(yv12 != NULL); + + if (yv12_g && yv12_g != yv12) { + av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + y_sad_g = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); + } else { + y_sad_g = UINT_MAX; + } + + av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[LAST_FRAME - 1].sf); + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->sb_type = cm->sb_size; + mbmi->mv[0].as_int = 0; +#if CONFIG_DUAL_FILTER + for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = BILINEAR; +#else + mbmi->interp_filter = BILINEAR; +#endif + + y_sad = av1_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); + + if (y_sad_g < y_sad) { + av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + mbmi->ref_frame[0] = GOLDEN_FRAME; + mbmi->mv[0].as_int = 0; + y_sad = y_sad_g; + } else { + x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv; + } + + av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, cm->sb_size); + + ref = xd->plane[0].dst.buf; + ref_stride = xd->plane[0].dst.stride; + + // If the y_sad is very small, take the largest partition and exit. + // Don't check on boosted segment for now, as largest is suppressed there. + if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) { + if (!split_vert && !split_horz) { + set_block_size(cpi, x, xd, mi_row, mi_col, cm->sb_size); + return; + } + } + } else { + ref = AV1_VAR_OFFS; + ref_stride = 0; +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + switch (xd->bd) { + case 10: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10); break; + case 12: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12); break; + case 8: + default: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8); break; + } + } +#endif // CONFIG_HIGHBITDEPTH + } + + init_variance_tree( + vt, +#if CONFIG_HIGHBITDEPTH + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, +#endif // CONFIG_HIGHBITDEPTH + cm->sb_size, (is_key_frame || low_res) ? BLOCK_4X4 : BLOCK_8X8, + pixels_wide, pixels_high, src, src_stride, ref, ref_stride); + + // Fill in the entire tree of variances and compute splits. + if (is_key_frame) { + fill_variance_tree(vt, BLOCK_4X4); + check_split_key_frame(vt, thre[1]); + } else { + fill_variance_tree(vt, BLOCK_8X8); + check_split(cpi, vt, segment_id, thre); + if (low_res) { + refine_variance_tree(vt, thre[1] << 1); + } + } + + vt->force_split |= mi_col + cm->mib_size > cm->mi_cols || + mi_row + cm->mib_size > cm->mi_rows; + + // Now go through the entire structure, splitting every block size until + // we get to one that's got a variance lower than our threshold. + set_vt_partitioning(cpi, x, xd, vt, mi_row, mi_col, thre, bmin); +} + +#if CONFIG_DUAL_FILTER +static void reset_intmv_filter_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, + MB_MODE_INFO *mbmi) { + int dir; + for (dir = 0; dir < 2; ++dir) { + if (!has_subpel_mv_component(xd->mi[0], xd, dir) && + (mbmi->ref_frame[1] == NONE_FRAME || + !has_subpel_mv_component(xd->mi[0], xd, dir + 2))) + mbmi->interp_filter[dir] = (cm->interp_filter == SWITCHABLE) + ? EIGHTTAP_REGULAR + : cm->interp_filter; + mbmi->interp_filter[dir + 2] = mbmi->interp_filter[dir]; + } +} + +static void update_filter_type_count(FRAME_COUNTS *counts, + const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi) { + int dir; + for (dir = 0; dir < 2; ++dir) { + if (has_subpel_mv_component(xd->mi[0], xd, dir) || + (mbmi->ref_frame[1] > INTRA_FRAME && + has_subpel_mv_component(xd->mi[0], xd, dir + 2))) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + ++counts->switchable_interp[ctx][mbmi->interp_filter[dir]]; + } + } +} +#endif +#if CONFIG_GLOBAL_MOTION +static void update_global_motion_used(PREDICTION_MODE mode, BLOCK_SIZE bsize, + const MB_MODE_INFO *mbmi, + RD_COUNTS *rdc) { + if (mode == ZEROMV +#if CONFIG_EXT_INTER + || mode == ZERO_ZEROMV +#endif + ) { + const int num_4x4s = + num_4x4_blocks_wide_lookup[bsize] * num_4x4_blocks_high_lookup[bsize]; + int ref; + for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s; + } + } +} +#endif // CONFIG_GLOBAL_MOTION + +static void reset_tx_size(MACROBLOCKD *xd, MB_MODE_INFO *mbmi, + const TX_MODE tx_mode) { + if (xd->lossless[mbmi->segment_id]) { + mbmi->tx_size = TX_4X4; + } else if (tx_mode != TX_MODE_SELECT) { + mbmi->tx_size = + tx_size_from_tx_mode(mbmi->sb_type, tx_mode, is_inter_block(mbmi)); + } +} + +#if CONFIG_REF_MV +static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv, + int8_t rf_type) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + + const int bw = xd->n8_w << MI_SIZE_LOG2; + const int bh = xd->n8_h << MI_SIZE_LOG2; + int ref_mv_idx = mbmi->ref_mv_idx; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + CANDIDATE_MV *const curr_ref_mv_stack = mbmi_ext->ref_mv_stack[rf_type]; + +#if CONFIG_EXT_INTER + if (has_second_ref(mbmi)) { + // Special case: NEAR_NEWMV and NEW_NEARMV modes use 1 + mbmi->ref_mv_idx + // (like NEARMV) instead + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) ref_mv_idx += 1; + + if (compound_ref0_mode(mbmi->mode) == NEWMV) { + int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].this_mv; + clamp_mv_ref(&this_mv.as_mv, bw, bh, xd); + mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv; + mbmi->pred_mv[0] = this_mv; + mi_pred_mv[0] = this_mv; + } + if (compound_ref1_mode(mbmi->mode) == NEWMV) { + int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].comp_mv; + clamp_mv_ref(&this_mv.as_mv, bw, bh, xd); + mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv; + mbmi->pred_mv[1] = this_mv; + mi_pred_mv[1] = this_mv; + } + } else { +#endif // CONFIG_EXT_INTER + if (mbmi->mode == NEWMV) { + int i; + for (i = 0; i < 1 + has_second_ref(mbmi); ++i) { + int_mv this_mv = (i == 0) ? curr_ref_mv_stack[ref_mv_idx].this_mv + : curr_ref_mv_stack[ref_mv_idx].comp_mv; + clamp_mv_ref(&this_mv.as_mv, bw, bh, xd); + mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0] = this_mv; + mbmi->pred_mv[i] = this_mv; + mi_pred_mv[i] = this_mv; + } + } +#if CONFIG_EXT_INTER + } +#endif // CONFIG_EXT_INTER +} +#endif // CONFIG_REF_MV + +static void update_state(const AV1_COMP *const cpi, ThreadData *td, + PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, + BLOCK_SIZE bsize, RUN_TYPE dry_run) { + int i, x_idx, y; + const AV1_COMMON *const cm = &cpi->common; + RD_COUNTS *const rdc = &td->rd_counts; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + MODE_INFO *mi = &ctx->mic; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MODE_INFO *mi_addr = xd->mi[0]; + const struct segmentation *const seg = &cm->seg; + const int bw = mi_size_wide[mi->mbmi.sb_type]; + const int bh = mi_size_high[mi->mbmi.sb_type]; + const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col); + const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row); + MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col; + int w, h; + + const int mis = cm->mi_stride; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + const int unify_bsize = CONFIG_CB4X4; + +#if CONFIG_REF_MV + int8_t rf_type; +#endif + +#if !CONFIG_SUPERTX + assert(mi->mbmi.sb_type == bsize); +#endif + + *mi_addr = *mi; + *x->mbmi_ext = ctx->mbmi_ext; + +#if CONFIG_DUAL_FILTER + reset_intmv_filter_type(cm, xd, mbmi); +#endif + +#if CONFIG_REF_MV + rf_type = av1_ref_frame_type(mbmi->ref_frame); + if (x->mbmi_ext->ref_mv_count[rf_type] > 1 && + (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) { + set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type); + } +#endif // CONFIG_REF_MV + + // If segmentation in use + if (seg->enabled) { + // For in frame complexity AQ copy the segment id from the segment map. + if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + reset_tx_size(xd, &mi_addr->mbmi, cm->tx_mode); + } + // Else for cyclic refresh mode update the segment map, set the segment id + // and then update the quantizer. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col, + bsize, ctx->rate, ctx->dist, x->skip); + reset_tx_size(xd, &mi_addr->mbmi, cm->tx_mode); + } + } + + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff[i]; + p[i].qcoeff = ctx->qcoeff[i]; + pd[i].dqcoeff = ctx->dqcoeff[i]; +#if CONFIG_PVQ + pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i]; +#endif + p[i].eobs = ctx->eobs[i]; +#if CONFIG_LV_MAP + p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; +#endif // CONFIG_LV_MAP + } +#if CONFIG_PALETTE + for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; +#endif // CONFIG_PALETTE + + // Restore the coding context of the MB to that that was in place + // when the mode was picked for it + for (y = 0; y < mi_height; y++) + for (x_idx = 0; x_idx < mi_width; x_idx++) + if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx && + (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) { + xd->mi[x_idx + y * mis] = mi_addr; + } + +#if CONFIG_DELTA_Q && !CONFIG_EXT_DELTA_Q + if (cpi->oxcf.aq_mode > NO_AQ && cpi->oxcf.aq_mode < DELTA_AQ) + av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id); +#else + if (cpi->oxcf.aq_mode) + av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id); +#endif + + if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8 && !unify_bsize) { + mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; + mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; + } + + x->skip = ctx->skip; + +#if CONFIG_VAR_TX + for (i = 0; i < 1; ++i) + memcpy(x->blk_skip[i], ctx->blk_skip[i], + sizeof(uint8_t) * ctx->num_4x4_blk); +#endif + + if (dry_run) return; + +#if CONFIG_INTERNAL_STATS + { + unsigned int *const mode_chosen_counts = + (unsigned int *)cpi->mode_chosen_counts; // Cast const away. + if (frame_is_intra_only(cm)) { + static const int kf_mode_index[] = { + THR_DC /*DC_PRED*/, + THR_V_PRED /*V_PRED*/, + THR_H_PRED /*H_PRED*/, + THR_D45_PRED /*D45_PRED*/, + THR_D135_PRED /*D135_PRED*/, + THR_D117_PRED /*D117_PRED*/, + THR_D153_PRED /*D153_PRED*/, + THR_D207_PRED /*D207_PRED*/, + THR_D63_PRED /*D63_PRED*/, +#if CONFIG_ALT_INTRA + THR_SMOOTH, /*SMOOTH_PRED*/ +#endif // CONFIG_ALT_INTRA + THR_TM /*TM_PRED*/, + }; + ++mode_chosen_counts[kf_mode_index[mbmi->mode]]; + } else { + // Note how often each mode chosen as best + ++mode_chosen_counts[ctx->best_mode_index]; + } + } +#endif + if (!frame_is_intra_only(cm)) { + if (is_inter_block(mbmi)) { + av1_update_mv_count(td); +#if CONFIG_GLOBAL_MOTION + if (bsize >= BLOCK_8X8) { + // TODO(sarahparker): global motion stats need to be handled per-tile + // to be compatible with tile-based threading. + update_global_motion_used(mbmi->mode, bsize, mbmi, rdc); + } else { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int j = idy * 2 + idx; + update_global_motion_used(mi->bmi[j].as_mode, bsize, mbmi, rdc); + } + } + } +#endif // CONFIG_GLOBAL_MOTION + if (cm->interp_filter == SWITCHABLE +#if CONFIG_WARPED_MOTION + && mbmi->motion_mode != WARPED_CAUSAL +#endif // CONFIG_WARPED_MOTION +#if CONFIG_GLOBAL_MOTION + && !is_nontrans_global_motion(xd) +#endif // CONFIG_GLOBAL_MOTION + ) { +#if CONFIG_DUAL_FILTER + update_filter_type_count(td->counts, xd, mbmi); +#else + const int switchable_ctx = av1_get_pred_context_switchable_interp(xd); + ++td->counts->switchable_interp[switchable_ctx][mbmi->interp_filter]; +#endif + } + } + + rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff; + rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff; + rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff; + } + + for (h = 0; h < y_mis; ++h) { + MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols; + for (w = 0; w < x_mis; ++w) { + MV_REF *const mv = frame_mv + w; + mv->ref_frame[0] = mi->mbmi.ref_frame[0]; + mv->ref_frame[1] = mi->mbmi.ref_frame[1]; + mv->mv[0].as_int = mi->mbmi.mv[0].as_int; + mv->mv[1].as_int = mi->mbmi.mv[1].as_int; + } + } +} + +#if CONFIG_SUPERTX +static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td, + PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, + BLOCK_SIZE bsize, RUN_TYPE dry_run) { + int y, x_idx; +#if CONFIG_VAR_TX + int i; +#endif + const AV1_COMMON *const cm = &cpi->common; + RD_COUNTS *const rdc = &td->rd_counts; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *mi = &ctx->mic; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MODE_INFO *mi_addr = xd->mi[0]; + const struct segmentation *const seg = &cm->seg; + const int mis = cm->mi_stride; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + const int x_mis = AOMMIN(mi_width, cm->mi_cols - mi_col); + const int y_mis = AOMMIN(mi_height, cm->mi_rows - mi_row); + const int unify_bsize = CONFIG_CB4X4; + MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col; + int w, h; + +#if CONFIG_REF_MV + int8_t rf_type; +#endif + + *mi_addr = *mi; + *x->mbmi_ext = ctx->mbmi_ext; + assert(is_inter_block(mbmi)); + assert(mbmi->tx_size == ctx->mic.mbmi.tx_size); + +#if CONFIG_DUAL_FILTER + reset_intmv_filter_type(cm, xd, mbmi); +#endif + +#if CONFIG_REF_MV + rf_type = av1_ref_frame_type(mbmi->ref_frame); + if (x->mbmi_ext->ref_mv_count[rf_type] > 1 && + (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) { + set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type); + } +#endif // CONFIG_REF_MV + + // If segmentation in use + if (seg->enabled) { + if (cpi->vaq_refresh) { + const int energy = + bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize); + mi_addr->mbmi.segment_id = av1_vaq_segment_id(energy); + } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + // For cyclic refresh mode, now update the segment map + // and set the segment id. + av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col, + bsize, ctx->rate, ctx->dist, 1); + } else { + // Otherwise just set the segment id based on the current segment map + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + } + mi_addr->mbmi.segment_id_supertx = MAX_SEGMENTS; + } + + // Restore the coding context of the MB to that that was in place + // when the mode was picked for it + for (y = 0; y < mi_height; y++) + for (x_idx = 0; x_idx < mi_width; x_idx++) + if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx && + (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) { + xd->mi[x_idx + y * mis] = mi_addr; + } + +#if !CONFIG_CB4X4 + if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) { + mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; + mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; + } +#endif + + x->skip = ctx->skip; + +#if CONFIG_VAR_TX + for (i = 0; i < 1; ++i) + memcpy(x->blk_skip[i], ctx->blk_skip[i], + sizeof(uint8_t) * ctx->num_4x4_blk); + + if (!is_inter_block(mbmi) || mbmi->skip) + mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size); +#endif // CONFIG_VAR_TX + +#if CONFIG_VAR_TX + { + const TX_SIZE mtx = mbmi->tx_size; + const int num_4x4_blocks_wide = tx_size_wide_unit[mtx] >> 1; + const int num_4x4_blocks_high = tx_size_high_unit[mtx] >> 1; + int idy, idx; + mbmi->inter_tx_size[0][0] = mtx; + for (idy = 0; idy < num_4x4_blocks_high; ++idy) + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) + mbmi->inter_tx_size[idy][idx] = mtx; + } +#endif // CONFIG_VAR_TX + // Turn motion variation off for supertx + mbmi->motion_mode = SIMPLE_TRANSLATION; + + if (dry_run) return; + + if (!frame_is_intra_only(cm)) { + av1_update_mv_count(td); + +#if CONFIG_GLOBAL_MOTION + if (is_inter_block(mbmi)) { + if (bsize >= BLOCK_8X8) { + // TODO(sarahparker): global motion stats need to be handled per-tile + // to be compatible with tile-based threading. + update_global_motion_used(mbmi->mode, bsize, mbmi, rdc); + } else { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int j = idy * 2 + idx; + update_global_motion_used(mi->bmi[j].as_mode, bsize, mbmi, rdc); + } + } + } + } +#endif // CONFIG_GLOBAL_MOTION + + if (cm->interp_filter == SWITCHABLE +#if CONFIG_GLOBAL_MOTION + && !is_nontrans_global_motion(xd) +#endif // CONFIG_GLOBAL_MOTION + ) { +#if CONFIG_DUAL_FILTER + update_filter_type_count(td->counts, xd, mbmi); +#else + const int pred_ctx = av1_get_pred_context_switchable_interp(xd); + ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter]; +#endif + } + + rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff; + rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff; + rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff; + } + + for (h = 0; h < y_mis; ++h) { + MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols; + for (w = 0; w < x_mis; ++w) { + MV_REF *const mv = frame_mv + w; + mv->ref_frame[0] = mi->mbmi.ref_frame[0]; + mv->ref_frame[1] = mi->mbmi.ref_frame[1]; + mv->mv[0].as_int = mi->mbmi.mv[0].as_int; + mv->mv[1].as_int = mi->mbmi.mv[1].as_int; + } + } +} + +static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, int mi_row, + int mi_col, BLOCK_SIZE bsize, + RUN_TYPE dry_run, PC_TREE *pc_tree) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + int hbs = mi_size_wide[bsize] / 2; +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; +#endif + PARTITION_TYPE partition = pc_tree->partitioning; + BLOCK_SIZE subsize = get_subsize(bsize, partition); + int i; +#if CONFIG_EXT_PARTITION_TYPES + BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT); +#endif + PICK_MODE_CONTEXT *pmc = NULL; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + if (bsize == BLOCK_16X16 && cpi->vaq_refresh) + x->mb_energy = av1_block_energy(cpi, x, bsize); + + switch (partition) { + case PARTITION_NONE: + set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); + update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col, subsize, + dry_run); + break; + case PARTITION_VERT: + set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); + update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, + subsize, dry_run); + if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) { + set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize); + update_state_supertx(cpi, td, &pc_tree->vertical[1], mi_row, + mi_col + hbs, subsize, dry_run); + } + pmc = &pc_tree->vertical_supertx; + break; + case PARTITION_HORZ: + set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); + update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col, + subsize, dry_run); + if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) { + set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize); + update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs, + mi_col, subsize, dry_run); + } + pmc = &pc_tree->horizontal_supertx; + break; + case PARTITION_SPLIT: + if (bsize == BLOCK_8X8 && !unify_bsize) { + set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); + update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col, + subsize, dry_run); + } else { + set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); + update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize, dry_run, + pc_tree->split[0]); + set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize); + update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize, + dry_run, pc_tree->split[1]); + set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize); + update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize, + dry_run, pc_tree->split[2]); + set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize); + update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, + subsize, dry_run, pc_tree->split[3]); + } + pmc = &pc_tree->split_supertx; + break; +#if CONFIG_EXT_PARTITION_TYPES + case PARTITION_HORZ_A: + set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2); + update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col, + bsize2, dry_run); + set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2); + update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row, + mi_col + hbs, bsize2, dry_run); + set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize); + update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs, + mi_col, subsize, dry_run); + pmc = &pc_tree->horizontala_supertx; + break; + case PARTITION_HORZ_B: + set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); + update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col, + subsize, dry_run); + set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2); + update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs, + mi_col, bsize2, dry_run); + set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2); + update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs, + mi_col + hbs, bsize2, dry_run); + pmc = &pc_tree->horizontalb_supertx; + break; + case PARTITION_VERT_A: + set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2); + update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col, + bsize2, dry_run); + set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2); + update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs, + mi_col, bsize2, dry_run); + set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize); + update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row, + mi_col + hbs, subsize, dry_run); + pmc = &pc_tree->verticala_supertx; + break; + case PARTITION_VERT_B: + set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize); + update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col, + subsize, dry_run); + set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2); + update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row, + mi_col + hbs, bsize2, dry_run); + set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2); + update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs, + mi_col + hbs, bsize2, dry_run); + pmc = &pc_tree->verticalb_supertx; + break; +#endif // CONFIG_EXT_PARTITION_TYPES + default: assert(0); + } + + for (i = 0; i < MAX_MB_PLANE; ++i) { + if (pmc != NULL) { + p[i].coeff = pmc->coeff[i]; + p[i].qcoeff = pmc->qcoeff[i]; + pd[i].dqcoeff = pmc->dqcoeff[i]; + p[i].eobs = pmc->eobs[i]; + } else { + // These should never be used + p[i].coeff = NULL; + p[i].qcoeff = NULL; + pd[i].dqcoeff = NULL; + p[i].eobs = NULL; + } + } +} + +static void update_supertx_param(ThreadData *td, PICK_MODE_CONTEXT *ctx, + int best_tx, TX_SIZE supertx_size) { + MACROBLOCK *const x = &td->mb; +#if CONFIG_VAR_TX + int i; + + for (i = 0; i < 1; ++i) + memcpy(ctx->blk_skip[i], x->blk_skip[i], + sizeof(uint8_t) * ctx->num_4x4_blk); + ctx->mic.mbmi.min_tx_size = get_min_tx_size(supertx_size); +#endif // CONFIG_VAR_TX + ctx->mic.mbmi.tx_size = supertx_size; + ctx->skip = x->skip; + ctx->mic.mbmi.tx_type = best_tx; +} + +static void update_supertx_param_sb(const AV1_COMP *const cpi, ThreadData *td, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int best_tx, TX_SIZE supertx_size, + PC_TREE *pc_tree) { + const AV1_COMMON *const cm = &cpi->common; + const int hbs = mi_size_wide[bsize] / 2; + PARTITION_TYPE partition = pc_tree->partitioning; + BLOCK_SIZE subsize = get_subsize(bsize, partition); +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; +#endif +#if CONFIG_EXT_PARTITION_TYPES + int i; +#endif + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + switch (partition) { + case PARTITION_NONE: + update_supertx_param(td, &pc_tree->none, best_tx, supertx_size); + break; + case PARTITION_VERT: + update_supertx_param(td, &pc_tree->vertical[0], best_tx, supertx_size); + if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) + update_supertx_param(td, &pc_tree->vertical[1], best_tx, supertx_size); + break; + case PARTITION_HORZ: + update_supertx_param(td, &pc_tree->horizontal[0], best_tx, supertx_size); + if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) + update_supertx_param(td, &pc_tree->horizontal[1], best_tx, + supertx_size); + break; + case PARTITION_SPLIT: + if (bsize == BLOCK_8X8 && !unify_bsize) { + update_supertx_param(td, pc_tree->leaf_split[0], best_tx, supertx_size); + } else { + update_supertx_param_sb(cpi, td, mi_row, mi_col, subsize, best_tx, + supertx_size, pc_tree->split[0]); + update_supertx_param_sb(cpi, td, mi_row, mi_col + hbs, subsize, best_tx, + supertx_size, pc_tree->split[1]); + update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col, subsize, best_tx, + supertx_size, pc_tree->split[2]); + update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col + hbs, subsize, + best_tx, supertx_size, pc_tree->split[3]); + } + break; +#if CONFIG_EXT_PARTITION_TYPES + case PARTITION_HORZ_A: + for (i = 0; i < 3; i++) + update_supertx_param(td, &pc_tree->horizontala[i], best_tx, + supertx_size); + break; + case PARTITION_HORZ_B: + for (i = 0; i < 3; i++) + update_supertx_param(td, &pc_tree->horizontalb[i], best_tx, + supertx_size); + break; + case PARTITION_VERT_A: + for (i = 0; i < 3; i++) + update_supertx_param(td, &pc_tree->verticala[i], best_tx, supertx_size); + break; + case PARTITION_VERT_B: + for (i = 0; i < 3; i++) + update_supertx_param(td, &pc_tree->verticalb[i], best_tx, supertx_size); + break; +#endif // CONFIG_EXT_PARTITION_TYPES + default: assert(0); + } +} +#endif // CONFIG_SUPERTX + +#if CONFIG_MOTION_VAR && CONFIG_NCOBMC +static void set_mode_info_b(const AV1_COMP *const cpi, + const TileInfo *const tile, ThreadData *td, + int mi_row, int mi_col, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + MACROBLOCK *const x = &td->mb; + set_offsets(cpi, tile, x, mi_row, mi_col, bsize); + update_state(cpi, td, ctx, mi_row, mi_col, bsize, 1); +} + +static void set_mode_info_sb(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, TOKENEXTRA **tp, + int mi_row, int mi_col, BLOCK_SIZE bsize, + PC_TREE *pc_tree) { + const AV1_COMMON *const cm = &cpi->common; + const int hbs = mi_size_wide[bsize] / 2; + const PARTITION_TYPE partition = pc_tree->partitioning; + BLOCK_SIZE subsize = get_subsize(bsize, partition); +#if CONFIG_EXT_PARTITION_TYPES + const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT); +#endif +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; + assert(bsize >= BLOCK_8X8); +#endif + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + switch (partition) { + case PARTITION_NONE: + set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, &pc_tree->none); + break; + case PARTITION_VERT: + set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, + &pc_tree->vertical[0]); + if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) { + set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, subsize, + &pc_tree->vertical[1]); + } + break; + case PARTITION_HORZ: + set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, + &pc_tree->horizontal[0]); + if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) { + set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, subsize, + &pc_tree->horizontal[1]); + } + break; + case PARTITION_SPLIT: + if (bsize == BLOCK_8X8 && !unify_bsize) { + set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, + pc_tree->leaf_split[0]); + } else { + set_mode_info_sb(cpi, td, tile, tp, mi_row, mi_col, subsize, + pc_tree->split[0]); + set_mode_info_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, subsize, + pc_tree->split[1]); + set_mode_info_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, subsize, + pc_tree->split[2]); + set_mode_info_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, subsize, + pc_tree->split[3]); + } + break; +#if CONFIG_EXT_PARTITION_TYPES + case PARTITION_HORZ_A: + set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2, + &pc_tree->horizontala[0]); + set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, bsize2, + &pc_tree->horizontala[1]); + set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, subsize, + &pc_tree->horizontala[2]); + break; + case PARTITION_HORZ_B: + set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, + &pc_tree->horizontalb[0]); + set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, bsize2, + &pc_tree->horizontalb[1]); + set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2, + &pc_tree->horizontalb[2]); + break; + case PARTITION_VERT_A: + set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2, + &pc_tree->verticala[0]); + set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col, bsize2, + &pc_tree->verticala[1]); + set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, subsize, + &pc_tree->verticala[2]); + break; + case PARTITION_VERT_B: + set_mode_info_b(cpi, tile, td, mi_row, mi_col, subsize, + &pc_tree->verticalb[0]); + set_mode_info_b(cpi, tile, td, mi_row, mi_col + hbs, bsize2, + &pc_tree->verticalb[1]); + set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2, + &pc_tree->verticalb[2]); + break; +#endif // CONFIG_EXT_PARTITION_TYPES + default: assert(0 && "Invalid partition type."); break; + } +} +#endif + +void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col) { + uint8_t *const buffers[3] = { src->y_buffer, src->u_buffer, src->v_buffer }; + const int widths[3] = { src->y_crop_width, src->uv_crop_width, + src->uv_crop_width }; + const int heights[3] = { src->y_crop_height, src->uv_crop_height, + src->uv_crop_height }; + const int strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; + int i; + + // Set current frame pointer. + x->e_mbd.cur_buf = src; + + for (i = 0; i < MAX_MB_PLANE; i++) + setup_pred_plane(&x->plane[i].src, x->e_mbd.mi[0]->mbmi.sb_type, buffers[i], + widths[i], heights[i], strides[i], mi_row, mi_col, NULL, + x->e_mbd.plane[i].subsampling_x, + x->e_mbd.plane[i].subsampling_y); +} + +static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + int8_t segment_id) { + int segment_qindex; + const AV1_COMMON *const cm = &cpi->common; + av1_init_plane_quantizers(cpi, x, segment_id); + aom_clear_system_state(); + segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex); + return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q); +} + +static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data, + MACROBLOCK *const x, int mi_row, int mi_col, + RD_STATS *rd_cost, +#if CONFIG_SUPERTX + int *totalrate_nocoef, +#endif +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_TYPE partition, +#endif + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd) { + const AV1_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + const AQ_MODE aq_mode = cpi->oxcf.aq_mode; + int i, orig_rdmult; + const int unify_bsize = CONFIG_CB4X4; + + aom_clear_system_state(); + +#if CONFIG_PVQ + x->pvq_speed = 1; + x->pvq_coded = 0; +#endif +#if CONFIG_CFL + // Don't store luma during RDO (we will store the best mode later). + x->cfl_store_y = 0; +#endif + + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + mbmi = &xd->mi[0]->mbmi; + mbmi->sb_type = bsize; +#if CONFIG_RD_DEBUG + mbmi->mi_row = mi_row; + mbmi->mi_col = mi_col; +#endif +#if CONFIG_SUPERTX + // We set tx_size here as skip blocks would otherwise not set it. + // tx_size needs to be set at this point as supertx_enable in + // write_modes_sb is computed based on this, and if the garbage in memory + // just happens to be the supertx_size, then the packer will code this + // block as a supertx block, even if rdopt did not pick it as such. + mbmi->tx_size = max_txsize_lookup[bsize]; +#endif +#if CONFIG_EXT_PARTITION_TYPES + mbmi->partition = partition; +#endif + + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff[i]; + p[i].qcoeff = ctx->qcoeff[i]; + pd[i].dqcoeff = ctx->dqcoeff[i]; +#if CONFIG_PVQ + pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i]; +#endif + p[i].eobs = ctx->eobs[i]; +#if CONFIG_LV_MAP + p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; +#endif + } + +#if CONFIG_PALETTE + for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; +#endif // CONFIG_PALETTE + + ctx->skippable = 0; + ctx->pred_pixel_ready = 0; + + // Set to zero to make sure we do not use the previous encoded frame stats + mbmi->skip = 0; + +#if CONFIG_CB4X4 + x->skip_chroma_rd = + !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y); +#endif + +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + x->source_variance = av1_high_get_sby_perpixel_variance( + cpi, &x->plane[0].src, bsize, xd->bd); + } else { + x->source_variance = + av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + } +#else + x->source_variance = + av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); +#endif // CONFIG_HIGHBITDEPTH + + // Save rdmult before it might be changed, so it can be restored later. + orig_rdmult = x->rdmult; + + if (aq_mode == VARIANCE_AQ) { + if (cpi->vaq_refresh) { + const int energy = + bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize); + mbmi->segment_id = av1_vaq_segment_id(energy); + // Re-initialise quantiser + av1_init_plane_quantizers(cpi, x, mbmi->segment_id); + } + x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id); + } else if (aq_mode == COMPLEXITY_AQ) { + x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id); + } else if (aq_mode == CYCLIC_REFRESH_AQ) { + // If segment is boosted, use rdmult for that segment. + if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) + x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + } + + // Find best coding mode & reconstruct the MB so it is available + // as a predictor for MBs that follow in the SB + if (frame_is_intra_only(cm)) { + av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd); +#if CONFIG_SUPERTX + *totalrate_nocoef = 0; +#endif // CONFIG_SUPERTX + } else { + if (bsize >= BLOCK_8X8 || unify_bsize) { + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col, + rd_cost, bsize, ctx, best_rd); +#if CONFIG_SUPERTX + *totalrate_nocoef = rd_cost->rate; +#endif // CONFIG_SUPERTX + } else { + av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost, +#if CONFIG_SUPERTX + totalrate_nocoef, +#endif // CONFIG_SUPERTX + bsize, ctx, best_rd); +#if CONFIG_SUPERTX + assert(*totalrate_nocoef >= 0); +#endif // CONFIG_SUPERTX + } + } else { + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + // The decoder rejects sub8x8 partitions when SEG_LVL_SKIP is set. + rd_cost->rate = INT_MAX; + } else { + av1_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, + rd_cost, +#if CONFIG_SUPERTX + totalrate_nocoef, +#endif // CONFIG_SUPERTX + bsize, ctx, best_rd); +#if CONFIG_SUPERTX + assert(*totalrate_nocoef >= 0); +#endif // CONFIG_SUPERTX + } + } + } + + // Examine the resulting rate and for AQ mode 2 make a segment choice. + if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) && + (bsize >= BLOCK_16X16) && + (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) { + av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate); + } + + x->rdmult = orig_rdmult; + + // TODO(jingning) The rate-distortion optimization flow needs to be + // refactored to provide proper exit/return handle. + if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX; + + ctx->rate = rd_cost->rate; + ctx->dist = rd_cost->dist; +} + +#if CONFIG_REF_MV +static void update_inter_mode_stats(FRAME_COUNTS *counts, PREDICTION_MODE mode, + int16_t mode_context) { + int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; + if (mode == NEWMV) { + ++counts->newmv_mode[mode_ctx][0]; + return; + } else { + ++counts->newmv_mode[mode_ctx][1]; + + if (mode_context & (1 << ALL_ZERO_FLAG_OFFSET)) { + return; + } + + mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK; + if (mode == ZEROMV) { + ++counts->zeromv_mode[mode_ctx][0]; + return; + } else { + ++counts->zeromv_mode[mode_ctx][1]; + mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; + + if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6; + if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7; + if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8; + + ++counts->refmv_mode[mode_ctx][mode != NEARESTMV]; + } + } +} +#endif + +static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row, + int mi_col +#if CONFIG_SUPERTX + , + int supertx_enabled +#endif + ) { +#if CONFIG_DELTA_Q + MACROBLOCK *x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; +#else + const MACROBLOCK *x = &td->mb; + const MACROBLOCKD *const xd = &x->e_mbd; +#endif + const MODE_INFO *const mi = xd->mi[0]; + const MB_MODE_INFO *const mbmi = &mi->mbmi; + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int unify_bsize = CONFIG_CB4X4; + +#if CONFIG_DELTA_Q + // delta quant applies to both intra and inter + const int super_block_upper_left = ((mi_row & 7) == 0) && ((mi_col & 7) == 0); + + if (cm->delta_q_present_flag && (bsize != BLOCK_64X64 || !mbmi->skip) && + super_block_upper_left) { + const int dq = (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res; + const int absdq = abs(dq); + int i; + for (i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) { + td->counts->delta_q[i][1]++; + } + if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++; + xd->prev_qindex = mbmi->current_q_index; +#if CONFIG_EXT_DELTA_Q + if (cm->delta_lf_present_flag) { + const int dlf = + (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) / + cm->delta_lf_res; + const int absdlf = abs(dlf); + for (i = 0; i < AOMMIN(absdlf, DELTA_LF_SMALL); ++i) { + td->counts->delta_lf[i][1]++; + } + if (absdlf < DELTA_LF_SMALL) td->counts->delta_lf[absdlf][0]++; + xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base; + } +#endif + } +#else + (void)mi_row; + (void)mi_col; +#endif + if (!frame_is_intra_only(cm)) { + FRAME_COUNTS *const counts = td->counts; + const int inter_block = is_inter_block(mbmi); + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active) { +#if CONFIG_SUPERTX + if (!supertx_enabled) +#endif + counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++; + // If the segment reference feature is enabled we have only a single + // reference frame allowed for the segment so exclude it from + // the reference frame counts used to work out probabilities. + if (inter_block) { + const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0]; +#if CONFIG_EXT_REFS + const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1]; +#endif // CONFIG_EXT_REFS + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { +#if !SUB8X8_COMP_REF + if (mbmi->sb_type >= BLOCK_8X8) + counts->comp_inter[av1_get_reference_mode_context(cm, xd)] + [has_second_ref(mbmi)]++; +#else + counts->comp_inter[av1_get_reference_mode_context(cm, xd)] + [has_second_ref(mbmi)]++; +#endif + } + + if (has_second_ref(mbmi)) { +#if CONFIG_EXT_REFS + const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME); + + counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0][bit]++; + if (!bit) { + counts->comp_ref[av1_get_pred_context_comp_ref_p1(cm, xd)][1] + [ref0 == LAST_FRAME]++; + } else { + counts->comp_ref[av1_get_pred_context_comp_ref_p2(cm, xd)][2] + [ref0 == GOLDEN_FRAME]++; + } + + counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(cm, xd)][0] + [ref1 == ALTREF_FRAME]++; +#else + counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0] + [ref0 == GOLDEN_FRAME]++; +#endif // CONFIG_EXT_REFS + } else { +#if CONFIG_EXT_REFS + const int bit = (ref0 == ALTREF_FRAME || ref0 == BWDREF_FRAME); + + counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++; + if (bit) { + counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1] + [ref0 != BWDREF_FRAME]++; + } else { + const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME); + counts + ->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++; + if (!bit1) { + counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3] + [ref0 != LAST_FRAME]++; + } else { + counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4] + [ref0 != LAST3_FRAME]++; + } + } +#else + counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0] + [ref0 != LAST_FRAME]++; + if (ref0 != LAST_FRAME) { + counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1] + [ref0 != GOLDEN_FRAME]++; + } +#endif // CONFIG_EXT_REFS + } + +#if CONFIG_EXT_INTER + if (cm->reference_mode != COMPOUND_REFERENCE && +#if CONFIG_SUPERTX + !supertx_enabled && +#endif + is_interintra_allowed(mbmi)) { + const int bsize_group = size_group_lookup[bsize]; + if (mbmi->ref_frame[1] == INTRA_FRAME) { + counts->interintra[bsize_group][1]++; + counts->interintra_mode[bsize_group][mbmi->interintra_mode]++; + if (is_interintra_wedge_used(bsize)) + counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++; + } else { + counts->interintra[bsize_group][0]++; + } + } +#endif // CONFIG_EXT_INTER + +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + const MOTION_MODE motion_allowed = motion_mode_allowed( +#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION + 0, xd->global_motion, +#endif // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION + mi); +#if CONFIG_SUPERTX + if (!supertx_enabled) +#endif // CONFIG_SUPERTX +#if CONFIG_EXT_INTER + if (mbmi->ref_frame[1] != INTRA_FRAME) +#endif // CONFIG_EXT_INTER +#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION + { + if (motion_allowed == WARPED_CAUSAL) + counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++; + else if (motion_allowed == OBMC_CAUSAL) + counts->obmc[mbmi->sb_type][mbmi->motion_mode == OBMC_CAUSAL]++; + } +#else + if (motion_allowed > SIMPLE_TRANSLATION) + counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++; +#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + +#if CONFIG_EXT_INTER + if (cm->reference_mode != SINGLE_REFERENCE && + is_inter_compound_mode(mbmi->mode) +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + && mbmi->motion_mode == SIMPLE_TRANSLATION +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + ) { + counts->compound_interinter[bsize][mbmi->interinter_compound_type]++; + } +#endif // CONFIG_EXT_INTER + } + } + + if (inter_block && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + int16_t mode_ctx; +#if !CONFIG_REF_MV + mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]]; +#endif + if (bsize >= BLOCK_8X8 || unify_bsize) { + const PREDICTION_MODE mode = mbmi->mode; +#if CONFIG_REF_MV +#if CONFIG_EXT_INTER + if (has_second_ref(mbmi)) { + mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]; + ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)]; + } else { +#endif // CONFIG_EXT_INTER + mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, + mbmi->ref_frame, bsize, -1); + update_inter_mode_stats(counts, mode, mode_ctx); +#if CONFIG_EXT_INTER + } +#endif // CONFIG_EXT_INTER + +#if CONFIG_EXT_INTER + if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) { +#else + if (mbmi->mode == NEWMV) { +#endif + uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + int idx; + + for (idx = 0; idx < 2; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx); + ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx]; + + if (mbmi->ref_mv_idx == idx) break; + } + } + } + +#if CONFIG_EXT_INTER + if (have_nearmv_in_inter_mode(mbmi->mode)) { +#else + if (mbmi->mode == NEARMV) { +#endif + uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + int idx; + + for (idx = 1; idx < 3; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx); + ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1]; + + if (mbmi->ref_mv_idx == idx - 1) break; + } + } + } +#else +#if CONFIG_EXT_INTER + if (is_inter_compound_mode(mode)) + ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)]; + else +#endif // CONFIG_EXT_INTER + ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)]; +#endif + } else { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int j = idy * 2 + idx; + const PREDICTION_MODE b_mode = mi->bmi[j].as_mode; +#if CONFIG_REF_MV +#if CONFIG_EXT_INTER + if (has_second_ref(mbmi)) { + mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]; + ++counts->inter_compound_mode[mode_ctx] + [INTER_COMPOUND_OFFSET(b_mode)]; + } else { +#endif // CONFIG_EXT_INTER + mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, + mbmi->ref_frame, bsize, j); + update_inter_mode_stats(counts, b_mode, mode_ctx); +#if CONFIG_EXT_INTER + } +#endif // CONFIG_EXT_INTER +#else +#if CONFIG_EXT_INTER + if (is_inter_compound_mode(b_mode)) + ++counts->inter_compound_mode[mode_ctx] + [INTER_COMPOUND_OFFSET(b_mode)]; + else +#endif // CONFIG_EXT_INTER + ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)]; +#endif + } + } + } + } + } +} + +typedef struct { + ENTROPY_CONTEXT a[2 * MAX_MIB_SIZE * MAX_MB_PLANE]; + ENTROPY_CONTEXT l[2 * MAX_MIB_SIZE * MAX_MB_PLANE]; + PARTITION_CONTEXT sa[MAX_MIB_SIZE]; + PARTITION_CONTEXT sl[MAX_MIB_SIZE]; +#if CONFIG_VAR_TX + TXFM_CONTEXT *p_ta; + TXFM_CONTEXT *p_tl; + TXFM_CONTEXT ta[MAX_MIB_SIZE]; + TXFM_CONTEXT tl[MAX_MIB_SIZE]; +#endif +} RD_SEARCH_MACROBLOCK_CONTEXT; + +static void restore_context(MACROBLOCK *x, + const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row, + int mi_col, +#if CONFIG_PVQ + od_rollback_buffer *rdo_buf, +#endif + BLOCK_SIZE bsize) { + MACROBLOCKD *xd = &x->e_mbd; + int p; + const int num_4x4_blocks_wide = + block_size_wide[bsize] >> tx_size_wide_log2[0]; + const int num_4x4_blocks_high = + block_size_high[bsize] >> tx_size_high_log2[0]; + int mi_width = mi_size_wide[bsize]; + int mi_height = mi_size_high[bsize]; + for (p = 0; p < MAX_MB_PLANE; p++) { + memcpy(xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x), + ctx->a + num_4x4_blocks_wide * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> + xd->plane[p].subsampling_x); + memcpy(xd->left_context[p] + + ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y), + ctx->l + num_4x4_blocks_high * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> + xd->plane[p].subsampling_y); + } + memcpy(xd->above_seg_context + mi_col, ctx->sa, + sizeof(*xd->above_seg_context) * mi_width); + memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), ctx->sl, + sizeof(xd->left_seg_context[0]) * mi_height); +#if CONFIG_VAR_TX + xd->above_txfm_context = ctx->p_ta; + xd->left_txfm_context = ctx->p_tl; + memcpy(xd->above_txfm_context, ctx->ta, + sizeof(*xd->above_txfm_context) * mi_width); + memcpy(xd->left_txfm_context, ctx->tl, + sizeof(*xd->left_txfm_context) * mi_height); +#endif +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, rdo_buf); +#endif +} + +static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx, + int mi_row, int mi_col, +#if CONFIG_PVQ + od_rollback_buffer *rdo_buf, +#endif + BLOCK_SIZE bsize) { + const MACROBLOCKD *xd = &x->e_mbd; + int p; + const int num_4x4_blocks_wide = + block_size_wide[bsize] >> tx_size_wide_log2[0]; + const int num_4x4_blocks_high = + block_size_high[bsize] >> tx_size_high_log2[0]; + int mi_width = mi_size_wide[bsize]; + int mi_height = mi_size_high[bsize]; + + // buffer the above/left context information of the block in search. + for (p = 0; p < MAX_MB_PLANE; ++p) { + memcpy(ctx->a + num_4x4_blocks_wide * p, + xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x), + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> + xd->plane[p].subsampling_x); + memcpy(ctx->l + num_4x4_blocks_high * p, + xd->left_context[p] + + ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y), + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> + xd->plane[p].subsampling_y); + } + memcpy(ctx->sa, xd->above_seg_context + mi_col, + sizeof(*xd->above_seg_context) * mi_width); + memcpy(ctx->sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK), + sizeof(xd->left_seg_context[0]) * mi_height); +#if CONFIG_VAR_TX + memcpy(ctx->ta, xd->above_txfm_context, + sizeof(*xd->above_txfm_context) * mi_width); + memcpy(ctx->tl, xd->left_txfm_context, + sizeof(*xd->left_txfm_context) * mi_height); + ctx->p_ta = xd->above_txfm_context; + ctx->p_tl = xd->left_txfm_context; +#endif +#if CONFIG_PVQ + od_encode_checkpoint(&x->daala_enc, rdo_buf); +#endif +} + +static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile, + ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col, + RUN_TYPE dry_run, BLOCK_SIZE bsize, +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_TYPE partition, +#endif + PICK_MODE_CONTEXT *ctx, int *rate) { + MACROBLOCK *const x = &td->mb; +#if (CONFIG_MOTION_VAR && CONFIG_NCOBMC) | CONFIG_EXT_DELTA_Q + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi; +#if CONFIG_MOTION_VAR && CONFIG_NCOBMC + int check_ncobmc; +#endif +#endif + + set_offsets(cpi, tile, x, mi_row, mi_col, bsize); +#if CONFIG_EXT_PARTITION_TYPES + x->e_mbd.mi[0]->mbmi.partition = partition; +#endif + update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run); +#if CONFIG_MOTION_VAR && CONFIG_NCOBMC + mbmi = &xd->mi[0]->mbmi; + const MOTION_MODE motion_allowed = motion_mode_allowed( +#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION + 0, xd->global_motion, +#endif // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION + xd->mi[0]); + check_ncobmc = is_inter_block(mbmi) && motion_allowed >= OBMC_CAUSAL; + if (!dry_run && check_ncobmc) { + av1_check_ncobmc_rd(cpi, x, mi_row, mi_col); + av1_setup_dst_planes(x->e_mbd.plane, bsize, + get_frame_new_buffer(&cpi->common), mi_row, mi_col); + } +#endif + encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, ctx, rate); + + if (!dry_run) { +#if CONFIG_EXT_DELTA_Q + mbmi = &xd->mi[0]->mbmi; + if (bsize == BLOCK_64X64 && mbmi->skip == 1 && is_inter_block(mbmi) && + cpi->common.delta_lf_present_flag) { + mbmi->current_delta_lf_from_base = xd->prev_delta_lf_from_base; + } +#endif +#if CONFIG_SUPERTX + update_stats(&cpi->common, td, mi_row, mi_col, 0); +#else + update_stats(&cpi->common, td, mi_row, mi_col); +#endif + } +} + +static void encode_sb(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, + int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize, + PC_TREE *pc_tree, int *rate) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int hbs = mi_size_wide[bsize] / 2; + const int is_partition_root = bsize >= BLOCK_8X8; + const int ctx = is_partition_root + ? partition_plane_context(xd, mi_row, mi_col, +#if CONFIG_UNPOISON_PARTITION_CTX + mi_row + hbs < cm->mi_rows, + mi_col + hbs < cm->mi_cols, +#endif + bsize) + : -1; + const PARTITION_TYPE partition = pc_tree->partitioning; + const BLOCK_SIZE subsize = get_subsize(bsize, partition); +#if CONFIG_EXT_PARTITION_TYPES + const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT); +#endif + +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; + assert(bsize >= BLOCK_8X8); +#endif + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + if (!dry_run && ctx >= 0) td->counts->partition[ctx][partition]++; + +#if CONFIG_SUPERTX + if (!frame_is_intra_only(cm) && bsize <= MAX_SUPERTX_BLOCK_SIZE && + partition != PARTITION_NONE && !xd->lossless[0]) { + int supertx_enabled; + TX_SIZE supertx_size = max_txsize_lookup[bsize]; + supertx_enabled = check_supertx_sb(bsize, supertx_size, pc_tree); + if (supertx_enabled) { + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + int x_idx, y_idx, i; + uint8_t *dst_buf[3]; + int dst_stride[3]; + set_skip_context(xd, mi_row, mi_col); + set_mode_info_offsets(cpi, x, xd, mi_row, mi_col); + update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, dry_run, + pc_tree); + + av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row, + mi_col); + for (i = 0; i < MAX_MB_PLANE; i++) { + dst_buf[i] = xd->plane[i].dst.buf; + dst_stride[i] = xd->plane[i].dst.stride; + } + predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, dry_run, + bsize, bsize, dst_buf, dst_stride, pc_tree); + + set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); + set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize); + + if (!x->skip) { + int this_rate = 0; + av1_encode_sb_supertx((AV1_COMMON *)cm, x, bsize); + av1_tokenize_sb_supertx(cpi, td, tp, dry_run, bsize, rate); + if (rate) *rate += this_rate; + } else { + xd->mi[0]->mbmi.skip = 1; + if (!dry_run) td->counts->skip[av1_get_skip_context(xd)][1]++; + reset_skip_context(xd, bsize); + } + if (!dry_run) { + for (y_idx = 0; y_idx < mi_height; y_idx++) + for (x_idx = 0; x_idx < mi_width; x_idx++) { + if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > + x_idx && + (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > + y_idx) { + xd->mi[x_idx + y_idx * cm->mi_stride]->mbmi.skip = + xd->mi[0]->mbmi.skip; + } + } + td->counts->supertx[partition_supertx_context_lookup[partition]] + [supertx_size][1]++; + td->counts->supertx_size[supertx_size]++; +#if CONFIG_EXT_TX + if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) > + 1 && + !xd->mi[0]->mbmi.skip) { + const int eset = + get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used); + if (eset > 0) { + ++td->counts + ->inter_ext_tx[eset][supertx_size][xd->mi[0]->mbmi.tx_type]; + } + } +#else + if (supertx_size < TX_32X32 && !xd->mi[0]->mbmi.skip) { + ++td->counts->inter_ext_tx[supertx_size][xd->mi[0]->mbmi.tx_type]; + } +#endif // CONFIG_EXT_TX + } +#if CONFIG_EXT_PARTITION_TYPES + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, + partition); +#else + if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) + update_partition_context(xd, mi_row, mi_col, subsize, bsize); +#endif +#if CONFIG_VAR_TX + set_txfm_ctxs(supertx_size, mi_width, mi_height, xd->mi[0]->mbmi.skip, + xd); +#endif // CONFIG_VAR_TX + return; + } else { + if (!dry_run) { + td->counts->supertx[partition_supertx_context_lookup[partition]] + [supertx_size][0]++; + } + } + } +#endif // CONFIG_SUPERTX + + switch (partition) { + case PARTITION_NONE: + encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, +#if CONFIG_EXT_PARTITION_TYPES + partition, +#endif + &pc_tree->none, rate); + break; + case PARTITION_VERT: + encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, +#if CONFIG_EXT_PARTITION_TYPES + partition, +#endif + &pc_tree->vertical[0], rate); + if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) { + encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize, +#if CONFIG_EXT_PARTITION_TYPES + partition, +#endif + &pc_tree->vertical[1], rate); + } + break; + case PARTITION_HORZ: + encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, +#if CONFIG_EXT_PARTITION_TYPES + partition, +#endif + &pc_tree->horizontal[0], rate); + if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) { + encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize, +#if CONFIG_EXT_PARTITION_TYPES + partition, +#endif + &pc_tree->horizontal[1], rate); + } + break; + case PARTITION_SPLIT: + if (bsize == BLOCK_8X8 && !unify_bsize) { + encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, +#if CONFIG_EXT_PARTITION_TYPES + partition, +#endif + pc_tree->leaf_split[0], rate); + } else { + encode_sb(cpi, td, tile, tp, mi_row, mi_col, dry_run, subsize, + pc_tree->split[0], rate); + encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, dry_run, subsize, + pc_tree->split[1], rate); + encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, dry_run, subsize, + pc_tree->split[2], rate); + encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, dry_run, + subsize, pc_tree->split[3], rate); + } + break; +#if CONFIG_EXT_PARTITION_TYPES + case PARTITION_HORZ_A: + encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition, + &pc_tree->horizontala[0], rate); + encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, + partition, &pc_tree->horizontala[1], rate); + encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize, + partition, &pc_tree->horizontala[2], rate); + break; + case PARTITION_HORZ_B: + encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition, + &pc_tree->horizontalb[0], rate); + encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, + partition, &pc_tree->horizontalb[1], rate); + encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2, + partition, &pc_tree->horizontalb[2], rate); + break; + case PARTITION_VERT_A: + encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition, + &pc_tree->verticala[0], rate); + encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, + partition, &pc_tree->verticala[1], rate); + encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize, + partition, &pc_tree->verticala[2], rate); + + break; + case PARTITION_VERT_B: + encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition, + &pc_tree->verticalb[0], rate); + encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, + partition, &pc_tree->verticalb[1], rate); + encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2, + partition, &pc_tree->verticalb[2], rate); + break; +#endif // CONFIG_EXT_PARTITION_TYPES + default: assert(0 && "Invalid partition type."); break; + } + +#if CONFIG_EXT_PARTITION_TYPES + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); +#else + if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) + update_partition_context(xd, mi_row, mi_col, subsize, bsize); +#endif // CONFIG_EXT_PARTITION_TYPES +} + +// Check to see if the given partition size is allowed for a specified number +// of mi block rows and columns remaining in the image. +// If not then return the largest allowed partition size +static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left, + int cols_left, int *bh, int *bw) { + if (rows_left <= 0 || cols_left <= 0) { + return AOMMIN(bsize, BLOCK_8X8); + } else { + for (; bsize > 0; bsize -= 3) { + *bh = mi_size_high[bsize]; + *bw = mi_size_wide[bsize]; + if ((*bh <= rows_left) && (*bw <= cols_left)) { + break; + } + } + } + return bsize; +} + +static void set_partial_sb_partition(const AV1_COMMON *const cm, MODE_INFO *mi, + int bh_in, int bw_in, + int mi_rows_remaining, + int mi_cols_remaining, BLOCK_SIZE bsize, + MODE_INFO **mib) { + int bh = bh_in; + int r, c; + for (r = 0; r < cm->mib_size; r += bh) { + int bw = bw_in; + for (c = 0; c < cm->mib_size; c += bw) { + const int index = r * cm->mi_stride + c; + mib[index] = mi + index; + mib[index]->mbmi.sb_type = find_partition_size( + bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw); + } + } +} + +// This function attempts to set all mode info entries in a given superblock +// to the same block partition size. +// However, at the bottom and right borders of the image the requested size +// may not be allowed in which case this code attempts to choose the largest +// allowable partition. +static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mib, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &cpi->common; + const int mi_rows_remaining = tile->mi_row_end - mi_row; + const int mi_cols_remaining = tile->mi_col_end - mi_col; + int block_row, block_col; + MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col; + int bh = mi_size_high[bsize]; + int bw = mi_size_wide[bsize]; + + assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0)); + + // Apply the requested partition size to the SB if it is all "in image" + if ((mi_cols_remaining >= cm->mib_size) && + (mi_rows_remaining >= cm->mib_size)) { + for (block_row = 0; block_row < cm->mib_size; block_row += bh) { + for (block_col = 0; block_col < cm->mib_size; block_col += bw) { + int index = block_row * cm->mi_stride + block_col; + mib[index] = mi_upper_left + index; + mib[index]->mbmi.sb_type = bsize; + } + } + } else { + // Else this is a partial SB. + set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining, + mi_cols_remaining, bsize, mib); + } +} + +static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, MODE_INFO **mib, + TOKENEXTRA **tp, int mi_row, int mi_col, + BLOCK_SIZE bsize, int *rate, int64_t *dist, +#if CONFIG_SUPERTX + int *rate_nocoef, +#endif + int do_recon, PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + int i; + const int pl = (bsize >= BLOCK_8X8) + ? partition_plane_context(xd, mi_row, mi_col, +#if CONFIG_UNPOISON_PARTITION_CTX + mi_row + hbs < cm->mi_rows, + mi_col + hbs < cm->mi_cols, +#endif + bsize) + : 0; + const PARTITION_TYPE partition = + (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize) + : PARTITION_NONE; + const BLOCK_SIZE subsize = get_subsize(bsize, partition); + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_STATS last_part_rdc, none_rdc, chosen_rdc; + BLOCK_SIZE sub_subsize = BLOCK_4X4; + int splits_below = 0; + BLOCK_SIZE bs_type = mib[0]->mbmi.sb_type; + int do_partition_search = 1; + PICK_MODE_CONTEXT *ctx_none = &pc_tree->none; + const int unify_bsize = CONFIG_CB4X4; +#if CONFIG_SUPERTX + int last_part_rate_nocoef = INT_MAX; + int none_rate_nocoef = INT_MAX; + int chosen_rate_nocoef = INT_MAX; +#endif +#if CONFIG_PVQ + od_rollback_buffer pre_rdo_buf; +#endif + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + assert(num_4x4_blocks_wide_lookup[bsize] == + num_4x4_blocks_high_lookup[bsize]); + + av1_invalid_rd_stats(&last_part_rdc); + av1_invalid_rd_stats(&none_rdc); + av1_invalid_rd_stats(&chosen_rdc); + + pc_tree->partitioning = partition; + +#if CONFIG_VAR_TX + xd->above_txfm_context = cm->above_txfm_context + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); +#endif +#if !CONFIG_PVQ + save_context(x, &x_ctx, mi_row, mi_col, bsize); +#else + save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); +#endif + + if (bsize == BLOCK_16X16 && cpi->vaq_refresh) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + x->mb_energy = av1_block_energy(cpi, x, bsize); + } + + if (do_partition_search && + cpi->sf.partition_search_type == SEARCH_PARTITION && + cpi->sf.adjust_partitioning_from_last_frame) { + // Check if any of the sub blocks are further split. + if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) { + sub_subsize = get_subsize(subsize, PARTITION_SPLIT); + splits_below = 1; + for (i = 0; i < 4; i++) { + int jj = i >> 1, ii = i & 0x01; + MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs]; + if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) { + splits_below = 0; + } + } + } + + // If partition is not none try none unless each of the 4 splits are split + // even further.. + if (partition != PARTITION_NONE && !splits_below && + mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) { + pc_tree->partitioning = PARTITION_NONE; + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, +#if CONFIG_SUPERTX + &none_rate_nocoef, +#endif +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_NONE, +#endif + bsize, ctx_none, INT64_MAX); + + if (none_rdc.rate < INT_MAX) { + none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; + none_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, none_rdc.rate, none_rdc.dist); +#if CONFIG_SUPERTX + none_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE]; +#endif + } + +#if !CONFIG_PVQ + restore_context(x, &x_ctx, mi_row, mi_col, bsize); +#else + restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); +#endif + mib[0]->mbmi.sb_type = bs_type; + pc_tree->partitioning = partition; + } + } + + switch (partition) { + case PARTITION_NONE: + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, +#if CONFIG_SUPERTX + &last_part_rate_nocoef, +#endif +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_NONE, +#endif + bsize, ctx_none, INT64_MAX); + break; + case PARTITION_HORZ: + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, +#if CONFIG_SUPERTX + &last_part_rate_nocoef, +#endif +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_HORZ, +#endif + subsize, &pc_tree->horizontal[0], INT64_MAX); + if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && + mi_row + hbs < cm->mi_rows) { + RD_STATS tmp_rdc; +#if CONFIG_SUPERTX + int rt_nocoef = 0; +#endif + PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0]; + av1_init_rd_stats(&tmp_rdc); + update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize, + ctx_h, NULL); + rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc, +#if CONFIG_SUPERTX + &rt_nocoef, +#endif +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_HORZ, +#endif + subsize, &pc_tree->horizontal[1], INT64_MAX); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); +#if CONFIG_SUPERTX + last_part_rate_nocoef = INT_MAX; +#endif + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + last_part_rdc.rdcost += tmp_rdc.rdcost; +#if CONFIG_SUPERTX + last_part_rate_nocoef += rt_nocoef; +#endif + } + break; + case PARTITION_VERT: + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, +#if CONFIG_SUPERTX + &last_part_rate_nocoef, +#endif +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_VERT, +#endif + subsize, &pc_tree->vertical[0], INT64_MAX); + if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && + mi_col + hbs < cm->mi_cols) { + RD_STATS tmp_rdc; +#if CONFIG_SUPERTX + int rt_nocoef = 0; +#endif + PICK_MODE_CONTEXT *ctx_v = &pc_tree->vertical[0]; + av1_init_rd_stats(&tmp_rdc); + update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize, + ctx_v, NULL); + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc, +#if CONFIG_SUPERTX + &rt_nocoef, +#endif +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_VERT, +#endif + subsize, &pc_tree->vertical[bsize > BLOCK_8X8], + INT64_MAX); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); +#if CONFIG_SUPERTX + last_part_rate_nocoef = INT_MAX; +#endif + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + last_part_rdc.rdcost += tmp_rdc.rdcost; +#if CONFIG_SUPERTX + last_part_rate_nocoef += rt_nocoef; +#endif + } + break; + case PARTITION_SPLIT: + if (bsize == BLOCK_8X8 && !unify_bsize) { + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, +#if CONFIG_SUPERTX + &last_part_rate_nocoef, +#endif +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_SPLIT, +#endif + subsize, pc_tree->leaf_split[0], INT64_MAX); + break; + } + last_part_rdc.rate = 0; + last_part_rdc.dist = 0; + last_part_rdc.rdcost = 0; +#if CONFIG_SUPERTX + last_part_rate_nocoef = 0; +#endif + for (i = 0; i < 4; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + int jj = i >> 1, ii = i & 0x01; + RD_STATS tmp_rdc; +#if CONFIG_SUPERTX + int rt_nocoef; +#endif + if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) + continue; + + av1_init_rd_stats(&tmp_rdc); + rd_use_partition(cpi, td, tile_data, + mib + jj * hbs * cm->mi_stride + ii * hbs, tp, + mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate, + &tmp_rdc.dist, +#if CONFIG_SUPERTX + &rt_nocoef, +#endif + i != 3, pc_tree->split[i]); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); +#if CONFIG_SUPERTX + last_part_rate_nocoef = INT_MAX; +#endif + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; +#if CONFIG_SUPERTX + last_part_rate_nocoef += rt_nocoef; +#endif + } + break; +#if CONFIG_EXT_PARTITION_TYPES + case PARTITION_VERT_A: + case PARTITION_VERT_B: + case PARTITION_HORZ_A: + case PARTITION_HORZ_B: assert(0 && "Cannot handle extended partiton types"); +#endif // CONFIG_EXT_PARTITION_TYPES + default: assert(0); break; + } + + if (last_part_rdc.rate < INT_MAX) { + last_part_rdc.rate += cpi->partition_cost[pl][partition]; + last_part_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, last_part_rdc.rate, last_part_rdc.dist); +#if CONFIG_SUPERTX + last_part_rate_nocoef += cpi->partition_cost[pl][partition]; +#endif + } + + if (do_partition_search && cpi->sf.adjust_partitioning_from_last_frame && + cpi->sf.partition_search_type == SEARCH_PARTITION && + partition != PARTITION_SPLIT && bsize > BLOCK_8X8 && + (mi_row + bs < cm->mi_rows || mi_row + hbs == cm->mi_rows) && + (mi_col + bs < cm->mi_cols || mi_col + hbs == cm->mi_cols)) { + BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT); + chosen_rdc.rate = 0; + chosen_rdc.dist = 0; +#if CONFIG_SUPERTX + chosen_rate_nocoef = 0; +#endif +#if !CONFIG_PVQ + restore_context(x, &x_ctx, mi_row, mi_col, bsize); +#else + restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); +#endif + pc_tree->partitioning = PARTITION_SPLIT; + + // Split partition. + for (i = 0; i < 4; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + RD_STATS tmp_rdc; +#if CONFIG_SUPERTX + int rt_nocoef = 0; +#endif +#if CONFIG_PVQ + od_rollback_buffer buf; +#endif + if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) + continue; + +#if !CONFIG_PVQ + save_context(x, &x_ctx, mi_row, mi_col, bsize); +#else + save_context(x, &x_ctx, mi_row, mi_col, &buf, bsize); +#endif + pc_tree->split[i]->partitioning = PARTITION_NONE; + rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, + &tmp_rdc, +#if CONFIG_SUPERTX + &rt_nocoef, +#endif +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_SPLIT, +#endif + split_subsize, &pc_tree->split[i]->none, INT64_MAX); + +#if !CONFIG_PVQ + restore_context(x, &x_ctx, mi_row, mi_col, bsize); +#else + restore_context(x, &x_ctx, mi_row, mi_col, &buf, bsize); +#endif + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&chosen_rdc); +#if CONFIG_SUPERTX + chosen_rate_nocoef = INT_MAX; +#endif + break; + } + + chosen_rdc.rate += tmp_rdc.rate; + chosen_rdc.dist += tmp_rdc.dist; +#if CONFIG_SUPERTX + chosen_rate_nocoef += rt_nocoef; +#endif + + if (i != 3) + encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, + OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL); + + chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; +#if CONFIG_SUPERTX + chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_SPLIT]; +#endif + } + if (chosen_rdc.rate < INT_MAX) { + chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT]; + chosen_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, chosen_rdc.rate, chosen_rdc.dist); +#if CONFIG_SUPERTX + chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE]; +#endif + } + } + + // If last_part is better set the partitioning to that. + if (last_part_rdc.rdcost < chosen_rdc.rdcost) { + mib[0]->mbmi.sb_type = bsize; + if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition; + chosen_rdc = last_part_rdc; +#if CONFIG_SUPERTX + chosen_rate_nocoef = last_part_rate_nocoef; +#endif + } + // If none was better set the partitioning to that. + if (none_rdc.rdcost < chosen_rdc.rdcost) { + if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; + chosen_rdc = none_rdc; +#if CONFIG_SUPERTX + chosen_rate_nocoef = none_rate_nocoef; +#endif + } + +#if !CONFIG_PVQ + restore_context(x, &x_ctx, mi_row, mi_col, bsize); +#else + restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); +#endif + + // We must have chosen a partitioning and encoding or we'll fail later on. + // No other opportunities for success. + if (bsize == cm->sb_size) + assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX); + + if (do_recon) { + if (bsize == cm->sb_size) { + // NOTE: To get estimate for rate due to the tokens, use: + // int rate_coeffs = 0; + // encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, + // bsize, pc_tree, &rate_coeffs); + encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + pc_tree, NULL); + } else { + encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + } + + *rate = chosen_rdc.rate; + *dist = chosen_rdc.dist; +#if CONFIG_SUPERTX + *rate_nocoef = chosen_rate_nocoef; +#endif +} + +/* clang-format off */ +static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = { +#if CONFIG_CB4X4 + BLOCK_2X2, BLOCK_2X2, BLOCK_2X2, // 2x2, 2x4, 4x2 +#endif + BLOCK_4X4, // 4x4 + BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, // 4x8, 8x4, 8x8 + BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 8x16, 16x8, 16x16 + BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, // 16x32, 32x16, 32x32 + BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 32x64, 64x32, 64x64 +#if CONFIG_EXT_PARTITION + BLOCK_16X16, BLOCK_16X16, BLOCK_16X16 // 64x128, 128x64, 128x128 +#endif // CONFIG_EXT_PARTITION +}; + +static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = { +#if CONFIG_CB4X4 + BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, // 2x2, 2x4, 4x2 +#endif + BLOCK_8X8, // 4x4 + BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 4x8, 8x4, 8x8 + BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, // 8x16, 16x8, 16x16 + BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, // 16x32, 32x16, 32x32 + BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST, // 32x64, 64x32, 64x64 +#if CONFIG_EXT_PARTITION + BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST // 64x128, 128x64, 128x128 +#endif // CONFIG_EXT_PARTITION +}; + +// Next square block size less or equal than current block size. +static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = { +#if CONFIG_CB4X4 + BLOCK_2X2, BLOCK_2X2, BLOCK_2X2, // 2x2, 2x4, 4x2 +#endif + BLOCK_4X4, // 4x4 + BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x8, 8x4, 8x8 + BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, // 8x16, 16x8, 16x16 + BLOCK_16X16, BLOCK_16X16, BLOCK_32X32, // 16x32, 32x16, 32x32 + BLOCK_32X32, BLOCK_32X32, BLOCK_64X64, // 32x64, 64x32, 64x64 +#if CONFIG_EXT_PARTITION + BLOCK_64X64, BLOCK_64X64, BLOCK_128X128 // 64x128, 128x64, 128x128 +#endif // CONFIG_EXT_PARTITION +}; +/* clang-format on */ + +// Look at all the mode_info entries for blocks that are part of this +// partition and find the min and max values for sb_type. +// At the moment this is designed to work on a superblock but could be +// adjusted to use a size parameter. +// +// The min and max are assumed to have been initialized prior to calling this +// function so repeat calls can accumulate a min and max of more than one +// superblock. +static void get_sb_partition_size_range(const AV1_COMMON *const cm, + MACROBLOCKD *xd, MODE_INFO **mib, + BLOCK_SIZE *min_block_size, + BLOCK_SIZE *max_block_size) { + int i, j; + int index = 0; + + // Check the sb_type for each block that belongs to this region. + for (i = 0; i < cm->mib_size; ++i) { + for (j = 0; j < cm->mib_size; ++j) { + MODE_INFO *mi = mib[index + j]; + BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : BLOCK_4X4; + *min_block_size = AOMMIN(*min_block_size, sb_type); + *max_block_size = AOMMAX(*max_block_size, sb_type); + } + index += xd->mi_stride; + } +} + +// Look at neighboring blocks and set a min and max partition size based on +// what they chose. +static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile, + MACROBLOCKD *const xd, int mi_row, + int mi_col, BLOCK_SIZE *min_block_size, + BLOCK_SIZE *max_block_size) { + AV1_COMMON *const cm = &cpi->common; + MODE_INFO **mi = xd->mi; + const int left_in_image = xd->left_available && mi[-1]; + const int above_in_image = xd->up_available && mi[-xd->mi_stride]; + const int mi_rows_remaining = tile->mi_row_end - mi_row; + const int mi_cols_remaining = tile->mi_col_end - mi_col; + int bh, bw; + BLOCK_SIZE min_size = BLOCK_4X4; + BLOCK_SIZE max_size = BLOCK_LARGEST; + + // Trap case where we do not have a prediction. + if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) { + // Default "min to max" and "max to min" + min_size = BLOCK_LARGEST; + max_size = BLOCK_4X4; + + // NOTE: each call to get_sb_partition_size_range() uses the previous + // passed in values for min and max as a starting point. + // Find the min and max partition used in previous frame at this location + if (cm->frame_type != KEY_FRAME) { + MODE_INFO **prev_mi = + &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]; + get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size); + } + // Find the min and max partition sizes used in the left superblock + if (left_in_image) { + MODE_INFO **left_sb_mi = &mi[-cm->mib_size]; + get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size); + } + // Find the min and max partition sizes used in the above suprblock. + if (above_in_image) { + MODE_INFO **above_sb_mi = &mi[-xd->mi_stride * cm->mib_size]; + get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size); + } + + // Adjust observed min and max for "relaxed" auto partition case. + if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) { + min_size = min_partition_size[min_size]; + max_size = max_partition_size[max_size]; + } + } + + // Check border cases where max and min from neighbors may not be legal. + max_size = find_partition_size(max_size, mi_rows_remaining, mi_cols_remaining, + &bh, &bw); + min_size = AOMMIN(min_size, max_size); + + // Test for blocks at the edge of the active image. + // This may be the actual edge of the image or where there are formatting + // bars. + if (av1_active_edge_sb(cpi, mi_row, mi_col)) { + min_size = BLOCK_4X4; + } else { + min_size = AOMMIN(cpi->sf.rd_auto_partition_min_limit, min_size); + } + + // When use_square_partition_only is true, make sure at least one square + // partition is allowed by selecting the next smaller square size as + // *min_block_size. + if (cpi->sf.use_square_partition_only) { + min_size = AOMMIN(min_size, next_square_size[max_size]); + } + + *min_block_size = AOMMIN(min_size, cm->sb_size); + *max_block_size = AOMMIN(max_size, cm->sb_size); +} + +// TODO(jingning) refactor functions setting partition search range +static void set_partition_range(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, int mi_row, + int mi_col, BLOCK_SIZE bsize, + BLOCK_SIZE *const min_bs, + BLOCK_SIZE *const max_bs) { + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + int idx, idy; + + const int idx_str = cm->mi_stride * mi_row + mi_col; + MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str]; + BLOCK_SIZE min_size = BLOCK_64X64; // default values + BLOCK_SIZE max_size = BLOCK_4X4; + + if (prev_mi) { + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + const MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx]; + const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize; + min_size = AOMMIN(min_size, bs); + max_size = AOMMAX(max_size, bs); + } + } + } + + if (xd->left_available) { + for (idy = 0; idy < mi_height; ++idy) { + const MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1]; + const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize; + min_size = AOMMIN(min_size, bs); + max_size = AOMMAX(max_size, bs); + } + } + + if (xd->up_available) { + for (idx = 0; idx < mi_width; ++idx) { + const MODE_INFO *const mi = xd->mi[idx - cm->mi_stride]; + const BLOCK_SIZE bs = mi ? mi->mbmi.sb_type : bsize; + min_size = AOMMIN(min_size, bs); + max_size = AOMMAX(max_size, bs); + } + } + + if (min_size == max_size) { + min_size = min_partition_size[min_size]; + max_size = max_partition_size[max_size]; + } + + *min_bs = AOMMIN(min_size, cm->sb_size); + *max_bs = AOMMIN(max_size, cm->sb_size); +} + +static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { + memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); +} + +static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { + memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); +} + +#if CONFIG_FP_MB_STATS +const int qindex_skip_threshold_lookup[BLOCK_SIZES] = { + 0, + 10, + 10, + 30, + 40, + 40, + 60, + 80, + 80, + 90, + 100, + 100, + 120, +#if CONFIG_EXT_PARTITION + // TODO(debargha): What are the correct numbers here? + 130, + 130, + 150 +#endif // CONFIG_EXT_PARTITION +}; +const int qindex_split_threshold_lookup[BLOCK_SIZES] = { + 0, + 3, + 3, + 7, + 15, + 15, + 30, + 40, + 40, + 60, + 80, + 80, + 120, +#if CONFIG_EXT_PARTITION + // TODO(debargha): What are the correct numbers here? + 160, + 160, + 240 +#endif // CONFIG_EXT_PARTITION +}; +const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = { + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 4, + 4, + 6, +#if CONFIG_EXT_PARTITION + // TODO(debargha): What are the correct numbers here? + 8, + 8, + 10 +#endif // CONFIG_EXT_PARTITION +}; + +typedef enum { + MV_ZERO = 0, + MV_LEFT = 1, + MV_UP = 2, + MV_RIGHT = 3, + MV_DOWN = 4, + MV_INVALID +} MOTION_DIRECTION; + +static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) { + if (fp_byte & FPMB_MOTION_ZERO_MASK) { + return MV_ZERO; + } else if (fp_byte & FPMB_MOTION_LEFT_MASK) { + return MV_LEFT; + } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) { + return MV_RIGHT; + } else if (fp_byte & FPMB_MOTION_UP_MASK) { + return MV_UP; + } else { + return MV_DOWN; + } +} + +static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, + MOTION_DIRECTION that_mv) { + if (this_mv == that_mv) { + return 0; + } else { + return abs(this_mv - that_mv) == 2 ? 2 : 1; + } +} +#endif + +#if CONFIG_EXT_PARTITION_TYPES +static void rd_test_partition3( + const AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TOKENEXTRA **tp, PC_TREE *pc_tree, RD_STATS *best_rdc, + PICK_MODE_CONTEXT ctxs[3], PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, + BLOCK_SIZE bsize, PARTITION_TYPE partition, +#if CONFIG_SUPERTX + int64_t best_rd, int *best_rate_nocoef, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, +#endif + int mi_row0, int mi_col0, BLOCK_SIZE subsize0, int mi_row1, int mi_col1, + BLOCK_SIZE subsize1, int mi_row2, int mi_col2, BLOCK_SIZE subsize2) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + RD_STATS this_rdc, sum_rdc; +#if CONFIG_SUPERTX + const AV1_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + int this_rate_nocoef, sum_rate_nocoef; + int abort_flag; + const int supertx_allowed = !frame_is_intra_only(cm) && + bsize <= MAX_SUPERTX_BLOCK_SIZE && + !xd->lossless[0]; +#endif + if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); + + rd_pick_sb_modes(cpi, tile_data, x, mi_row0, mi_col0, &sum_rdc, +#if CONFIG_SUPERTX + &sum_rate_nocoef, +#endif +#if CONFIG_EXT_PARTITION_TYPES + partition, +#endif + subsize0, &ctxs[0], best_rdc->rdcost); +#if CONFIG_SUPERTX + abort_flag = sum_rdc.rdcost >= best_rd; +#endif + +#if CONFIG_SUPERTX + if (sum_rdc.rdcost < INT64_MAX) { +#else + if (sum_rdc.rdcost < best_rdc->rdcost) { +#endif + PICK_MODE_CONTEXT *ctx_0 = &ctxs[0]; + update_state(cpi, td, ctx_0, mi_row0, mi_col0, subsize0, 1); + encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row0, mi_col0, subsize0, + ctx_0, NULL); + + if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_0); + +#if CONFIG_SUPERTX + rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc, + &this_rate_nocoef, +#if CONFIG_EXT_PARTITION_TYPES + partition, +#endif + subsize1, &ctxs[1], INT64_MAX - sum_rdc.rdcost); +#else + rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc, +#if CONFIG_EXT_PARTITION_TYPES + partition, +#endif + subsize1, &ctxs[1], best_rdc->rdcost - sum_rdc.rdcost); +#endif // CONFIG_SUPERTX + + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; +#if CONFIG_SUPERTX + sum_rate_nocoef = INT_MAX; +#endif + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; +#if CONFIG_SUPERTX + sum_rate_nocoef += this_rate_nocoef; +#endif + } + +#if CONFIG_SUPERTX + if (sum_rdc.rdcost < INT64_MAX) { +#else + if (sum_rdc.rdcost < best_rdc->rdcost) { +#endif + PICK_MODE_CONTEXT *ctx_1 = &ctxs[1]; + update_state(cpi, td, ctx_1, mi_row1, mi_col1, subsize1, 1); + encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row1, mi_col1, subsize1, + ctx_1, NULL); + + if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_1); + +#if CONFIG_SUPERTX + rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc, + &this_rate_nocoef, +#if CONFIG_EXT_PARTITION_TYPES + partition, +#endif + subsize2, &ctxs[2], INT64_MAX - sum_rdc.rdcost); +#else + rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc, +#if CONFIG_EXT_PARTITION_TYPES + partition, +#endif + subsize2, &ctxs[2], best_rdc->rdcost - sum_rdc.rdcost); +#endif // CONFIG_SUPERTX + + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; +#if CONFIG_SUPERTX + sum_rate_nocoef = INT_MAX; +#endif + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; +#if CONFIG_SUPERTX + sum_rate_nocoef += this_rate_nocoef; +#endif + } + +#if CONFIG_SUPERTX + if (supertx_allowed && !abort_flag && sum_rdc.rdcost < INT64_MAX) { + TX_SIZE supertx_size = max_txsize_lookup[bsize]; + const PARTITION_TYPE best_partition = pc_tree->partitioning; + pc_tree->partitioning = partition; + sum_rdc.rate += av1_cost_bit( + cm->fc->supertx_prob[partition_supertx_context_lookup[partition]] + [supertx_size], + 0); + sum_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); + + if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) { + TX_TYPE best_tx = DCT_DCT; + RD_STATS tmp_rdc = { sum_rate_nocoef, 0, 0 }; + + restore_context(x, x_ctx, mi_row, mi_col, bsize); + + rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, + &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree); + + tmp_rdc.rate += av1_cost_bit( + cm->fc->supertx_prob[partition_supertx_context_lookup[partition]] + [supertx_size], + 1); + tmp_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist); + if (tmp_rdc.rdcost < sum_rdc.rdcost) { + sum_rdc = tmp_rdc; + update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx, + supertx_size, pc_tree); + } + } + + pc_tree->partitioning = best_partition; + } +#endif // CONFIG_SUPERTX + + if (sum_rdc.rdcost < best_rdc->rdcost) { + int pl = partition_plane_context(xd, mi_row, mi_col, +#if CONFIG_UNPOISON_PARTITION_CTX + has_rows, has_cols, +#endif + bsize); + sum_rdc.rate += cpi->partition_cost[pl][partition]; + sum_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); +#if CONFIG_SUPERTX + sum_rate_nocoef += cpi->partition_cost[pl][partition]; +#endif + if (sum_rdc.rdcost < best_rdc->rdcost) { +#if CONFIG_SUPERTX + *best_rate_nocoef = sum_rate_nocoef; + assert(*best_rate_nocoef >= 0); +#endif + *best_rdc = sum_rdc; + pc_tree->partitioning = partition; + } + } + } + } +} +#endif // CONFIG_EXT_PARTITION_TYPES + +// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are +// unlikely to be selected depending on previous rate-distortion optimization +// results, for encoding speed-up. +static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RD_STATS *rd_cost, +#if CONFIG_SUPERTX + int *rate_nocoef, +#endif + int64_t best_rd, PC_TREE *pc_tree) { + const AV1_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_step = mi_size_wide[bsize] / 2; + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + const TOKENEXTRA *const tp_orig = *tp; + PICK_MODE_CONTEXT *ctx_none = &pc_tree->none; +#if CONFIG_UNPOISON_PARTITION_CTX + const int hbs = mi_size_wide[bsize] / 2; + const int has_rows = mi_row + hbs < cm->mi_rows; + const int has_cols = mi_col + hbs < cm->mi_cols; +#else + int tmp_partition_cost[PARTITION_TYPES]; +#endif + BLOCK_SIZE subsize; + RD_STATS this_rdc, sum_rdc, best_rdc; + const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8); + int do_square_split = bsize_at_least_8x8; +#if CONFIG_CB4X4 + const int unify_bsize = 1; + const int pl = bsize_at_least_8x8 + ? partition_plane_context(xd, mi_row, mi_col, +#if CONFIG_UNPOISON_PARTITION_CTX + has_rows, has_cols, +#endif + bsize) + : 0; +#else + const int unify_bsize = 0; + const int pl = partition_plane_context(xd, mi_row, mi_col, +#if CONFIG_UNPOISON_PARTITION_CTX + has_rows, has_cols, +#endif + bsize); +#endif // CONFIG_CB4X4 + const int *partition_cost = cpi->partition_cost[pl]; +#if CONFIG_SUPERTX + int this_rate_nocoef, sum_rate_nocoef = 0, best_rate_nocoef = INT_MAX; + int abort_flag; + const int supertx_allowed = !frame_is_intra_only(cm) && + bsize <= MAX_SUPERTX_BLOCK_SIZE && + !xd->lossless[0]; +#endif // CONFIG_SUPERTX + + int do_rectangular_split = 1; +#if CONFIG_EXT_PARTITION_TYPES + BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT); +#endif + + // Override skipping rectangular partition operations for edge blocks + const int force_horz_split = (mi_row + mi_step >= cm->mi_rows); + const int force_vert_split = (mi_col + mi_step >= cm->mi_cols); + const int xss = x->e_mbd.plane[1].subsampling_x; + const int yss = x->e_mbd.plane[1].subsampling_y; + + BLOCK_SIZE min_size = x->min_partition_size; + BLOCK_SIZE max_size = x->max_partition_size; + +#if CONFIG_FP_MB_STATS + unsigned int src_diff_var = UINT_MAX; + int none_complexity = 0; +#endif + + int partition_none_allowed = !force_horz_split && !force_vert_split; + int partition_horz_allowed = + !force_vert_split && yss <= xss && bsize_at_least_8x8; + int partition_vert_allowed = + !force_horz_split && xss <= yss && bsize_at_least_8x8; + +#if CONFIG_PVQ + od_rollback_buffer pre_rdo_buf; +#endif + + (void)*tp_orig; + +#if !CONFIG_UNPOISON_PARTITION_CTX + if (force_horz_split || force_vert_split) { + tmp_partition_cost[PARTITION_NONE] = INT_MAX; + + if (!force_vert_split) { // force_horz_split only + tmp_partition_cost[PARTITION_VERT] = INT_MAX; + tmp_partition_cost[PARTITION_HORZ] = + av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 0); + tmp_partition_cost[PARTITION_SPLIT] = + av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 1); + } else if (!force_horz_split) { // force_vert_split only + tmp_partition_cost[PARTITION_HORZ] = INT_MAX; + tmp_partition_cost[PARTITION_VERT] = + av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 0); + tmp_partition_cost[PARTITION_SPLIT] = + av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 1); + } else { // force_ horz_split && force_vert_split horz_split + tmp_partition_cost[PARTITION_HORZ] = INT_MAX; + tmp_partition_cost[PARTITION_VERT] = INT_MAX; + tmp_partition_cost[PARTITION_SPLIT] = 0; + } + + partition_cost = tmp_partition_cost; + } +#endif + +#if CONFIG_VAR_TX +#ifndef NDEBUG + // Nothing should rely on the default value of this array (which is just + // leftover from encoding the previous block. Setting it to magic number + // when debugging. + memset(x->blk_skip[0], 234, sizeof(x->blk_skip[0])); +#endif // NDEBUG +#endif // CONFIG_VAR_TX + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + + av1_init_rd_stats(&this_rdc); + av1_init_rd_stats(&sum_rdc); + av1_invalid_rd_stats(&best_rdc); + best_rdc.rdcost = best_rd; + + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + if (bsize == BLOCK_16X16 && cpi->vaq_refresh) + x->mb_energy = av1_block_energy(cpi, x, bsize); + + if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) { + const int cb_partition_search_ctrl = + ((pc_tree->index == 0 || pc_tree->index == 3) + + get_chessboard_index(cm->current_video_frame)) & + 0x1; + + if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size) + set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size); + } + + // Determine partition types in search according to the speed features. + // The threshold set here has to be of square block size. + if (cpi->sf.auto_min_max_partition_size) { + const int no_partition_allowed = (bsize <= max_size && bsize >= min_size); + // Note: Further partitioning is NOT allowed when bsize == min_size already. + const int partition_allowed = (bsize <= max_size && bsize > min_size); + partition_none_allowed &= no_partition_allowed; + partition_horz_allowed &= partition_allowed || force_horz_split; + partition_vert_allowed &= partition_allowed || force_vert_split; + do_square_split &= bsize > min_size; + } + if (cpi->sf.use_square_partition_only) { + partition_horz_allowed &= force_horz_split; + partition_vert_allowed &= force_vert_split; + } + +#if CONFIG_VAR_TX + xd->above_txfm_context = cm->above_txfm_context + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); +#endif +#if !CONFIG_PVQ + save_context(x, &x_ctx, mi_row, mi_col, bsize); +#else + save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); +#endif + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row, + mi_col, bsize); + } +#endif + +#if CONFIG_FP_MB_STATS + // Decide whether we shall split directly and skip searching NONE by using + // the first pass block statistics + if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_square_split && + partition_none_allowed && src_diff_var > 4 && + cm->base_qindex < qindex_split_threshold_lookup[bsize]) { + int mb_row = mi_row >> 1; + int mb_col = mi_col >> 1; + int mb_row_end = + AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); + int mb_col_end = + AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); + int r, c; + + // compute a complexity measure, basically measure inconsistency of motion + // vectors obtained from the first pass in the current block + for (r = mb_row; r < mb_row_end; r++) { + for (c = mb_col; c < mb_col_end; c++) { + const int mb_index = r * cm->mb_cols + c; + + MOTION_DIRECTION this_mv; + MOTION_DIRECTION right_mv; + MOTION_DIRECTION bottom_mv; + + this_mv = + get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]); + + // to its right + if (c != mb_col_end - 1) { + right_mv = get_motion_direction_fp( + cpi->twopass.this_frame_mb_stats[mb_index + 1]); + none_complexity += get_motion_inconsistency(this_mv, right_mv); + } + + // to its bottom + if (r != mb_row_end - 1) { + bottom_mv = get_motion_direction_fp( + cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]); + none_complexity += get_motion_inconsistency(this_mv, bottom_mv); + } + + // do not count its left and top neighbors to avoid double counting + } + } + + if (none_complexity > complexity_16x16_blocks_threshold[bsize]) { + partition_none_allowed = 0; + } + } +#endif + + // PARTITION_NONE + if (partition_none_allowed) { + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, +#if CONFIG_SUPERTX + &this_rate_nocoef, +#endif +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_NONE, +#endif + bsize, ctx_none, best_rdc.rdcost); + if (this_rdc.rate != INT_MAX) { + if (bsize_at_least_8x8) { + this_rdc.rate += partition_cost[PARTITION_NONE]; + this_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); +#if CONFIG_SUPERTX + this_rate_nocoef += partition_cost[PARTITION_NONE]; +#endif + } + + if (this_rdc.rdcost < best_rdc.rdcost) { + // Adjust dist breakout threshold according to the partition size. + const int64_t dist_breakout_thr = + cpi->sf.partition_search_breakout_dist_thr >> + ((2 * (MAX_SB_SIZE_LOG2 - 2)) - + (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize])); + const int rate_breakout_thr = + cpi->sf.partition_search_breakout_rate_thr * + num_pels_log2_lookup[bsize]; + + best_rdc = this_rdc; +#if CONFIG_SUPERTX + best_rate_nocoef = this_rate_nocoef; + assert(best_rate_nocoef >= 0); +#endif + if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE; + + // If all y, u, v transform blocks in this partition are skippable, and + // the dist & rate are within the thresholds, the partition search is + // terminated for current branch of the partition search tree. + // The dist & rate thresholds are set to 0 at speed 0 to disable the + // early termination at that speed. + if (!x->e_mbd.lossless[xd->mi[0]->mbmi.segment_id] && + (ctx_none->skippable && best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr)) { + do_square_split = 0; + do_rectangular_split = 0; + } + +#if CONFIG_FP_MB_STATS + // Check if every 16x16 first pass block statistics has zero + // motion and the corresponding first pass residue is small enough. + // If that is the case, check the difference variance between the + // current frame and the last frame. If the variance is small enough, + // stop further splitting in RD optimization + if (cpi->use_fp_mb_stats && do_square_split && + cm->base_qindex > qindex_skip_threshold_lookup[bsize]) { + int mb_row = mi_row >> 1; + int mb_col = mi_col >> 1; + int mb_row_end = + AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); + int mb_col_end = + AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); + int r, c; + + int skip = 1; + for (r = mb_row; r < mb_row_end; r++) { + for (c = mb_col; c < mb_col_end; c++) { + const int mb_index = r * cm->mb_cols + c; + if (!(cpi->twopass.this_frame_mb_stats[mb_index] & + FPMB_MOTION_ZERO_MASK) || + !(cpi->twopass.this_frame_mb_stats[mb_index] & + FPMB_ERROR_SMALL_MASK)) { + skip = 0; + break; + } + } + if (skip == 0) { + break; + } + } + if (skip) { + if (src_diff_var == UINT_MAX) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + src_diff_var = get_sby_perpixel_diff_variance( + cpi, &x->plane[0].src, mi_row, mi_col, bsize); + } + if (src_diff_var < 8) { + do_square_split = 0; + do_rectangular_split = 0; + } + } + } +#endif + } + } +#if !CONFIG_PVQ + restore_context(x, &x_ctx, mi_row, mi_col, bsize); +#else + restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); +#endif + } + + // store estimated motion vector + if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none); + + // PARTITION_SPLIT + // TODO(jingning): use the motion vectors given by the above search as + // the starting point of motion search in the following partition type check. + if (do_square_split) { + int reached_last_index = 0; + subsize = get_subsize(bsize, PARTITION_SPLIT); + if (bsize == BLOCK_8X8 && !unify_bsize) { +#if CONFIG_DUAL_FILTER + if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed) + pc_tree->leaf_split[0]->pred_interp_filter = + ctx_none->mic.mbmi.interp_filter[0]; +#else + if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed) + pc_tree->leaf_split[0]->pred_interp_filter = + ctx_none->mic.mbmi.interp_filter; +#endif +#if CONFIG_SUPERTX + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, + &sum_rate_nocoef, +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_SPLIT, +#endif + subsize, pc_tree->leaf_split[0], INT64_MAX); +#else + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_SPLIT, +#endif + subsize, pc_tree->leaf_split[0], best_rdc.rdcost); +#endif // CONFIG_SUPERTX + if (sum_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; +#if CONFIG_SUPERTX + sum_rate_nocoef = INT_MAX; +#endif + } +#if CONFIG_SUPERTX + if (supertx_allowed && sum_rdc.rdcost < INT64_MAX) { + TX_SIZE supertx_size = max_txsize_lookup[bsize]; + const PARTITION_TYPE best_partition = pc_tree->partitioning; + + pc_tree->partitioning = PARTITION_SPLIT; + + sum_rdc.rate += av1_cost_bit( + cm->fc->supertx_prob[partition_supertx_context_lookup + [PARTITION_SPLIT]][supertx_size], + 0); + sum_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); + + if (is_inter_mode(pc_tree->leaf_split[0]->mic.mbmi.mode)) { + TX_TYPE best_tx = DCT_DCT; + RD_STATS tmp_rdc; + av1_init_rd_stats(&tmp_rdc); + tmp_rdc.rate = sum_rate_nocoef; + + restore_context(x, &x_ctx, mi_row, mi_col, bsize); + + rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, + &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree); + + tmp_rdc.rate += av1_cost_bit( + cm->fc->supertx_prob[partition_supertx_context_lookup + [PARTITION_SPLIT]][supertx_size], + 1); + tmp_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist); + if (tmp_rdc.rdcost < sum_rdc.rdcost) { + sum_rdc = tmp_rdc; + update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx, + supertx_size, pc_tree); + } + } + + pc_tree->partitioning = best_partition; + } +#endif // CONFIG_SUPERTX + reached_last_index = 1; + } else { + int idx; +#if CONFIG_SUPERTX + for (idx = 0; idx < 4 && sum_rdc.rdcost < INT64_MAX; ++idx) { +#else + for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) { +#endif // CONFIG_SUPERTX + const int x_idx = (idx & 1) * mi_step; + const int y_idx = (idx >> 1) * mi_step; + + if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) + continue; + + if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); + + pc_tree->split[idx]->index = idx; +#if CONFIG_SUPERTX + rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, + mi_col + x_idx, subsize, &this_rdc, &this_rate_nocoef, + INT64_MAX - sum_rdc.rdcost, pc_tree->split[idx]); +#else + rd_pick_partition( + cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, + &this_rdc, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[idx]); +#endif // CONFIG_SUPERTX + + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; +#if CONFIG_SUPERTX + sum_rate_nocoef = INT_MAX; +#endif // CONFIG_SUPERTX + break; + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; +#if CONFIG_SUPERTX + sum_rate_nocoef += this_rate_nocoef; +#endif // CONFIG_SUPERTX + } + } + reached_last_index = (idx == 4); +#if CONFIG_SUPERTX + if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && reached_last_index) { + TX_SIZE supertx_size = max_txsize_lookup[bsize]; + const PARTITION_TYPE best_partition = pc_tree->partitioning; + + pc_tree->partitioning = PARTITION_SPLIT; + + sum_rdc.rate += av1_cost_bit( + cm->fc->supertx_prob[partition_supertx_context_lookup + [PARTITION_SPLIT]][supertx_size], + 0); + sum_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); + + if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) { + TX_TYPE best_tx = DCT_DCT; + RD_STATS tmp_rdc; + av1_init_rd_stats(&tmp_rdc); + tmp_rdc.rate = sum_rate_nocoef; + + restore_context(x, &x_ctx, mi_row, mi_col, bsize); + + rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, + &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree); + + tmp_rdc.rate += av1_cost_bit( + cm->fc->supertx_prob[partition_supertx_context_lookup + [PARTITION_SPLIT]][supertx_size], + 1); + tmp_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist); + if (tmp_rdc.rdcost < sum_rdc.rdcost) { + sum_rdc = tmp_rdc; + update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx, + supertx_size, pc_tree); + } + } + + pc_tree->partitioning = best_partition; + } +#endif // CONFIG_SUPERTX + } + + if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) { + sum_rdc.rate += partition_cost[PARTITION_SPLIT]; + sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); +#if CONFIG_SUPERTX + sum_rate_nocoef += partition_cost[PARTITION_SPLIT]; +#endif // CONFIG_SUPERTX + + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; +#if CONFIG_SUPERTX + best_rate_nocoef = sum_rate_nocoef; + assert(best_rate_nocoef >= 0); +#endif // CONFIG_SUPERTX + pc_tree->partitioning = PARTITION_SPLIT; + } + } else if (cpi->sf.less_rectangular_check) { + // skip rectangular partition test when larger block size + // gives better rd cost + do_rectangular_split &= !partition_none_allowed; + } +#if !CONFIG_PVQ + restore_context(x, &x_ctx, mi_row, mi_col, bsize); +#else + restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); +#endif + } // if (do_split) + + // PARTITION_HORZ + if (partition_horz_allowed && + (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) { + subsize = get_subsize(bsize, PARTITION_HORZ); + if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); +#if CONFIG_DUAL_FILTER + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->horizontal[0].pred_interp_filter = + ctx_none->mic.mbmi.interp_filter[0]; +#else + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->horizontal[0].pred_interp_filter = + ctx_none->mic.mbmi.interp_filter; +#endif + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, +#if CONFIG_SUPERTX + &sum_rate_nocoef, +#endif // CONFIG_SUPERTX +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_HORZ, +#endif + subsize, &pc_tree->horizontal[0], best_rdc.rdcost); + +#if CONFIG_SUPERTX + abort_flag = + (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) || + (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8); + if (sum_rdc.rdcost < INT64_MAX && +#else + if (sum_rdc.rdcost < best_rdc.rdcost && +#endif // CONFIG_SUPERTX + !force_horz_split && (bsize > BLOCK_8X8 || unify_bsize)) { + PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0]; + update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize, + ctx_h, NULL); + + if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h); + +#if CONFIG_DUAL_FILTER + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->horizontal[1].pred_interp_filter = + ctx_h->mic.mbmi.interp_filter[0]; +#else + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->horizontal[1].pred_interp_filter = + ctx_none->mic.mbmi.interp_filter; +#endif +#if CONFIG_SUPERTX + rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc, + &this_rate_nocoef, +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_HORZ, +#endif + subsize, &pc_tree->horizontal[1], INT64_MAX); +#else + rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc, +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_HORZ, +#endif + subsize, &pc_tree->horizontal[1], + best_rdc.rdcost - sum_rdc.rdcost); +#endif // CONFIG_SUPERTX + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; +#if CONFIG_SUPERTX + sum_rate_nocoef = INT_MAX; +#endif // CONFIG_SUPERTX + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; +#if CONFIG_SUPERTX + sum_rate_nocoef += this_rate_nocoef; +#endif // CONFIG_SUPERTX + } + } + +#if CONFIG_SUPERTX + if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) { + TX_SIZE supertx_size = max_txsize_lookup[bsize]; + const PARTITION_TYPE best_partition = pc_tree->partitioning; + + pc_tree->partitioning = PARTITION_HORZ; + + sum_rdc.rate += av1_cost_bit( + cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]] + [supertx_size], + 0); + sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); + + if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) { + TX_TYPE best_tx = DCT_DCT; + RD_STATS tmp_rdc; + av1_init_rd_stats(&tmp_rdc); + tmp_rdc.rate = sum_rate_nocoef; + + restore_context(x, &x_ctx, mi_row, mi_col, bsize); + + rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate, + &tmp_rdc.dist, &best_tx, pc_tree); + + tmp_rdc.rate += av1_cost_bit( + cm->fc + ->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]] + [supertx_size], + 1); + tmp_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist); + if (tmp_rdc.rdcost < sum_rdc.rdcost) { + sum_rdc = tmp_rdc; + update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx, + supertx_size, pc_tree); + } + } + + pc_tree->partitioning = best_partition; + } +#endif // CONFIG_SUPERTX + + if (sum_rdc.rdcost < best_rdc.rdcost) { + sum_rdc.rate += partition_cost[PARTITION_HORZ]; + sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); +#if CONFIG_SUPERTX + sum_rate_nocoef += partition_cost[PARTITION_HORZ]; +#endif // CONFIG_SUPERTX + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; +#if CONFIG_SUPERTX + best_rate_nocoef = sum_rate_nocoef; + assert(best_rate_nocoef >= 0); +#endif // CONFIG_SUPERTX + pc_tree->partitioning = PARTITION_HORZ; + } + } +#if !CONFIG_PVQ + restore_context(x, &x_ctx, mi_row, mi_col, bsize); +#else + restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); +#endif + } + + // PARTITION_VERT + if (partition_vert_allowed && + (do_rectangular_split || av1_active_v_edge(cpi, mi_col, mi_step))) { + subsize = get_subsize(bsize, PARTITION_VERT); + + if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); + +#if CONFIG_DUAL_FILTER + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->vertical[0].pred_interp_filter = + ctx_none->mic.mbmi.interp_filter[0]; +#else + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->vertical[0].pred_interp_filter = + ctx_none->mic.mbmi.interp_filter; +#endif + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, +#if CONFIG_SUPERTX + &sum_rate_nocoef, +#endif // CONFIG_SUPERTX +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_VERT, +#endif + subsize, &pc_tree->vertical[0], best_rdc.rdcost); +#if CONFIG_SUPERTX + abort_flag = + (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) || + (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8); + if (sum_rdc.rdcost < INT64_MAX && +#else + if (sum_rdc.rdcost < best_rdc.rdcost && +#endif // CONFIG_SUPERTX + !force_vert_split && (bsize > BLOCK_8X8 || unify_bsize)) { + update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1); + encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize, + &pc_tree->vertical[0], NULL); + + if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); + +#if CONFIG_DUAL_FILTER + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->vertical[1].pred_interp_filter = + ctx_none->mic.mbmi.interp_filter[0]; +#else + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->vertical[1].pred_interp_filter = + ctx_none->mic.mbmi.interp_filter; +#endif +#if CONFIG_SUPERTX + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc, + &this_rate_nocoef, +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_VERT, +#endif + subsize, &pc_tree->vertical[1], + INT64_MAX - sum_rdc.rdcost); +#else + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc, +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_VERT, +#endif + subsize, &pc_tree->vertical[1], + best_rdc.rdcost - sum_rdc.rdcost); +#endif // CONFIG_SUPERTX + if (this_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; +#if CONFIG_SUPERTX + sum_rate_nocoef = INT_MAX; +#endif // CONFIG_SUPERTX + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; +#if CONFIG_SUPERTX + sum_rate_nocoef += this_rate_nocoef; +#endif // CONFIG_SUPERTX + } + } +#if CONFIG_SUPERTX + if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) { + TX_SIZE supertx_size = max_txsize_lookup[bsize]; + const PARTITION_TYPE best_partition = pc_tree->partitioning; + + pc_tree->partitioning = PARTITION_VERT; + + sum_rdc.rate += av1_cost_bit( + cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]] + [supertx_size], + 0); + sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); + + if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) { + TX_TYPE best_tx = DCT_DCT; + RD_STATS tmp_rdc; + av1_init_rd_stats(&tmp_rdc); + tmp_rdc.rate = sum_rate_nocoef; + + restore_context(x, &x_ctx, mi_row, mi_col, bsize); + + rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate, + &tmp_rdc.dist, &best_tx, pc_tree); + + tmp_rdc.rate += av1_cost_bit( + cm->fc + ->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]] + [supertx_size], + 1); + tmp_rdc.rdcost = + RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist); + if (tmp_rdc.rdcost < sum_rdc.rdcost) { + sum_rdc = tmp_rdc; + update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx, + supertx_size, pc_tree); + } + } + + pc_tree->partitioning = best_partition; + } +#endif // CONFIG_SUPERTX + + if (sum_rdc.rdcost < best_rdc.rdcost) { + sum_rdc.rate += partition_cost[PARTITION_VERT]; + sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); +#if CONFIG_SUPERTX + sum_rate_nocoef += partition_cost[PARTITION_VERT]; +#endif // CONFIG_SUPERTX + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; +#if CONFIG_SUPERTX + best_rate_nocoef = sum_rate_nocoef; + assert(best_rate_nocoef >= 0); +#endif // CONFIG_SUPERTX + pc_tree->partitioning = PARTITION_VERT; + } + } +#if !CONFIG_PVQ + restore_context(x, &x_ctx, mi_row, mi_col, bsize); +#else + restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize); +#endif + } + +#if CONFIG_EXT_PARTITION_TYPES + // PARTITION_HORZ_A + if (partition_horz_allowed && do_rectangular_split && bsize > BLOCK_8X8 && + partition_none_allowed) { + subsize = get_subsize(bsize, PARTITION_HORZ_A); + rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, + pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize, + PARTITION_HORZ_A, +#if CONFIG_SUPERTX + best_rd, &best_rate_nocoef, &x_ctx, +#endif + mi_row, mi_col, bsize2, mi_row, mi_col + mi_step, bsize2, + mi_row + mi_step, mi_col, subsize); + restore_context(x, &x_ctx, mi_row, mi_col, bsize); + } + // PARTITION_HORZ_B + if (partition_horz_allowed && do_rectangular_split && bsize > BLOCK_8X8 && + partition_none_allowed) { + subsize = get_subsize(bsize, PARTITION_HORZ_B); + rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, + pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize, + PARTITION_HORZ_B, +#if CONFIG_SUPERTX + best_rd, &best_rate_nocoef, &x_ctx, +#endif + mi_row, mi_col, subsize, mi_row + mi_step, mi_col, + bsize2, mi_row + mi_step, mi_col + mi_step, bsize2); + restore_context(x, &x_ctx, mi_row, mi_col, bsize); + } + // PARTITION_VERT_A + if (partition_vert_allowed && do_rectangular_split && bsize > BLOCK_8X8 && + partition_none_allowed) { + subsize = get_subsize(bsize, PARTITION_VERT_A); + rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, + pc_tree->verticala, ctx_none, mi_row, mi_col, bsize, + PARTITION_VERT_A, +#if CONFIG_SUPERTX + best_rd, &best_rate_nocoef, &x_ctx, +#endif + mi_row, mi_col, bsize2, mi_row + mi_step, mi_col, bsize2, + mi_row, mi_col + mi_step, subsize); + restore_context(x, &x_ctx, mi_row, mi_col, bsize); + } + // PARTITION_VERT_B + if (partition_vert_allowed && do_rectangular_split && bsize > BLOCK_8X8 && + partition_none_allowed) { + subsize = get_subsize(bsize, PARTITION_VERT_B); + rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, + pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize, + PARTITION_VERT_B, +#if CONFIG_SUPERTX + best_rd, &best_rate_nocoef, &x_ctx, +#endif + mi_row, mi_col, subsize, mi_row, mi_col + mi_step, + bsize2, mi_row + mi_step, mi_col + mi_step, bsize2); + restore_context(x, &x_ctx, mi_row, mi_col, bsize); + } +#endif // CONFIG_EXT_PARTITION_TYPES + + // TODO(jbb): This code added so that we avoid static analysis + // warning related to the fact that best_rd isn't used after this + // point. This code should be refactored so that the duplicate + // checks occur in some sub function and thus are used... + (void)best_rd; + *rd_cost = best_rdc; +#if CONFIG_SUPERTX + *rate_nocoef = best_rate_nocoef; +#endif // CONFIG_SUPERTX + +#if CONFIG_CFL + // Store the luma for the best mode + x->cfl_store_y = 1; +#endif + if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && + pc_tree->index != 3) { + if (bsize == cm->sb_size) { +#if CONFIG_MOTION_VAR && CONFIG_NCOBMC + set_mode_info_sb(cpi, td, tile_info, tp, mi_row, mi_col, bsize, pc_tree); +#endif + encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + pc_tree, NULL); + } else { + encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + } +#if CONFIG_CFL + x->cfl_store_y = 0; +#endif + + if (bsize == cm->sb_size) { +#if !CONFIG_PVQ && !CONFIG_LV_MAP + assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip)); +#endif + assert(best_rdc.rate < INT_MAX); + assert(best_rdc.dist < INT64_MAX); + } else { + assert(tp_orig == *tp); + } +} + +static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, int mi_row, + TOKENEXTRA **tp) { + AV1_COMMON *const cm = &cpi->common; + const TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + SPEED_FEATURES *const sf = &cpi->sf; + int mi_col; +#if CONFIG_EXT_PARTITION + const int leaf_nodes = 256; +#else + const int leaf_nodes = 64; +#endif // CONFIG_EXT_PARTITION + + // Initialize the left context for the new SB row + av1_zero_left_context(xd); + +#if CONFIG_DELTA_Q + // Reset delta for every tile + if (cm->delta_q_present_flag) + if (mi_row == tile_info->mi_row_start) xd->prev_qindex = cm->base_qindex; +#if CONFIG_EXT_DELTA_Q + if (cm->delta_lf_present_flag) + if (mi_row == tile_info->mi_row_start) xd->prev_delta_lf_from_base = 0; +#endif +#endif + + // Code each SB in the row + for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; + mi_col += cm->mib_size) { + const struct segmentation *const seg = &cm->seg; + int dummy_rate; + int64_t dummy_dist; + RD_STATS dummy_rdc; +#if CONFIG_SUPERTX + int dummy_rate_nocoef; +#endif // CONFIG_SUPERTX + int i; + int seg_skip = 0; + + const int idx_str = cm->mi_stride * mi_row + mi_col; + MODE_INFO **mi = cm->mi_grid_visible + idx_str; + PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2]; + + av1_update_boundary_info(cm, tile_info, mi_row, mi_col); + + if (sf->adaptive_pred_interp_filter) { + for (i = 0; i < leaf_nodes; ++i) + td->leaf_tree[i].pred_interp_filter = SWITCHABLE; + + for (i = 0; i < leaf_nodes; ++i) { + td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE; + td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE; + td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE; + td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE; + } + } + + av1_zero(x->pred_mv); + pc_root->index = 0; + + if (seg->enabled) { + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + int segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col); + seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); + } + +#if CONFIG_DELTA_Q + if (cm->delta_q_present_flag) { + // Test mode for delta quantization + int sb_row = mi_row >> 3; + int sb_col = mi_col >> 3; + int sb_stride = (cm->width + MAX_SB_SIZE - 1) >> MAX_SB_SIZE_LOG2; + int index = ((sb_row * sb_stride + sb_col + 8) & 31) - 16; + + // Ensure divisibility of delta_qindex by delta_q_res + int offset_qindex = (index < 0 ? -index - 8 : index - 8); + int qmask = ~(cm->delta_q_res - 1); + int current_qindex = clamp(cm->base_qindex + offset_qindex, + cm->delta_q_res, 256 - cm->delta_q_res); + + current_qindex = + ((current_qindex - cm->base_qindex + cm->delta_q_res / 2) & qmask) + + cm->base_qindex; + assert(current_qindex > 0); + + xd->delta_qindex = current_qindex - cm->base_qindex; + set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); + xd->mi[0]->mbmi.current_q_index = current_qindex; +#if !CONFIG_EXT_DELTA_Q + xd->mi[0]->mbmi.segment_id = 0; +#endif // CONFIG_EXT_DELTA_Q + av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id); +#if CONFIG_EXT_DELTA_Q + if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) { + int j, k; + int lfmask = ~(cm->delta_lf_res - 1); + int current_delta_lf_from_base = offset_qindex / 2; + current_delta_lf_from_base = + ((current_delta_lf_from_base + cm->delta_lf_res / 2) & lfmask); + + // pre-set the delta lf for loop filter. Note that this value is set + // before mi is assigned for each block in current superblock + for (j = 0; j < AOMMIN(cm->mib_size, cm->mi_rows - mi_row); j++) { + for (k = 0; k < AOMMIN(cm->mib_size, cm->mi_cols - mi_col); k++) { + cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)] + .mbmi.current_delta_lf_from_base = current_delta_lf_from_base; + } + } + } +#endif // CONFIG_EXT_DELTA_Q + } +#endif // CONFIG_DELTA_Q + + x->source_variance = UINT_MAX; + if (sf->partition_search_type == FIXED_PARTITION || seg_skip) { + BLOCK_SIZE bsize; + set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size); + bsize = seg_skip ? cm->sb_size : sf->always_this_block_size; + set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size, + &dummy_rate, &dummy_dist, +#if CONFIG_SUPERTX + &dummy_rate_nocoef, +#endif // CONFIG_SUPERTX + 1, pc_root); + } else if (cpi->partition_search_skippable_frame) { + BLOCK_SIZE bsize; + set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size); + bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col); + set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size, + &dummy_rate, &dummy_dist, +#if CONFIG_SUPERTX + &dummy_rate_nocoef, +#endif // CONFIG_SUPERTX + 1, pc_root); + } else if (sf->partition_search_type == VAR_BASED_PARTITION) { + choose_partitioning(cpi, td, tile_info, x, mi_row, mi_col); + rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size, + &dummy_rate, &dummy_dist, +#if CONFIG_SUPERTX + &dummy_rate_nocoef, +#endif // CONFIG_SUPERTX + 1, pc_root); + } else { + // If required set upper and lower partition size limits + if (sf->auto_min_max_partition_size) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size); + rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col, + &x->min_partition_size, &x->max_partition_size); + } + rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size, + &dummy_rdc, +#if CONFIG_SUPERTX + &dummy_rate_nocoef, +#endif // CONFIG_SUPERTX + INT64_MAX, pc_root); + } + } +#if CONFIG_SUBFRAME_PROB_UPDATE + if (cm->do_subframe_update && + cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { + const int mi_rows_per_update = + MI_SIZE * AOMMAX(cm->mi_rows / MI_SIZE / COEF_PROBS_BUFS, 1); + if ((mi_row + MI_SIZE) % mi_rows_per_update == 0 && + mi_row + MI_SIZE < cm->mi_rows && + cm->coef_probs_update_idx < COEF_PROBS_BUFS - 1) { + TX_SIZE t; + SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats; + + for (t = 0; t < TX_SIZES; ++t) + av1_full_to_model_counts(cpi->td.counts->coef[t], + cpi->td.rd_counts.coef_counts[t]); + av1_partial_adapt_probs(cm, mi_row, mi_col); + ++cm->coef_probs_update_idx; + av1_copy(subframe_stats->coef_probs_buf[cm->coef_probs_update_idx], + cm->fc->coef_probs); + av1_copy(subframe_stats->coef_counts_buf[cm->coef_probs_update_idx], + cpi->td.rd_counts.coef_counts); + av1_copy(subframe_stats->eob_counts_buf[cm->coef_probs_update_idx], + cm->counts.eob_branch); + av1_fill_token_costs(x->token_costs, cm->fc->coef_probs); + } + } +#endif // CONFIG_SUBFRAME_PROB_UPDATE +} + +static void init_encode_frame_mb_context(AV1_COMP *cpi) { + MACROBLOCK *const x = &cpi->td.mb; + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + + // Copy data over into macro block data structures. + av1_setup_src_planes(x, cpi->source, 0, 0); + + av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y); +} + +#if !CONFIG_REF_ADAPT +static int check_dual_ref_flags(AV1_COMP *cpi) { + const int ref_flags = cpi->ref_frame_flags; + + if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) { + return 0; + } else { + return (!!(ref_flags & AOM_GOLD_FLAG) + !!(ref_flags & AOM_LAST_FLAG) + +#if CONFIG_EXT_REFS + !!(ref_flags & AOM_LAST2_FLAG) + !!(ref_flags & AOM_LAST3_FLAG) + + !!(ref_flags & AOM_BWD_FLAG) + +#endif // CONFIG_EXT_REFS + !!(ref_flags & AOM_ALT_FLAG)) >= 2; + } +} +#endif // !CONFIG_REF_ADAPT + +#if !CONFIG_VAR_TX +static void reset_skip_tx_size(AV1_COMMON *cm, TX_SIZE max_tx_size) { + int mi_row, mi_col; + const int mis = cm->mi_stride; + MODE_INFO **mi_ptr = cm->mi_grid_visible; + + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if (txsize_sqr_up_map[mi_ptr[mi_col]->mbmi.tx_size] > max_tx_size) + mi_ptr[mi_col]->mbmi.tx_size = max_tx_size; + } + } +} +#endif + +static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) { + if (frame_is_intra_only(&cpi->common)) return INTRA_FRAME; +#if CONFIG_EXT_REFS + // We will not update the golden frame with an internal overlay frame + else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) || + cpi->rc.is_src_frame_ext_arf) +#else + else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) +#endif + return ALTREF_FRAME; + else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) + return GOLDEN_FRAME; + else + // TODO(zoeliu): To investigate whether a frame_type other than + // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately. + return LAST_FRAME; +} + +static TX_MODE select_tx_mode(const AV1_COMP *cpi, MACROBLOCKD *const xd) { + int i, all_lossless = 1; + + if (cpi->common.seg.enabled) { + for (i = 0; i < MAX_SEGMENTS; ++i) { + if (!xd->lossless[i]) { + all_lossless = 0; + break; + } + } + } else { + all_lossless = xd->lossless[0]; + } + if (all_lossless) return ONLY_4X4; + if (cpi->sf.tx_size_search_method == USE_LARGESTALL) + return ALLOW_32X32 + CONFIG_TX64X64; + else if (cpi->sf.tx_size_search_method == USE_FULL_RD || + cpi->sf.tx_size_search_method == USE_TX_8X8) + return TX_MODE_SELECT; + else + return cpi->common.tx_mode; +} + +void av1_init_tile_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; + int tile_col, tile_row; + TOKENEXTRA *pre_tok = cpi->tile_tok[0][0]; + unsigned int tile_tok = 0; + + if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { + if (cpi->tile_data != NULL) aom_free(cpi->tile_data); + CHECK_MEM_ERROR( + cm, cpi->tile_data, + aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data))); + cpi->allocated_tiles = tile_cols * tile_rows; + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + int i, j; + for (i = 0; i < BLOCK_SIZES; ++i) { + for (j = 0; j < MAX_MODES; ++j) { + tile_data->thresh_freq_fact[i][j] = 32; + tile_data->mode_map[i][j] = j; + } + } +#if CONFIG_PVQ + // This will be dynamically increased as more pvq block is encoded. + tile_data->pvq_q.buf_len = 1000; + CHECK_MEM_ERROR( + cm, tile_data->pvq_q.buf, + aom_malloc(tile_data->pvq_q.buf_len * sizeof(PVQ_INFO))); + tile_data->pvq_q.curr_pos = 0; +#endif + } + } + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileInfo *const tile_info = + &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; + av1_tile_init(tile_info, cm, tile_row, tile_col); + + cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; + pre_tok = cpi->tile_tok[tile_row][tile_col]; + tile_tok = allocated_tokens(*tile_info); +#if CONFIG_PVQ + cpi->tile_data[tile_row * tile_cols + tile_col].pvq_q.curr_pos = 0; +#endif + } + } +} + +void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, + int tile_col) { + AV1_COMMON *const cm = &cpi->common; + TileDataEnc *const this_tile = + &cpi->tile_data[tile_row * cm->tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col]; + int mi_row; + +#if CONFIG_DEPENDENT_HORZTILES +#if CONFIG_TILE_GROUPS + if ((!cm->dependent_horz_tiles) || (tile_row == 0) || + tile_info->tg_horz_boundary) { +#else + if ((!cm->dependent_horz_tiles) || (tile_row == 0)) { +#endif + av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end); + } +#else + av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end); +#endif + + // Set up pointers to per thread motion search counters. + this_tile->m_search_count = 0; // Count of motion search hits. + this_tile->ex_search_count = 0; // Exhaustive mesh search hits. + td->mb.m_search_count_ptr = &this_tile->m_search_count; + td->mb.ex_search_count_ptr = &this_tile->ex_search_count; + +#if CONFIG_PVQ + td->mb.pvq_q = &this_tile->pvq_q; + + // TODO(yushin) : activity masking info needs be signaled by a bitstream + td->mb.daala_enc.use_activity_masking = AV1_PVQ_ENABLE_ACTIVITY_MASKING; + + if (td->mb.daala_enc.use_activity_masking) + td->mb.daala_enc.qm = OD_HVS_QM; // Hard coded. Enc/dec required to sync. + else + td->mb.daala_enc.qm = OD_FLAT_QM; // Hard coded. Enc/dec required to sync. + + { + // FIXME: Multiple segments support + int segment_id = 0; + int rdmult = set_segment_rdmult(cpi, &td->mb, segment_id); + int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex); +#if CONFIG_HIGHBITDEPTH + const int quantizer_shift = td->mb.e_mbd.bd - 8; +#else + const int quantizer_shift = 0; +#endif // CONFIG_HIGHBITDEPTH + int64_t q_ac = OD_MAXI( + 1, av1_ac_quant(qindex, 0, cpi->common.bit_depth) >> quantizer_shift); + int64_t q_dc = OD_MAXI( + 1, av1_dc_quant(qindex, 0, cpi->common.bit_depth) >> quantizer_shift); + /* td->mb.daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA; */ + td->mb.daala_enc.pvq_norm_lambda = + (double)rdmult * (64 / 16) / (q_ac * q_ac * (1 << RDDIV_BITS)); + td->mb.daala_enc.pvq_norm_lambda_dc = + (double)rdmult * (64 / 16) / (q_dc * q_dc * (1 << RDDIV_BITS)); + // printf("%f\n", td->mb.daala_enc.pvq_norm_lambda); + } + od_init_qm(td->mb.daala_enc.state.qm, td->mb.daala_enc.state.qm_inv, + td->mb.daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT); + + if (td->mb.daala_enc.use_activity_masking) { + int pli; + int use_masking = td->mb.daala_enc.use_activity_masking; + int segment_id = 0; + int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex); + + for (pli = 0; pli < MAX_MB_PLANE; pli++) { + int i; + int q; + + q = qindex; + if (q <= OD_DEFAULT_QMS[use_masking][0][pli].interp_q << OD_COEFF_SHIFT) { + od_interp_qm(&td->mb.daala_enc.state.pvq_qm_q4[pli][0], q, + &OD_DEFAULT_QMS[use_masking][0][pli], NULL); + } else { + i = 0; + while (OD_DEFAULT_QMS[use_masking][i + 1][pli].qm_q4 != NULL && + q > OD_DEFAULT_QMS[use_masking][i + 1][pli].interp_q + << OD_COEFF_SHIFT) { + i++; + } + od_interp_qm(&td->mb.daala_enc.state.pvq_qm_q4[pli][0], q, + &OD_DEFAULT_QMS[use_masking][i][pli], + &OD_DEFAULT_QMS[use_masking][i + 1][pli]); + } + } + } + +#if CONFIG_DAALA_EC + od_ec_enc_init(&td->mb.daala_enc.w.ec, 65025); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + +#if CONFIG_DAALA_EC + od_ec_enc_reset(&td->mb.daala_enc.w.ec); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif +#endif // #if CONFIG_PVQ + +#if CONFIG_EC_ADAPT + this_tile->tctx = *cm->fc; + td->mb.e_mbd.tile_ctx = &this_tile->tctx; +#endif // #if CONFIG_EC_ADAPT + +#if CONFIG_CFL + MACROBLOCKD *const xd = &td->mb.e_mbd; + xd->cfl = &this_tile->cfl; + cfl_init(xd->cfl, cm, xd->plane[AOM_PLANE_U].subsampling_x, + xd->plane[AOM_PLANE_U].subsampling_y); +#endif + +#if CONFIG_PVQ + td->mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context; +#endif // CONFIG_PVQ + + for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; + mi_row += cm->mib_size) { + encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); + } + + cpi->tok_count[tile_row][tile_col] = + (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]); + assert(cpi->tok_count[tile_row][tile_col] <= allocated_tokens(*tile_info)); +#if CONFIG_PVQ +#if CONFIG_DAALA_EC + od_ec_enc_clear(&td->mb.daala_enc.w.ec); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + + td->mb.pvq_q->last_pos = td->mb.pvq_q->curr_pos; + // rewind current position so that bitstream can be written + // from the 1st pvq block + td->mb.pvq_q->curr_pos = 0; + + td->mb.pvq_q = NULL; +#endif +} + +static void encode_tiles(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + int tile_col, tile_row; + + av1_init_tile_data(cpi); + + for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) + for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) + av1_encode_tile(cpi, &cpi->td, tile_row, tile_col); +} + +#if CONFIG_FP_MB_STATS +static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats, + AV1_COMMON *cm, uint8_t **this_frame_mb_stats) { + uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start + + cm->current_video_frame * cm->MBs * sizeof(uint8_t); + + if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF; + + *this_frame_mb_stats = mb_stats_in; + + return 1; +} +#endif + +#if CONFIG_GLOBAL_MOTION +#define GLOBAL_TRANS_TYPES_ENC 3 // highest motion model to search +static int gm_get_params_cost(WarpedMotionParams *gm, + WarpedMotionParams *ref_gm, int allow_hp) { + assert(gm->wmtype < GLOBAL_TRANS_TYPES); + int params_cost = 0; + int trans_bits, trans_prec_diff; + switch (gm->wmtype) { + case HOMOGRAPHY: + case HORTRAPEZOID: + case VERTRAPEZOID: + if (gm->wmtype != HORTRAPEZOID) + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF), + (gm->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF)); + if (gm->wmtype != VERTRAPEZOID) + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF), + (gm->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF)); + // Fallthrough intended + case AFFINE: + case ROTZOOM: + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS), + (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + if (gm->wmtype != VERTRAPEZOID) + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF), + (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF)); + if (gm->wmtype >= AFFINE) { + if (gm->wmtype != HORTRAPEZOID) + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF), + (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF)); + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + } + // Fallthrough intended + case TRANSLATION: + trans_bits = (gm->wmtype == TRANSLATION) + ? GM_ABS_TRANS_ONLY_BITS - !allow_hp + : GM_ABS_TRANS_BITS; + trans_prec_diff = (gm->wmtype == TRANSLATION) + ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp + : GM_TRANS_PREC_DIFF; + params_cost += aom_count_signed_primitive_refsubexpfin( + (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_gm->wmmat[0] >> trans_prec_diff), + (gm->wmmat[0] >> trans_prec_diff)); + params_cost += aom_count_signed_primitive_refsubexpfin( + (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_gm->wmmat[1] >> trans_prec_diff), + (gm->wmmat[1] >> trans_prec_diff)); + // Fallthrough intended + case IDENTITY: break; + default: assert(0); + } + return (params_cost << AV1_PROB_COST_SHIFT); +} +#endif // CONFIG_GLOBAL_MOTION + +static void encode_frame_internal(AV1_COMP *cpi) { + ThreadData *const td = &cpi->td; + MACROBLOCK *const x = &td->mb; + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + RD_COUNTS *const rdc = &cpi->td.rd_counts; + int i; +#if CONFIG_TEMPMV_SIGNALING || CONFIG_EXT_REFS + const int last_fb_buf_idx = get_ref_frame_buf_idx(cpi, LAST_FRAME); +#endif // CONFIG_TEMPMV_SIGNALING || CONFIG_EXT_REFS + +#if CONFIG_ADAPT_SCAN + av1_deliver_eob_threshold(cm, xd); +#endif + + x->min_partition_size = AOMMIN(x->min_partition_size, cm->sb_size); + x->max_partition_size = AOMMIN(x->max_partition_size, cm->sb_size); +#if CONFIG_REF_MV + cm->setup_mi(cm); +#endif + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + + av1_zero(*td->counts); + av1_zero(rdc->coef_counts); + av1_zero(rdc->comp_pred_diff); + +#if CONFIG_GLOBAL_MOTION + av1_zero(rdc->global_motion_used); + if (cpi->common.frame_type == INTER_FRAME && cpi->source && + !cpi->global_motion_search_done) { + YV12_BUFFER_CONFIG *ref_buf; + int frame; + double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)]; + const double *params_this_motion; + int inliers_by_motion[RANSAC_NUM_MOTIONS]; + WarpedMotionParams tmp_wm_params; + static const double kInfiniteErrAdv = 1e12; + static const double kIdentityParams[MAX_PARAMDIM - 1] = { + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0 + }; + + for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { + ref_buf = get_ref_frame_buffer(cpi, frame); + if (ref_buf) { + TransformationType model; + aom_clear_system_state(); + for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) { + double best_erroradvantage = kInfiniteErrAdv; + + // Initially set all params to identity. + for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) { + memcpy(params_by_motion + (MAX_PARAMDIM - 1) * i, kIdentityParams, + (MAX_PARAMDIM - 1) * sizeof(*params_by_motion)); + } + + compute_global_motion_feature_based( + model, cpi->source, ref_buf, +#if CONFIG_HIGHBITDEPTH + cpi->common.bit_depth, +#endif // CONFIG_HIGHBITDEPTH + inliers_by_motion, params_by_motion, RANSAC_NUM_MOTIONS); + + for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) { + if (inliers_by_motion[i] == 0) continue; + + params_this_motion = params_by_motion + (MAX_PARAMDIM - 1) * i; + convert_model_to_params(params_this_motion, &tmp_wm_params); + + if (tmp_wm_params.wmtype != IDENTITY) { + const double erroradv_this_motion = refine_integerized_param( + &tmp_wm_params, tmp_wm_params.wmtype, +#if CONFIG_HIGHBITDEPTH + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd, +#endif // CONFIG_HIGHBITDEPTH + ref_buf->y_buffer, ref_buf->y_width, ref_buf->y_height, + ref_buf->y_stride, cpi->source->y_buffer, + cpi->source->y_width, cpi->source->y_height, + cpi->source->y_stride, 3); + if (erroradv_this_motion < best_erroradvantage) { + best_erroradvantage = erroradv_this_motion; + // Save the wm_params modified by refine_integerized_param() + // rather than motion index to avoid rerunning refine() below. + memcpy(&(cm->global_motion[frame]), &tmp_wm_params, + sizeof(WarpedMotionParams)); + } + } + } + if (cm->global_motion[frame].wmtype <= AFFINE) + if (!get_shear_params(&cm->global_motion[frame])) + set_default_warp_params(&cm->global_motion[frame]); + + if (cm->global_motion[frame].wmtype == TRANSLATION) { + cm->global_motion[frame].wmmat[0] = + convert_to_trans_prec(cm->allow_high_precision_mv, + cm->global_motion[frame].wmmat[0]) * + GM_TRANS_ONLY_DECODE_FACTOR; + cm->global_motion[frame].wmmat[1] = + convert_to_trans_prec(cm->allow_high_precision_mv, + cm->global_motion[frame].wmmat[1]) * + GM_TRANS_ONLY_DECODE_FACTOR; + } + + // If the best error advantage found doesn't meet the threshold for + // this motion type, revert to IDENTITY. + if (!is_enough_erroradvantage( + best_erroradvantage, + gm_get_params_cost(&cm->global_motion[frame], + &cm->prev_frame->global_motion[frame], + cm->allow_high_precision_mv))) { + set_default_warp_params(&cm->global_motion[frame]); + } + + if (cm->global_motion[frame].wmtype != IDENTITY) break; + } + aom_clear_system_state(); + } + cpi->gmparams_cost[frame] = + gm_get_params_cost(&cm->global_motion[frame], + &cm->prev_frame->global_motion[frame], + cm->allow_high_precision_mv) + + cpi->gmtype_cost[cm->global_motion[frame].wmtype] - + cpi->gmtype_cost[IDENTITY]; + } + cpi->global_motion_search_done = 1; + } + memcpy(cm->cur_frame->global_motion, cm->global_motion, + TOTAL_REFS_PER_FRAME * sizeof(WarpedMotionParams)); +#endif // CONFIG_GLOBAL_MOTION + + for (i = 0; i < MAX_SEGMENTS; ++i) { + const int qindex = cm->seg.enabled + ? av1_get_qindex(&cm->seg, i, cm->base_qindex) + : cm->base_qindex; + xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 && + cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; + xd->qindex[i] = qindex; + } + + if (!cm->seg.enabled && xd->lossless[0]) x->optimize = 0; + + cm->tx_mode = select_tx_mode(cpi, xd); + +#if CONFIG_DELTA_Q + // Fix delta q resolution for the moment + cm->delta_q_res = DEFAULT_DELTA_Q_RES; +// Set delta_q_present_flag before it is used for the first time +#if CONFIG_EXT_DELTA_Q + cm->delta_lf_res = DEFAULT_DELTA_LF_RES; + // update delta_q_present_flag and delta_lf_present_flag based on base_qindex + cm->delta_q_present_flag &= cm->base_qindex > 0; + cm->delta_lf_present_flag &= cm->base_qindex > 0; +#else + cm->delta_q_present_flag = + cpi->oxcf.aq_mode == DELTA_AQ && cm->base_qindex > 0; +#endif // CONFIG_EXT_DELTA_Q +#endif + + av1_frame_init_quantizer(cpi); + + av1_initialize_rd_consts(cpi); + av1_initialize_me_consts(cpi, x, cm->base_qindex); + init_encode_frame_mb_context(cpi); +#if CONFIG_TEMPMV_SIGNALING + if (last_fb_buf_idx != INVALID_IDX) { + cm->prev_frame = &cm->buffer_pool->frame_bufs[last_fb_buf_idx]; + cm->use_prev_frame_mvs &= !cm->error_resilient_mode && + cm->width == cm->prev_frame->buf.y_width && + cm->height == cm->prev_frame->buf.y_height && + !cm->intra_only && !cm->prev_frame->intra_only; + } +#else + cm->use_prev_frame_mvs = + !cm->error_resilient_mode && cm->width == cm->last_width && + cm->height == cm->last_height && !cm->intra_only && cm->last_show_frame; +#endif + +#if CONFIG_EXT_REFS + // NOTE(zoeliu): As cm->prev_frame can take neither a frame of + // show_exisiting_frame=1, nor can it take a frame not used as + // a reference, it is probable that by the time it is being + // referred to, the frame buffer it originally points to may + // already get expired and have been reassigned to the current + // newly coded frame. Hence, we need to check whether this is + // the case, and if yes, we have 2 choices: + // (1) Simply disable the use of previous frame mvs; or + // (2) Have cm->prev_frame point to one reference frame buffer, + // e.g. LAST_FRAME. + if (cm->use_prev_frame_mvs && !enc_is_ref_frame_buf(cpi, cm->prev_frame)) { + // Reassign the LAST_FRAME buffer to cm->prev_frame. + cm->prev_frame = &cm->buffer_pool->frame_bufs[last_fb_buf_idx]; + } +#endif // CONFIG_EXT_REFS + + // Special case: set prev_mi to NULL when the previous mode info + // context cannot be used. + cm->prev_mi = + cm->use_prev_frame_mvs ? cm->prev_mip + cm->mi_stride + 1 : NULL; + +#if CONFIG_VAR_TX + x->txb_split_count = 0; +#if CONFIG_REF_MV + av1_zero(x->blk_skip_drl); +#endif +#endif + + if (cpi->sf.partition_search_type == VAR_BASED_PARTITION && + cpi->td.var_root[0] == NULL) + av1_setup_var_tree(&cpi->common, &cpi->td); + + { + struct aom_usec_timer emr_timer; + aom_usec_timer_start(&emr_timer); + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm, + &cpi->twopass.this_frame_mb_stats); + } +#endif + + // If allowed, encoding tiles in parallel with one thread handling one tile. + // TODO(geza.lore): The multi-threaded encoder is not safe with more than + // 1 tile rows, as it uses the single above_context et al arrays from + // cpi->common + if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols) > 1 && cm->tile_rows == 1) + av1_encode_tiles_mt(cpi); + else + encode_tiles(cpi); + + aom_usec_timer_mark(&emr_timer); + cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer); + } + +#if 0 + // Keep record of the total distortion this time around for future use + cpi->last_frame_distortion = cpi->frame_distortion; +#endif +} + +void av1_encode_frame(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; +#if CONFIG_EXT_TX + // Indicates whether or not to use a default reduced set for ext-tx + // rather than the potential full set of 16 transforms + cm->reduced_tx_set_used = 0; +#endif // CONFIG_EXT_TX + + // In the longer term the encoder should be generalized to match the + // decoder such that we allow compound where one of the 3 buffers has a + // different sign bias and that buffer is then the fixed ref. However, this + // requires further work in the rd loop. For now the only supported encoder + // side behavior is where the ALT ref buffer has opposite sign bias to + // the other two. + if (!frame_is_intra_only(cm)) { +#if CONFIG_LOWDELAY_COMPOUND // Normative in encoder + cpi->allow_comp_inter_inter = 1; +#if CONFIG_EXT_REFS + cm->comp_fwd_ref[0] = LAST_FRAME; + cm->comp_fwd_ref[1] = LAST2_FRAME; + cm->comp_fwd_ref[2] = LAST3_FRAME; + cm->comp_fwd_ref[3] = GOLDEN_FRAME; + cm->comp_bwd_ref[0] = BWDREF_FRAME; + cm->comp_bwd_ref[1] = ALTREF_FRAME; +#else + cm->comp_fixed_ref = ALTREF_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = GOLDEN_FRAME; +#endif // CONFIG_EXT_REFS +#else + if ((cm->ref_frame_sign_bias[ALTREF_FRAME] == + cm->ref_frame_sign_bias[GOLDEN_FRAME]) || + (cm->ref_frame_sign_bias[ALTREF_FRAME] == + cm->ref_frame_sign_bias[LAST_FRAME])) { + cpi->allow_comp_inter_inter = 0; + } else { + cpi->allow_comp_inter_inter = 1; + +#if CONFIG_EXT_REFS + cm->comp_fwd_ref[0] = LAST_FRAME; + cm->comp_fwd_ref[1] = LAST2_FRAME; + cm->comp_fwd_ref[2] = LAST3_FRAME; + cm->comp_fwd_ref[3] = GOLDEN_FRAME; + cm->comp_bwd_ref[0] = BWDREF_FRAME; + cm->comp_bwd_ref[1] = ALTREF_FRAME; +#else + cm->comp_fixed_ref = ALTREF_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = GOLDEN_FRAME; +#endif // CONFIG_EXT_REFS + } +#endif + } else { + cpi->allow_comp_inter_inter = 0; + } + + if (cpi->sf.frame_parameter_update) { + int i; + RD_OPT *const rd_opt = &cpi->rd; + FRAME_COUNTS *counts = cpi->td.counts; + RD_COUNTS *const rdc = &cpi->td.rd_counts; + + // This code does a single RD pass over the whole frame assuming + // either compound, single or hybrid prediction as per whatever has + // worked best for that type of frame in the past. + // It also predicts whether another coding mode would have worked + // better than this coding mode. If that is the case, it remembers + // that for subsequent frames. + // It does the same analysis for transform size selection also. + // + // TODO(zoeliu): To investigate whether a frame_type other than + // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately. + const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi); + int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type]; + const int is_alt_ref = frame_type == ALTREF_FRAME; + +/* prediction (compound, single or hybrid) mode selection */ +#if CONFIG_REF_ADAPT + // NOTE(zoeliu): "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames + if (is_alt_ref || !cpi->allow_comp_inter_inter) + cm->reference_mode = SINGLE_REFERENCE; + else + cm->reference_mode = REFERENCE_MODE_SELECT; +#else + if (is_alt_ref || !cpi->allow_comp_inter_inter) + cm->reference_mode = SINGLE_REFERENCE; + else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] && + mode_thrs[COMPOUND_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT] && + check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100) + cm->reference_mode = COMPOUND_REFERENCE; + else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT]) + cm->reference_mode = SINGLE_REFERENCE; + else + cm->reference_mode = REFERENCE_MODE_SELECT; +#endif // CONFIG_REF_ADAPT + +#if CONFIG_DUAL_FILTER + cm->interp_filter = SWITCHABLE; +#endif + + encode_frame_internal(cpi); + + for (i = 0; i < REFERENCE_MODES; ++i) + mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2; + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + int single_count_zero = 0; + int comp_count_zero = 0; + + for (i = 0; i < COMP_INTER_CONTEXTS; i++) { + single_count_zero += counts->comp_inter[i][0]; + comp_count_zero += counts->comp_inter[i][1]; + } + + if (comp_count_zero == 0) { + cm->reference_mode = SINGLE_REFERENCE; + av1_zero(counts->comp_inter); +#if !CONFIG_REF_ADAPT + } else if (single_count_zero == 0) { + cm->reference_mode = COMPOUND_REFERENCE; + av1_zero(counts->comp_inter); +#endif // !CONFIG_REF_ADAPT + } + } + +#if CONFIG_VAR_TX + if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0) + cm->tx_mode = ALLOW_32X32 + CONFIG_TX64X64; +#else + if (cm->tx_mode == TX_MODE_SELECT) { +#if CONFIG_TX64X64 + int count4x4 = 0; + int count8x8_8x8p = 0, count8x8_lp = 0; + int count16x16_16x16p = 0, count16x16_lp = 0; + int count32x32_32x32p = 0, count32x32_lp = 0; + int count64x64_64x64p = 0; + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + // counts->tx_size[max_depth][context_idx][this_depth_level] + count4x4 += counts->tx_size[0][i][0]; + count4x4 += counts->tx_size[1][i][0]; + count4x4 += counts->tx_size[2][i][0]; + count4x4 += counts->tx_size[3][i][0]; + + count8x8_8x8p += counts->tx_size[0][i][1]; + count8x8_lp += counts->tx_size[1][i][1]; + count8x8_lp += counts->tx_size[2][i][1]; + count8x8_lp += counts->tx_size[3][i][1]; + + count16x16_16x16p += counts->tx_size[1][i][2]; + count16x16_lp += counts->tx_size[2][i][2]; + count16x16_lp += counts->tx_size[3][i][2]; + + count32x32_32x32p += counts->tx_size[2][i][3]; + count32x32_lp += counts->tx_size[3][i][3]; + + count64x64_64x64p += counts->tx_size[3][i][4]; + } +#if CONFIG_EXT_TX && CONFIG_RECT_TX + count4x4 += counts->tx_size_implied[0][TX_4X4]; + count4x4 += counts->tx_size_implied[1][TX_4X4]; + count4x4 += counts->tx_size_implied[2][TX_4X4]; + count4x4 += counts->tx_size_implied[3][TX_4X4]; + count8x8_8x8p += counts->tx_size_implied[1][TX_8X8]; + count8x8_lp += counts->tx_size_implied[2][TX_8X8]; + count8x8_lp += counts->tx_size_implied[3][TX_8X8]; + count8x8_lp += counts->tx_size_implied[4][TX_8X8]; + count16x16_16x16p += counts->tx_size_implied[2][TX_16X16]; + count16x16_lp += counts->tx_size_implied[3][TX_16X16]; + count16x16_lp += counts->tx_size_implied[4][TX_16X16]; + count32x32_32x32p += counts->tx_size_implied[3][TX_32X32]; + count32x32_lp += counts->tx_size_implied[4][TX_32X32]; + count64x64_64x64p += counts->tx_size_implied[4][TX_64X64]; +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 && + count32x32_lp == 0 && count32x32_32x32p == 0 && +#if CONFIG_SUPERTX + cm->counts.supertx_size[TX_16X16] == 0 && + cm->counts.supertx_size[TX_32X32] == 0 && + cm->counts.supertx_size[TX_64X64] == 0 && +#endif + count64x64_64x64p == 0) { + cm->tx_mode = ALLOW_8X8; + reset_skip_tx_size(cm, TX_8X8); + } else if (count8x8_8x8p == 0 && count8x8_lp == 0 && + count16x16_16x16p == 0 && count16x16_lp == 0 && + count32x32_32x32p == 0 && count32x32_lp == 0 && +#if CONFIG_SUPERTX + cm->counts.supertx_size[TX_8X8] == 0 && + cm->counts.supertx_size[TX_16X16] == 0 && + cm->counts.supertx_size[TX_32X32] == 0 && + cm->counts.supertx_size[TX_64X64] == 0 && +#endif + count64x64_64x64p == 0) { + cm->tx_mode = ONLY_4X4; + reset_skip_tx_size(cm, TX_4X4); + } else if (count4x4 == 0 && count8x8_lp == 0 && count16x16_lp == 0 && + count32x32_lp == 0) { + cm->tx_mode = ALLOW_64X64; + } else if (count4x4 == 0 && count8x8_lp == 0 && count16x16_lp == 0 && +#if CONFIG_SUPERTX + cm->counts.supertx_size[TX_64X64] == 0 && +#endif + count64x64_64x64p == 0) { + cm->tx_mode = ALLOW_32X32; + reset_skip_tx_size(cm, TX_32X32); + } else if (count4x4 == 0 && count8x8_lp == 0 && count32x32_lp == 0 && + count32x32_32x32p == 0 && +#if CONFIG_SUPERTX + cm->counts.supertx_size[TX_32X32] == 0 && + cm->counts.supertx_size[TX_64X64] == 0 && +#endif + count64x64_64x64p == 0) { + cm->tx_mode = ALLOW_16X16; + reset_skip_tx_size(cm, TX_16X16); + } + +#else // CONFIG_TX64X64 + + int count4x4 = 0; + int count8x8_lp = 0, count8x8_8x8p = 0; + int count16x16_16x16p = 0, count16x16_lp = 0; + int count32x32 = 0; + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + // counts->tx_size[max_depth][context_idx][this_depth_level] + count4x4 += counts->tx_size[0][i][0]; + count4x4 += counts->tx_size[1][i][0]; + count4x4 += counts->tx_size[2][i][0]; + + count8x8_8x8p += counts->tx_size[0][i][1]; + count8x8_lp += counts->tx_size[1][i][1]; + count8x8_lp += counts->tx_size[2][i][1]; + + count16x16_16x16p += counts->tx_size[1][i][2]; + count16x16_lp += counts->tx_size[2][i][2]; + count32x32 += counts->tx_size[2][i][3]; + } +#if CONFIG_EXT_TX && CONFIG_RECT_TX + count4x4 += counts->tx_size_implied[0][TX_4X4]; + count4x4 += counts->tx_size_implied[1][TX_4X4]; + count4x4 += counts->tx_size_implied[2][TX_4X4]; + count4x4 += counts->tx_size_implied[3][TX_4X4]; + count8x8_8x8p += counts->tx_size_implied[1][TX_8X8]; + count8x8_lp += counts->tx_size_implied[2][TX_8X8]; + count8x8_lp += counts->tx_size_implied[3][TX_8X8]; + count16x16_lp += counts->tx_size_implied[3][TX_16X16]; + count16x16_16x16p += counts->tx_size_implied[2][TX_16X16]; + count32x32 += counts->tx_size_implied[3][TX_32X32]; +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 && +#if CONFIG_SUPERTX + cm->counts.supertx_size[TX_16X16] == 0 && + cm->counts.supertx_size[TX_32X32] == 0 && +#endif // CONFIG_SUPERTX + count32x32 == 0) { + cm->tx_mode = ALLOW_8X8; + reset_skip_tx_size(cm, TX_8X8); + } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 && + count8x8_lp == 0 && count16x16_lp == 0 && +#if CONFIG_SUPERTX + cm->counts.supertx_size[TX_8X8] == 0 && + cm->counts.supertx_size[TX_16X16] == 0 && + cm->counts.supertx_size[TX_32X32] == 0 && +#endif // CONFIG_SUPERTX + count32x32 == 0) { + cm->tx_mode = ONLY_4X4; + reset_skip_tx_size(cm, TX_4X4); + } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) { + cm->tx_mode = ALLOW_32X32; + } else if (count32x32 == 0 && count8x8_lp == 0 && +#if CONFIG_SUPERTX + cm->counts.supertx_size[TX_32X32] == 0 && +#endif // CONFIG_SUPERTX + count4x4 == 0) { + cm->tx_mode = ALLOW_16X16; + reset_skip_tx_size(cm, TX_16X16); + } +#endif // CONFIG_TX64X64 + } +#endif + } else { + encode_frame_internal(cpi); + } +} + +static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd, + const MODE_INFO *mi, const MODE_INFO *above_mi, + const MODE_INFO *left_mi, const int intraonly, + const int mi_row, const int mi_col) { + const MB_MODE_INFO *const mbmi = &mi->mbmi; + const PREDICTION_MODE y_mode = mbmi->mode; + const PREDICTION_MODE uv_mode = mbmi->uv_mode; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int unify_bsize = CONFIG_CB4X4; + + if (bsize < BLOCK_8X8 && !unify_bsize) { + int idx, idy; + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + for (idy = 0; idy < 2; idy += num_4x4_h) + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int bidx = idy * 2 + idx; + const PREDICTION_MODE bmode = mi->bmi[bidx].as_mode; + if (intraonly) { + const PREDICTION_MODE a = av1_above_block_mode(mi, above_mi, bidx); + const PREDICTION_MODE l = av1_left_block_mode(mi, left_mi, bidx); + ++counts->kf_y_mode[a][l][bmode]; + } else { + ++counts->y_mode[0][bmode]; + } + } + } else { + if (intraonly) { + const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, 0); + const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, 0); + ++counts->kf_y_mode[above][left][y_mode]; + } else { + ++counts->y_mode[size_group_lookup[bsize]][y_mode]; + } +#if CONFIG_FILTER_INTRA + if (mbmi->mode == DC_PRED +#if CONFIG_PALETTE + && mbmi->palette_mode_info.palette_size[0] == 0 +#endif // CONFIG_PALETTE + ) { + const int use_filter_intra_mode = + mbmi->filter_intra_mode_info.use_filter_intra_mode[0]; + ++counts->filter_intra[0][use_filter_intra_mode]; + } + if (mbmi->uv_mode == DC_PRED +#if CONFIG_PALETTE + && mbmi->palette_mode_info.palette_size[1] == 0 +#endif // CONFIG_PALETTE + ) { + const int use_filter_intra_mode = + mbmi->filter_intra_mode_info.use_filter_intra_mode[1]; + ++counts->filter_intra[1][use_filter_intra_mode]; + } +#endif // CONFIG_FILTER_INTRA +#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP + if (av1_is_directional_mode(mbmi->mode, bsize)) { + const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd); + const int p_angle = + mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP; + if (av1_is_intra_filter_switchable(p_angle)) + ++counts->intra_filter[intra_filter_ctx][mbmi->intra_filter]; + } +#endif // CONFIG_INTRA_INTERP && CONFIG_INTRA_INTERP + } + +#if CONFIG_CB4X4 + if (!is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y)) + return; +#else + (void)mi_row; + (void)mi_col; + (void)xd; +#endif + ++counts->uv_mode[y_mode][uv_mode]; +} + +#if CONFIG_VAR_TX +static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd, + FRAME_COUNTS *counts, TX_SIZE tx_size, int depth, + int blk_row, int blk_col) { + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const int tx_row = blk_row >> 1; + const int tx_col = blk_col >> 1; + const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0); + const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0); + int ctx = txfm_partition_context(xd->above_txfm_context + tx_col, + xd->left_txfm_context + tx_row, + mbmi->sb_type, tx_size); + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col]; + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + if (tx_size == plane_tx_size) { + ++counts->txfm_partition[ctx][0]; + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + tx_col, + xd->left_txfm_context + tx_row, tx_size, tx_size); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bs = tx_size_wide_unit[sub_txs]; + int i; + + ++counts->txfm_partition[ctx][1]; + ++x->txb_split_count; + + if (tx_size == TX_8X8) { + mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4; + mbmi->tx_size = TX_4X4; + txfm_partition_update(xd->above_txfm_context + tx_col, + xd->left_txfm_context + tx_row, TX_4X4, tx_size); + return; + } + + for (i = 0; i < 4; ++i) { + int offsetr = (i >> 1) * bs; + int offsetc = (i & 0x01) * bs; + update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr, + blk_col + offsetc); + } + } +} + +static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x, + BLOCK_SIZE plane_bsize, int mi_row, + int mi_col, FRAME_COUNTS *td_counts) { + MACROBLOCKD *xd = &x->e_mbd; + const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0]; + TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + int idx, idy; + + xd->above_txfm_context = cm->above_txfm_context + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + for (idy = 0; idy < mi_height; idy += bh) + for (idx = 0; idx < mi_width; idx += bw) + update_txfm_count(x, xd, td_counts, max_tx_size, mi_width != mi_height, + idy, idx); +} + +static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row, + int blk_col) { + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const int tx_row = blk_row >> 1; + const int tx_col = blk_col >> 1; + const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0); + const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0); + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col]; + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + if (tx_size == plane_tx_size) { + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + tx_col, + xd->left_txfm_context + tx_row, tx_size, tx_size); + + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsl = tx_size_wide_unit[sub_txs]; + int i; + + if (tx_size == TX_8X8) { + mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4; + mbmi->tx_size = TX_4X4; + txfm_partition_update(xd->above_txfm_context + tx_col, + xd->left_txfm_context + tx_row, TX_4X4, tx_size); + return; + } + + assert(bsl > 0); + for (i = 0; i < 4; ++i) { + int offsetr = (i >> 1) * bsl; + int offsetc = (i & 0x01) * bsl; + set_txfm_context(xd, sub_txs, blk_row + offsetr, blk_col + offsetc); + } + } +} + +static void tx_partition_set_contexts(const AV1_COMMON *const cm, + MACROBLOCKD *xd, BLOCK_SIZE plane_bsize, + int mi_row, int mi_col) { + const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0]; + TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + int idx, idy; + + xd->above_txfm_context = cm->above_txfm_context + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + for (idy = 0; idy < mi_height; idy += bh) + for (idx = 0; idx < mi_width; idx += bw) + set_txfm_context(xd, max_tx_size, idy, idx); +} +#endif + +void av1_update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd, +#if CONFIG_TXK_SEL + int block, int plane, +#endif + BLOCK_SIZE bsize, TX_SIZE tx_size, + FRAME_COUNTS *counts) { + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + int is_inter = is_inter_block(mbmi); +#if !CONFIG_TXK_SEL + TX_TYPE tx_type = mbmi->tx_type; +#else + // Only y plane's tx_type is updated + if (plane > 0) return; + TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size); +#endif +#if CONFIG_EXT_TX + if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 && + cm->base_qindex > 0 && !mbmi->skip && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const int eset = + get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used); + if (eset > 0) { + if (is_inter) { + ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type]; + } else { + ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode] + [tx_type]; + } + } + } +#else + (void)bsize; + if (tx_size < TX_32X32 && + ((!cm->seg.enabled && cm->base_qindex > 0) || + (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) && + !mbmi->skip && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + if (is_inter) { + ++counts->inter_ext_tx[tx_size][tx_type]; + } else { + ++counts->intra_ext_tx[tx_size][intra_mode_to_tx_type_context[mbmi->mode]] + [tx_type]; + } + } +#endif // CONFIG_EXT_TX +} + +static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td, + TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row, + int mi_col, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int *rate) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO **mi_8x8 = xd->mi; + MODE_INFO *mi = mi_8x8[0]; + MB_MODE_INFO *mbmi = &mi->mbmi; + const int seg_skip = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); + const int mis = cm->mi_stride; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + const int is_inter = is_inter_block(mbmi); +#if CONFIG_CB4X4 + const BLOCK_SIZE block_size = bsize; +#else + const BLOCK_SIZE block_size = AOMMAX(bsize, BLOCK_8X8); +#endif + +#if CONFIG_PVQ + x->pvq_speed = 0; + x->pvq_coded = (dry_run == OUTPUT_ENABLED) ? 1 : 0; +#endif +#if CONFIG_CFL + x->cfl_store_y = (dry_run == OUTPUT_ENABLED) ? 1 : 0; +#endif + + if (!is_inter) { + int plane; + mbmi->skip = 1; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + av1_encode_intra_block_plane((AV1_COMMON *)cm, x, block_size, plane, 1, + mi_row, mi_col); + } + if (!dry_run) { + sum_intra_stats(td->counts, xd, mi, xd->above_mi, xd->left_mi, + frame_is_intra_only(cm), mi_row, mi_col); + } +#if CONFIG_PALETTE + if (bsize >= BLOCK_8X8 && !dry_run) { + for (plane = 0; plane <= 1; ++plane) { + if (mbmi->palette_mode_info.palette_size[plane] > 0) { + mbmi->palette_mode_info.palette_first_color_idx[plane] = + xd->plane[plane].color_index_map[0]; + // TODO(huisu): this increases the use of token buffer. Needs stretch + // test to verify. + av1_tokenize_palette_sb(cpi, td, plane, t, dry_run, bsize, rate); + } + } + } +#endif // CONFIG_PALETTE +#if CONFIG_VAR_TX + mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size); +#endif +#if CONFIG_LV_MAP + av1_update_txb_context(cpi, td, dry_run, block_size, rate, mi_row, mi_col); +#else // CONFIG_LV_MAP + av1_tokenize_sb(cpi, td, t, dry_run, block_size, rate, mi_row, mi_col); +#endif // CONFIG_LV_MAP + } else { + int ref; + const int is_compound = has_second_ref(mbmi); + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + for (ref = 0; ref < 1 + is_compound; ++ref) { + YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]); +#if CONFIG_INTRABC + assert(IMPLIES(!is_intrabc_block(mbmi), cfg)); +#else + assert(cfg != NULL); +#endif // !CONFIG_INTRABC + av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, + &xd->block_refs[ref]->sf); + } + if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip) + av1_build_inter_predictors_sby(xd, mi_row, mi_col, NULL, block_size); + + av1_build_inter_predictors_sbuv(xd, mi_row, mi_col, NULL, block_size); +#if CONFIG_MOTION_VAR + if (mbmi->motion_mode == OBMC_CAUSAL) { +#if CONFIG_NCOBMC + if (dry_run == OUTPUT_ENABLED) + av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col); + else +#endif + av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); + } +#endif // CONFIG_MOTION_VAR + + av1_encode_sb((AV1_COMMON *)cm, x, block_size, mi_row, mi_col); +#if CONFIG_VAR_TX + if (mbmi->skip) mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size); + av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, block_size, + rate); +#else +#if CONFIG_LV_MAP + av1_update_txb_context(cpi, td, dry_run, block_size, rate, mi_row, mi_col); +#else // CONFIG_LV_MAP + av1_tokenize_sb(cpi, td, t, dry_run, block_size, rate, mi_row, mi_col); +#endif // CONFIG_LV_MAP +#endif + } + + if (!dry_run) { +#if CONFIG_VAR_TX + TX_SIZE tx_size = + is_inter && !mbmi->skip ? mbmi->min_tx_size : mbmi->tx_size; +#else + TX_SIZE tx_size = mbmi->tx_size; +#endif + if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id] && +#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX + mbmi->sb_type > BLOCK_4X4 && +#else + mbmi->sb_type >= BLOCK_8X8 && +#endif + !(is_inter && (mbmi->skip || seg_skip))) { +#if CONFIG_VAR_TX + if (is_inter) { + tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts); + } else { + const int tx_size_ctx = get_tx_size_context(xd); + const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize] + : intra_tx_size_cat_lookup[bsize]; + const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size]; + const int depth = tx_size_to_depth(coded_tx_size); + ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth]; + if (tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count; + } +#else + const int tx_size_ctx = get_tx_size_context(xd); + const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize] + : intra_tx_size_cat_lookup[bsize]; + const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size]; + const int depth = tx_size_to_depth(coded_tx_size); + + ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth]; +#endif +#if CONFIG_EXT_TX && CONFIG_RECT_TX + assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi))); +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + } else { + int i, j; + TX_SIZE intra_tx_size; + // The new intra coding scheme requires no change of transform size + if (is_inter) { + if (xd->lossless[mbmi->segment_id]) { + intra_tx_size = TX_4X4; + } else { + intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1); + } + } else { +#if CONFIG_EXT_TX && CONFIG_RECT_TX + intra_tx_size = tx_size; +#else + intra_tx_size = (bsize >= BLOCK_8X8) ? tx_size : TX_4X4; +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + } +#if CONFIG_EXT_TX && CONFIG_RECT_TX + ++td->counts->tx_size_implied[max_txsize_lookup[bsize]] + [txsize_sqr_up_map[tx_size]]; +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + + for (j = 0; j < mi_height; j++) + for (i = 0; i < mi_width; i++) + if (mi_col + i < cm->mi_cols && mi_row + j < cm->mi_rows) + mi_8x8[mis * j + i]->mbmi.tx_size = intra_tx_size; + +#if CONFIG_VAR_TX + mbmi->min_tx_size = get_min_tx_size(intra_tx_size); + if (intra_tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count; +#endif + } + + ++td->counts->tx_size_totals[txsize_sqr_map[tx_size]]; + ++td->counts + ->tx_size_totals[txsize_sqr_map[get_uv_tx_size(mbmi, &xd->plane[1])]]; +#if !CONFIG_TXK_SEL + av1_update_tx_type_count(cm, xd, bsize, tx_size, td->counts); +#endif + } + +#if CONFIG_VAR_TX + if (cm->tx_mode == TX_MODE_SELECT && +#if CONFIG_CB4X4 + mbmi->sb_type > BLOCK_4X4 && +#else + mbmi->sb_type >= BLOCK_8X8 && +#endif + is_inter && !(mbmi->skip || seg_skip)) { + if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col); + } else { + TX_SIZE tx_size = mbmi->tx_size; + // The new intra coding scheme requires no change of transform size + if (is_inter) + tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, is_inter); + else + tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4; + mbmi->tx_size = tx_size; + set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h, (mbmi->skip || seg_skip), xd); + } +#endif // CONFIG_VAR_TX +} + +#if CONFIG_SUPERTX +static int check_intra_b(PICK_MODE_CONTEXT *ctx) { + if (!is_inter_mode((&ctx->mic)->mbmi.mode)) return 1; +#if CONFIG_EXT_INTER + if (ctx->mic.mbmi.ref_frame[1] == INTRA_FRAME) return 1; +#endif // CONFIG_EXT_INTER + return 0; +} + +static int check_intra_sb(const AV1_COMP *const cpi, const TileInfo *const tile, + int mi_row, int mi_col, BLOCK_SIZE bsize, + PC_TREE *pc_tree) { + const AV1_COMMON *const cm = &cpi->common; + const int hbs = mi_size_wide[bsize] / 2; + const PARTITION_TYPE partition = pc_tree->partitioning; + const BLOCK_SIZE subsize = get_subsize(bsize, partition); +#if CONFIG_EXT_PARTITION_TYPES + int i; +#endif +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; +#endif + +#if !CONFIG_CB4X4 + assert(bsize >= BLOCK_8X8); +#endif + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return 1; + + switch (partition) { + case PARTITION_NONE: return check_intra_b(&pc_tree->none); break; + case PARTITION_VERT: + if (check_intra_b(&pc_tree->vertical[0])) return 1; + if (mi_col + hbs < cm->mi_cols && (bsize > BLOCK_8X8 || unify_bsize)) { + if (check_intra_b(&pc_tree->vertical[1])) return 1; + } + break; + case PARTITION_HORZ: + if (check_intra_b(&pc_tree->horizontal[0])) return 1; + if (mi_row + hbs < cm->mi_rows && (bsize > BLOCK_8X8 || unify_bsize)) { + if (check_intra_b(&pc_tree->horizontal[1])) return 1; + } + break; + case PARTITION_SPLIT: + if (bsize == BLOCK_8X8 && !unify_bsize) { + if (check_intra_b(pc_tree->leaf_split[0])) return 1; + } else { + if (check_intra_sb(cpi, tile, mi_row, mi_col, subsize, + pc_tree->split[0])) + return 1; + if (check_intra_sb(cpi, tile, mi_row, mi_col + hbs, subsize, + pc_tree->split[1])) + return 1; + if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col, subsize, + pc_tree->split[2])) + return 1; + if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col + hbs, subsize, + pc_tree->split[3])) + return 1; + } + break; +#if CONFIG_EXT_PARTITION_TYPES + case PARTITION_HORZ_A: + for (i = 0; i < 3; i++) { + if (check_intra_b(&pc_tree->horizontala[i])) return 1; + } + break; + case PARTITION_HORZ_B: + for (i = 0; i < 3; i++) { + if (check_intra_b(&pc_tree->horizontalb[i])) return 1; + } + break; + case PARTITION_VERT_A: + for (i = 0; i < 3; i++) { + if (check_intra_b(&pc_tree->verticala[i])) return 1; + } + break; + case PARTITION_VERT_B: + for (i = 0; i < 3; i++) { + if (check_intra_b(&pc_tree->verticalb[i])) return 1; + } + break; +#endif // CONFIG_EXT_PARTITION_TYPES + default: assert(0); + } + return 0; +} + +static int check_supertx_b(TX_SIZE supertx_size, PICK_MODE_CONTEXT *ctx) { + return ctx->mic.mbmi.tx_size == supertx_size; +} + +static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size, + PC_TREE *pc_tree) { + PARTITION_TYPE partition; + BLOCK_SIZE subsize; +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; +#endif + + partition = pc_tree->partitioning; + subsize = get_subsize(bsize, partition); + switch (partition) { + case PARTITION_NONE: return check_supertx_b(supertx_size, &pc_tree->none); + case PARTITION_VERT: + return check_supertx_b(supertx_size, &pc_tree->vertical[0]); + case PARTITION_HORZ: + return check_supertx_b(supertx_size, &pc_tree->horizontal[0]); + case PARTITION_SPLIT: + if (bsize == BLOCK_8X8 && !unify_bsize) + return check_supertx_b(supertx_size, pc_tree->leaf_split[0]); + else + return check_supertx_sb(subsize, supertx_size, pc_tree->split[0]); +#if CONFIG_EXT_PARTITION_TYPES + case PARTITION_HORZ_A: + return check_supertx_b(supertx_size, &pc_tree->horizontala[0]); + case PARTITION_HORZ_B: + return check_supertx_b(supertx_size, &pc_tree->horizontalb[0]); + case PARTITION_VERT_A: + return check_supertx_b(supertx_size, &pc_tree->verticala[0]); + case PARTITION_VERT_B: + return check_supertx_b(supertx_size, &pc_tree->verticalb[0]); +#endif // CONFIG_EXT_PARTITION_TYPES + default: assert(0); return 0; + } +} + +static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td, +#if CONFIG_EXT_INTER + int mi_row_ori, int mi_col_ori, +#endif // CONFIG_EXT_INTER + int mi_row_pred, int mi_col_pred, + BLOCK_SIZE bsize_pred, int b_sub8x8, int block) { + // Used in supertx + // (mi_row_ori, mi_col_ori): location for mv + // (mi_row_pred, mi_col_pred, bsize_pred): region to predict + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *mi_8x8 = xd->mi[0]; + MODE_INFO *mi = mi_8x8; + MB_MODE_INFO *mbmi = &mi->mbmi; + int ref; + const int is_compound = has_second_ref(mbmi); + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + for (ref = 0; ref < 1 + is_compound; ++ref) { + YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]); + av1_setup_pre_planes(xd, ref, cfg, mi_row_pred, mi_col_pred, + &xd->block_refs[ref]->sf); + } + + if (!b_sub8x8) + av1_build_inter_predictors_sb_extend(xd, +#if CONFIG_EXT_INTER + mi_row_ori, mi_col_ori, +#endif // CONFIG_EXT_INTER + mi_row_pred, mi_col_pred, bsize_pred); + else + av1_build_inter_predictors_sb_sub8x8_extend(xd, +#if CONFIG_EXT_INTER + mi_row_ori, mi_col_ori, +#endif // CONFIG_EXT_INTER + mi_row_pred, mi_col_pred, + bsize_pred, block); +} + +static void predict_b_extend(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, int block, + int mi_row_ori, int mi_col_ori, int mi_row_pred, + int mi_col_pred, int mi_row_top, int mi_col_top, + uint8_t *dst_buf[3], int dst_stride[3], + BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred, + RUN_TYPE dry_run, int b_sub8x8, int bextend) { + // Used in supertx + // (mi_row_ori, mi_col_ori): location for mv + // (mi_row_pred, mi_col_pred, bsize_pred): region to predict + // (mi_row_top, mi_col_top, bsize_top): region of the top partition size + // block: sub location of sub8x8 blocks + // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8 + // bextend: 1: region to predict is an extension of ori; 0: not + + MACROBLOCK *const x = &td->mb; + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + int r = (mi_row_pred - mi_row_top) * MI_SIZE; + int c = (mi_col_pred - mi_col_top) * MI_SIZE; + const int mi_width_top = mi_size_wide[bsize_top]; + const int mi_height_top = mi_size_high[bsize_top]; + + if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top || + mi_row_pred >= mi_row_top + mi_height_top || + mi_col_pred >= mi_col_top + mi_width_top || mi_row_pred >= cm->mi_rows || + mi_col_pred >= cm->mi_cols) + return; + + set_offsets_extend(cpi, td, tile, mi_row_pred, mi_col_pred, mi_row_ori, + mi_col_ori, bsize_pred); + xd->plane[0].dst.stride = dst_stride[0]; + xd->plane[1].dst.stride = dst_stride[1]; + xd->plane[2].dst.stride = dst_stride[2]; + xd->plane[0].dst.buf = dst_buf[0] + + (r >> xd->plane[0].subsampling_y) * dst_stride[0] + + (c >> xd->plane[0].subsampling_x); + xd->plane[1].dst.buf = dst_buf[1] + + (r >> xd->plane[1].subsampling_y) * dst_stride[1] + + (c >> xd->plane[1].subsampling_x); + xd->plane[2].dst.buf = dst_buf[2] + + (r >> xd->plane[2].subsampling_y) * dst_stride[2] + + (c >> xd->plane[2].subsampling_x); + + predict_superblock(cpi, td, +#if CONFIG_EXT_INTER + mi_row_ori, mi_col_ori, +#endif // CONFIG_EXT_INTER + mi_row_pred, mi_col_pred, bsize_pred, b_sub8x8, block); + + if (!dry_run && !bextend) + update_stats(&cpi->common, td, mi_row_pred, mi_col_pred, 1); +} + +static void extend_dir(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, int block, BLOCK_SIZE bsize, + BLOCK_SIZE top_bsize, int mi_row, int mi_col, + int mi_row_top, int mi_col_top, RUN_TYPE dry_run, + uint8_t *dst_buf[3], int dst_stride[3], int dir) { + // dir: 0-lower, 1-upper, 2-left, 3-right + // 4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright + MACROBLOCKD *xd = &td->mb.e_mbd; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + int xss = xd->plane[1].subsampling_x; + int yss = xd->plane[1].subsampling_y; +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; +#endif + int b_sub8x8 = (bsize < BLOCK_8X8) && !unify_bsize ? 1 : 0; + int wide_unit, high_unit; + int i, j; + int ext_offset = 0; + + BLOCK_SIZE extend_bsize; + int mi_row_pred, mi_col_pred; + + if (dir == 0 || dir == 1) { // lower and upper + extend_bsize = + (mi_width == mi_size_wide[BLOCK_8X8] || bsize < BLOCK_8X8 || xss < yss) + ? BLOCK_8X8 + : BLOCK_16X8; + +#if CONFIG_CB4X4 + if (bsize < BLOCK_8X8) { + extend_bsize = BLOCK_4X4; + ext_offset = mi_size_wide[BLOCK_8X8]; + } +#endif + wide_unit = mi_size_wide[extend_bsize]; + high_unit = mi_size_high[extend_bsize]; + + mi_row_pred = mi_row + ((dir == 0) ? mi_height : -(mi_height + ext_offset)); + mi_col_pred = mi_col; + + for (j = 0; j < mi_height + ext_offset; j += high_unit) + for (i = 0; i < mi_width + ext_offset; i += wide_unit) + predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j, + mi_col_pred + i, mi_row_top, mi_col_top, dst_buf, + dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8, + 1); + } else if (dir == 2 || dir == 3) { // left and right + extend_bsize = + (mi_height == mi_size_high[BLOCK_8X8] || bsize < BLOCK_8X8 || yss < xss) + ? BLOCK_8X8 + : BLOCK_8X16; +#if CONFIG_CB4X4 + if (bsize < BLOCK_8X8) { + extend_bsize = BLOCK_4X4; + ext_offset = mi_size_wide[BLOCK_8X8]; + } +#endif + wide_unit = mi_size_wide[extend_bsize]; + high_unit = mi_size_high[extend_bsize]; + + mi_row_pred = mi_row; + mi_col_pred = mi_col + ((dir == 3) ? mi_width : -(mi_width + ext_offset)); + + for (j = 0; j < mi_height + ext_offset; j += high_unit) + for (i = 0; i < mi_width + ext_offset; i += wide_unit) + predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j, + mi_col_pred + i, mi_row_top, mi_col_top, dst_buf, + dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8, + 1); + } else { + extend_bsize = BLOCK_8X8; +#if CONFIG_CB4X4 + if (bsize < BLOCK_8X8) { + extend_bsize = BLOCK_4X4; + ext_offset = mi_size_wide[BLOCK_8X8]; + } +#endif + wide_unit = mi_size_wide[extend_bsize]; + high_unit = mi_size_high[extend_bsize]; + + mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height + : -(mi_height + ext_offset)); + mi_col_pred = + mi_col + ((dir == 6 || dir == 7) ? mi_width : -(mi_width + ext_offset)); + + for (j = 0; j < mi_height + ext_offset; j += high_unit) + for (i = 0; i < mi_width + ext_offset; i += wide_unit) + predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j, + mi_col_pred + i, mi_row_top, mi_col_top, dst_buf, + dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8, + 1); + } +} + +static void extend_all(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, int block, BLOCK_SIZE bsize, + BLOCK_SIZE top_bsize, int mi_row, int mi_col, + int mi_row_top, int mi_col_top, RUN_TYPE dry_run, + uint8_t *dst_buf[3], int dst_stride[3]) { + assert(block >= 0 && block < 4); + extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top, + mi_col_top, dry_run, dst_buf, dst_stride, 0); + extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top, + mi_col_top, dry_run, dst_buf, dst_stride, 1); + extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top, + mi_col_top, dry_run, dst_buf, dst_stride, 2); + extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top, + mi_col_top, dry_run, dst_buf, dst_stride, 3); + extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top, + mi_col_top, dry_run, dst_buf, dst_stride, 4); + extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top, + mi_col_top, dry_run, dst_buf, dst_stride, 5); + extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top, + mi_col_top, dry_run, dst_buf, dst_stride, 6); + extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top, + mi_col_top, dry_run, dst_buf, dst_stride, 7); +} + +// This function generates prediction for multiple blocks, between which +// discontinuity around boundary is reduced by smoothing masks. The basic +// smoothing mask is a soft step function along horz/vert direction. In more +// complicated case when a block is split into 4 subblocks, the basic mask is +// first applied to neighboring subblocks (2 pairs) in horizontal direction and +// then applied to the 2 masked prediction mentioned above in vertical direction +// If the block is split into more than one level, at every stage, masked +// prediction is stored in dst_buf[] passed from higher level. +static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, int mi_row, + int mi_col, int mi_row_top, int mi_col_top, + RUN_TYPE dry_run, BLOCK_SIZE bsize, + BLOCK_SIZE top_bsize, uint8_t *dst_buf[3], + int dst_stride[3], PC_TREE *pc_tree) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int hbs = mi_size_wide[bsize] / 2; + const int is_partition_root = bsize >= BLOCK_8X8; + const int ctx = is_partition_root + ? partition_plane_context(xd, mi_row, mi_col, +#if CONFIG_UNPOISON_PARTITION_CTX + mi_row + hbs < cm->mi_rows, + mi_col + hbs < cm->mi_cols, +#endif + bsize) + : -1; + const PARTITION_TYPE partition = pc_tree->partitioning; + const BLOCK_SIZE subsize = get_subsize(bsize, partition); +#if CONFIG_EXT_PARTITION_TYPES + const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT); +#endif + + int i; + uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3]; + DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]); + DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]); + DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]); + int dst_stride1[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE }; + int dst_stride2[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE }; + int dst_stride3[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE }; +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; + assert(bsize >= BLOCK_8X8); +#endif + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + int len = sizeof(uint16_t); + dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1); + dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len); + dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len); + dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2); + dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len); + dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len); + dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3); + dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len); + dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len); + } else { +#endif // CONFIG_HIGHBITDEPTH + dst_buf1[0] = tmp_buf1; + dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE; + dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE; + dst_buf2[0] = tmp_buf2; + dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE; + dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE; + dst_buf3[0] = tmp_buf3; + dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE; + dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE; +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH + + if (!dry_run && ctx >= 0 && bsize < top_bsize) { + // Explicitly cast away const. + FRAME_COUNTS *const frame_counts = (FRAME_COUNTS *)&cm->counts; + frame_counts->partition[ctx][partition]++; + } + + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = dst_buf[i]; + xd->plane[i].dst.stride = dst_stride[i]; + } + + switch (partition) { + case PARTITION_NONE: + assert(bsize < top_bsize); + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, + bsize, dry_run, 0, 0); + extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row_top, + mi_col_top, dry_run, dst_buf, dst_stride); + break; + case PARTITION_HORZ: + if (bsize == BLOCK_8X8 && !unify_bsize) { + // Fisrt half + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, + BLOCK_8X8, dry_run, 1, 0); + if (bsize < top_bsize) + extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); + + // Second half + predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf1, dst_stride1, + top_bsize, BLOCK_8X8, dry_run, 1, 1); + if (bsize < top_bsize) + extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1); + + // Smooth + xd->plane[0].dst.buf = dst_buf[0]; + xd->plane[0].dst.stride = dst_stride[0]; + av1_build_masked_inter_predictor_complex( + xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row, + mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ, + 0); + } else { + // First half + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, + subsize, dry_run, 0, 0); + if (bsize < top_bsize) + extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); + else + extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0); + + if (mi_row + hbs < cm->mi_rows) { + // Second half + predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs, + mi_col, mi_row_top, mi_col_top, dst_buf1, + dst_stride1, top_bsize, subsize, dry_run, 0, 0); + if (bsize < top_bsize) + extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, + mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1, + dst_stride1); + else + extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, + mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1, + dst_stride1, 1); + + // Smooth + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = dst_buf[i]; + xd->plane[i].dst.stride = dst_stride[i]; + av1_build_masked_inter_predictor_complex( + xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], + mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, + PARTITION_HORZ, i); + } + } + } + break; + case PARTITION_VERT: + if (bsize == BLOCK_8X8 && !unify_bsize) { + // First half + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, + BLOCK_8X8, dry_run, 1, 0); + if (bsize < top_bsize) + extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); + + // Second half + predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf1, dst_stride1, + top_bsize, BLOCK_8X8, dry_run, 1, 1); + if (bsize < top_bsize) + extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1); + + // Smooth + xd->plane[0].dst.buf = dst_buf[0]; + xd->plane[0].dst.stride = dst_stride[0]; + av1_build_masked_inter_predictor_complex( + xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row, + mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT, + 0); + } else { + // bsize: not important, not useful + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, + subsize, dry_run, 0, 0); + if (bsize < top_bsize) + extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); + else + extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3); + + if (mi_col + hbs < cm->mi_cols) { + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row, + mi_col + hbs, mi_row_top, mi_col_top, dst_buf1, + dst_stride1, top_bsize, subsize, dry_run, 0, 0); + if (bsize < top_bsize) + extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, + mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1, + dst_stride1); + else + extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, + mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1, + dst_stride1, 2); + + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = dst_buf[i]; + xd->plane[i].dst.stride = dst_stride[i]; + av1_build_masked_inter_predictor_complex( + xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], + mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, + PARTITION_VERT, i); + } + } + } + break; + case PARTITION_SPLIT: + if (bsize == BLOCK_8X8 && !unify_bsize) { + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, + BLOCK_8X8, dry_run, 1, 0); + predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf1, dst_stride1, + top_bsize, BLOCK_8X8, dry_run, 1, 1); + predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf2, dst_stride2, + top_bsize, BLOCK_8X8, dry_run, 1, 1); + predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf3, dst_stride3, + top_bsize, BLOCK_8X8, dry_run, 1, 1); + + if (bsize < top_bsize) { + extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); + extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1); + extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2); + extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf3, dst_stride3); + } + } else { + predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top, + mi_col_top, dry_run, subsize, top_bsize, dst_buf, + dst_stride, pc_tree->split[0]); + if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) + predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs, mi_row_top, + mi_col_top, dry_run, subsize, top_bsize, dst_buf1, + dst_stride1, pc_tree->split[1]); + if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) + predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col, mi_row_top, + mi_col_top, dry_run, subsize, top_bsize, dst_buf2, + dst_stride2, pc_tree->split[2]); + if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) + predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs, + mi_row_top, mi_col_top, dry_run, subsize, + top_bsize, dst_buf3, dst_stride3, + pc_tree->split[3]); + } + for (i = 0; i < MAX_MB_PLANE; i++) { +#if !CONFIG_CB4X4 + if (bsize == BLOCK_8X8 && i != 0) + continue; // Skip <4x4 chroma smoothing +#endif + if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) { + av1_build_masked_inter_predictor_complex( + xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], + mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, + PARTITION_VERT, i); + if (mi_row + hbs < cm->mi_rows) { + av1_build_masked_inter_predictor_complex( + xd, dst_buf2[i], dst_stride2[i], dst_buf3[i], dst_stride3[i], + mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, + PARTITION_VERT, i); + av1_build_masked_inter_predictor_complex( + xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], + mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, + PARTITION_HORZ, i); + } + } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) { + if (bsize == BLOCK_8X8 && i != 0) + continue; // Skip <4x4 chroma smoothing + + av1_build_masked_inter_predictor_complex( + xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], + mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, + PARTITION_HORZ, i); + } + } + break; +#if CONFIG_EXT_PARTITION_TYPES + case PARTITION_HORZ_A: + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, + bsize2, dry_run, 0, 0); + extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); + + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row, + mi_col + hbs, mi_row_top, mi_col_top, dst_buf1, + dst_stride1, top_bsize, bsize2, dry_run, 0, 0); + extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs, + mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1); + + predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs, + mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2, + top_bsize, subsize, dry_run, 0, 0); + if (bsize < top_bsize) + extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2); + else + extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 1); + + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = dst_buf[i]; + xd->plane[i].dst.stride = dst_stride[i]; + av1_build_masked_inter_predictor_complex( + xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row, + mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT, + i); + } + for (i = 0; i < MAX_MB_PLANE; i++) { + av1_build_masked_inter_predictor_complex( + xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row, + mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ, + i); + } + + break; + case PARTITION_VERT_A: + + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, + bsize2, dry_run, 0, 0); + extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); + + predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs, + mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1, + top_bsize, bsize2, dry_run, 0, 0); + extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1); + + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row, + mi_col + hbs, mi_row_top, mi_col_top, dst_buf2, + dst_stride2, top_bsize, subsize, dry_run, 0, 0); + if (bsize < top_bsize) + extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs, + mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2); + else + extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs, + mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 2); + + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = dst_buf[i]; + xd->plane[i].dst.stride = dst_stride[i]; + av1_build_masked_inter_predictor_complex( + xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row, + mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ, + i); + } + for (i = 0; i < MAX_MB_PLANE; i++) { + av1_build_masked_inter_predictor_complex( + xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row, + mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT, + i); + } + break; + case PARTITION_HORZ_B: + + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, + subsize, dry_run, 0, 0); + if (bsize < top_bsize) + extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); + else + extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0); + + predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs, + mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1, + top_bsize, bsize2, dry_run, 0, 0); + extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1); + + predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs, + mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top, + dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0); + extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, + mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2, + dst_stride2); + + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = dst_buf1[i]; + xd->plane[i].dst.stride = dst_stride1[i]; + av1_build_masked_inter_predictor_complex( + xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i], + mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, + PARTITION_VERT, i); + } + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = dst_buf[i]; + xd->plane[i].dst.stride = dst_stride[i]; + av1_build_masked_inter_predictor_complex( + xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row, + mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ, + i); + } + break; + case PARTITION_VERT_B: + + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col, + mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize, + subsize, dry_run, 0, 0); + if (bsize < top_bsize) + extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride); + else + extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col, + mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3); + + predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row, + mi_col + hbs, mi_row_top, mi_col_top, dst_buf1, + dst_stride1, top_bsize, bsize2, dry_run, 0, 0); + extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs, + mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1); + + predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs, + mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top, + dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0); + extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, + mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2, + dst_stride2); + + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = dst_buf1[i]; + xd->plane[i].dst.stride = dst_stride1[i]; + av1_build_masked_inter_predictor_complex( + xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i], + mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize, + PARTITION_HORZ, i); + } + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = dst_buf[i]; + xd->plane[i].dst.stride = dst_stride[i]; + av1_build_masked_inter_predictor_complex( + xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row, + mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT, + i); + } + break; +#endif // CONFIG_EXT_PARTITION_TYPES + default: assert(0); + } + +#if CONFIG_EXT_PARTITION_TYPES + if (bsize < top_bsize) + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); +#else + if (bsize < top_bsize && (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)) + update_partition_context(xd, mi_row, mi_col, subsize, bsize); +#endif // CONFIG_EXT_PARTITION_TYPES +} + +static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td, + const TileInfo *const tile, int mi_row, int mi_col, + BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist, + TX_TYPE *best_tx, PC_TREE *pc_tree) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + int plane, pnskip, skippable, skippable_uv, rate_uv, this_rate, + base_rate = *tmp_rate; + int64_t sse, pnsse, sse_uv, this_dist, dist_uv; + uint8_t *dst_buf[3]; + int dst_stride[3]; + TX_SIZE tx_size; + MB_MODE_INFO *mbmi; + TX_TYPE tx_type, best_tx_nostx; +#if CONFIG_EXT_TX + int ext_tx_set; +#endif // CONFIG_EXT_TX + int tmp_rate_tx = 0, skip_tx = 0; + int64_t tmp_dist_tx = 0, rd_tx, bestrd_tx = INT64_MAX; + + set_skip_context(xd, mi_row, mi_col); + set_mode_info_offsets(cpi, x, xd, mi_row, mi_col); + update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 1, pc_tree); + av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row, + mi_col); + for (plane = 0; plane < MAX_MB_PLANE; plane++) { + dst_buf[plane] = xd->plane[plane].dst.buf; + dst_stride[plane] = xd->plane[plane].dst.stride; + } + predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 1, bsize, + bsize, dst_buf, dst_stride, pc_tree); + + set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); + set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize); + + mbmi = &xd->mi[0]->mbmi; + best_tx_nostx = mbmi->tx_type; + + *best_tx = DCT_DCT; + + // chroma + skippable_uv = 1; + rate_uv = 0; + dist_uv = 0; + sse_uv = 0; + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { +#if CONFIG_VAR_TX + ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + RD_STATS this_rd_stats; + av1_init_rd_stats(&this_rd_stats); + + tx_size = max_txsize_lookup[bsize]; + tx_size = + uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y]; + av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl); + + av1_subtract_plane(x, bsize, plane); + av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, plane, 0, + get_plane_block_size(bsize, pd), &ctxa[0], &ctxl[0], + &this_rd_stats); + + this_rate = this_rd_stats.rate; + this_dist = this_rd_stats.dist; + pnsse = this_rd_stats.sse; + pnskip = this_rd_stats.skip; +#else + tx_size = max_txsize_lookup[bsize]; + tx_size = + uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y]; + av1_subtract_plane(x, bsize, plane); + av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip, + &pnsse, INT64_MAX, plane, bsize, tx_size, 0); +#endif // CONFIG_VAR_TX + + rate_uv += this_rate; + dist_uv += this_dist; + sse_uv += pnsse; + skippable_uv &= pnskip; + } + + // luma + tx_size = max_txsize_lookup[bsize]; + av1_subtract_plane(x, bsize, 0); +#if CONFIG_EXT_TX + ext_tx_set = get_ext_tx_set(tx_size, bsize, 1, cm->reduced_tx_set_used); +#endif // CONFIG_EXT_TX + for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) { +#if CONFIG_VAR_TX + ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE]; + const struct macroblockd_plane *const pd = &xd->plane[0]; + RD_STATS this_rd_stats; +#endif // CONFIG_VAR_TX + +#if CONFIG_EXT_TX + if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue; +#else + if (tx_size >= TX_32X32 && tx_type != DCT_DCT) continue; +#endif // CONFIG_EXT_TX + mbmi->tx_type = tx_type; + +#if CONFIG_VAR_TX + av1_init_rd_stats(&this_rd_stats); + av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl); + av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, 0, 0, bsize, &ctxa[0], &ctxl[0], + &this_rd_stats); + + this_rate = this_rd_stats.rate; + this_dist = this_rd_stats.dist; + pnsse = this_rd_stats.sse; + pnskip = this_rd_stats.skip; +#else + av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip, + &pnsse, INT64_MAX, 0, bsize, tx_size, 0); +#endif // CONFIG_VAR_TX + +#if CONFIG_EXT_TX + if (get_ext_tx_types(tx_size, bsize, 1, cm->reduced_tx_set_used) > 1 && + !xd->lossless[xd->mi[0]->mbmi.segment_id] && this_rate != INT_MAX) { + if (ext_tx_set > 0) + this_rate += + cpi->inter_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->tx_type]; + } +#else + if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] && + this_rate != INT_MAX) { + this_rate += cpi->inter_tx_type_costs[tx_size][mbmi->tx_type]; + } +#endif // CONFIG_EXT_TX + *tmp_rate = rate_uv + this_rate; + *tmp_dist = dist_uv + this_dist; + sse = sse_uv + pnsse; + skippable = skippable_uv && pnskip; + if (skippable) { + *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + x->skip = 1; + } else { + if (RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist) < + RDCOST(x->rdmult, x->rddiv, 0, sse)) { + *tmp_rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + x->skip = 0; + } else { + *tmp_dist = sse; + *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + x->skip = 1; + } + } + *tmp_rate += base_rate; + rd_tx = RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist); + if (rd_tx < bestrd_tx * 0.99 || tx_type == DCT_DCT) { + *best_tx = tx_type; + bestrd_tx = rd_tx; + tmp_rate_tx = *tmp_rate; + tmp_dist_tx = *tmp_dist; + skip_tx = x->skip; + } + } + *tmp_rate = tmp_rate_tx; + *tmp_dist = tmp_dist_tx; + x->skip = skip_tx; +#if CONFIG_VAR_TX + for (plane = 0; plane < 1; ++plane) + memset(x->blk_skip[plane], x->skip, + sizeof(uint8_t) * pc_tree->none.num_4x4_blk); +#endif // CONFIG_VAR_TX + xd->mi[0]->mbmi.tx_type = best_tx_nostx; +} +#endif // CONFIG_SUPERTX diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h new file mode 100644 index 000000000..08d6d20de --- /dev/null +++ b/third_party/aom/av1/encoder/encodeframe.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_ENCODEFRAME_H_ +#define AV1_ENCODER_ENCODEFRAME_H_ + +#include "aom/aom_integer.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct macroblock; +struct yv12_buffer_config; +struct AV1_COMP; +struct ThreadData; + +// Constants used in SOURCE_VAR_BASED_PARTITION +#define VAR_HIST_MAX_BG_VAR 1000 +#define VAR_HIST_FACTOR 10 +#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1) +#define VAR_HIST_LARGE_CUT_OFF 75 +#define VAR_HIST_SMALL_CUT_OFF 45 + +void av1_setup_src_planes(struct macroblock *x, + const struct yv12_buffer_config *src, int mi_row, + int mi_col); + +void av1_encode_frame(struct AV1_COMP *cpi); + +void av1_init_tile_data(struct AV1_COMP *cpi); +void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row, + int tile_col); + +void av1_set_variance_partition_thresholds(struct AV1_COMP *cpi, int q); + +void av1_update_tx_type_count(const struct AV1Common *cm, MACROBLOCKD *xd, +#if CONFIG_TXK_SEL + int block, int plane, +#endif + BLOCK_SIZE bsize, TX_SIZE tx_size, + FRAME_COUNTS *counts); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_ENCODEFRAME_H_ diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c new file mode 100644 index 000000000..c450244b1 --- /dev/null +++ b/third_party/aom/av1/encoder/encodemb.c @@ -0,0 +1,1671 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./av1_rtcd.h" +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_dsp/bitwriter.h" +#include "aom_dsp/quantize.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/idct.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/scan.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encodemb.h" +#if CONFIG_LV_MAP +#include "av1/encoder/encodetxb.h" +#endif +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/tokenize.h" + +#if CONFIG_PVQ +#include "av1/encoder/encint.h" +#include "av1/common/partition.h" +#include "av1/encoder/pvq_encoder.h" +#endif + +#if CONFIG_CFL +#include "av1/common/cfl.h" +#endif + +// Check if one needs to use c version subtraction. +static int check_subtract_block_size(int w, int h) { return w < 4 || h < 4; } + +static void subtract_block(const MACROBLOCKD *xd, int rows, int cols, + int16_t *diff, ptrdiff_t diff_stride, + const uint8_t *src8, ptrdiff_t src_stride, + const uint8_t *pred8, ptrdiff_t pred_stride) { +#if !CONFIG_HIGHBITDEPTH + (void)xd; +#endif + + if (check_subtract_block_size(rows, cols)) { +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8, + src_stride, pred8, pred_stride, xd->bd); + return; + } +#endif // CONFIG_HIGHBITDEPTH + aom_subtract_block_c(rows, cols, diff, diff_stride, src8, src_stride, pred8, + pred_stride); + + return; + } + +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, + pred8, pred_stride, xd->bd); + return; + } +#endif // CONFIG_HIGHBITDEPTH + aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8, + pred_stride); +} + +void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, + int blk_col, int blk_row, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; + const int diff_stride = block_size_wide[plane_bsize]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + const int tx1d_width = tx_size_wide[tx_size]; + const int tx1d_height = tx_size_high[tx_size]; + uint8_t *dst = + &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; + uint8_t *src = + &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]]; + int16_t *src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]]; + subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src, + src_stride, dst, dst_stride); +} + +void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + const MACROBLOCKD *xd = &x->e_mbd; + + subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); +} + +// These numbers are empirically obtained. +static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { +#if CONFIG_EC_ADAPT + { 10, 7 }, { 8, 5 }, +#else + { 10, 6 }, { 8, 5 }, +#endif +}; + +#define UPDATE_RD_COST() \ + { \ + rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \ + rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \ + } + +static INLINE int64_t +get_token_bit_costs(unsigned int token_costs[2][COEFF_CONTEXTS][ENTROPY_TOKENS], + int skip_eob, int ctx, int token) { +#if CONFIG_NEW_TOKENSET + (void)skip_eob; + return token_costs[token == ZERO_TOKEN || token == EOB_TOKEN][ctx][token]; +#else + return token_costs[skip_eob][ctx][token]; +#endif +} + +#define USE_GREEDY_OPTIMIZE_B 0 + +#if USE_GREEDY_OPTIMIZE_B + +typedef struct av1_token_state { + int16_t token; + tran_low_t qc; + tran_low_t dqc; +} av1_token_state; + +int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block, + TX_SIZE tx_size, int ctx) { +#if !CONFIG_PVQ + MACROBLOCKD *const xd = &mb->e_mbd; + struct macroblock_plane *const p = &mb->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ref = is_inter_block(&xd->mi[0]->mbmi); + av1_token_state tokens[MAX_TX_SQUARE + 1][2]; + uint8_t token_cache[MAX_TX_SQUARE]; + const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const int eob = p->eobs[block]; + const PLANE_TYPE plane_type = pd->plane_type; + const int16_t *const dequant_ptr = pd->dequant; + const uint8_t *const band_translate = get_band_translate(tx_size); + TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); + const SCAN_ORDER *const scan_order = + get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi)); + const int16_t *const scan = scan_order->scan; + const int16_t *const nb = scan_order->neighbors; + int dqv; + const int shift = av1_get_tx_scale(tx_size); +#if CONFIG_AOM_QM + int seg_id = xd->mi[0]->mbmi.segment_id; + const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!ref][tx_size]; +#endif +#if CONFIG_NEW_QUANT + int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type); + const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq]; +#elif !CONFIG_AOM_QM + const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift }; +#endif // CONFIG_NEW_QUANT + int sz = 0; + const int64_t rddiv = mb->rddiv; + int64_t rd_cost0, rd_cost1; + int16_t t0, t1; + int i, final_eob; +#if CONFIG_HIGHBITDEPTH + const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd); +#else + const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8); +#endif + unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref]; + const int default_eob = tx_size_2d[tx_size]; + + assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)); + + assert((!plane_type && !plane) || (plane_type && plane)); + assert(eob <= default_eob); + + int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1; +/* CpuSpeedTest uses "--min-q=0 --max-q=0" and expects 100dB psnr +* This creates conflict with search for a better EOB position +* The line below is to make sure EOB search is disabled at this corner case. +*/ +#if !CONFIG_NEW_QUANT && !CONFIG_AOM_QM + if (dq_step[1] <= 4) { + rdmult = 1; + } +#endif + + int64_t rate0, rate1; + for (i = 0; i < eob; i++) { + const int rc = scan[i]; + int x = qcoeff[rc]; + t0 = av1_get_token(x); + + tokens[i][0].qc = x; + tokens[i][0].token = t0; + tokens[i][0].dqc = dqcoeff[rc]; + + token_cache[rc] = av1_pt_energy_class[t0]; + } + tokens[eob][0].token = EOB_TOKEN; + tokens[eob][0].qc = 0; + tokens[eob][0].dqc = 0; + tokens[eob][1] = tokens[eob][0]; + + unsigned int(*token_costs_ptr)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + token_costs; + + final_eob = 0; + + int64_t eob_cost0, eob_cost1; + + const int ctx0 = ctx; + /* Record the r-d cost */ + int64_t accu_rate = 0; + int64_t accu_error = 0; + + rate0 = get_token_bit_costs(*(token_costs_ptr + band_translate[0]), 0, ctx0, + EOB_TOKEN); + int64_t best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error); + + // int64_t best_block_rd_cost_all0 = best_block_rd_cost; + + int x_prev = 1; + + for (i = 0; i < eob; i++) { + const int rc = scan[i]; + int x = qcoeff[rc]; + sz = -(x < 0); + + int band_cur = band_translate[i]; + int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i); + int token_tree_sel_cur = (x_prev == 0); + + if (x == 0) { + // no need to search when x == 0 + rate0 = + get_token_bit_costs(*(token_costs_ptr + band_cur), token_tree_sel_cur, + ctx_cur, tokens[i][0].token); + accu_rate += rate0; + x_prev = 0; + // accu_error does not change when x==0 + } else { + /* Computing distortion + */ + // compute the distortion for the first candidate + // and the distortion for quantizing to 0. + int dx0 = (-coeff[rc]) * (1 << shift); +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dx0 >>= xd->bd - 8; + } +#endif + int64_t d0 = (int64_t)dx0 * dx0; + + int x_a = x - 2 * sz - 1; + int64_t d2, d2_a; + + int dx; + +#if CONFIG_AOM_QM + int iwt = iqmatrix[rc]; + dqv = dequant_ptr[rc != 0]; + dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; +#else + dqv = dequant_ptr[rc != 0]; +#endif + + dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift); +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dx >>= xd->bd - 8; + } +#endif // CONFIG_HIGHBITDEPTH + d2 = (int64_t)dx * dx; + + /* compute the distortion for the second candidate + * x_a = x - 2 * sz + 1; + */ + if (x_a != 0) { +#if CONFIG_NEW_QUANT + dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) - + (coeff[rc] << shift); +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dx >>= xd->bd - 8; + } +#endif // CONFIG_HIGHBITDEPTH +#else // CONFIG_NEW_QUANT +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz; + } else { + dx -= (dqv + sz) ^ sz; + } +#else + dx -= (dqv + sz) ^ sz; +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_NEW_QUANT + d2_a = (int64_t)dx * dx; + } else { + d2_a = d0; + } + /* Computing rates and r-d cost + */ + + int best_x, best_eob_x; + int64_t base_bits, next_bits0, next_bits1; + int64_t next_eob_bits0, next_eob_bits1; + + // rate cost of x + base_bits = av1_get_token_cost(x, &t0, cat6_bits); + rate0 = base_bits + get_token_bit_costs(*(token_costs_ptr + band_cur), + token_tree_sel_cur, ctx_cur, t0); + + base_bits = av1_get_token_cost(x_a, &t1, cat6_bits); + rate1 = base_bits + get_token_bit_costs(*(token_costs_ptr + band_cur), + token_tree_sel_cur, ctx_cur, t1); + + next_bits0 = 0; + next_bits1 = 0; + next_eob_bits0 = 0; + next_eob_bits1 = 0; + + if (i < default_eob - 1) { + int ctx_next, token_tree_sel_next; + int band_next = band_translate[i + 1]; + + token_cache[rc] = av1_pt_energy_class[t0]; + ctx_next = get_coef_context(nb, token_cache, i + 1); + token_tree_sel_next = (x == 0); + + next_bits0 = get_token_bit_costs(*(token_costs_ptr + band_next), + token_tree_sel_next, ctx_next, + tokens[i + 1][0].token); + next_eob_bits0 = + get_token_bit_costs(*(token_costs_ptr + band_next), + token_tree_sel_next, ctx_next, EOB_TOKEN); + + token_cache[rc] = av1_pt_energy_class[t1]; + ctx_next = get_coef_context(nb, token_cache, i + 1); + token_tree_sel_next = (x_a == 0); + + next_bits1 = get_token_bit_costs(*(token_costs_ptr + band_next), + token_tree_sel_next, ctx_next, + tokens[i + 1][0].token); + + if (x_a != 0) { + next_eob_bits1 = + get_token_bit_costs(*(token_costs_ptr + band_next), + token_tree_sel_next, ctx_next, EOB_TOKEN); + } + } + + rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), d2); + rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), d2_a); + + best_x = (rd_cost1 < rd_cost0); + + eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0), + (accu_error + d2 - d0)); + eob_cost1 = eob_cost0; + if (x_a != 0) { + eob_cost1 = RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1), + (accu_error + d2_a - d0)); + best_eob_x = (eob_cost1 < eob_cost0); + } else { + best_eob_x = 0; + } + + int dqc, dqc_a = 0; + + dqc = dqcoeff[rc]; + if (best_x + best_eob_x) { + if (x_a != 0) { +#if CONFIG_NEW_QUANT + dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv, + dequant_val[band_translate[i]]); + dqc_a = shift ? ROUND_POWER_OF_TWO(dqc_a, shift) : dqc_a; + if (sz) dqc_a = -dqc_a; +#else +// The 32x32 transform coefficient uses half quantization step size. +// Account for the rounding difference in the dequantized coefficeint +// value when the quantization index is dropped from an even number +// to an odd number. + +#if CONFIG_AOM_QM + tran_low_t offset = dqv >> shift; +#else + tran_low_t offset = dq_step[rc != 0]; +#endif + if (shift & x_a) offset += (dqv & 0x01); + + if (sz == 0) + dqc_a = dqcoeff[rc] - offset; + else + dqc_a = dqcoeff[rc] + offset; +#endif // CONFIG_NEW_QUANT + } else { + dqc_a = 0; + } // if (x_a != 0) + } + + // record the better quantized value + if (best_x) { + qcoeff[rc] = x_a; + dqcoeff[rc] = dqc_a; + + accu_rate += rate1; + accu_error += d2_a - d0; + assert(d2_a <= d0); + + token_cache[rc] = av1_pt_energy_class[t1]; + } else { + accu_rate += rate0; + accu_error += d2 - d0; + assert(d2 <= d0); + + token_cache[rc] = av1_pt_energy_class[t0]; + } + + x_prev = qcoeff[rc]; + + // determine whether to move the eob position to i+1 + int64_t best_eob_cost_i = eob_cost0; + + tokens[i][1].token = t0; + tokens[i][1].qc = x; + tokens[i][1].dqc = dqc; + + if ((x_a != 0) && (best_eob_x)) { + best_eob_cost_i = eob_cost1; + + tokens[i][1].token = t1; + tokens[i][1].qc = x_a; + tokens[i][1].dqc = dqc_a; + } + + if (best_eob_cost_i < best_block_rd_cost) { + best_block_rd_cost = best_eob_cost_i; + final_eob = i + 1; + } + } // if (x==0) + } // for (i) + + assert(final_eob <= eob); + if (final_eob > 0) { + assert(tokens[final_eob - 1][1].qc != 0); + i = final_eob - 1; + int rc = scan[i]; + qcoeff[rc] = tokens[i][1].qc; + dqcoeff[rc] = tokens[i][1].dqc; + } + + for (i = final_eob; i < eob; i++) { + int rc = scan[i]; + qcoeff[rc] = 0; + dqcoeff[rc] = 0; + } + + mb->plane[plane].eobs[block] = final_eob; + return final_eob; + +#else // !CONFIG_PVQ + (void)cm; + (void)tx_size; + (void)ctx; + struct macroblock_plane *const p = &mb->plane[plane]; + return p->eobs[block]; +#endif // !CONFIG_PVQ +} + +#else // USE_GREEDY_OPTIMIZE_B + +typedef struct av1_token_state { + int64_t error; + int rate; + int16_t next; + int16_t token; + tran_low_t qc; + tran_low_t dqc; + uint8_t best_index; +} av1_token_state; + +int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block, + TX_SIZE tx_size, int ctx) { +#if !CONFIG_PVQ + MACROBLOCKD *const xd = &mb->e_mbd; + struct macroblock_plane *const p = &mb->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ref = is_inter_block(&xd->mi[0]->mbmi); + av1_token_state tokens[MAX_TX_SQUARE + 1][2]; + uint8_t token_cache[MAX_TX_SQUARE]; + const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const int eob = p->eobs[block]; + const PLANE_TYPE plane_type = pd->plane_type; + const int default_eob = tx_size_2d[tx_size]; + const int16_t *const dequant_ptr = pd->dequant; + const uint8_t *const band_translate = get_band_translate(tx_size); + TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); + const SCAN_ORDER *const scan_order = + get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi)); + const int16_t *const scan = scan_order->scan; + const int16_t *const nb = scan_order->neighbors; + int dqv; + const int shift = av1_get_tx_scale(tx_size); +#if CONFIG_AOM_QM + int seg_id = xd->mi[0]->mbmi.segment_id; + const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!ref][tx_size]; +#endif +#if CONFIG_NEW_QUANT + int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type); + const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq]; +#elif !CONFIG_AOM_QM + const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift }; +#endif // CONFIG_NEW_QUANT + int next = eob, sz = 0; + const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1; + const int64_t rddiv = mb->rddiv; + int64_t rd_cost0, rd_cost1; + int rate0, rate1; + int64_t error0, error1; + int16_t t0, t1; + int best, band = (eob < default_eob) ? band_translate[eob] + : band_translate[eob - 1]; + int pt, i, final_eob; +#if CONFIG_HIGHBITDEPTH + const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd); +#else + const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8); +#endif + unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref]; + const uint16_t *band_counts = &band_count_table[tx_size][band]; + uint16_t band_left = eob - band_cum_count_table[tx_size][band] + 1; + int shortcut = 0; + int next_shortcut = 0; + +#if CONFIG_EXT_DELTA_Q + const int qindex = cm->seg.enabled + ? av1_get_qindex(&cm->seg, xd->mi[0]->mbmi.segment_id, + cm->base_qindex) + : cm->base_qindex; + if (qindex == 0) { + assert((qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)); + } +#else + assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)); +#endif + + token_costs += band; + + assert((!plane_type && !plane) || (plane_type && plane)); + assert(eob <= default_eob); + + /* Now set up a Viterbi trellis to evaluate alternative roundings. */ + /* Initialize the sentinel node of the trellis. */ + tokens[eob][0].rate = 0; + tokens[eob][0].error = 0; + tokens[eob][0].next = default_eob; + tokens[eob][0].token = EOB_TOKEN; + tokens[eob][0].qc = 0; + tokens[eob][1] = tokens[eob][0]; + + for (i = 0; i < eob; i++) { + const int rc = scan[i]; + tokens[i][0].rate = av1_get_token_cost(qcoeff[rc], &t0, cat6_bits); + tokens[i][0].token = t0; + token_cache[rc] = av1_pt_energy_class[t0]; + } + + for (i = eob; i-- > 0;) { + int base_bits, dx; + int64_t d2; + const int rc = scan[i]; + int x = qcoeff[rc]; +#if CONFIG_AOM_QM + int iwt = iqmatrix[rc]; + dqv = dequant_ptr[rc != 0]; + dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; +#else + dqv = dequant_ptr[rc != 0]; +#endif + next_shortcut = shortcut; + + /* Only add a trellis state for non-zero coefficients. */ + if (UNLIKELY(x)) { + error0 = tokens[next][0].error; + error1 = tokens[next][1].error; + /* Evaluate the first possibility for this state. */ + rate0 = tokens[next][0].rate; + rate1 = tokens[next][1].rate; + + if (next_shortcut) { + /* Consider both possible successor states. */ + if (next < default_eob) { + pt = get_coef_context(nb, token_cache, i + 1); + rate0 += + get_token_bit_costs(*token_costs, 0, pt, tokens[next][0].token); + rate1 += + get_token_bit_costs(*token_costs, 0, pt, tokens[next][1].token); + } + UPDATE_RD_COST(); + /* And pick the best. */ + best = rd_cost1 < rd_cost0; + } else { + if (next < default_eob) { + pt = get_coef_context(nb, token_cache, i + 1); + rate0 += + get_token_bit_costs(*token_costs, 0, pt, tokens[next][0].token); + } + best = 0; + } + + dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift); +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dx >>= xd->bd - 8; + } +#endif // CONFIG_HIGHBITDEPTH + d2 = (int64_t)dx * dx; + tokens[i][0].rate += (best ? rate1 : rate0); + tokens[i][0].error = d2 + (best ? error1 : error0); + tokens[i][0].next = next; + tokens[i][0].qc = x; + tokens[i][0].dqc = dqcoeff[rc]; + tokens[i][0].best_index = best; + + /* Evaluate the second possibility for this state. */ + rate0 = tokens[next][0].rate; + rate1 = tokens[next][1].rate; + + // The threshold of 3 is empirically obtained. + if (UNLIKELY(abs(x) > 3)) { + shortcut = 0; + } else { +#if CONFIG_NEW_QUANT + shortcut = ((av1_dequant_abscoeff_nuq(abs(x), dqv, + dequant_val[band_translate[i]]) > + (abs(coeff[rc]) << shift)) && + (av1_dequant_abscoeff_nuq(abs(x) - 1, dqv, + dequant_val[band_translate[i]]) < + (abs(coeff[rc]) << shift))); +#else // CONFIG_NEW_QUANT +#if CONFIG_AOM_QM + if ((abs(x) * dequant_ptr[rc != 0] * iwt > + ((abs(coeff[rc]) << shift) << AOM_QM_BITS)) && + (abs(x) * dequant_ptr[rc != 0] * iwt < + (((abs(coeff[rc]) << shift) + dequant_ptr[rc != 0]) + << AOM_QM_BITS))) +#else + if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) && + (abs(x) * dequant_ptr[rc != 0] < + (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) +#endif // CONFIG_AOM_QM + shortcut = 1; + else + shortcut = 0; +#endif // CONFIG_NEW_QUANT + } + + if (shortcut) { + sz = -(x < 0); + x -= 2 * sz + 1; + } else { + tokens[i][1] = tokens[i][0]; + next = i; + + if (UNLIKELY(!(--band_left))) { + --band_counts; + band_left = *band_counts; + --token_costs; + } + continue; + } + + /* Consider both possible successor states. */ + if (!x) { + /* If we reduced this coefficient to zero, check to see if + * we need to move the EOB back here. + */ + t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; + t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; + base_bits = 0; + } else { + base_bits = av1_get_token_cost(x, &t0, cat6_bits); + t1 = t0; + } + + if (next_shortcut) { + if (LIKELY(next < default_eob)) { + if (t0 != EOB_TOKEN) { + token_cache[rc] = av1_pt_energy_class[t0]; + pt = get_coef_context(nb, token_cache, i + 1); + rate0 += get_token_bit_costs(*token_costs, !x, pt, + tokens[next][0].token); + } + if (t1 != EOB_TOKEN) { + token_cache[rc] = av1_pt_energy_class[t1]; + pt = get_coef_context(nb, token_cache, i + 1); + rate1 += get_token_bit_costs(*token_costs, !x, pt, + tokens[next][1].token); + } + } + + UPDATE_RD_COST(); + /* And pick the best. */ + best = rd_cost1 < rd_cost0; + } else { + // The two states in next stage are identical. + if (next < default_eob && t0 != EOB_TOKEN) { + token_cache[rc] = av1_pt_energy_class[t0]; + pt = get_coef_context(nb, token_cache, i + 1); + rate0 += + get_token_bit_costs(*token_costs, !x, pt, tokens[next][0].token); + } + best = 0; + } + +#if CONFIG_NEW_QUANT + dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) - + (coeff[rc] << shift); +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dx >>= xd->bd - 8; + } +#endif // CONFIG_HIGHBITDEPTH +#else // CONFIG_NEW_QUANT +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz; + } else { + dx -= (dqv + sz) ^ sz; + } +#else + dx -= (dqv + sz) ^ sz; +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_NEW_QUANT + d2 = (int64_t)dx * dx; + + tokens[i][1].rate = base_bits + (best ? rate1 : rate0); + tokens[i][1].error = d2 + (best ? error1 : error0); + tokens[i][1].next = next; + tokens[i][1].token = best ? t1 : t0; + tokens[i][1].qc = x; + + if (x) { +#if CONFIG_NEW_QUANT + tokens[i][1].dqc = av1_dequant_abscoeff_nuq( + abs(x), dqv, dequant_val[band_translate[i]]); + tokens[i][1].dqc = shift ? ROUND_POWER_OF_TWO(tokens[i][1].dqc, shift) + : tokens[i][1].dqc; + if (sz) tokens[i][1].dqc = -tokens[i][1].dqc; +#else +// The 32x32 transform coefficient uses half quantization step size. +// Account for the rounding difference in the dequantized coefficeint +// value when the quantization index is dropped from an even number +// to an odd number. + +#if CONFIG_AOM_QM + tran_low_t offset = dqv >> shift; +#else + tran_low_t offset = dq_step[rc != 0]; +#endif + if (shift & x) offset += (dqv & 0x01); + + if (sz == 0) + tokens[i][1].dqc = dqcoeff[rc] - offset; + else + tokens[i][1].dqc = dqcoeff[rc] + offset; +#endif // CONFIG_NEW_QUANT + } else { + tokens[i][1].dqc = 0; + } + + tokens[i][1].best_index = best; + /* Finally, make this the new head of the trellis. */ + next = i; + } else { + /* There's no choice to make for a zero coefficient, so we don't + * add a new trellis node, but we do need to update the costs. + */ + t0 = tokens[next][0].token; + t1 = tokens[next][1].token; + pt = get_coef_context(nb, token_cache, i + 1); + /* Update the cost of each path if we're past the EOB token. */ + if (t0 != EOB_TOKEN) { + tokens[next][0].rate += get_token_bit_costs(*token_costs, 1, pt, t0); + tokens[next][0].token = ZERO_TOKEN; + } + if (t1 != EOB_TOKEN) { + tokens[next][1].rate += get_token_bit_costs(*token_costs, 1, pt, t1); + tokens[next][1].token = ZERO_TOKEN; + } + tokens[i][0].best_index = tokens[i][1].best_index = 0; + shortcut = (tokens[next][0].rate != tokens[next][1].rate); + /* Don't update next, because we didn't add a new node. */ + } + + if (UNLIKELY(!(--band_left))) { + --band_counts; + band_left = *band_counts; + --token_costs; + } + } + + /* Now pick the best path through the whole trellis. */ + rate0 = tokens[next][0].rate; + rate1 = tokens[next][1].rate; + error0 = tokens[next][0].error; + error1 = tokens[next][1].error; + t0 = tokens[next][0].token; + t1 = tokens[next][1].token; + rate0 += get_token_bit_costs(*token_costs, 0, ctx, t0); + rate1 += get_token_bit_costs(*token_costs, 0, ctx, t1); + UPDATE_RD_COST(); + best = rd_cost1 < rd_cost0; + + final_eob = -1; + + for (i = next; i < eob; i = next) { + const int x = tokens[i][best].qc; + const int rc = scan[i]; + if (x) final_eob = i; + qcoeff[rc] = x; + dqcoeff[rc] = tokens[i][best].dqc; + + next = tokens[i][best].next; + best = tokens[i][best].best_index; + } + final_eob++; + + mb->plane[plane].eobs[block] = final_eob; + assert(final_eob <= default_eob); + return final_eob; +#else // !CONFIG_PVQ + (void)cm; + (void)tx_size; + (void)ctx; + struct macroblock_plane *const p = &mb->plane[plane]; + return p->eobs[block]; +#endif // !CONFIG_PVQ +} + +#endif // USE_GREEDY_OPTIMIZE_B + +#if !CONFIG_PVQ +#if CONFIG_HIGHBITDEPTH +typedef enum QUANT_FUNC { + QUANT_FUNC_LOWBD = 0, + QUANT_FUNC_HIGHBD = 1, + QUANT_FUNC_TYPES = 2 +} QUANT_FUNC; + +static AV1_QUANT_FACADE + quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = { +#if !CONFIG_NEW_QUANT + { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade }, + { av1_quantize_b_facade, av1_highbd_quantize_b_facade }, + { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade }, +#else // !CONFIG_NEW_QUANT + { av1_quantize_fp_nuq_facade, av1_highbd_quantize_fp_nuq_facade }, + { av1_quantize_b_nuq_facade, av1_highbd_quantize_b_nuq_facade }, + { av1_quantize_dc_nuq_facade, av1_highbd_quantize_dc_nuq_facade }, +#endif // !CONFIG_NEW_QUANT + { NULL, NULL } + }; + +#else + +typedef enum QUANT_FUNC { + QUANT_FUNC_LOWBD = 0, + QUANT_FUNC_TYPES = 1 +} QUANT_FUNC; + +static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES] + [QUANT_FUNC_TYPES] = { +#if !CONFIG_NEW_QUANT + { av1_quantize_fp_facade }, + { av1_quantize_b_facade }, + { av1_quantize_dc_facade }, +#else // !CONFIG_NEW_QUANT + { av1_quantize_fp_nuq_facade }, + { av1_quantize_b_nuq_facade }, + { av1_quantize_dc_nuq_facade }, +#endif // !CONFIG_NEW_QUANT + { NULL } + }; +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_PVQ + +void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block, + int blk_row, int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, int ctx, + AV1_XFORM_QUANT xform_quant_idx) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; +#if !(CONFIG_PVQ || CONFIG_DAALA_DIST) + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; +#else + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; +#endif + PLANE_TYPE plane_type = get_plane_type(plane); + TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); + const int is_inter = is_inter_block(mbmi); + const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, is_inter); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = block_size_wide[plane_bsize]; +#if CONFIG_AOM_QM + int seg_id = mbmi->segment_id; + const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][!is_inter][tx_size]; + const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!is_inter][tx_size]; +#endif + + FWD_TXFM_PARAM fwd_txfm_param; + +#if CONFIG_PVQ || CONFIG_DAALA_DIST + uint8_t *dst; + int16_t *pred; + const int dst_stride = pd->dst.stride; + int tx_blk_size; + int i, j; +#endif + +#if !CONFIG_PVQ + const int tx2d_size = tx_size_2d[tx_size]; + QUANT_PARAM qparam; + const int16_t *src_diff; + + src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]]; + qparam.log_scale = av1_get_tx_scale(tx_size); +#if CONFIG_NEW_QUANT + qparam.tx_size = tx_size; + qparam.dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type); +#endif // CONFIG_NEW_QUANT +#if CONFIG_AOM_QM + qparam.qmatrix = qmatrix; + qparam.iqmatrix = iqmatrix; +#endif // CONFIG_AOM_QM +#else + tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block); + int skip = 1; + PVQ_INFO *pvq_info = NULL; + uint8_t *src; + int16_t *src_int16; + const int src_stride = p->src.stride; + + (void)ctx; + (void)scan_order; + (void)qcoeff; + + if (x->pvq_coded) { + assert(block < MAX_PVQ_BLOCKS_IN_SB); + pvq_info = &x->pvq[block][plane]; + } + src = &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]]; + src_int16 = + &p->src_int16[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]]; + + // transform block size in pixels + tx_blk_size = tx_size_wide[tx_size]; +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + for (j = 0; j < tx_blk_size; j++) + for (i = 0; i < tx_blk_size; i++) + src_int16[diff_stride * j + i] = + CONVERT_TO_SHORTPTR(src)[src_stride * j + i]; + } else { +#endif // CONFIG_HIGHBITDEPTH + for (j = 0; j < tx_blk_size; j++) + for (i = 0; i < tx_blk_size; i++) + src_int16[diff_stride * j + i] = src[src_stride * j + i]; +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH +#endif + +#if CONFIG_PVQ || CONFIG_DAALA_DIST + dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; + pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]]; + + // transform block size in pixels + tx_blk_size = tx_size_wide[tx_size]; + +// copy uint8 orig and predicted block to int16 buffer +// in order to use existing VP10 transform functions +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + for (j = 0; j < tx_blk_size; j++) + for (i = 0; i < tx_blk_size; i++) + pred[diff_stride * j + i] = + CONVERT_TO_SHORTPTR(dst)[dst_stride * j + i]; + } else { +#endif // CONFIG_HIGHBITDEPTH + for (j = 0; j < tx_blk_size; j++) + for (i = 0; i < tx_blk_size; i++) + pred[diff_stride * j + i] = dst[dst_stride * j + i]; +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH +#endif + + (void)ctx; + + fwd_txfm_param.tx_type = tx_type; + fwd_txfm_param.tx_size = tx_size; + fwd_txfm_param.lossless = xd->lossless[mbmi->segment_id]; + +#if !CONFIG_PVQ +#if CONFIG_HIGHBITDEPTH + fwd_txfm_param.bd = xd->bd; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param); + if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) { + if (LIKELY(!x->skip_block)) { + quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD]( + coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam); + } else { + av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob); + } + } +#if CONFIG_LV_MAP + p->txb_entropy_ctx[block] = + (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob); +#endif // CONFIG_LV_MAP + return; + } +#endif // CONFIG_HIGHBITDEPTH + av1_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param); + if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) { + if (LIKELY(!x->skip_block)) { + quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD]( + coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam); + } else { + av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob); + } + } +#if CONFIG_LV_MAP + p->txb_entropy_ctx[block] = + (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob); +#endif // CONFIG_LV_MAP +#else // #if !CONFIG_PVQ + (void)xform_quant_idx; +#if CONFIG_HIGHBITDEPTH + fwd_txfm_param.bd = xd->bd; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + av1_highbd_fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param); + av1_highbd_fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param); + } else { +#endif + av1_fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param); + av1_fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param); +#if CONFIG_HIGHBITDEPTH + } +#endif + + // PVQ for inter mode block + if (!x->skip_block) { + PVQ_SKIP_TYPE ac_dc_coded = + av1_pvq_encode_helper(x, + coeff, // target original vector + ref_coeff, // reference vector + dqcoeff, // de-quantized vector + eob, // End of Block marker + pd->dequant, // aom's quantizers + plane, // image plane + tx_size, // block size in log_2 - 2 + tx_type, + &x->rate, // rate measured + x->pvq_speed, + pvq_info); // PVQ info for a block + skip = ac_dc_coded == PVQ_SKIP; + } + x->pvq_skip[plane] = skip; + + if (!skip) mbmi->skip = 0; +#endif // #if !CONFIG_PVQ +} + +static void encode_block(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { + struct encode_b_args *const args = arg; + AV1_COMMON *cm = args->cm; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + int ctx; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint8_t *dst; +#if !CONFIG_PVQ + ENTROPY_CONTEXT *a, *l; +#endif +#if CONFIG_VAR_TX + int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; +#endif + dst = &pd->dst + .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]]; + +#if !CONFIG_PVQ + a = &args->ta[blk_col]; + l = &args->tl[blk_row]; +#if CONFIG_VAR_TX + ctx = get_entropy_context(tx_size, a, l); +#else + ctx = combine_entropy_contexts(*a, *l); +#endif +#else + ctx = 0; +#endif // CONFIG_PVQ + +#if CONFIG_VAR_TX + // Assert not magic number (uninitialized). + assert(x->blk_skip[plane][blk_row * bw + blk_col] != 234); + + if (x->blk_skip[plane][blk_row * bw + blk_col] == 0) { +#else + { +#endif + av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + ctx, AV1_XFORM_QUANT_FP); + } +#if CONFIG_VAR_TX + else { + p->eobs[block] = 0; + } +#endif + +#if !CONFIG_PVQ + if (p->eobs[block] && !xd->lossless[xd->mi[0]->mbmi.segment_id]) + av1_optimize_b(cm, x, plane, block, tx_size, ctx); + + av1_set_txb_context(x, plane, block, tx_size, a, l); + + if (p->eobs[block]) *(args->skip) = 0; + + if (p->eobs[block] == 0) return; +#else + (void)ctx; + if (!x->pvq_skip[plane]) *(args->skip) = 0; + + if (x->pvq_skip[plane]) return; +#endif + TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block, tx_size); + av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, dst, + pd->dst.stride, p->eobs[block]); +} + +#if CONFIG_VAR_TX +static void encode_block_inter(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct encode_b_args *const args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int tx_row = blk_row >> (1 - pd->subsampling_y); + const int tx_col = blk_col >> (1 - pd->subsampling_x); + TX_SIZE plane_tx_size; + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + plane_tx_size = + plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0] + : mbmi->inter_tx_size[tx_row][tx_col]; + + if (tx_size == plane_tx_size) { + encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + // This is the square transform block partition entry point. + int bsl = tx_size_wide_unit[sub_txs]; + int i; + assert(bsl > 0); + assert(tx_size < TX_SIZES_ALL); + + for (i = 0; i < 4; ++i) { + const int offsetr = blk_row + ((i >> 1) * bsl); + const int offsetc = blk_col + ((i & 0x01) * bsl); + int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs]; + + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + + encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs, + arg); + block += step; + } + } +} +#endif + +typedef struct encode_block_pass1_args { + AV1_COMMON *cm; + MACROBLOCK *x; +} encode_block_pass1_args; + +static void encode_block_pass1(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + encode_block_pass1_args *args = (encode_block_pass1_args *)arg; + AV1_COMMON *cm = args->cm; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint8_t *dst; + int ctx = 0; + dst = &pd->dst + .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]]; + + av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + ctx, AV1_XFORM_QUANT_B); +#if !CONFIG_PVQ + if (p->eobs[block] > 0) { +#else + if (!x->pvq_skip[plane]) { + { + int tx_blk_size; + int i, j; + // transform block size in pixels + tx_blk_size = tx_size_wide[tx_size]; + +// Since av1 does not have separate function which does inverse transform +// but av1_inv_txfm_add_*x*() also does addition of predicted image to +// inverse transformed image, +// pass blank dummy image to av1_inv_txfm_add_*x*(), i.e. set dst as zeros +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + for (j = 0; j < tx_blk_size; j++) + for (i = 0; i < tx_blk_size; i++) + CONVERT_TO_SHORTPTR(dst)[j * pd->dst.stride + i] = 0; + } else { +#endif // CONFIG_HIGHBITDEPTH + for (j = 0; j < tx_blk_size; j++) + for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0; +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH + } +#endif // !CONFIG_PVQ +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (xd->lossless[xd->mi[0]->mbmi.segment_id]) { + av1_highbd_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + xd->bd); + } else { + av1_highbd_idct4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + xd->bd); + } + return; + } +#endif // CONFIG_HIGHBITDEPTH + if (xd->lossless[xd->mi[0]->mbmi.segment_id]) { + av1_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + } else { + av1_idct4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + } + } +} + +void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) { + encode_block_pass1_args args = { cm, x }; + av1_subtract_plane(x, bsize, 0); + av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, + encode_block_pass1, &args); +} + +void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, + const int mi_row, const int mi_col) { + MACROBLOCKD *const xd = &x->e_mbd; + struct optimize_ctx ctx; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 }; + int plane; + + mbmi->skip = 1; + + if (x->skip) return; + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { +#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 + const int subsampling_x = xd->plane[plane].subsampling_x; + const int subsampling_y = xd->plane[plane].subsampling_y; + + if (!is_chroma_reference(mi_row, mi_col, bsize, subsampling_x, + subsampling_y)) + continue; + + bsize = scale_chroma_bsize(bsize, subsampling_x, subsampling_y); +#else + (void)mi_row; + (void)mi_col; +#endif + +#if CONFIG_VAR_TX + // TODO(jingning): Clean this up. + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize); + const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; + const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0]; + const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0]; + int idx, idy; + int block = 0; + int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + av1_get_entropy_contexts(bsize, 0, pd, ctx.ta[plane], ctx.tl[plane]); +#else + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = get_tx_size(plane, xd); + av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); +#endif + +#if !CONFIG_PVQ + av1_subtract_plane(x, bsize, plane); +#endif + arg.ta = ctx.ta[plane]; + arg.tl = ctx.tl[plane]; + +#if CONFIG_VAR_TX + for (idy = 0; idy < mi_height; idy += bh) { + for (idx = 0; idx < mi_width; idx += bw) { + encode_block_inter(plane, block, idy, idx, plane_bsize, max_tx_size, + &arg); + block += step; + } + } +#else + av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block, + &arg); +#endif + } +} + +#if CONFIG_SUPERTX +void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &x->e_mbd; + struct optimize_ctx ctx; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 }; + int plane; + + mbmi->skip = 1; + if (x->skip) return; + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; +#if CONFIG_VAR_TX + const TX_SIZE tx_size = TX_4X4; +#else + const TX_SIZE tx_size = get_tx_size(plane, xd); +#endif + av1_subtract_plane(x, bsize, plane); + av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); + arg.ta = ctx.ta[plane]; + arg.tl = ctx.tl[plane]; + av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block, + &arg); + } +} +#endif // CONFIG_SUPERTX + +#if !CONFIG_PVQ +void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, + ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) { + (void)tx_size; + struct macroblock_plane *p = &x->plane[plane]; + +#if !CONFIG_LV_MAP + *a = *l = p->eobs[block] > 0; +#else // !CONFIG_LV_MAP + *a = *l = p->txb_entropy_ctx[block]; +#endif // !CONFIG_LV_MAP + +#if CONFIG_VAR_TX || CONFIG_LV_MAP + int i; + for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) a[i] = a[0]; + + for (i = 0; i < tx_size_high_unit[tx_size]; ++i) l[i] = l[0]; +#endif +} +#endif + +static void encode_block_intra_and_set_context(int plane, int block, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, + arg); +#if !CONFIG_PVQ + struct encode_b_args *const args = arg; + MACROBLOCK *x = args->x; + ENTROPY_CONTEXT *a = &args->ta[blk_col]; + ENTROPY_CONTEXT *l = &args->tl[blk_row]; + av1_set_txb_context(x, plane, block, tx_size, a, l); +#endif +} + +void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct encode_b_args *const args = arg; + AV1_COMMON *cm = args->cm; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + PLANE_TYPE plane_type = get_plane_type(plane); + const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); + uint16_t *eob = &p->eobs[block]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = + &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; + av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size); + av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); + + const ENTROPY_CONTEXT *a = &args->ta[blk_col]; + const ENTROPY_CONTEXT *l = &args->tl[blk_row]; + int ctx = combine_entropy_contexts(*a, *l); + if (args->enable_optimize_b) { + av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + ctx, AV1_XFORM_QUANT_FP); + if (p->eobs[block]) { + av1_optimize_b(cm, x, plane, block, tx_size, ctx); + } + } else { + av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + ctx, AV1_XFORM_QUANT_B); + } + +#if CONFIG_PVQ + // *(args->skip) == mbmi->skip + if (!x->pvq_skip[plane]) *(args->skip) = 0; + + if (x->pvq_skip[plane]) return; +#endif // CONFIG_PVQ + av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, dst, dst_stride, + *eob); +#if !CONFIG_PVQ + if (*eob) *(args->skip) = 0; +#else +// Note : *(args->skip) == mbmi->skip +#endif +#if CONFIG_CFL + if (plane == AOM_PLANE_Y && x->cfl_store_y) { + cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size); + } +#endif +} + +void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x, + BLOCK_SIZE bsize, int plane, + int enable_optimize_b, const int mi_row, + const int mi_col) { + const MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE] = { 0 }; + ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE] = { 0 }; + + struct encode_b_args arg = { + cm, x, NULL, &xd->mi[0]->mbmi.skip, ta, tl, enable_optimize_b + }; + +#if CONFIG_CB4X4 + if (!is_chroma_reference(mi_row, mi_col, bsize, + xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y)) + return; +#else + (void)mi_row; + (void)mi_col; +#endif + + if (enable_optimize_b) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = get_tx_size(plane, xd); + av1_get_entropy_contexts(bsize, tx_size, pd, ta, tl); + } + av1_foreach_transformed_block_in_plane( + xd, bsize, plane, encode_block_intra_and_set_context, &arg); +} + +#if CONFIG_PVQ +PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff, + tran_low_t *ref_coeff, + tran_low_t *const dqcoeff, uint16_t *eob, + const int16_t *quant, int plane, + int tx_size, TX_TYPE tx_type, int *rate, + int speed, PVQ_INFO *pvq_info) { + const int tx_blk_size = tx_size_wide[tx_size]; + daala_enc_ctx *daala_enc = &x->daala_enc; + PVQ_SKIP_TYPE ac_dc_coded; + int coeff_shift = 3 - av1_get_tx_scale(tx_size); + int hbd_downshift = 0; + int rounding_mask; + int pvq_dc_quant; + int use_activity_masking = daala_enc->use_activity_masking; + int tell; + int has_dc_skip = 1; + int i; + int off = od_qm_offset(tx_size, plane ? 1 : 0); + + DECLARE_ALIGNED(16, tran_low_t, coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]); + DECLARE_ALIGNED(16, tran_low_t, ref_coeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff_pvq[OD_TXSIZE_MAX * OD_TXSIZE_MAX]); + + DECLARE_ALIGNED(16, int32_t, in_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]); + DECLARE_ALIGNED(16, int32_t, ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]); + DECLARE_ALIGNED(16, int32_t, out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]); + +#if CONFIG_HIGHBITDEPTH + hbd_downshift = x->e_mbd.bd - 8; +#endif + + assert(OD_COEFF_SHIFT >= 4); + // DC quantizer for PVQ + if (use_activity_masking) + pvq_dc_quant = + OD_MAXI(1, (quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift) * + daala_enc->state + .pvq_qm_q4[plane][od_qm_get_index(tx_size, 0)] >> + 4); + else + pvq_dc_quant = + OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift); + + *eob = 0; + +#if CONFIG_DAALA_EC + tell = od_ec_enc_tell_frac(&daala_enc->w.ec); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + + // Change coefficient ordering for pvq encoding. + od_raster_to_coding_order(coeff_pvq, tx_blk_size, tx_type, coeff, + tx_blk_size); + od_raster_to_coding_order(ref_coeff_pvq, tx_blk_size, tx_type, ref_coeff, + tx_blk_size); + + // copy int16 inputs to int32 + for (i = 0; i < tx_blk_size * tx_blk_size; i++) { + ref_int32[i] = + AOM_SIGNED_SHL(ref_coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >> + hbd_downshift; + in_int32[i] = AOM_SIGNED_SHL(coeff_pvq[i], OD_COEFF_SHIFT - coeff_shift) >> + hbd_downshift; + } + + if (abs(in_int32[0] - ref_int32[0]) < pvq_dc_quant * 141 / 256) { /* 0.55 */ + out_int32[0] = 0; + } else { + out_int32[0] = OD_DIV_R0(in_int32[0] - ref_int32[0], pvq_dc_quant); + } + + ac_dc_coded = + od_pvq_encode(daala_enc, ref_int32, in_int32, out_int32, + OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >> + hbd_downshift), // scale/quantizer + OD_MAXI(1, quant[1] << (OD_COEFF_SHIFT - 3) >> + hbd_downshift), // scale/quantizer + plane, + tx_size, OD_PVQ_BETA[use_activity_masking][plane][tx_size], + 0, // is_keyframe, + daala_enc->state.qm + off, daala_enc->state.qm_inv + off, + speed, // speed + pvq_info); + + // Encode residue of DC coeff, if required. + if (!has_dc_skip || out_int32[0]) { + generic_encode(&daala_enc->w, &daala_enc->state.adapt->model_dc[plane], + abs(out_int32[0]) - has_dc_skip, + &daala_enc->state.adapt->ex_dc[plane][tx_size][0], 2); + } + if (out_int32[0]) { + aom_write_bit(&daala_enc->w, out_int32[0] < 0); + } + + // need to save quantized residue of DC coeff + // so that final pvq bitstream writing can know whether DC is coded. + if (pvq_info) pvq_info->dq_dc_residue = out_int32[0]; + + out_int32[0] = out_int32[0] * pvq_dc_quant; + out_int32[0] += ref_int32[0]; + + // copy int32 result back to int16 + assert(OD_COEFF_SHIFT > coeff_shift); + rounding_mask = (1 << (OD_COEFF_SHIFT - coeff_shift - 1)) - 1; + for (i = 0; i < tx_blk_size * tx_blk_size; i++) { + out_int32[i] = AOM_SIGNED_SHL(out_int32[i], hbd_downshift); + dqcoeff_pvq[i] = (out_int32[i] + (out_int32[i] < 0) + rounding_mask) >> + (OD_COEFF_SHIFT - coeff_shift); + } + + // Back to original coefficient order + od_coding_order_to_raster(dqcoeff, tx_blk_size, tx_type, dqcoeff_pvq, + tx_blk_size); + + *eob = tx_blk_size * tx_blk_size; + +#if CONFIG_DAALA_EC + *rate = (od_ec_enc_tell_frac(&daala_enc->w.ec) - tell) + << (AV1_PROB_COST_SHIFT - OD_BITRES); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + assert(*rate >= 0); + + return ac_dc_coded; +} + +void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k, + od_coeff *y, int nb_bands, const int *off, + int *size, int skip_rest, int skip_dir, + int bs) { // block size in log_2 -2 + int i; + const int tx_blk_size = tx_size_wide[bs]; + + for (i = 0; i < nb_bands; i++) { + pvq_info->qg[i] = qg[i]; + pvq_info->theta[i] = theta[i]; + pvq_info->k[i] = k[i]; + pvq_info->off[i] = off[i]; + pvq_info->size[i] = size[i]; + } + + memcpy(pvq_info->y, y, tx_blk_size * tx_blk_size * sizeof(od_coeff)); + + pvq_info->nb_bands = nb_bands; + pvq_info->skip_rest = skip_rest; + pvq_info->skip_dir = skip_dir; + pvq_info->bs = bs; +} +#endif diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h new file mode 100644 index 000000000..73fde1d88 --- /dev/null +++ b/third_party/aom/av1/encoder/encodemb.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_ENCODEMB_H_ +#define AV1_ENCODER_ENCODEMB_H_ + +#include "./aom_config.h" +#include "av1/common/onyxc_int.h" +#include "av1/encoder/block.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct optimize_ctx { + ENTROPY_CONTEXT ta[MAX_MB_PLANE][2 * MAX_MIB_SIZE]; + ENTROPY_CONTEXT tl[MAX_MB_PLANE][2 * MAX_MIB_SIZE]; +}; + +struct encode_b_args { + AV1_COMMON *cm; + MACROBLOCK *x; + struct optimize_ctx *ctx; + int8_t *skip; + ENTROPY_CONTEXT *ta; + ENTROPY_CONTEXT *tl; + int8_t enable_optimize_b; +}; + +typedef enum AV1_XFORM_QUANT { + AV1_XFORM_QUANT_FP = 0, + AV1_XFORM_QUANT_B = 1, + AV1_XFORM_QUANT_DC = 2, + AV1_XFORM_QUANT_SKIP_QUANT, + AV1_XFORM_QUANT_TYPES, +} AV1_XFORM_QUANT; + +void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col); +#if CONFIG_SUPERTX +void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize); +#endif // CONFIG_SUPERTX +void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize); +void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block, + int blk_row, int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, int ctx, AV1_XFORM_QUANT xform_quant_idx); + +int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block, + TX_SIZE tx_size, int ctx); + +void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, + int blk_col, int blk_row, TX_SIZE tx_size); + +void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); + +void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, + ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l); + +void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); + +void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x, + BLOCK_SIZE bsize, int plane, + int enable_optimize_b, int mi_row, + int mi_col); + +#if CONFIG_PVQ +PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff, + tran_low_t *ref_coeff, + tran_low_t *const dqcoeff, uint16_t *eob, + const int16_t *quant, int plane, + int tx_size, TX_TYPE tx_type, int *rate, + int speed, PVQ_INFO *pvq_info); + +void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k, + od_coeff *y, int nb_bands, const int *off, + int *size, int skip_rest, int skip_dir, int bs); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_ENCODEMB_H_ diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c new file mode 100644 index 000000000..a2a53f840 --- /dev/null +++ b/third_party/aom/av1/encoder/encodemv.c @@ -0,0 +1,497 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/common.h" +#include "av1/common/entropymode.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/subexp.h" + +#include "aom_dsp/aom_dsp_common.h" + +static struct av1_token mv_joint_encodings[MV_JOINTS]; +static struct av1_token mv_class_encodings[MV_CLASSES]; +static struct av1_token mv_fp_encodings[MV_FP_SIZE]; + +void av1_entropy_mv_init(void) { + av1_tokens_from_tree(mv_joint_encodings, av1_mv_joint_tree); + av1_tokens_from_tree(mv_class_encodings, av1_mv_class_tree); + av1_tokens_from_tree(mv_fp_encodings, av1_mv_fp_tree); +} + +static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp, + int usehp) { + int offset; + const int sign = comp < 0; + const int mag = sign ? -comp : comp; + const int mv_class = av1_get_mv_class(mag - 1, &offset); + const int d = offset >> 3; // int mv data + const int fr = (offset >> 1) & 3; // fractional mv data + const int hp = offset & 1; // high precision mv data + + assert(comp != 0); + + // Sign + aom_write(w, sign, mvcomp->sign); + +// Class +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, mv_class, mvcomp->class_cdf, MV_CLASSES); +#else + av1_write_token(w, av1_mv_class_tree, mvcomp->classes, + &mv_class_encodings[mv_class]); +#endif + + // Integer bits + if (mv_class == MV_CLASS_0) { + aom_write(w, d, mvcomp->class0[0]); + } else { + int i; + const int n = mv_class + CLASS0_BITS - 1; // number of bits + for (i = 0; i < n; ++i) aom_write(w, (d >> i) & 1, mvcomp->bits[i]); + } + +// Fractional bits +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol( + w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf, + MV_FP_SIZE); +#else + av1_write_token(w, av1_mv_fp_tree, + mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp, + &mv_fp_encodings[fr]); +#endif + + // High precision bit + if (usehp) + aom_write(w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp); +} + +static void build_nmv_component_cost_table(int *mvcost, + const nmv_component *const mvcomp, + int usehp) { + int i, v; + int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE]; + int bits_cost[MV_OFFSET_BITS][2]; + int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE]; + int class0_hp_cost[2], hp_cost[2]; + + sign_cost[0] = av1_cost_zero(mvcomp->sign); + sign_cost[1] = av1_cost_one(mvcomp->sign); + av1_cost_tokens(class_cost, mvcomp->classes, av1_mv_class_tree); + av1_cost_tokens(class0_cost, mvcomp->class0, av1_mv_class0_tree); + for (i = 0; i < MV_OFFSET_BITS; ++i) { + bits_cost[i][0] = av1_cost_zero(mvcomp->bits[i]); + bits_cost[i][1] = av1_cost_one(mvcomp->bits[i]); + } + + for (i = 0; i < CLASS0_SIZE; ++i) + av1_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], av1_mv_fp_tree); + av1_cost_tokens(fp_cost, mvcomp->fp, av1_mv_fp_tree); + + if (usehp) { + class0_hp_cost[0] = av1_cost_zero(mvcomp->class0_hp); + class0_hp_cost[1] = av1_cost_one(mvcomp->class0_hp); + hp_cost[0] = av1_cost_zero(mvcomp->hp); + hp_cost[1] = av1_cost_one(mvcomp->hp); + } + mvcost[0] = 0; + for (v = 1; v <= MV_MAX; ++v) { + int z, c, o, d, e, f, cost = 0; + z = v - 1; + c = av1_get_mv_class(z, &o); + cost += class_cost[c]; + d = (o >> 3); /* int mv data */ + f = (o >> 1) & 3; /* fractional pel mv data */ + e = (o & 1); /* high precision mv data */ + if (c == MV_CLASS_0) { + cost += class0_cost[d]; + } else { + const int b = c + CLASS0_BITS - 1; /* number of bits */ + for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)]; + } + if (c == MV_CLASS_0) { + cost += class0_fp_cost[d][f]; + } else { + cost += fp_cost[f]; + } + if (usehp) { + if (c == MV_CLASS_0) { + cost += class0_hp_cost[e]; + } else { + cost += hp_cost[e]; + } + } + mvcost[v] = cost + sign_cost[0]; + mvcost[-v] = cost + sign_cost[1]; + } +} + +static void update_mv(aom_writer *w, const unsigned int ct[2], aom_prob *cur_p, + aom_prob upd_p) { + (void)upd_p; +#if CONFIG_TILE_GROUPS + // Just use the default maximum number of tile groups to avoid passing in the + // actual + // number + av1_cond_prob_diff_update(w, cur_p, ct, DEFAULT_MAX_NUM_TG); +#else + av1_cond_prob_diff_update(w, cur_p, ct, 1); +#endif +} + +#if !CONFIG_EC_ADAPT +static void write_mv_update(const aom_tree_index *tree, + aom_prob probs[/*n - 1*/], + const unsigned int counts[/*n - 1*/], int n, + aom_writer *w) { + int i; + unsigned int branch_ct[32][2]; + + // Assuming max number of probabilities <= 32 + assert(n <= 32); + + av1_tree_probs_from_distribution(tree, branch_ct, counts); + for (i = 0; i < n - 1; ++i) + update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB); +} +#endif + +void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w, + nmv_context_counts *const nmv_counts) { + int i; +#if CONFIG_REF_MV + int nmv_ctx = 0; + for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) { + nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx]; + nmv_context_counts *const counts = &nmv_counts[nmv_ctx]; +#if !CONFIG_EC_ADAPT + write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, + w); + + for (i = 0; i < 2; ++i) { + int j; + nmv_component *comp = &mvc->comps[i]; + nmv_component_counts *comp_counts = &counts->comps[i]; + + update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB); + write_mv_update(av1_mv_class_tree, comp->classes, comp_counts->classes, + MV_CLASSES, w); + write_mv_update(av1_mv_class0_tree, comp->class0, comp_counts->class0, + CLASS0_SIZE, w); + for (j = 0; j < MV_OFFSET_BITS; ++j) + update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB); + } + + for (i = 0; i < 2; ++i) { + int j; + for (j = 0; j < CLASS0_SIZE; ++j) + write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j], + counts->comps[i].class0_fp[j], MV_FP_SIZE, w); + + write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp, + MV_FP_SIZE, w); + } +#endif + + if (usehp) { + for (i = 0; i < 2; ++i) { + update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp, + MV_UPDATE_PROB); + update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB); + } + } + } +#else + nmv_context *const mvc = &cm->fc->nmvc; + nmv_context_counts *const counts = nmv_counts; + +#if !CONFIG_EC_ADAPT + write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w); + + for (i = 0; i < 2; ++i) { + int j; + nmv_component *comp = &mvc->comps[i]; + nmv_component_counts *comp_counts = &counts->comps[i]; + + update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB); + write_mv_update(av1_mv_class_tree, comp->classes, comp_counts->classes, + MV_CLASSES, w); + write_mv_update(av1_mv_class0_tree, comp->class0, comp_counts->class0, + CLASS0_SIZE, w); + for (j = 0; j < MV_OFFSET_BITS; ++j) + update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB); + } + + for (i = 0; i < 2; ++i) { + int j; + for (j = 0; j < CLASS0_SIZE; ++j) { + write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j], + counts->comps[i].class0_fp[j], MV_FP_SIZE, w); + } + write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp, + MV_FP_SIZE, w); + } +#endif // !CONFIG_EC_ADAPT + + if (usehp) { + for (i = 0; i < 2; ++i) { + update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp, + MV_UPDATE_PROB); + update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB); + } + } +#endif +} + +void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref, + nmv_context *mvctx, int usehp) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS); +#else + av1_write_token(w, av1_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]); +#endif + if (mv_joint_vertical(j)) + encode_mv_component(w, diff.row, &mvctx->comps[0], usehp); + + if (mv_joint_horizontal(j)) + encode_mv_component(w, diff.col, &mvctx->comps[1], usehp); + + // If auto_mv_step_size is enabled then keep track of the largest + // motion vector component used. + if (cpi->sf.mv.auto_mv_step_size) { + unsigned int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3; + cpi->max_mv_magnitude = AOMMAX(maxv, cpi->max_mv_magnitude); + } +} + +#if CONFIG_INTRABC +void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref, + nmv_context *mvctx) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); + +#if CONFIG_EC_MULTISYMBOL + aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS); +#else + av1_write_token(w, av1_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]); +#endif + if (mv_joint_vertical(j)) + encode_mv_component(w, diff.row, &mvctx->comps[0], 0); + + if (mv_joint_horizontal(j)) + encode_mv_component(w, diff.col, &mvctx->comps[1], 0); +} +#endif // CONFIG_INTRABC + +void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context *ctx, int usehp) { + av1_cost_tokens(mvjoint, ctx->joints, av1_mv_joint_tree); + build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp); + build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp); +} + +#if CONFIG_EXT_INTER +static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext, + const int_mv mvs[2], +#if CONFIG_REF_MV + const int_mv pred_mvs[2], +#endif + nmv_context_counts *nmv_counts) { + int i; + PREDICTION_MODE mode = mbmi->mode; +#if !CONFIG_REF_MV + nmv_context_counts *counts = nmv_counts; +#endif + + if (mode == NEWMV || mode == NEW_NEWMV) { + for (i = 0; i < 1 + has_second_ref(mbmi); ++i) { + const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv; + const MV diff = { mvs[i].as_mv.row - ref->row, + mvs[i].as_mv.col - ref->col }; +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = + av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx); + nmv_context_counts *counts = &nmv_counts[nmv_ctx]; + (void)pred_mvs; +#endif + av1_inc_mv(&diff, counts, 1); + } + } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) { + const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv; + const MV diff = { mvs[1].as_mv.row - ref->row, + mvs[1].as_mv.col - ref->col }; +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = + av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx); + nmv_context_counts *counts = &nmv_counts[nmv_ctx]; +#endif + av1_inc_mv(&diff, counts, 1); + } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) { + const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv; + const MV diff = { mvs[0].as_mv.row - ref->row, + mvs[0].as_mv.col - ref->col }; +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = + av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx); + nmv_context_counts *counts = &nmv_counts[nmv_ctx]; +#endif + av1_inc_mv(&diff, counts, 1); + } +} + +static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2], +#if CONFIG_REF_MV + const MB_MODE_INFO_EXT *mbmi_ext, +#endif + nmv_context_counts *nmv_counts) { + int i; + PREDICTION_MODE mode = mi->bmi[block].as_mode; +#if CONFIG_REF_MV + const MB_MODE_INFO *mbmi = &mi->mbmi; +#else + nmv_context_counts *counts = nmv_counts; +#endif + + if (mode == NEWMV || mode == NEW_NEWMV) { + for (i = 0; i < 1 + has_second_ref(&mi->mbmi); ++i) { + const MV *ref = &mi->bmi[block].ref_mv[i].as_mv; + const MV diff = { mvs[i].as_mv.row - ref->row, + mvs[i].as_mv.col - ref->col }; +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = + av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx); + nmv_context_counts *counts = &nmv_counts[nmv_ctx]; +#endif + av1_inc_mv(&diff, counts, 1); + } + } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) { + const MV *ref = &mi->bmi[block].ref_mv[1].as_mv; + const MV diff = { mvs[1].as_mv.row - ref->row, + mvs[1].as_mv.col - ref->col }; +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = + av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx); + nmv_context_counts *counts = &nmv_counts[nmv_ctx]; +#endif + av1_inc_mv(&diff, counts, 1); + } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) { + const MV *ref = &mi->bmi[block].ref_mv[0].as_mv; + const MV diff = { mvs[0].as_mv.row - ref->row, + mvs[0].as_mv.col - ref->col }; +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = + av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx); + nmv_context_counts *counts = &nmv_counts[nmv_ctx]; +#endif + av1_inc_mv(&diff, counts, 1); + } +} +#else +static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext, + const int_mv mvs[2], +#if CONFIG_REF_MV + const int_mv pred_mvs[2], +#endif + nmv_context_counts *nmv_counts) { + int i; +#if !CONFIG_REF_MV + nmv_context_counts *counts = nmv_counts; +#endif + + for (i = 0; i < 1 + has_second_ref(mbmi); ++i) { +#if CONFIG_REF_MV + int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame); + int nmv_ctx = + av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx); + nmv_context_counts *counts = &nmv_counts[nmv_ctx]; + const MV *ref = &pred_mvs[i].as_mv; +#else + const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv; +#endif + const MV diff = { mvs[i].as_mv.row - ref->row, + mvs[i].as_mv.col - ref->col }; + av1_inc_mv(&diff, counts, 1); + } +} +#endif // CONFIG_EXT_INTER + +void av1_update_mv_count(ThreadData *td) { + const MACROBLOCKD *xd = &td->mb.e_mbd; + const MODE_INFO *mi = xd->mi[0]; + const MB_MODE_INFO *const mbmi = &mi->mbmi; + const MB_MODE_INFO_EXT *mbmi_ext = td->mb.mbmi_ext; +#if CONFIG_CB4X4 + const int unify_bsize = 1; +#else + const int unify_bsize = 0; +#endif + + if (mbmi->sb_type < BLOCK_8X8 && !unify_bsize) { + const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type]; + const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type]; + int idx, idy; + + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int i = idy * 2 + idx; + +#if CONFIG_EXT_INTER + if (have_newmv_in_inter_mode(mi->bmi[i].as_mode)) + inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv, +#if CONFIG_REF_MV + mbmi_ext, td->counts->mv); +#else + &td->counts->mv); +#endif +#else + if (mi->bmi[i].as_mode == NEWMV) + inc_mvs(mbmi, mbmi_ext, mi->bmi[i].as_mv, +#if CONFIG_REF_MV + mi->bmi[i].pred_mv, td->counts->mv); +#else + &td->counts->mv); +#endif +#endif // CONFIG_EXT_INTER + } + } + } else { +#if CONFIG_EXT_INTER + if (have_newmv_in_inter_mode(mbmi->mode)) +#else + if (mbmi->mode == NEWMV) +#endif // CONFIG_EXT_INTER + inc_mvs(mbmi, mbmi_ext, mbmi->mv, +#if CONFIG_REF_MV + mbmi->pred_mv, td->counts->mv); +#else + &td->counts->mv); +#endif + } +} diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h new file mode 100644 index 000000000..6d442147f --- /dev/null +++ b/third_party/aom/av1/encoder/encodemv.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_ENCODEMV_H_ +#define AV1_ENCODER_ENCODEMV_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_entropy_mv_init(void); + +void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w, + nmv_context_counts *const counts); + +void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref, + nmv_context *mvctx, int usehp); + +void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context *mvctx, int usehp); + +void av1_update_mv_count(ThreadData *td); + +#if CONFIG_INTRABC +void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref, + nmv_context *mvctx); +#endif // CONFIG_INTRABC + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_ENCODEMV_H_ diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c new file mode 100644 index 000000000..027109151 --- /dev/null +++ b/third_party/aom/av1/encoder/encoder.c @@ -0,0 +1,5980 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "./aom_config.h" + +#include "av1/common/alloccommon.h" +#if CONFIG_CDEF +#include "av1/common/cdef.h" +#include "av1/common/clpf.h" +#endif // CONFIG_CDEF +#include "av1/common/filter.h" +#include "av1/common/idct.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/resize.h" +#include "av1/common/tile_common.h" + +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/bitstream.h" +#if CONFIG_ANS +#include "aom_dsp/buf_ans.h" +#endif +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#if CONFIG_LV_MAP +#include "av1/encoder/encodetxb.h" +#endif +#include "av1/encoder/ethread.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/mbgraph.h" +#include "av1/encoder/picklpf.h" +#if CONFIG_LOOP_RESTORATION +#include "av1/encoder/pickrst.h" +#endif // CONFIG_LOOP_RESTORATION +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/temporal_filter.h" + +#include "./av1_rtcd.h" +#include "./aom_dsp_rtcd.h" +#include "./aom_scale_rtcd.h" +#include "aom_dsp/psnr.h" +#if CONFIG_INTERNAL_STATS +#include "aom_dsp/ssim.h" +#endif +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" +#include "aom_scale/aom_scale.h" +#if CONFIG_BITSTREAM_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_ENTROPY_STATS +FRAME_COUNTS aggregate_fc; +#endif // CONFIG_ENTROPY_STATS + +#define AM_SEGMENT_ID_INACTIVE 7 +#define AM_SEGMENT_ID_ACTIVE 0 + +#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */ + +#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv + // for altref computation. +#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision + // mv. Choose a very high value for + // now so that HIGH_PRECISION is always + // chosen. +// #define OUTPUT_YUV_REC +#ifdef OUTPUT_YUV_DENOISED +FILE *yuv_denoised_file = NULL; +#endif +#ifdef OUTPUT_YUV_SKINMAP +FILE *yuv_skinmap_file = NULL; +#endif +#ifdef OUTPUT_YUV_REC +FILE *yuv_rec_file; +#define FILE_NAME_LEN 100 +#endif + +#if 0 +FILE *framepsnr; +FILE *kf_list; +FILE *keyfile; +#endif + +#if CONFIG_CFL +CFL_CTX NULL_CFL; +#endif + +#if CONFIG_INTERNAL_STATS +typedef enum { Y, U, V, ALL } STAT_TYPE; +#endif // CONFIG_INTERNAL_STATS + +static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) { + switch (mode) { + case NORMAL: + *hr = 1; + *hs = 1; + break; + case FOURFIVE: + *hr = 4; + *hs = 5; + break; + case THREEFIVE: + *hr = 3; + *hs = 5; + break; + case ONETWO: + *hr = 1; + *hs = 2; + break; + default: + *hr = 1; + *hs = 1; + assert(0); + break; + } +} + +// Mark all inactive blocks as active. Other segmentation features may be set +// so memset cannot be used, instead only inactive blocks should be reset. +static void suppress_active_map(AV1_COMP *cpi) { + unsigned char *const seg_map = cpi->segmentation_map; + int i; + if (cpi->active_map.enabled || cpi->active_map.update) + for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i) + if (seg_map[i] == AM_SEGMENT_ID_INACTIVE) + seg_map[i] = AM_SEGMENT_ID_ACTIVE; +} + +static void apply_active_map(AV1_COMP *cpi) { + struct segmentation *const seg = &cpi->common.seg; + unsigned char *const seg_map = cpi->segmentation_map; + const unsigned char *const active_map = cpi->active_map.map; + int i; + + assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE); + + if (frame_is_intra_only(&cpi->common)) { + cpi->active_map.enabled = 0; + cpi->active_map.update = 1; + } + + if (cpi->active_map.update) { + if (cpi->active_map.enabled) { + for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i) + if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i]; + av1_enable_segmentation(seg); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF); + // Setting the data to -MAX_LOOP_FILTER will result in the computed loop + // filter level being zero regardless of the value of seg->abs_delta. + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF, + -MAX_LOOP_FILTER); + } else { + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF); + if (seg->enabled) { + seg->update_data = 1; + seg->update_map = 1; + } + } + cpi->active_map.update = 0; + } +} + +int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols) { + if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) { + unsigned char *const active_map_8x8 = cpi->active_map.map; + const int mi_rows = cpi->common.mi_rows; + const int mi_cols = cpi->common.mi_cols; + const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2; + const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2; + cpi->active_map.update = 1; + if (new_map_16x16) { + int r, c; + for (r = 0; r < mi_rows; ++r) { + for (c = 0; c < mi_cols; ++c) { + active_map_8x8[r * mi_cols + c] = + new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)] + ? AM_SEGMENT_ID_ACTIVE + : AM_SEGMENT_ID_INACTIVE; + } + } + cpi->active_map.enabled = 1; + } else { + cpi->active_map.enabled = 0; + } + return 0; + } else { + return -1; + } +} + +int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols) { + if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols && + new_map_16x16) { + unsigned char *const seg_map_8x8 = cpi->segmentation_map; + const int mi_rows = cpi->common.mi_rows; + const int mi_cols = cpi->common.mi_cols; + const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2; + const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2; + + memset(new_map_16x16, !cpi->active_map.enabled, rows * cols); + if (cpi->active_map.enabled) { + int r, c; + for (r = 0; r < mi_rows; ++r) { + for (c = 0; c < mi_cols; ++c) { + // Cyclic refresh segments are considered active despite not having + // AM_SEGMENT_ID_ACTIVE + new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)] |= + seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE; + } + } + } + return 0; + } else { + return -1; + } +} + +void av1_set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv) { + MACROBLOCK *const mb = &cpi->td.mb; + cpi->common.allow_high_precision_mv = allow_high_precision_mv; + +#if CONFIG_REF_MV + if (cpi->common.allow_high_precision_mv) { + int i; + for (i = 0; i < NMV_CONTEXTS; ++i) { + mb->mv_cost_stack[i] = mb->nmvcost_hp[i]; + mb->mvsadcost = mb->nmvsadcost_hp; + } + } else { + int i; + for (i = 0; i < NMV_CONTEXTS; ++i) { + mb->mv_cost_stack[i] = mb->nmvcost[i]; + mb->mvsadcost = mb->nmvsadcost; + } + } +#else + if (cpi->common.allow_high_precision_mv) { + mb->mvcost = mb->nmvcost_hp; + mb->mvsadcost = mb->nmvcost_hp; + } else { + mb->mvcost = mb->nmvcost; + mb->mvsadcost = mb->nmvcost; + } +#endif +} + +static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) { +#if CONFIG_EXT_PARTITION + if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64) + return BLOCK_64X64; + + if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128) + return BLOCK_128X128; + + assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC); + + assert(IMPLIES(cpi->common.tile_cols > 1, + cpi->common.tile_width % MAX_MIB_SIZE == 0)); + assert(IMPLIES(cpi->common.tile_rows > 1, + cpi->common.tile_height % MAX_MIB_SIZE == 0)); + + // TODO(any): Possibly could improve this with a heuristic. + return BLOCK_128X128; +#else + (void)cpi; + return BLOCK_64X64; +#endif // CONFIG_EXT_PARTITION +} + +static void setup_frame(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + // Set up entropy context depending on frame type. The decoder mandates + // the use of the default context, index 0, for keyframes and inter + // frames where the error_resilient_mode or intra_only flag is set. For + // other inter-frames the encoder currently uses only two contexts; + // context 1 for ALTREF frames and context 0 for the others. + if (frame_is_intra_only(cm) || cm->error_resilient_mode) { + av1_setup_past_independence(cm); + } else { +#if CONFIG_EXT_REFS + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) + cm->frame_context_idx = EXT_ARF_FRAME; + else if (cpi->refresh_alt_ref_frame) + cm->frame_context_idx = ARF_FRAME; +#else + if (cpi->refresh_alt_ref_frame) cm->frame_context_idx = ARF_FRAME; +#endif // CONFIG_EXT_REFS + else if (cpi->rc.is_src_frame_alt_ref) + cm->frame_context_idx = OVERLAY_FRAME; + else if (cpi->refresh_golden_frame) + cm->frame_context_idx = GLD_FRAME; +#if CONFIG_EXT_REFS + else if (cpi->refresh_bwd_ref_frame) + cm->frame_context_idx = BRF_FRAME; +#endif // CONFIG_EXT_REFS + else + cm->frame_context_idx = REGULAR_FRAME; + } + + if (cm->frame_type == KEY_FRAME) { + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + av1_zero(cpi->interp_filter_selected); + } else { + *cm->fc = cm->frame_contexts[cm->frame_context_idx]; + av1_zero(cpi->interp_filter_selected[0]); + } +#if CONFIG_EXT_REFS +#if CONFIG_LOWDELAY_COMPOUND // No change to bitstream + if (cpi->sf.recode_loop == DISALLOW_RECODE) { + cpi->refresh_bwd_ref_frame = cpi->refresh_last_frame; + cpi->rc.is_bipred_frame = 1; + } +#endif +#endif + + cpi->vaq_refresh = 0; + + set_sb_size(cm, select_sb_size(cpi)); +} + +static void av1_enc_setup_mi(AV1_COMMON *cm) { + int i; + cm->mi = cm->mip + cm->mi_stride + 1; + memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip)); + cm->prev_mi = cm->prev_mip + cm->mi_stride + 1; + // Clear top border row + memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride); + // Clear left border column + for (i = 1; i < cm->mi_rows + 1; ++i) + memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip)); + + cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1; + cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1; + + memset(cm->mi_grid_base, 0, + cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base)); +} + +static int av1_enc_alloc_mi(AV1_COMMON *cm, int mi_size) { + cm->mip = aom_calloc(mi_size, sizeof(*cm->mip)); + if (!cm->mip) return 1; + cm->prev_mip = aom_calloc(mi_size, sizeof(*cm->prev_mip)); + if (!cm->prev_mip) return 1; + cm->mi_alloc_size = mi_size; + + cm->mi_grid_base = (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *)); + if (!cm->mi_grid_base) return 1; + cm->prev_mi_grid_base = + (MODE_INFO **)aom_calloc(mi_size, sizeof(MODE_INFO *)); + if (!cm->prev_mi_grid_base) return 1; + + return 0; +} + +static void av1_enc_free_mi(AV1_COMMON *cm) { + aom_free(cm->mip); + cm->mip = NULL; + aom_free(cm->prev_mip); + cm->prev_mip = NULL; + aom_free(cm->mi_grid_base); + cm->mi_grid_base = NULL; + aom_free(cm->prev_mi_grid_base); + cm->prev_mi_grid_base = NULL; +} + +static void av1_swap_mi_and_prev_mi(AV1_COMMON *cm) { + // Current mip will be the prev_mip for the next frame. + MODE_INFO **temp_base = cm->prev_mi_grid_base; + MODE_INFO *temp = cm->prev_mip; + cm->prev_mip = cm->mip; + cm->mip = temp; + + // Update the upper left visible macroblock ptrs. + cm->mi = cm->mip + cm->mi_stride + 1; + cm->prev_mi = cm->prev_mip + cm->mi_stride + 1; + + cm->prev_mi_grid_base = cm->mi_grid_base; + cm->mi_grid_base = temp_base; + cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1; + cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1; +} + +void av1_initialize_enc(void) { + static volatile int init_done = 0; + + if (!init_done) { + av1_rtcd(); + aom_dsp_rtcd(); + aom_scale_rtcd(); + av1_init_intra_predictors(); + av1_init_me_luts(); +#if !CONFIG_XIPHRC + av1_rc_init_minq_luts(); +#endif + av1_entropy_mv_init(); + av1_encode_token_init(); +#if CONFIG_EXT_INTER + av1_init_wedge_masks(); +#endif + init_done = 1; + } +} + +static void dealloc_compressor_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + int i; + + aom_free(cpi->mbmi_ext_base); + cpi->mbmi_ext_base = NULL; + +#if CONFIG_PVQ + if (cpi->oxcf.pass != 1) { + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; + int tile_col, tile_row; + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + aom_free(tile_data->pvq_q.buf); + } + } +#endif + aom_free(cpi->tile_data); + cpi->tile_data = NULL; + + // Delete sementation map + aom_free(cpi->segmentation_map); + cpi->segmentation_map = NULL; + + av1_cyclic_refresh_free(cpi->cyclic_refresh); + cpi->cyclic_refresh = NULL; + + aom_free(cpi->active_map.map); + cpi->active_map.map = NULL; + + // Free up-sampled reference buffers. + for (i = 0; i < (REF_FRAMES + 1); i++) + aom_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf); + + av1_free_ref_frame_buffers(cm->buffer_pool); +#if CONFIG_LV_MAP + av1_free_txb_buf(cpi); +#endif + av1_free_context_buffers(cm); + + aom_free_frame_buffer(&cpi->last_frame_uf); +#if CONFIG_LOOP_RESTORATION + av1_free_restoration_buffers(cm); + aom_free_frame_buffer(&cpi->last_frame_db); + aom_free_frame_buffer(&cpi->trial_frame_rst); + aom_free(cpi->extra_rstbuf); + for (i = 0; i < MAX_MB_PLANE; ++i) + av1_free_restoration_struct(&cpi->rst_search[i]); +#endif // CONFIG_LOOP_RESTORATION + aom_free_frame_buffer(&cpi->scaled_source); + aom_free_frame_buffer(&cpi->scaled_last_source); + aom_free_frame_buffer(&cpi->alt_ref_buffer); + av1_lookahead_destroy(cpi->lookahead); + + aom_free(cpi->tile_tok[0][0]); + cpi->tile_tok[0][0] = 0; + + av1_free_pc_tree(&cpi->td); + av1_free_var_tree(&cpi->td); + +#if CONFIG_PALETTE + if (cpi->common.allow_screen_content_tools) + aom_free(cpi->td.mb.palette_buffer); +#endif // CONFIG_PALETTE + + if (cpi->source_diff_var != NULL) { + aom_free(cpi->source_diff_var); + cpi->source_diff_var = NULL; + } +#if CONFIG_ANS + aom_buf_ans_free(&cpi->buf_ans); +#endif // CONFIG_ANS +} + +static void save_coding_context(AV1_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + AV1_COMMON *cm = &cpi->common; +#if CONFIG_REF_MV + int i; +#endif + +// Stores a snapshot of key state variables which can subsequently be +// restored with a call to av1_restore_coding_context. These functions are +// intended for use in a re-code loop in av1_compress_frame where the +// quantizer value is adjusted between loop iterations. +#if CONFIG_REF_MV + for (i = 0; i < NMV_CONTEXTS; ++i) { + av1_copy(cc->nmv_vec_cost[i], cpi->td.mb.nmv_vec_cost[i]); + av1_copy(cc->nmv_costs, cpi->nmv_costs); + av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp); + } +#else + av1_copy(cc->nmvjointcost, cpi->td.mb.nmvjointcost); +#endif + + av1_copy(cc->nmvcosts, cpi->nmvcosts); + av1_copy(cc->nmvcosts_hp, cpi->nmvcosts_hp); + + av1_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas); + av1_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas); + + cc->fc = *cm->fc; +} + +static void restore_coding_context(AV1_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + AV1_COMMON *cm = &cpi->common; +#if CONFIG_REF_MV + int i; +#endif + +// Restore key state variables to the snapshot state stored in the +// previous call to av1_save_coding_context. +#if CONFIG_REF_MV + for (i = 0; i < NMV_CONTEXTS; ++i) { + av1_copy(cpi->td.mb.nmv_vec_cost[i], cc->nmv_vec_cost[i]); + av1_copy(cpi->nmv_costs, cc->nmv_costs); + av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp); + } +#else + av1_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost); +#endif + + av1_copy(cpi->nmvcosts, cc->nmvcosts); + av1_copy(cpi->nmvcosts_hp, cc->nmvcosts_hp); + + av1_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas); + av1_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas); + + *cm->fc = cc->fc; +} + +static void configure_static_seg_features(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + struct segmentation *const seg = &cm->seg; + + int high_q = (int)(rc->avg_q > 48.0); + int qi_delta; + + // Disable and clear down for KF + if (cm->frame_type == KEY_FRAME) { + // Clear down the global segmentation map + memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + seg->update_map = 0; + seg->update_data = 0; + cpi->static_mb_pct = 0; + + // Disable segmentation + av1_disable_segmentation(seg); + + // Clear down the segment features. + av1_clearall_segfeatures(seg); + } else if (cpi->refresh_alt_ref_frame) { + // If this is an alt ref frame + // Clear down the global segmentation map + memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + seg->update_map = 0; + seg->update_data = 0; + cpi->static_mb_pct = 0; + + // Disable segmentation and individual segment features by default + av1_disable_segmentation(seg); + av1_clearall_segfeatures(seg); + + // Scan frames from current to arf frame. + // This function re-enables segmentation if appropriate. + av1_update_mbgraph_stats(cpi); + + // If segmentation was enabled set those features needed for the + // arf itself. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + + qi_delta = + av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, cm->bit_depth); + av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); + + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF); + + // Where relevant assume segment data is delta data + seg->abs_delta = SEGMENT_DELTADATA; + } + } else if (seg->enabled) { + // All other frames if segmentation has been enabled + + // First normal frame in a valid gf or alt ref group + if (rc->frames_since_golden == 0) { + // Set up segment features for normal frames in an arf group + if (rc->source_alt_ref_active) { + seg->update_map = 0; + seg->update_data = 1; + seg->abs_delta = SEGMENT_DELTADATA; + + qi_delta = + av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, cm->bit_depth); + av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); + + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF); + + // Segment coding disabled for compred testing + if (high_q || (cpi->static_mb_pct == 100)) { + av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); + av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); + av1_enable_segfeature(seg, 1, SEG_LVL_SKIP); + } + } else { + // Disable segmentation and clear down features if alt ref + // is not active for this group + + av1_disable_segmentation(seg); + + memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + + seg->update_map = 0; + seg->update_data = 0; + + av1_clearall_segfeatures(seg); + } + } else if (rc->is_src_frame_alt_ref) { + // Special case where we are coding over the top of a previous + // alt ref frame. + // Segment coding disabled for compred testing + + // Enable ref frame features for segment 0 as well + av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME); + av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); + + // All mbs should use ALTREF_FRAME + av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME); + av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME); + av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME); + av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); + + // Skip all MBs if high Q (0,0 mv and skip coeffs) + if (high_q) { + av1_enable_segfeature(seg, 0, SEG_LVL_SKIP); + av1_enable_segfeature(seg, 1, SEG_LVL_SKIP); + } + // Enable data update + seg->update_data = 1; + } else { + // All other frames. + + // No updates.. leave things as they are. + seg->update_map = 0; + seg->update_data = 0; + } + } +} + +static void update_reference_segmentation_map(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible; + uint8_t *cache_ptr = cm->last_frame_seg_map; + int row, col; + + for (row = 0; row < cm->mi_rows; row++) { + MODE_INFO **mi_8x8 = mi_8x8_ptr; + uint8_t *cache = cache_ptr; + for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++) + cache[0] = mi_8x8[0]->mbmi.segment_id; + mi_8x8_ptr += cm->mi_stride; + cache_ptr += cm->mi_cols; + } +} + +static void alloc_raw_frame_buffers(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const AV1EncoderConfig *oxcf = &cpi->oxcf; + + if (!cpi->lookahead) + cpi->lookahead = av1_lookahead_init(oxcf->width, oxcf->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + oxcf->lag_in_frames); + if (!cpi->lookahead) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate lag buffers"); + + // TODO(agrange) Check if ARF is enabled and skip allocation if not. + if (aom_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, + NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate altref buffer"); +} + +static void alloc_util_frame_buffers(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + if (aom_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, + NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate last frame buffer"); + +#if CONFIG_LOOP_RESTORATION + if (aom_realloc_frame_buffer(&cpi->last_frame_db, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, + NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate last frame deblocked buffer"); + if (aom_realloc_frame_buffer(&cpi->trial_frame_rst, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, + NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate trial restored frame buffer"); + int extra_rstbuf_sz = RESTORATION_EXTBUF_SIZE; + if (extra_rstbuf_sz > 0) { + aom_free(cpi->extra_rstbuf); + CHECK_MEM_ERROR(cm, cpi->extra_rstbuf, + (uint8_t *)aom_malloc(extra_rstbuf_sz)); + } else { + cpi->extra_rstbuf = NULL; + } +#endif // CONFIG_LOOP_RESTORATION + + if (aom_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, + NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate scaled source buffer"); + + if (aom_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, + NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate scaled last source buffer"); +} + +static int alloc_context_buffers_ext(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + int mi_size = cm->mi_cols * cm->mi_rows; + + cpi->mbmi_ext_base = aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)); + if (!cpi->mbmi_ext_base) return 1; + + return 0; +} + +void av1_alloc_compressor_data(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + + av1_alloc_context_buffers(cm, cm->width, cm->height); + +#if CONFIG_LV_MAP + av1_alloc_txb_buf(cpi); +#endif + + alloc_context_buffers_ext(cpi); + + aom_free(cpi->tile_tok[0][0]); + + { + unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols); + CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0], + aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0]))); +#if CONFIG_ANS && !ANS_MAX_SYMBOLS + aom_buf_ans_alloc(&cpi->buf_ans, &cm->error, (int)tokens); +#endif // CONFIG_ANS + } + + av1_setup_pc_tree(&cpi->common, &cpi->td); +} + +void av1_new_framerate(AV1_COMP *cpi, double framerate) { + cpi->framerate = framerate < 0.1 ? 30 : framerate; +#if CONFIG_XIPHRC + if (!cpi->od_rc.cur_frame) return; + cpi->od_rc.framerate = cpi->framerate; + od_enc_rc_resize(&cpi->od_rc); +#else + av1_rc_update_framerate(cpi); +#endif +} + +static void set_tile_info(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; +#if CONFIG_TILE_GROUPS && CONFIG_DEPENDENT_HORZTILES + int tile_row, tile_col, num_tiles_in_tg; + int tg_row_start, tg_col_start; +#endif +#if CONFIG_EXT_TILE +#if CONFIG_EXT_PARTITION + if (cpi->oxcf.superblock_size != AOM_SUPERBLOCK_SIZE_64X64) { + cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 32); + cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 32); + cm->tile_width <<= MAX_MIB_SIZE_LOG2; + cm->tile_height <<= MAX_MIB_SIZE_LOG2; + } else { + cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64); + cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64); + cm->tile_width <<= MAX_MIB_SIZE_LOG2 - 1; + cm->tile_height <<= MAX_MIB_SIZE_LOG2 - 1; + } +#else + cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64); + cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64); + cm->tile_width <<= MAX_MIB_SIZE_LOG2; + cm->tile_height <<= MAX_MIB_SIZE_LOG2; +#endif // CONFIG_EXT_PARTITION + + cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols); + cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows); + + assert(cm->tile_width >> MAX_MIB_SIZE <= 32); + assert(cm->tile_height >> MAX_MIB_SIZE <= 32); + + // Get the number of tiles + cm->tile_cols = 1; + while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols; + + cm->tile_rows = 1; + while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows; +#else + int min_log2_tile_cols, max_log2_tile_cols; + av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + + cm->log2_tile_cols = + clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); + cm->log2_tile_rows = cpi->oxcf.tile_rows; + + cm->tile_cols = 1 << cm->log2_tile_cols; + cm->tile_rows = 1 << cm->log2_tile_rows; + + cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2); + cm->tile_width >>= cm->log2_tile_cols; + cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2); + cm->tile_height >>= cm->log2_tile_rows; + + // round to integer multiples of max superblock size + cm->tile_width = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2); + cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2); +#endif // CONFIG_EXT_TILE + +#if CONFIG_DEPENDENT_HORZTILES + cm->dependent_horz_tiles = cpi->oxcf.dependent_horz_tiles; + if (cm->log2_tile_rows == 0) cm->dependent_horz_tiles = 0; +#if CONFIG_TILE_GROUPS + if (cpi->oxcf.mtu == 0) { + cm->num_tg = cpi->oxcf.num_tile_groups; + } else { + // Use a default value for the purposes of weighting costs in probability + // updates + cm->num_tg = DEFAULT_MAX_NUM_TG; + } + num_tiles_in_tg = + (cm->tile_cols * cm->tile_rows + cm->num_tg - 1) / cm->num_tg; + tg_row_start = 0; + tg_col_start = 0; + for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) { + if ((tile_row * cm->tile_cols + tile_col) % num_tiles_in_tg == 0) { + tg_row_start = tile_row; + tg_col_start = tile_col; + } + cm->tile_group_start_row[tile_row][tile_col] = tg_row_start; + cm->tile_group_start_col[tile_row][tile_col] = tg_col_start; + } + } +#endif +#endif + +#if CONFIG_LOOPFILTERING_ACROSS_TILES + cm->loop_filter_across_tiles_enabled = + cpi->oxcf.loop_filter_across_tiles_enabled; +#endif // CONFIG_LOOPFILTERING_ACROSS_TILES +} + +static void update_frame_size(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + + av1_set_mb_mi(cm, cm->width, cm->height); + av1_init_context_buffers(cm); + av1_init_macroblockd(cm, xd, +#if CONFIG_PVQ + NULL, +#endif +#if CONFIG_CFL + &NULL_CFL, +#endif + NULL); + memset(cpi->mbmi_ext_base, 0, + cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base)); + + set_tile_info(cpi); +} + +static void init_buffer_indices(AV1_COMP *cpi) { +#if CONFIG_EXT_REFS + int fb_idx; + for (fb_idx = 0; fb_idx < LAST_REF_FRAMES; ++fb_idx) + cpi->lst_fb_idxes[fb_idx] = fb_idx; + cpi->gld_fb_idx = LAST_REF_FRAMES; + cpi->bwd_fb_idx = LAST_REF_FRAMES + 1; + cpi->alt_fb_idx = LAST_REF_FRAMES + 2; + for (fb_idx = 0; fb_idx < MAX_EXT_ARFS + 1; ++fb_idx) + cpi->arf_map[fb_idx] = LAST_REF_FRAMES + 2 + fb_idx; +#else + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 1; + cpi->alt_fb_idx = 2; +#endif // CONFIG_EXT_REFS +} + +static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { + AV1_COMMON *const cm = &cpi->common; + + cpi->oxcf = *oxcf; + cpi->framerate = oxcf->init_framerate; + + cm->profile = oxcf->profile; + cm->bit_depth = oxcf->bit_depth; +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth = oxcf->use_highbitdepth; +#endif + cm->color_space = oxcf->color_space; + cm->color_range = oxcf->color_range; + + cm->width = oxcf->width; + cm->height = oxcf->height; + av1_alloc_compressor_data(cpi); + + // Single thread case: use counts in common. + cpi->td.counts = &cm->counts; + + // change includes all joint functionality + av1_change_config(cpi, oxcf); + + cpi->static_mb_pct = 0; + cpi->ref_frame_flags = 0; + + init_buffer_indices(cpi); +} + +static void set_rc_buffer_sizes(RATE_CONTROL *rc, + const AV1EncoderConfig *oxcf) { + const int64_t bandwidth = oxcf->target_bandwidth; + const int64_t starting = oxcf->starting_buffer_level_ms; + const int64_t optimal = oxcf->optimal_buffer_level_ms; + const int64_t maximum = oxcf->maximum_buffer_size_ms; + + rc->starting_buffer_level = starting * bandwidth / 1000; + rc->optimal_buffer_level = + (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000; + rc->maximum_buffer_size = + (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000; +} + +#if CONFIG_HIGHBITDEPTH +#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx3f = SDX3F; \ + cpi->fn_ptr[BT].sdx8f = SDX8F; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; + +#define MAKE_BFP_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ + int source_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \ + } + +#define MAKE_BFP_SADAVG_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \ + 4; \ + } + +#define MAKE_BFP_SAD3_WRAPPER(fnname) \ + static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + unsigned int *sad_array) { \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + } \ + static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 3; i++) sad_array[i] >>= 2; \ + } \ + static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 3; i++) sad_array[i] >>= 4; \ + } + +#define MAKE_BFP_SAD8_WRAPPER(fnname) \ + static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + unsigned int *sad_array) { \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + } \ + static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 8; i++) sad_array[i] >>= 2; \ + } \ + static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 8; i++) sad_array[i] >>= 4; \ + } +#define MAKE_BFP_SAD4D_WRAPPER(fnname) \ + static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + } \ + static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 2; \ + } \ + static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 4; \ + } + +#if CONFIG_EXT_PARTITION +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg) +MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad128x128x3) +MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad128x128x8) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d) +#endif // CONFIG_EXT_PARTITION +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg) +MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad32x32x3) +MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad32x32x8) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg) +MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad64x64x3) +MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad64x64x8) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg) +MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad16x16x3) +MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad16x16x8) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg) +MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad16x8x3) +MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad16x8x8) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg) +MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad8x16x3) +MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x16x8) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg) +MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad8x8x3) +MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x8x8) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg) +MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad8x4x8) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg) +MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x8x8) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg) +MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad4x4x3) +MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x4x8) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d) + +#if CONFIG_EXT_INTER +#define HIGHBD_MBFP(BT, MSDF, MVF, MSVF) \ + cpi->fn_ptr[BT].msdf = MSDF; \ + cpi->fn_ptr[BT].mvf = MVF; \ + cpi->fn_ptr[BT].msvf = MSVF; + +#define MAKE_MBFP_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *m, int m_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *m, int m_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *m, int m_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \ + 4; \ + } + +#if CONFIG_EXT_PARTITION +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x128) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x64) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x128) +#endif // CONFIG_EXT_PARTITION +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x64) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x32) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x64) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x32) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x16) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x32) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x16) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x8) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x16) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x8) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x4) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x8) +MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x4) +#endif // CONFIG_EXT_INTER + +#if CONFIG_MOTION_VAR +#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \ + cpi->fn_ptr[BT].osdf = OSDF; \ + cpi->fn_ptr[BT].ovf = OVF; \ + cpi->fn_ptr[BT].osvf = OSVF; + +#define MAKE_OBFP_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *msk) { \ + return fnname(ref, ref_stride, wsrc, msk); \ + } \ + static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *msk) { \ + return fnname(ref, ref_stride, wsrc, msk) >> 2; \ + } \ + static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *msk) { \ + return fnname(ref, ref_stride, wsrc, msk) >> 4; \ + } + +#if CONFIG_EXT_PARTITION +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128) +#endif // CONFIG_EXT_PARTITION +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4) +#endif // CONFIG_MOTION_VAR + +static void highbd_set_var_fns(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case AOM_BITS_8: + HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8, + aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16, + aom_highbd_8_sub_pixel_variance32x16, + aom_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL, + aom_highbd_sad32x16x4d_bits8) + + HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8, + aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32, + aom_highbd_8_sub_pixel_variance16x32, + aom_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL, + aom_highbd_sad16x32x4d_bits8) + + HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8, + aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32, + aom_highbd_8_sub_pixel_variance64x32, + aom_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL, + aom_highbd_sad64x32x4d_bits8) + + HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8, + aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64, + aom_highbd_8_sub_pixel_variance32x64, + aom_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL, + aom_highbd_sad32x64x4d_bits8) + + HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8, + aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32, + aom_highbd_8_sub_pixel_variance32x32, + aom_highbd_8_sub_pixel_avg_variance32x32, + aom_highbd_sad32x32x3_bits8, aom_highbd_sad32x32x8_bits8, + aom_highbd_sad32x32x4d_bits8) + + HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8, + aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64, + aom_highbd_8_sub_pixel_variance64x64, + aom_highbd_8_sub_pixel_avg_variance64x64, + aom_highbd_sad64x64x3_bits8, aom_highbd_sad64x64x8_bits8, + aom_highbd_sad64x64x4d_bits8) + + HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8, + aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16, + aom_highbd_8_sub_pixel_variance16x16, + aom_highbd_8_sub_pixel_avg_variance16x16, + aom_highbd_sad16x16x3_bits8, aom_highbd_sad16x16x8_bits8, + aom_highbd_sad16x16x4d_bits8) + + HIGHBD_BFP( + BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8, + aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8, + aom_highbd_8_sub_pixel_avg_variance16x8, aom_highbd_sad16x8x3_bits8, + aom_highbd_sad16x8x8_bits8, aom_highbd_sad16x8x4d_bits8) + + HIGHBD_BFP( + BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8, + aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16, + aom_highbd_8_sub_pixel_avg_variance8x16, aom_highbd_sad8x16x3_bits8, + aom_highbd_sad8x16x8_bits8, aom_highbd_sad8x16x4d_bits8) + + HIGHBD_BFP( + BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8, + aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8, + aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits8, + aom_highbd_sad8x8x8_bits8, aom_highbd_sad8x8x4d_bits8) + + HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits8, + aom_highbd_sad8x4_avg_bits8, aom_highbd_8_variance8x4, + aom_highbd_8_sub_pixel_variance8x4, + aom_highbd_8_sub_pixel_avg_variance8x4, NULL, + aom_highbd_sad8x4x8_bits8, aom_highbd_sad8x4x4d_bits8) + + HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits8, + aom_highbd_sad4x8_avg_bits8, aom_highbd_8_variance4x8, + aom_highbd_8_sub_pixel_variance4x8, + aom_highbd_8_sub_pixel_avg_variance4x8, NULL, + aom_highbd_sad4x8x8_bits8, aom_highbd_sad4x8x4d_bits8) + + HIGHBD_BFP( + BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8, + aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4, + aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits8, + aom_highbd_sad4x4x8_bits8, aom_highbd_sad4x4x4d_bits8) + +#if CONFIG_CB4X4 + HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_8_variance2x2, NULL, NULL, + NULL, NULL, NULL) + HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_8_variance4x2, NULL, NULL, + NULL, NULL, NULL) + HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_8_variance2x4, NULL, NULL, + NULL, NULL, NULL) +#endif + +#if CONFIG_EXT_PARTITION + HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8, + aom_highbd_sad128x128_avg_bits8, + aom_highbd_8_variance128x128, + aom_highbd_8_sub_pixel_variance128x128, + aom_highbd_8_sub_pixel_avg_variance128x128, + aom_highbd_sad128x128x3_bits8, aom_highbd_sad128x128x8_bits8, + aom_highbd_sad128x128x4d_bits8) + + HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8, + aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64, + aom_highbd_8_sub_pixel_variance128x64, + aom_highbd_8_sub_pixel_avg_variance128x64, NULL, NULL, + aom_highbd_sad128x64x4d_bits8) + + HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8, + aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128, + aom_highbd_8_sub_pixel_variance64x128, + aom_highbd_8_sub_pixel_avg_variance64x128, NULL, NULL, + aom_highbd_sad64x128x4d_bits8) +#endif // CONFIG_EXT_PARTITION + +#if CONFIG_EXT_INTER +#if CONFIG_EXT_PARTITION + HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8, + aom_highbd_masked_variance128x128, + aom_highbd_masked_sub_pixel_variance128x128) + HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8, + aom_highbd_masked_variance128x64, + aom_highbd_masked_sub_pixel_variance128x64) + HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8, + aom_highbd_masked_variance64x128, + aom_highbd_masked_sub_pixel_variance64x128) +#endif // CONFIG_EXT_PARTITION + HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8, + aom_highbd_masked_variance64x64, + aom_highbd_masked_sub_pixel_variance64x64) + HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8, + aom_highbd_masked_variance64x32, + aom_highbd_masked_sub_pixel_variance64x32) + HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8, + aom_highbd_masked_variance32x64, + aom_highbd_masked_sub_pixel_variance32x64) + HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8, + aom_highbd_masked_variance32x32, + aom_highbd_masked_sub_pixel_variance32x32) + HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8, + aom_highbd_masked_variance32x16, + aom_highbd_masked_sub_pixel_variance32x16) + HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8, + aom_highbd_masked_variance16x32, + aom_highbd_masked_sub_pixel_variance16x32) + HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8, + aom_highbd_masked_variance16x16, + aom_highbd_masked_sub_pixel_variance16x16) + HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8, + aom_highbd_masked_variance8x16, + aom_highbd_masked_sub_pixel_variance8x16) + HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8, + aom_highbd_masked_variance16x8, + aom_highbd_masked_sub_pixel_variance16x8) + HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8, + aom_highbd_masked_variance8x8, + aom_highbd_masked_sub_pixel_variance8x8) + HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8, + aom_highbd_masked_variance4x8, + aom_highbd_masked_sub_pixel_variance4x8) + HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8, + aom_highbd_masked_variance8x4, + aom_highbd_masked_sub_pixel_variance8x4) + HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8, + aom_highbd_masked_variance4x4, + aom_highbd_masked_sub_pixel_variance4x4) +#endif // CONFIG_EXT_INTER +#if CONFIG_MOTION_VAR +#if CONFIG_EXT_PARTITION + HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8, + aom_highbd_obmc_variance128x128, + aom_highbd_obmc_sub_pixel_variance128x128) + HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits8, + aom_highbd_obmc_variance128x64, + aom_highbd_obmc_sub_pixel_variance128x64) + HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8, + aom_highbd_obmc_variance64x128, + aom_highbd_obmc_sub_pixel_variance64x128) +#endif // CONFIG_EXT_PARTITION + HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8, + aom_highbd_obmc_variance64x64, + aom_highbd_obmc_sub_pixel_variance64x64) + HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits8, + aom_highbd_obmc_variance64x32, + aom_highbd_obmc_sub_pixel_variance64x32) + HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits8, + aom_highbd_obmc_variance32x64, + aom_highbd_obmc_sub_pixel_variance32x64) + HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits8, + aom_highbd_obmc_variance32x32, + aom_highbd_obmc_sub_pixel_variance32x32) + HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits8, + aom_highbd_obmc_variance32x16, + aom_highbd_obmc_sub_pixel_variance32x16) + HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits8, + aom_highbd_obmc_variance16x32, + aom_highbd_obmc_sub_pixel_variance16x32) + HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits8, + aom_highbd_obmc_variance16x16, + aom_highbd_obmc_sub_pixel_variance16x16) + HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits8, + aom_highbd_obmc_variance8x16, + aom_highbd_obmc_sub_pixel_variance8x16) + HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits8, + aom_highbd_obmc_variance16x8, + aom_highbd_obmc_sub_pixel_variance16x8) + HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits8, + aom_highbd_obmc_variance8x8, + aom_highbd_obmc_sub_pixel_variance8x8) + HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits8, + aom_highbd_obmc_variance4x8, + aom_highbd_obmc_sub_pixel_variance4x8) + HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits8, + aom_highbd_obmc_variance8x4, + aom_highbd_obmc_sub_pixel_variance8x4) + HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8, + aom_highbd_obmc_variance4x4, + aom_highbd_obmc_sub_pixel_variance4x4) +#endif // CONFIG_MOTION_VAR + break; + + case AOM_BITS_10: + HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10, + aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16, + aom_highbd_10_sub_pixel_variance32x16, + aom_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL, + aom_highbd_sad32x16x4d_bits10) + + HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10, + aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32, + aom_highbd_10_sub_pixel_variance16x32, + aom_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL, + aom_highbd_sad16x32x4d_bits10) + + HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10, + aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32, + aom_highbd_10_sub_pixel_variance64x32, + aom_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL, + aom_highbd_sad64x32x4d_bits10) + + HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10, + aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64, + aom_highbd_10_sub_pixel_variance32x64, + aom_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL, + aom_highbd_sad32x64x4d_bits10) + + HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10, + aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32, + aom_highbd_10_sub_pixel_variance32x32, + aom_highbd_10_sub_pixel_avg_variance32x32, + aom_highbd_sad32x32x3_bits10, aom_highbd_sad32x32x8_bits10, + aom_highbd_sad32x32x4d_bits10) + + HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10, + aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64, + aom_highbd_10_sub_pixel_variance64x64, + aom_highbd_10_sub_pixel_avg_variance64x64, + aom_highbd_sad64x64x3_bits10, aom_highbd_sad64x64x8_bits10, + aom_highbd_sad64x64x4d_bits10) + + HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10, + aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16, + aom_highbd_10_sub_pixel_variance16x16, + aom_highbd_10_sub_pixel_avg_variance16x16, + aom_highbd_sad16x16x3_bits10, aom_highbd_sad16x16x8_bits10, + aom_highbd_sad16x16x4d_bits10) + + HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10, + aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8, + aom_highbd_10_sub_pixel_variance16x8, + aom_highbd_10_sub_pixel_avg_variance16x8, + aom_highbd_sad16x8x3_bits10, aom_highbd_sad16x8x8_bits10, + aom_highbd_sad16x8x4d_bits10) + + HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10, + aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16, + aom_highbd_10_sub_pixel_variance8x16, + aom_highbd_10_sub_pixel_avg_variance8x16, + aom_highbd_sad8x16x3_bits10, aom_highbd_sad8x16x8_bits10, + aom_highbd_sad8x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10, + aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8, + aom_highbd_10_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits10, + aom_highbd_sad8x8x8_bits10, aom_highbd_sad8x8x4d_bits10) + + HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits10, + aom_highbd_sad8x4_avg_bits10, aom_highbd_10_variance8x4, + aom_highbd_10_sub_pixel_variance8x4, + aom_highbd_10_sub_pixel_avg_variance8x4, NULL, + aom_highbd_sad8x4x8_bits10, aom_highbd_sad8x4x4d_bits10) + + HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits10, + aom_highbd_sad4x8_avg_bits10, aom_highbd_10_variance4x8, + aom_highbd_10_sub_pixel_variance4x8, + aom_highbd_10_sub_pixel_avg_variance4x8, NULL, + aom_highbd_sad4x8x8_bits10, aom_highbd_sad4x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10, + aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4, + aom_highbd_10_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits10, + aom_highbd_sad4x4x8_bits10, aom_highbd_sad4x4x4d_bits10) + +#if CONFIG_CB4X4 + HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_10_variance2x2, NULL, NULL, + NULL, NULL, NULL) + HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_10_variance4x2, NULL, NULL, + NULL, NULL, NULL) + HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_10_variance2x4, NULL, NULL, + NULL, NULL, NULL) +#endif + +#if CONFIG_EXT_PARTITION + HIGHBD_BFP( + BLOCK_128X128, aom_highbd_sad128x128_bits10, + aom_highbd_sad128x128_avg_bits10, aom_highbd_10_variance128x128, + aom_highbd_10_sub_pixel_variance128x128, + aom_highbd_10_sub_pixel_avg_variance128x128, + aom_highbd_sad128x128x3_bits10, aom_highbd_sad128x128x8_bits10, + aom_highbd_sad128x128x4d_bits10) + + HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10, + aom_highbd_sad128x64_avg_bits10, + aom_highbd_10_variance128x64, + aom_highbd_10_sub_pixel_variance128x64, + aom_highbd_10_sub_pixel_avg_variance128x64, NULL, NULL, + aom_highbd_sad128x64x4d_bits10) + + HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10, + aom_highbd_sad64x128_avg_bits10, + aom_highbd_10_variance64x128, + aom_highbd_10_sub_pixel_variance64x128, + aom_highbd_10_sub_pixel_avg_variance64x128, NULL, NULL, + aom_highbd_sad64x128x4d_bits10) +#endif // CONFIG_EXT_PARTITION + +#if CONFIG_EXT_INTER +#if CONFIG_EXT_PARTITION + HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10, + aom_highbd_10_masked_variance128x128, + aom_highbd_10_masked_sub_pixel_variance128x128) + HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10, + aom_highbd_10_masked_variance128x64, + aom_highbd_10_masked_sub_pixel_variance128x64) + HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10, + aom_highbd_10_masked_variance64x128, + aom_highbd_10_masked_sub_pixel_variance64x128) +#endif // CONFIG_EXT_PARTITION + HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10, + aom_highbd_10_masked_variance64x64, + aom_highbd_10_masked_sub_pixel_variance64x64) + HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10, + aom_highbd_10_masked_variance64x32, + aom_highbd_10_masked_sub_pixel_variance64x32) + HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10, + aom_highbd_10_masked_variance32x64, + aom_highbd_10_masked_sub_pixel_variance32x64) + HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10, + aom_highbd_10_masked_variance32x32, + aom_highbd_10_masked_sub_pixel_variance32x32) + HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10, + aom_highbd_10_masked_variance32x16, + aom_highbd_10_masked_sub_pixel_variance32x16) + HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10, + aom_highbd_10_masked_variance16x32, + aom_highbd_10_masked_sub_pixel_variance16x32) + HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10, + aom_highbd_10_masked_variance16x16, + aom_highbd_10_masked_sub_pixel_variance16x16) + HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10, + aom_highbd_10_masked_variance8x16, + aom_highbd_10_masked_sub_pixel_variance8x16) + HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10, + aom_highbd_10_masked_variance16x8, + aom_highbd_10_masked_sub_pixel_variance16x8) + HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10, + aom_highbd_10_masked_variance8x8, + aom_highbd_10_masked_sub_pixel_variance8x8) + HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10, + aom_highbd_10_masked_variance4x8, + aom_highbd_10_masked_sub_pixel_variance4x8) + HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10, + aom_highbd_10_masked_variance8x4, + aom_highbd_10_masked_sub_pixel_variance8x4) + HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10, + aom_highbd_10_masked_variance4x4, + aom_highbd_10_masked_sub_pixel_variance4x4) +#endif // CONFIG_EXT_INTER +#if CONFIG_MOTION_VAR +#if CONFIG_EXT_PARTITION + HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10, + aom_highbd_10_obmc_variance128x128, + aom_highbd_10_obmc_sub_pixel_variance128x128) + HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits10, + aom_highbd_10_obmc_variance128x64, + aom_highbd_10_obmc_sub_pixel_variance128x64) + HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10, + aom_highbd_10_obmc_variance64x128, + aom_highbd_10_obmc_sub_pixel_variance64x128) +#endif // CONFIG_EXT_PARTITION + HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10, + aom_highbd_10_obmc_variance64x64, + aom_highbd_10_obmc_sub_pixel_variance64x64) + HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits10, + aom_highbd_10_obmc_variance64x32, + aom_highbd_10_obmc_sub_pixel_variance64x32) + HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits10, + aom_highbd_10_obmc_variance32x64, + aom_highbd_10_obmc_sub_pixel_variance32x64) + HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits10, + aom_highbd_10_obmc_variance32x32, + aom_highbd_10_obmc_sub_pixel_variance32x32) + HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits10, + aom_highbd_10_obmc_variance32x16, + aom_highbd_10_obmc_sub_pixel_variance32x16) + HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits10, + aom_highbd_10_obmc_variance16x32, + aom_highbd_10_obmc_sub_pixel_variance16x32) + HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits10, + aom_highbd_10_obmc_variance16x16, + aom_highbd_10_obmc_sub_pixel_variance16x16) + HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits10, + aom_highbd_10_obmc_variance8x16, + aom_highbd_10_obmc_sub_pixel_variance8x16) + HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits10, + aom_highbd_10_obmc_variance16x8, + aom_highbd_10_obmc_sub_pixel_variance16x8) + HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits10, + aom_highbd_10_obmc_variance8x8, + aom_highbd_10_obmc_sub_pixel_variance8x8) + HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits10, + aom_highbd_10_obmc_variance4x8, + aom_highbd_10_obmc_sub_pixel_variance4x8) + HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits10, + aom_highbd_10_obmc_variance8x4, + aom_highbd_10_obmc_sub_pixel_variance8x4) + HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10, + aom_highbd_10_obmc_variance4x4, + aom_highbd_10_obmc_sub_pixel_variance4x4) +#endif // CONFIG_MOTION_VAR + break; + + case AOM_BITS_12: + HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12, + aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16, + aom_highbd_12_sub_pixel_variance32x16, + aom_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL, + aom_highbd_sad32x16x4d_bits12) + + HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12, + aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32, + aom_highbd_12_sub_pixel_variance16x32, + aom_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL, + aom_highbd_sad16x32x4d_bits12) + + HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12, + aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32, + aom_highbd_12_sub_pixel_variance64x32, + aom_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL, + aom_highbd_sad64x32x4d_bits12) + + HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12, + aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64, + aom_highbd_12_sub_pixel_variance32x64, + aom_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL, + aom_highbd_sad32x64x4d_bits12) + + HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12, + aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32, + aom_highbd_12_sub_pixel_variance32x32, + aom_highbd_12_sub_pixel_avg_variance32x32, + aom_highbd_sad32x32x3_bits12, aom_highbd_sad32x32x8_bits12, + aom_highbd_sad32x32x4d_bits12) + + HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12, + aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64, + aom_highbd_12_sub_pixel_variance64x64, + aom_highbd_12_sub_pixel_avg_variance64x64, + aom_highbd_sad64x64x3_bits12, aom_highbd_sad64x64x8_bits12, + aom_highbd_sad64x64x4d_bits12) + + HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12, + aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16, + aom_highbd_12_sub_pixel_variance16x16, + aom_highbd_12_sub_pixel_avg_variance16x16, + aom_highbd_sad16x16x3_bits12, aom_highbd_sad16x16x8_bits12, + aom_highbd_sad16x16x4d_bits12) + + HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12, + aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8, + aom_highbd_12_sub_pixel_variance16x8, + aom_highbd_12_sub_pixel_avg_variance16x8, + aom_highbd_sad16x8x3_bits12, aom_highbd_sad16x8x8_bits12, + aom_highbd_sad16x8x4d_bits12) + + HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12, + aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16, + aom_highbd_12_sub_pixel_variance8x16, + aom_highbd_12_sub_pixel_avg_variance8x16, + aom_highbd_sad8x16x3_bits12, aom_highbd_sad8x16x8_bits12, + aom_highbd_sad8x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12, + aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8, + aom_highbd_12_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x3_bits12, + aom_highbd_sad8x8x8_bits12, aom_highbd_sad8x8x4d_bits12) + + HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits12, + aom_highbd_sad8x4_avg_bits12, aom_highbd_12_variance8x4, + aom_highbd_12_sub_pixel_variance8x4, + aom_highbd_12_sub_pixel_avg_variance8x4, NULL, + aom_highbd_sad8x4x8_bits12, aom_highbd_sad8x4x4d_bits12) + + HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits12, + aom_highbd_sad4x8_avg_bits12, aom_highbd_12_variance4x8, + aom_highbd_12_sub_pixel_variance4x8, + aom_highbd_12_sub_pixel_avg_variance4x8, NULL, + aom_highbd_sad4x8x8_bits12, aom_highbd_sad4x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12, + aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4, + aom_highbd_12_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits12, + aom_highbd_sad4x4x8_bits12, aom_highbd_sad4x4x4d_bits12) + +#if CONFIG_CB4X4 + HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_12_variance2x2, NULL, NULL, + NULL, NULL, NULL) + HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_12_variance4x2, NULL, NULL, + NULL, NULL, NULL) + HIGHBD_BFP(BLOCK_2X4, NULL, NULL, aom_highbd_12_variance2x4, NULL, NULL, + NULL, NULL, NULL) +#endif + +#if CONFIG_EXT_PARTITION + HIGHBD_BFP( + BLOCK_128X128, aom_highbd_sad128x128_bits12, + aom_highbd_sad128x128_avg_bits12, aom_highbd_12_variance128x128, + aom_highbd_12_sub_pixel_variance128x128, + aom_highbd_12_sub_pixel_avg_variance128x128, + aom_highbd_sad128x128x3_bits12, aom_highbd_sad128x128x8_bits12, + aom_highbd_sad128x128x4d_bits12) + + HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12, + aom_highbd_sad128x64_avg_bits12, + aom_highbd_12_variance128x64, + aom_highbd_12_sub_pixel_variance128x64, + aom_highbd_12_sub_pixel_avg_variance128x64, NULL, NULL, + aom_highbd_sad128x64x4d_bits12) + + HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12, + aom_highbd_sad64x128_avg_bits12, + aom_highbd_12_variance64x128, + aom_highbd_12_sub_pixel_variance64x128, + aom_highbd_12_sub_pixel_avg_variance64x128, NULL, NULL, + aom_highbd_sad64x128x4d_bits12) +#endif // CONFIG_EXT_PARTITION + +#if CONFIG_EXT_INTER +#if CONFIG_EXT_PARTITION + HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12, + aom_highbd_12_masked_variance128x128, + aom_highbd_12_masked_sub_pixel_variance128x128) + HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12, + aom_highbd_12_masked_variance128x64, + aom_highbd_12_masked_sub_pixel_variance128x64) + HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12, + aom_highbd_12_masked_variance64x128, + aom_highbd_12_masked_sub_pixel_variance64x128) +#endif // CONFIG_EXT_PARTITION + HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12, + aom_highbd_12_masked_variance64x64, + aom_highbd_12_masked_sub_pixel_variance64x64) + HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12, + aom_highbd_12_masked_variance64x32, + aom_highbd_12_masked_sub_pixel_variance64x32) + HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12, + aom_highbd_12_masked_variance32x64, + aom_highbd_12_masked_sub_pixel_variance32x64) + HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12, + aom_highbd_12_masked_variance32x32, + aom_highbd_12_masked_sub_pixel_variance32x32) + HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12, + aom_highbd_12_masked_variance32x16, + aom_highbd_12_masked_sub_pixel_variance32x16) + HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12, + aom_highbd_12_masked_variance16x32, + aom_highbd_12_masked_sub_pixel_variance16x32) + HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12, + aom_highbd_12_masked_variance16x16, + aom_highbd_12_masked_sub_pixel_variance16x16) + HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12, + aom_highbd_12_masked_variance8x16, + aom_highbd_12_masked_sub_pixel_variance8x16) + HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12, + aom_highbd_12_masked_variance16x8, + aom_highbd_12_masked_sub_pixel_variance16x8) + HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12, + aom_highbd_12_masked_variance8x8, + aom_highbd_12_masked_sub_pixel_variance8x8) + HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12, + aom_highbd_12_masked_variance4x8, + aom_highbd_12_masked_sub_pixel_variance4x8) + HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12, + aom_highbd_12_masked_variance8x4, + aom_highbd_12_masked_sub_pixel_variance8x4) + HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12, + aom_highbd_12_masked_variance4x4, + aom_highbd_12_masked_sub_pixel_variance4x4) +#endif // CONFIG_EXT_INTER + +#if CONFIG_MOTION_VAR +#if CONFIG_EXT_PARTITION + HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12, + aom_highbd_12_obmc_variance128x128, + aom_highbd_12_obmc_sub_pixel_variance128x128) + HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits12, + aom_highbd_12_obmc_variance128x64, + aom_highbd_12_obmc_sub_pixel_variance128x64) + HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12, + aom_highbd_12_obmc_variance64x128, + aom_highbd_12_obmc_sub_pixel_variance64x128) +#endif // CONFIG_EXT_PARTITION + HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12, + aom_highbd_12_obmc_variance64x64, + aom_highbd_12_obmc_sub_pixel_variance64x64) + HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits12, + aom_highbd_12_obmc_variance64x32, + aom_highbd_12_obmc_sub_pixel_variance64x32) + HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits12, + aom_highbd_12_obmc_variance32x64, + aom_highbd_12_obmc_sub_pixel_variance32x64) + HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits12, + aom_highbd_12_obmc_variance32x32, + aom_highbd_12_obmc_sub_pixel_variance32x32) + HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits12, + aom_highbd_12_obmc_variance32x16, + aom_highbd_12_obmc_sub_pixel_variance32x16) + HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits12, + aom_highbd_12_obmc_variance16x32, + aom_highbd_12_obmc_sub_pixel_variance16x32) + HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits12, + aom_highbd_12_obmc_variance16x16, + aom_highbd_12_obmc_sub_pixel_variance16x16) + HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits12, + aom_highbd_12_obmc_variance8x16, + aom_highbd_12_obmc_sub_pixel_variance8x16) + HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits12, + aom_highbd_12_obmc_variance16x8, + aom_highbd_12_obmc_sub_pixel_variance16x8) + HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits12, + aom_highbd_12_obmc_variance8x8, + aom_highbd_12_obmc_sub_pixel_variance8x8) + HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits12, + aom_highbd_12_obmc_variance4x8, + aom_highbd_12_obmc_sub_pixel_variance4x8) + HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits12, + aom_highbd_12_obmc_variance8x4, + aom_highbd_12_obmc_sub_pixel_variance8x4) + HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12, + aom_highbd_12_obmc_variance4x4, + aom_highbd_12_obmc_sub_pixel_variance4x4) +#endif // CONFIG_MOTION_VAR + break; + + default: + assert(0 && + "cm->bit_depth should be AOM_BITS_8, " + "AOM_BITS_10 or AOM_BITS_12"); + } + } +} +#endif // CONFIG_HIGHBITDEPTH + +static void realloc_segmentation_maps(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + + // Create the encoder segmentation map and set all entries to 0 + aom_free(cpi->segmentation_map); + CHECK_MEM_ERROR(cm, cpi->segmentation_map, + aom_calloc(cm->mi_rows * cm->mi_cols, 1)); + + // Create a map used for cyclic background refresh. + if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh); + CHECK_MEM_ERROR(cm, cpi->cyclic_refresh, + av1_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols)); + + // Create a map used to mark inactive areas. + aom_free(cpi->active_map.map); + CHECK_MEM_ERROR(cm, cpi->active_map.map, + aom_calloc(cm->mi_rows * cm->mi_cols, 1)); +} + +void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + + if (cm->profile != oxcf->profile) cm->profile = oxcf->profile; + cm->bit_depth = oxcf->bit_depth; + cm->color_space = oxcf->color_space; + cm->color_range = oxcf->color_range; + + if (cm->profile <= PROFILE_1) + assert(cm->bit_depth == AOM_BITS_8); + else + assert(cm->bit_depth > AOM_BITS_8); + + cpi->oxcf = *oxcf; +#if CONFIG_HIGHBITDEPTH + cpi->td.mb.e_mbd.bd = (int)cm->bit_depth; +#endif // CONFIG_HIGHBITDEPTH +#if CONFIG_GLOBAL_MOTION + cpi->td.mb.e_mbd.global_motion = cm->global_motion; +#endif // CONFIG_GLOBAL_MOTION + + if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) { + rc->baseline_gf_interval = FIXED_GF_INTERVAL; + } else { + rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2; + } + + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; +#if CONFIG_EXT_REFS + cpi->refresh_bwd_ref_frame = 0; +#endif // CONFIG_EXT_REFS + + cm->refresh_frame_context = + (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode) + ? REFRESH_FRAME_CONTEXT_FORWARD + : REFRESH_FRAME_CONTEXT_BACKWARD; + cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE; + +#if CONFIG_PALETTE + cm->allow_screen_content_tools = (cpi->oxcf.content == AOM_CONTENT_SCREEN); + if (cm->allow_screen_content_tools) { + MACROBLOCK *x = &cpi->td.mb; + if (x->palette_buffer == 0) { + CHECK_MEM_ERROR(cm, x->palette_buffer, + aom_memalign(16, sizeof(*x->palette_buffer))); + } + // Reallocate the pc_tree, as it's contents depends on + // the state of cm->allow_screen_content_tools + av1_free_pc_tree(&cpi->td); + av1_setup_pc_tree(&cpi->common, &cpi->td); + } +#endif // CONFIG_PALETTE + + av1_reset_segment_features(cm); + av1_set_high_precision_mv(cpi, 0); + + set_rc_buffer_sizes(rc, &cpi->oxcf); + + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = AOMMIN(rc->buffer_level, rc->maximum_buffer_size); + + // Set up frame rate and related parameters rate control values. + av1_new_framerate(cpi, cpi->framerate); + + // Set absolute upper and lower quality limits + rc->worst_quality = cpi->oxcf.worst_allowed_q; + rc->best_quality = cpi->oxcf.best_allowed_q; + + cm->interp_filter = cpi->sf.default_interp_filter; + + if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) { + cm->render_width = cpi->oxcf.render_width; + cm->render_height = cpi->oxcf.render_height; + } else { + cm->render_width = cpi->oxcf.width; + cm->render_height = cpi->oxcf.height; + } + cm->width = cpi->oxcf.width; + cm->height = cpi->oxcf.height; + + if (cpi->initial_width) { + if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) { + av1_free_context_buffers(cm); + av1_alloc_compressor_data(cpi); + realloc_segmentation_maps(cpi); + cpi->initial_width = cpi->initial_height = 0; + } + } + update_frame_size(cpi); + + cpi->alt_ref_source = NULL; + rc->is_src_frame_alt_ref = 0; + +#if CONFIG_EXT_REFS + rc->is_bwd_ref_frame = 0; + rc->is_last_bipred_frame = 0; + rc->is_bipred_frame = 0; +#endif // CONFIG_EXT_REFS + +#if 0 + // Experimental RD Code + cpi->frame_distortion = 0; + cpi->last_frame_distortion = 0; +#endif + + set_tile_info(cpi); + + cpi->ext_refresh_frame_flags_pending = 0; + cpi->ext_refresh_frame_context_pending = 0; + +#if CONFIG_HIGHBITDEPTH + highbd_set_var_fns(cpi); +#endif + +#if CONFIG_ANS && ANS_MAX_SYMBOLS + cpi->common.ans_window_size_log2 = cpi->oxcf.ans_window_size_log2; + if (cpi->buf_ans.size != (1 << cpi->common.ans_window_size_log2)) { + aom_buf_ans_free(&cpi->buf_ans); + aom_buf_ans_alloc(&cpi->buf_ans, &cpi->common.error, + 1 << cpi->common.ans_window_size_log2); + } +#endif // CONFIG_ANS && ANS_MAX_SYMBOLS +} + +#ifndef M_LOG2_E +#define M_LOG2_E 0.693147180559945309417 +#endif +#define log2f(x) (log(x) / (float)M_LOG2_E) + +#if !CONFIG_REF_MV +static void cal_nmvjointsadcost(int *mvjointsadcost) { + mvjointsadcost[0] = 600; + mvjointsadcost[1] = 300; + mvjointsadcost[2] = 300; + mvjointsadcost[3] = 300; +} +#endif + +static void cal_nmvsadcosts(int *mvsadcost[2]) { + int i = 1; + + mvsadcost[0][0] = 0; + mvsadcost[1][0] = 0; + + do { + double z = 256 * (2 * (log2f(8 * i) + .6)); + mvsadcost[0][i] = (int)z; + mvsadcost[1][i] = (int)z; + mvsadcost[0][-i] = (int)z; + mvsadcost[1][-i] = (int)z; + } while (++i <= MV_MAX); +} + +static void cal_nmvsadcosts_hp(int *mvsadcost[2]) { + int i = 1; + + mvsadcost[0][0] = 0; + mvsadcost[1][0] = 0; + + do { + double z = 256 * (2 * (log2f(8 * i) + .6)); + mvsadcost[0][i] = (int)z; + mvsadcost[1][i] = (int)z; + mvsadcost[0][-i] = (int)z; + mvsadcost[1][-i] = (int)z; + } while (++i <= MV_MAX); +} + +static INLINE void init_upsampled_ref_frame_bufs(AV1_COMP *cpi) { + int i; + + for (i = 0; i < (REF_FRAMES + 1); ++i) { + cpi->upsampled_ref_bufs[i].ref_count = 0; + cpi->upsampled_ref_idx[i] = INVALID_IDX; + } +} + +AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, + BufferPool *const pool) { + unsigned int i; + AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP)); + AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL; + + if (!cm) return NULL; + + av1_zero(*cpi); + + if (setjmp(cm->error.jmp)) { + cm->error.setjmp = 0; + av1_remove_compressor(cpi); + return 0; + } + + cm->error.setjmp = 1; + cm->alloc_mi = av1_enc_alloc_mi; + cm->free_mi = av1_enc_free_mi; + cm->setup_mi = av1_enc_setup_mi; + + CHECK_MEM_ERROR(cm, cm->fc, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc))); + CHECK_MEM_ERROR(cm, cm->frame_contexts, + (FRAME_CONTEXT *)aom_memalign( + 32, FRAME_CONTEXTS * sizeof(*cm->frame_contexts))); + memset(cm->fc, 0, sizeof(*cm->fc)); + memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts)); + + cpi->resize_state = 0; + cpi->resize_avg_qp = 0; + cpi->resize_buffer_underflow = 0; + cpi->common.buffer_pool = pool; + + init_config(cpi, oxcf); +#if CONFIG_XIPHRC + cpi->od_rc.framerate = cpi->framerate; + cpi->od_rc.frame_width = cm->render_width; + cpi->od_rc.frame_height = cm->render_height; + cpi->od_rc.keyframe_rate = oxcf->key_freq; + cpi->od_rc.goldenframe_rate = FIXED_GF_INTERVAL; + cpi->od_rc.altref_rate = 25; + cpi->od_rc.firstpass_quant = 1; + cpi->od_rc.bit_depth = cm->bit_depth; + cpi->od_rc.minq = oxcf->best_allowed_q; + cpi->od_rc.maxq = oxcf->worst_allowed_q; + if (cpi->oxcf.rc_mode == AOM_CQ) cpi->od_rc.minq = cpi->od_rc.quality; + cpi->od_rc.quality = cpi->oxcf.rc_mode == AOM_Q ? oxcf->cq_level : -1; + cpi->od_rc.periodic_boosts = oxcf->frame_periodic_boost; + od_enc_rc_init(&cpi->od_rc, + cpi->oxcf.rc_mode == AOM_Q ? -1 : oxcf->target_bandwidth, + oxcf->maximum_buffer_size_ms); +#else + av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc); +#endif + + cm->current_video_frame = 0; + cpi->partition_search_skippable_frame = 0; + cpi->tile_data = NULL; + cpi->last_show_frame_buf_idx = INVALID_IDX; + + realloc_segmentation_maps(cpi); + +#if CONFIG_REF_MV + for (i = 0; i < NMV_CONTEXTS; ++i) { + memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs)); + memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp)); + } +#endif + + memset(cpi->nmvcosts, 0, sizeof(cpi->nmvcosts)); + memset(cpi->nmvcosts_hp, 0, sizeof(cpi->nmvcosts_hp)); + memset(cpi->nmvsadcosts, 0, sizeof(cpi->nmvsadcosts)); + memset(cpi->nmvsadcosts_hp, 0, sizeof(cpi->nmvsadcosts_hp)); + + for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0])); + i++) { + CHECK_MEM_ERROR( + cm, cpi->mbgraph_stats[i].mb_stats, + aom_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1)); + } + +#if CONFIG_FP_MB_STATS + cpi->use_fp_mb_stats = 0; + if (cpi->use_fp_mb_stats) { + // a place holder used to store the first pass mb stats in the first pass + CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf, + aom_calloc(cm->MBs * sizeof(uint8_t), 1)); + } else { + cpi->twopass.frame_mb_stats_buf = NULL; + } +#endif + + cpi->refresh_alt_ref_frame = 0; + cpi->multi_arf_last_grp_enabled = 0; + + cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; +#if CONFIG_INTERNAL_STATS + cpi->b_calculate_blockiness = 1; + cpi->b_calculate_consistency = 1; + cpi->total_inconsistency = 0; + cpi->psnr.worst = 100.0; + cpi->worst_ssim = 100.0; + + cpi->count = 0; + cpi->bytes = 0; + + if (cpi->b_calculate_psnr) { + cpi->total_sq_error = 0; + cpi->total_samples = 0; + cpi->tot_recode_hits = 0; + cpi->summed_quality = 0; + cpi->summed_weights = 0; + } + + cpi->fastssim.worst = 100.0; + cpi->psnrhvs.worst = 100.0; + + if (cpi->b_calculate_blockiness) { + cpi->total_blockiness = 0; + cpi->worst_blockiness = 0.0; + } + + if (cpi->b_calculate_consistency) { + CHECK_MEM_ERROR(cm, cpi->ssim_vars, + aom_malloc(sizeof(*cpi->ssim_vars) * 4 * + cpi->common.mi_rows * cpi->common.mi_cols)); + cpi->worst_consistency = 100.0; + } +#endif +#if CONFIG_ENTROPY_STATS + av1_zero(aggregate_fc); +#endif // CONFIG_ENTROPY_STATS + + cpi->first_time_stamp_ever = INT64_MAX; + +#if CONFIG_REF_MV + for (i = 0; i < NMV_CONTEXTS; ++i) { + cpi->td.mb.nmvcost[i][0] = &cpi->nmv_costs[i][0][MV_MAX]; + cpi->td.mb.nmvcost[i][1] = &cpi->nmv_costs[i][1][MV_MAX]; + cpi->td.mb.nmvcost_hp[i][0] = &cpi->nmv_costs_hp[i][0][MV_MAX]; + cpi->td.mb.nmvcost_hp[i][1] = &cpi->nmv_costs_hp[i][1][MV_MAX]; + } +#else + cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost); + cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX]; + cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX]; + cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX]; + cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX]; +#endif + cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX]; + cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX]; + cal_nmvsadcosts(cpi->td.mb.nmvsadcost); + + cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX]; + cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX]; + cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp); + +#ifdef OUTPUT_YUV_SKINMAP + yuv_skinmap_file = fopen("skinmap.yuv", "ab"); +#endif +#ifdef OUTPUT_YUV_REC + yuv_rec_file = fopen("rec.yuv", "wb"); +#endif + +#if 0 + framepsnr = fopen("framepsnr.stt", "a"); + kf_list = fopen("kf_list.stt", "w"); +#endif + +#if CONFIG_XIPHRC + if (oxcf->pass == 2) { + cpi->od_rc.twopass_allframes_buf = oxcf->two_pass_stats_in.buf; + cpi->od_rc.twopass_allframes_buf_size = oxcf->two_pass_stats_in.sz; + } +#else + if (oxcf->pass == 1) { + av1_init_first_pass(cpi); + } else if (oxcf->pass == 2) { + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz); + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + const size_t psz = cpi->common.MBs * sizeof(uint8_t); + const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz); + + cpi->twopass.firstpass_mb_stats.mb_stats_start = + oxcf->firstpass_mb_stats_in.buf; + cpi->twopass.firstpass_mb_stats.mb_stats_end = + cpi->twopass.firstpass_mb_stats.mb_stats_start + + (ps - 1) * cpi->common.MBs * sizeof(uint8_t); + } +#endif + + cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf; + cpi->twopass.stats_in = cpi->twopass.stats_in_start; + cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1]; + + av1_init_second_pass(cpi); + } +#endif + + init_upsampled_ref_frame_bufs(cpi); + + av1_set_speed_features_framesize_independent(cpi); + av1_set_speed_features_framesize_dependent(cpi); + + // Allocate memory to store variances for a frame. + CHECK_MEM_ERROR(cm, cpi->source_diff_var, + aom_calloc(cm->MBs, sizeof(*cpi->source_diff_var))); + cpi->source_var_thresh = 0; + cpi->frames_till_next_var_check = 0; + +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx3f = SDX3F; \ + cpi->fn_ptr[BT].sdx8f = SDX8F; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; + +#if CONFIG_EXT_PARTITION + BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128, + aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128, + aom_sad128x128x3, aom_sad128x128x8, aom_sad128x128x4d) + + BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64, + aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, NULL, + NULL, aom_sad128x64x4d) + + BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128, + aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, NULL, + NULL, aom_sad64x128x4d) +#endif // CONFIG_EXT_PARTITION + + BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16, + aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, NULL, NULL, + aom_sad32x16x4d) + + BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32, + aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32, NULL, NULL, + aom_sad16x32x4d) + + BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32, + aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32, NULL, NULL, + aom_sad64x32x4d) + + BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64, + aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64, NULL, NULL, + aom_sad32x64x4d) + + BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32, + aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32, + aom_sad32x32x3, aom_sad32x32x8, aom_sad32x32x4d) + + BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64, + aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64, + aom_sad64x64x3, aom_sad64x64x8, aom_sad64x64x4d) + + BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16, + aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16, + aom_sad16x16x3, aom_sad16x16x8, aom_sad16x16x4d) + + BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8, + aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8, aom_sad16x8x3, + aom_sad16x8x8, aom_sad16x8x4d) + + BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16, + aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16, aom_sad8x16x3, + aom_sad8x16x8, aom_sad8x16x4d) + + BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8, + aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x3, + aom_sad8x8x8, aom_sad8x8x4d) + + BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4, + aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, NULL, + aom_sad8x4x8, aom_sad8x4x4d) + + BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8, + aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, NULL, + aom_sad4x8x8, aom_sad4x8x4d) + + BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4, + aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x3, + aom_sad4x4x8, aom_sad4x4x4d) + +#if CONFIG_CB4X4 + BFP(BLOCK_2X2, NULL, NULL, aom_variance2x2, NULL, NULL, NULL, NULL, NULL) + BFP(BLOCK_2X4, NULL, NULL, aom_variance2x4, NULL, NULL, NULL, NULL, NULL) + BFP(BLOCK_4X2, NULL, NULL, aom_variance4x2, NULL, NULL, NULL, NULL, NULL) +#endif + +#if CONFIG_MOTION_VAR +#define OBFP(BT, OSDF, OVF, OSVF) \ + cpi->fn_ptr[BT].osdf = OSDF; \ + cpi->fn_ptr[BT].ovf = OVF; \ + cpi->fn_ptr[BT].osvf = OSVF; + +#if CONFIG_EXT_PARTITION + OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128, + aom_obmc_sub_pixel_variance128x128) + OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64, + aom_obmc_sub_pixel_variance128x64) + OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128, + aom_obmc_sub_pixel_variance64x128) +#endif // CONFIG_EXT_PARTITION + OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64, + aom_obmc_sub_pixel_variance64x64) + OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32, + aom_obmc_sub_pixel_variance64x32) + OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64, + aom_obmc_sub_pixel_variance32x64) + OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32, + aom_obmc_sub_pixel_variance32x32) + OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16, + aom_obmc_sub_pixel_variance32x16) + OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32, + aom_obmc_sub_pixel_variance16x32) + OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16, + aom_obmc_sub_pixel_variance16x16) + OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8, + aom_obmc_sub_pixel_variance16x8) + OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16, + aom_obmc_sub_pixel_variance8x16) + OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8, + aom_obmc_sub_pixel_variance8x8) + OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8, + aom_obmc_sub_pixel_variance4x8) + OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4, + aom_obmc_sub_pixel_variance8x4) + OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4, + aom_obmc_sub_pixel_variance4x4) +#endif // CONFIG_MOTION_VAR + +#if CONFIG_EXT_INTER +#define MBFP(BT, MSDF, MVF, MSVF) \ + cpi->fn_ptr[BT].msdf = MSDF; \ + cpi->fn_ptr[BT].mvf = MVF; \ + cpi->fn_ptr[BT].msvf = MSVF; + +#if CONFIG_EXT_PARTITION + MBFP(BLOCK_128X128, aom_masked_sad128x128, aom_masked_variance128x128, + aom_masked_sub_pixel_variance128x128) + MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_variance128x64, + aom_masked_sub_pixel_variance128x64) + MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_variance64x128, + aom_masked_sub_pixel_variance64x128) +#endif // CONFIG_EXT_PARTITION + MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_variance64x64, + aom_masked_sub_pixel_variance64x64) + MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_variance64x32, + aom_masked_sub_pixel_variance64x32) + MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_variance32x64, + aom_masked_sub_pixel_variance32x64) + MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_variance32x32, + aom_masked_sub_pixel_variance32x32) + MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_variance32x16, + aom_masked_sub_pixel_variance32x16) + MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_variance16x32, + aom_masked_sub_pixel_variance16x32) + MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_variance16x16, + aom_masked_sub_pixel_variance16x16) + MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_variance16x8, + aom_masked_sub_pixel_variance16x8) + MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_variance8x16, + aom_masked_sub_pixel_variance8x16) + MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_variance8x8, + aom_masked_sub_pixel_variance8x8) + MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_variance4x8, + aom_masked_sub_pixel_variance4x8) + MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_variance8x4, + aom_masked_sub_pixel_variance8x4) + MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_variance4x4, + aom_masked_sub_pixel_variance4x4) +#endif // CONFIG_EXT_INTER + +#if CONFIG_HIGHBITDEPTH + highbd_set_var_fns(cpi); +#endif + + /* av1_init_quantizer() is first called here. Add check in + * av1_frame_init_quantizer() so that av1_init_quantizer is only + * called later when needed. This will avoid unnecessary calls of + * av1_init_quantizer() for every frame. + */ + av1_init_quantizer(cpi); +#if CONFIG_AOM_QM + aom_qm_init(cm); +#endif + + av1_loop_filter_init(cm); +#if CONFIG_LOOP_RESTORATION + av1_loop_restoration_precal(); +#endif // CONFIG_LOOP_RESTORATION + + cm->error.setjmp = 0; + + return cpi; +} + +#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T)) + +#define SNPRINT2(H, T, V) \ + snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V)) + +void av1_remove_compressor(AV1_COMP *cpi) { + AV1_COMMON *cm; + unsigned int i; + int t; + + if (!cpi) return; + + cm = &cpi->common; + if (cm->current_video_frame > 0) { +#if CONFIG_ENTROPY_STATS + if (cpi->oxcf.pass != 1) { + fprintf(stderr, "Writing counts.stt\n"); + FILE *f = fopen("counts.stt", "wb"); + fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f); + fclose(f); + } +#endif // CONFIG_ENTROPY_STATS +#if CONFIG_INTERNAL_STATS + aom_clear_system_state(); + + if (cpi->oxcf.pass != 1) { + char headings[512] = { 0 }; + char results[512] = { 0 }; + FILE *f = fopen("opsnr.stt", "a"); + double time_encoded = + (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / + 10000000.000; + double total_encode_time = + (cpi->time_receive_data + cpi->time_compress_data) / 1000.000; + const double dr = + (double)cpi->bytes * (double)8 / (double)1000 / time_encoded; + const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1); + const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000; + const double rate_err = ((100.0 * (dr - target_rate)) / target_rate); + + if (cpi->b_calculate_psnr) { + const double total_psnr = aom_sse_to_psnr( + (double)cpi->total_samples, peak, (double)cpi->total_sq_error); + const double total_ssim = + 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0); + snprintf(headings, sizeof(headings), + "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" + "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t" + "WstPsnr\tWstSsim\tWstFast\tWstHVS"); + snprintf(results, sizeof(results), + "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f\t%7.3f", + dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr, + cpi->psnr.stat[ALL] / cpi->count, total_psnr, total_ssim, + total_ssim, cpi->fastssim.stat[ALL] / cpi->count, + cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst, + cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst); + + if (cpi->b_calculate_blockiness) { + SNPRINT(headings, "\t Block\tWstBlck"); + SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count); + SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness); + } + + if (cpi->b_calculate_consistency) { + double consistency = + aom_sse_to_psnr((double)cpi->total_samples, peak, + (double)cpi->total_inconsistency); + + SNPRINT(headings, "\tConsist\tWstCons"); + SNPRINT2(results, "\t%7.3f", consistency); + SNPRINT2(results, "\t%7.3f", cpi->worst_consistency); + } + fprintf(f, "%s\t Time\tRcErr\tAbsErr\n", headings); + fprintf(f, "%s\t%8.0f\t%7.2f\t%7.2f\n", results, total_encode_time, + rate_err, fabs(rate_err)); + } + + fclose(f); + } + +#endif + +#if 0 + { + printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); + printf("\n_frames recive_data encod_mb_row compress_frame Total\n"); + printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, + cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000, + cpi->time_compress_data / 1000, + (cpi->time_receive_data + cpi->time_compress_data) / 1000); + } +#endif + } + + for (t = 0; t < cpi->num_workers; ++t) { + AVxWorker *const worker = &cpi->workers[t]; + EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; + + // Deallocate allocated threads. + aom_get_worker_interface()->end(worker); + + // Deallocate allocated thread data. + if (t < cpi->num_workers - 1) { +#if CONFIG_PALETTE + if (cpi->common.allow_screen_content_tools) + aom_free(thread_data->td->mb.palette_buffer); +#endif // CONFIG_PALETTE + aom_free(thread_data->td->counts); + av1_free_pc_tree(thread_data->td); + av1_free_var_tree(thread_data->td); + aom_free(thread_data->td); + } + } + aom_free(cpi->tile_thr_data); + aom_free(cpi->workers); + + if (cpi->num_workers > 1) av1_loop_filter_dealloc(&cpi->lf_row_sync); + + dealloc_compressor_data(cpi); + + for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); + ++i) { + aom_free(cpi->mbgraph_stats[i].mb_stats); + } + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + aom_free(cpi->twopass.frame_mb_stats_buf); + cpi->twopass.frame_mb_stats_buf = NULL; + } +#endif +#if CONFIG_INTERNAL_STATS + aom_free(cpi->ssim_vars); + cpi->ssim_vars = NULL; +#endif // CONFIG_INTERNAL_STATS + + av1_remove_common(cm); + av1_free_ref_frame_buffers(cm->buffer_pool); + aom_free(cpi); + +#ifdef OUTPUT_YUV_SKINMAP + fclose(yuv_skinmap_file); +#endif +#ifdef OUTPUT_YUV_REC + fclose(yuv_rec_file); +#endif + +#if 0 + + if (keyfile) + fclose(keyfile); + + if (framepsnr) + fclose(framepsnr); + + if (kf_list) + fclose(kf_list); + +#endif +} + +static void generate_psnr_packet(AV1_COMP *cpi) { + struct aom_codec_cx_pkt pkt; + int i; + PSNR_STATS psnr; +#if CONFIG_HIGHBITDEPTH + aom_calc_highbd_psnr(cpi->source, cpi->common.frame_to_show, &psnr, + cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth); +#else + aom_calc_psnr(cpi->source, cpi->common.frame_to_show, &psnr); +#endif + + for (i = 0; i < 4; ++i) { + pkt.data.psnr.samples[i] = psnr.samples[i]; + pkt.data.psnr.sse[i] = psnr.sse[i]; + pkt.data.psnr.psnr[i] = psnr.psnr[i]; + } + pkt.kind = AOM_CODEC_PSNR_PKT; + aom_codec_pkt_list_add(cpi->output_pkt_list, &pkt); +} + +int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) { + if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1; + + cpi->ref_frame_flags = ref_frame_flags; + return 0; +} + +void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags) { + cpi->ext_refresh_golden_frame = (ref_frame_flags & AOM_GOLD_FLAG) != 0; + cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & AOM_ALT_FLAG) != 0; + cpi->ext_refresh_last_frame = (ref_frame_flags & AOM_LAST_FLAG) != 0; + cpi->ext_refresh_frame_flags_pending = 1; +} + +static YV12_BUFFER_CONFIG *get_av1_ref_frame_buffer( + AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag) { + MV_REFERENCE_FRAME ref_frame = NONE_FRAME; + if (ref_frame_flag == AOM_LAST_FLAG) ref_frame = LAST_FRAME; +#if CONFIG_EXT_REFS + else if (ref_frame_flag == AOM_LAST2_FLAG) + ref_frame = LAST2_FRAME; + else if (ref_frame_flag == AOM_LAST3_FLAG) + ref_frame = LAST3_FRAME; +#endif // CONFIG_EXT_REFS + else if (ref_frame_flag == AOM_GOLD_FLAG) + ref_frame = GOLDEN_FRAME; +#if CONFIG_EXT_REFS + else if (ref_frame_flag == AOM_BWD_FLAG) + ref_frame = BWDREF_FRAME; +#endif // CONFIG_EXT_REFS + else if (ref_frame_flag == AOM_ALT_FLAG) + ref_frame = ALTREF_FRAME; + + return ref_frame == NONE_FRAME ? NULL : get_ref_frame_buffer(cpi, ref_frame); +} + +int av1_copy_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + YV12_BUFFER_CONFIG *cfg = get_av1_ref_frame_buffer(cpi, ref_frame_flag); + if (cfg) { + aom_yv12_copy_frame(cfg, sd); + return 0; + } else { + return -1; + } +} + +int av1_set_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + YV12_BUFFER_CONFIG *cfg = get_av1_ref_frame_buffer(cpi, ref_frame_flag); + if (cfg) { + aom_yv12_copy_frame(sd, cfg); + return 0; + } else { + return -1; + } +} + +int av1_update_entropy(AV1_COMP *cpi, int update) { + cpi->ext_refresh_frame_context = update; + cpi->ext_refresh_frame_context_pending = 1; + return 0; +} + +#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP) +// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it +// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do +// not denoise the UV channels at this time. If ever we implement UV channel +// denoising we will have to modify this. +void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) { + uint8_t *src = s->y_buffer; + int h = s->y_height; + + do { + fwrite(src, s->y_width, 1, f); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, f); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, f); + src += s->uv_stride; + } while (--h); +} +#endif + +#if CONFIG_EXT_REFS && !CONFIG_XIPHRC +static void check_show_existing_frame(AV1_COMP *cpi) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + AV1_COMMON *const cm = &cpi->common; + const FRAME_UPDATE_TYPE next_frame_update_type = + gf_group->update_type[gf_group->index]; + const int which_arf = gf_group->arf_update_idx[gf_group->index]; + + if (cm->show_existing_frame == 1) { + cm->show_existing_frame = 0; + } else if (cpi->rc.is_last_bipred_frame) { + // NOTE(zoeliu): If the current frame is a last bi-predictive frame, it is + // needed next to show the BWDREF_FRAME, which is pointed by + // the last_fb_idxes[0] after reference frame buffer update + cpi->rc.is_last_bipred_frame = 0; + cm->show_existing_frame = 1; + cpi->existing_fb_idx_to_show = cpi->lst_fb_idxes[0]; + } else if (cpi->is_arf_filter_off[which_arf] && + (next_frame_update_type == OVERLAY_UPDATE || + next_frame_update_type == INTNL_OVERLAY_UPDATE)) { + // Other parameters related to OVERLAY_UPDATE will be taken care of + // in av1_rc_get_second_pass_params(cpi) + cm->show_existing_frame = 1; + cpi->rc.is_src_frame_alt_ref = 1; + cpi->existing_fb_idx_to_show = cpi->alt_fb_idx; + cpi->is_arf_filter_off[which_arf] = 0; + } + cpi->rc.is_src_frame_ext_arf = 0; +} +#endif // CONFIG_EXT_REFS && !CONFIG_XIPHRC + +#ifdef OUTPUT_YUV_REC +void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) { + uint8_t *src = s->y_buffer; + int h = cm->height; + +#if CONFIG_HIGHBITDEPTH + if (s->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer); + + do { + fwrite(src16, s->y_width, 2, yuv_rec_file); + src16 += s->y_stride; + } while (--h); + + src16 = CONVERT_TO_SHORTPTR(s->u_buffer); + h = s->uv_height; + + do { + fwrite(src16, s->uv_width, 2, yuv_rec_file); + src16 += s->uv_stride; + } while (--h); + + src16 = CONVERT_TO_SHORTPTR(s->v_buffer); + h = s->uv_height; + + do { + fwrite(src16, s->uv_width, 2, yuv_rec_file); + src16 += s->uv_stride; + } while (--h); + + fflush(yuv_rec_file); + return; + } +#endif // CONFIG_HIGHBITDEPTH + + do { + fwrite(src, s->y_width, 1, yuv_rec_file); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, yuv_rec_file); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, yuv_rec_file); + src += s->uv_stride; + } while (--h); + + fflush(yuv_rec_file); +} +#endif // OUTPUT_YUV_REC + +#if CONFIG_HIGHBITDEPTH +static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + int bd) { +#else +static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { +#endif // CONFIG_HIGHBITDEPTH + // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t + int i; + const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, + src->v_buffer }; + const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; + const int src_widths[3] = { src->y_crop_width, src->uv_crop_width, + src->uv_crop_width }; + const int src_heights[3] = { src->y_crop_height, src->uv_crop_height, + src->uv_crop_height }; + uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; + const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; + const int dst_widths[3] = { dst->y_crop_width, dst->uv_crop_width, + dst->uv_crop_width }; + const int dst_heights[3] = { dst->y_crop_height, dst->uv_crop_height, + dst->uv_crop_height }; + + for (i = 0; i < MAX_MB_PLANE; ++i) { +#if CONFIG_HIGHBITDEPTH + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + av1_highbd_resize_plane(srcs[i], src_heights[i], src_widths[i], + src_strides[i], dsts[i], dst_heights[i], + dst_widths[i], dst_strides[i], bd); + } else { + av1_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i], + dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]); + } +#else + av1_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i], + dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]); +#endif // CONFIG_HIGHBITDEPTH + } + aom_extend_frame_borders(dst); +} + +#if CONFIG_HIGHBITDEPTH +static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int planes, + int bd) { +#else +static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int planes) { +#endif // CONFIG_HIGHBITDEPTH + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, + src->v_buffer }; + const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; + uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; + const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; + const InterpFilterParams interp_filter_params = + av1_get_interp_filter_params(EIGHTTAP_REGULAR); + const int16_t *kernel = interp_filter_params.filter_ptr; + const int taps = interp_filter_params.taps; + int x, y, i; + + assert(planes <= 3); + for (y = 0; y < dst_h; y += 16) { + for (x = 0; x < dst_w; x += 16) { + for (i = 0; i < planes; ++i) { + const int factor = (i == 0 || i == 3 ? 1 : 2); + const int x_q4 = x * (16 / factor) * src_w / dst_w; + const int y_q4 = y * (16 / factor) * src_h / dst_h; + const int src_stride = src_strides[i]; + const int dst_stride = dst_strides[i]; + const uint8_t *src_ptr = srcs[i] + + (y / factor) * src_h / dst_h * src_stride + + (x / factor) * src_w / dst_w; + uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); + +#if CONFIG_HIGHBITDEPTH + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + aom_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, + &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w, + &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h, + 16 / factor, 16 / factor, bd); + } else { + aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, + &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w, + &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h, + 16 / factor, 16 / factor); + } +#else + aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, + &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w, + &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h, + 16 / factor, 16 / factor); +#endif // CONFIG_HIGHBITDEPTH + } + } + } + + if (planes == 1) + aom_extend_frame_borders_y(dst); + else + aom_extend_frame_borders(dst); +} + +static int scale_down(AV1_COMP *cpi, int q) { + RATE_CONTROL *const rc = &cpi->rc; + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + int scale = 0; + assert(frame_is_kf_gf_arf(cpi)); + + if (rc->frame_size_selector == UNSCALED && + q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) { + const int max_size_thresh = + (int)(rate_thresh_mult[SCALE_STEP1] * + AOMMAX(rc->this_frame_target, rc->avg_frame_bandwidth)); + scale = rc->projected_frame_size > max_size_thresh ? 1 : 0; + } + return scale; +} + +#if CONFIG_GLOBAL_MOTION +#define GM_RECODE_LOOP_NUM4X4_FACTOR 192 +static int recode_loop_test_global_motion(AV1_COMP *cpi) { + int i; + int recode = 0; + RD_COUNTS *const rdc = &cpi->td.rd_counts; + AV1_COMMON *const cm = &cpi->common; + for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + if (cm->global_motion[i].wmtype != IDENTITY && + rdc->global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR < + cpi->gmparams_cost[i]) { + set_default_warp_params(&cm->global_motion[i]); + cpi->gmparams_cost[i] = 0; +#if CONFIG_REF_MV + recode = 1; +#else + recode |= (rdc->global_motion_used[i] > 0); +#endif + } + } + return recode; +} +#endif // CONFIG_GLOBAL_MOTION + +// Function to test for conditions that indicate we should loop +// back and recode a frame. +static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q, + int maxq, int minq) { + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi); + int force_recode = 0; + + if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + (cpi->sf.recode_loop == ALLOW_RECODE) || + (frame_is_kfgfarf && (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) { + if (frame_is_kfgfarf && (oxcf->resize_mode == RESIZE_DYNAMIC) && + scale_down(cpi, q)) { + // Code this group at a lower resolution. + cpi->resize_pending = 1; + return 1; + } + + // TODO(agrange) high_limit could be greater than the scale-down threshold. + if ((rc->projected_frame_size > high_limit && q < maxq) || + (rc->projected_frame_size < low_limit && q > minq)) { + force_recode = 1; + } else if (cpi->oxcf.rc_mode == AOM_CQ) { + // Deal with frame undershoot and whether or not we are + // below the automatically set cq level. + if (q > oxcf->cq_level && + rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) { + force_recode = 1; + } + } + } + return force_recode; +} + +static INLINE int get_free_upsampled_ref_buf(EncRefCntBuffer *ubufs) { + int i; + + for (i = 0; i < (REF_FRAMES + 1); i++) { + if (!ubufs[i].ref_count) { + return i; + } + } + return INVALID_IDX; +} + +// Up-sample 1 reference frame. +static INLINE int upsample_ref_frame(AV1_COMP *cpi, + const YV12_BUFFER_CONFIG *const ref) { + AV1_COMMON *const cm = &cpi->common; + EncRefCntBuffer *ubufs = cpi->upsampled_ref_bufs; + int new_uidx = get_free_upsampled_ref_buf(ubufs); + + if (new_uidx == INVALID_IDX) { + return INVALID_IDX; + } else { + YV12_BUFFER_CONFIG *upsampled_ref = &ubufs[new_uidx].buf; + + // Can allocate buffer for Y plane only. + if (upsampled_ref->buffer_alloc_sz < (ref->buffer_alloc_sz << 6)) + if (aom_realloc_frame_buffer(upsampled_ref, (cm->width << 3), + (cm->height << 3), cm->subsampling_x, + cm->subsampling_y, +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + (AOM_BORDER_IN_PIXELS << 3), + cm->byte_alignment, NULL, NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate up-sampled frame buffer"); + +// Currently, only Y plane is up-sampled, U, V are not used. +#if CONFIG_HIGHBITDEPTH + scale_and_extend_frame(ref, upsampled_ref, 1, (int)cm->bit_depth); +#else + scale_and_extend_frame(ref, upsampled_ref, 1); +#endif + return new_uidx; + } +} + +#define DUMP_REF_FRAME_IMAGES 0 + +#if DUMP_REF_FRAME_IMAGES == 1 +static int dump_one_image(AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *const ref_buf, + char *file_name) { + int h; + FILE *f_ref = NULL; + + if (ref_buf == NULL) { + printf("Frame data buffer is NULL.\n"); + return AOM_CODEC_MEM_ERROR; + } + + if ((f_ref = fopen(file_name, "wb")) == NULL) { + printf("Unable to open file %s to write.\n", file_name); + return AOM_CODEC_MEM_ERROR; + } + + // --- Y --- + for (h = 0; h < cm->height; ++h) { + fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref); + } + // --- U --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), + f_ref); + } + // --- V --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), + f_ref); + } + + fclose(f_ref); + + return AOM_CODEC_OK; +} + +static void dump_ref_frame_images(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MV_REFERENCE_FRAME ref_frame; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + char file_name[256] = ""; + snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv", + cm->current_video_frame, ref_frame); + dump_one_image(cm, get_ref_frame_buffer(cpi, ref_frame), file_name); + } +} +#endif // DUMP_REF_FRAME_IMAGES == 1 + +#if CONFIG_EXT_REFS +// This function is used to shift the virtual indices of last reference frames +// as follows: +// LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME +// when the LAST_FRAME is updated. +static INLINE void shift_last_ref_frames(AV1_COMP *cpi) { + int ref_frame; + for (ref_frame = LAST_REF_FRAMES - 1; ref_frame > 0; --ref_frame) { + cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1]; + + // [0] is allocated to the current coded frame. The statistics for the + // reference frames start at [LAST_FRAME], i.e. [1]. + if (!cpi->rc.is_src_frame_alt_ref) { + memcpy(cpi->interp_filter_selected[ref_frame + LAST_FRAME], + cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME], + sizeof(cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME])); + } + } +} +#endif // CONFIG_EXT_REFS + +void av1_update_reference_frames(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + BufferPool *const pool = cm->buffer_pool; + const int use_upsampled_ref = cpi->sf.use_upsampled_references; + int new_uidx = 0; + + // NOTE: Save the new show frame buffer index for --test-code=warn, i.e., + // for the purpose to verify no mismatch between encoder and decoder. + if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx; + + if (use_upsampled_ref) { +#if CONFIG_EXT_REFS + if (cm->show_existing_frame) { + new_uidx = cpi->upsampled_ref_idx[cpi->existing_fb_idx_to_show]; + // TODO(zoeliu): Once following is confirmed, remove it. + assert(cpi->upsampled_ref_bufs[new_uidx].ref_count > 0); + } else { +#endif // CONFIG_EXT_REFS + // Up-sample the current encoded frame. + RefCntBuffer *bufs = pool->frame_bufs; + const YV12_BUFFER_CONFIG *const ref = &bufs[cm->new_fb_idx].buf; + + new_uidx = upsample_ref_frame(cpi, ref); +#if CONFIG_EXT_REFS + assert(new_uidx != INVALID_IDX); + } +#endif // CONFIG_EXT_REFS + } + // At this point the new frame has been encoded. + // If any buffer copy / swapping is signaled it should be done here. + if (cm->frame_type == KEY_FRAME) { + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], + cm->new_fb_idx); +#if CONFIG_EXT_REFS + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx], + cm->new_fb_idx); +#endif // CONFIG_EXT_REFS + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], + cm->new_fb_idx); + + if (use_upsampled_ref) { + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx); +#if CONFIG_EXT_REFS + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx); +#endif // CONFIG_EXT_REFS + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx); + } + } else if (av1_preserve_existing_gf(cpi)) { + // We have decided to preserve the previously existing golden frame as our + // new ARF frame. However, in the short term in function + // av1_bitstream.c::get_refresh_mask() we left it in the GF slot and, if + // we're updating the GF with the current decoded frame, we save it to the + // ARF slot instead. + // We now have to update the ARF with the current frame and swap gld_fb_idx + // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF + // slot and, if we're updating the GF, the current frame becomes the new GF. + int tmp; + + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], + cm->new_fb_idx); + if (use_upsampled_ref) + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx); + + tmp = cpi->alt_fb_idx; + cpi->alt_fb_idx = cpi->gld_fb_idx; + cpi->gld_fb_idx = tmp; + +#if CONFIG_EXT_REFS + // We need to modify the mapping accordingly + cpi->arf_map[0] = cpi->alt_fb_idx; +#endif +// TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to +// cpi->interp_filter_selected[GOLDEN_FRAME]? +#if CONFIG_EXT_REFS + } else if (cpi->rc.is_last_bipred_frame) { + // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the LAST3_FRAME + // by updating the virtual indices. Note that the frame BWDREF_FRAME points + // to now should be retired, and it should not be used before refreshed. + int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]; + + shift_last_ref_frames(cpi); + cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx; + cpi->bwd_fb_idx = tmp; + + memcpy(cpi->interp_filter_selected[LAST_FRAME], + cpi->interp_filter_selected[BWDREF_FRAME], + sizeof(cpi->interp_filter_selected[BWDREF_FRAME])); + } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) { + // Deal with the special case for showing existing internal ALTREF_FRAME + // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME + // by updating the virtual indices. + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + int which_arf = gf_group->arf_ref_idx[gf_group->index]; + int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]; + + shift_last_ref_frames(cpi); + cpi->lst_fb_idxes[0] = cpi->alt_fb_idx; + cpi->alt_fb_idx = tmp; + + // We need to modify the mapping accordingly + cpi->arf_map[which_arf] = cpi->alt_fb_idx; + + memcpy(cpi->interp_filter_selected[LAST_FRAME], + cpi->interp_filter_selected[ALTREF_FRAME + which_arf], + sizeof(cpi->interp_filter_selected[ALTREF_FRAME + which_arf])); +#endif // CONFIG_EXT_REFS + } else { /* For non key/golden frames */ + if (cpi->refresh_alt_ref_frame) { + int arf_idx = cpi->alt_fb_idx; + int which_arf = 0; +#if CONFIG_EXT_REFS + if (cpi->oxcf.pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + which_arf = gf_group->arf_update_idx[gf_group->index]; + arf_idx = cpi->arf_map[which_arf]; + } +#else + if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + arf_idx = gf_group->arf_update_idx[gf_group->index]; + } +#endif // CONFIG_EXT_REFS + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx); + if (use_upsampled_ref) + uref_cnt_fb(cpi->upsampled_ref_bufs, &cpi->upsampled_ref_idx[arf_idx], + new_uidx); + + memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + } + + if (cpi->refresh_golden_frame) { + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], + cm->new_fb_idx); + if (use_upsampled_ref) + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx); + +#if !CONFIG_EXT_REFS + if (!cpi->rc.is_src_frame_alt_ref) +#endif // !CONFIG_EXT_REFS + memcpy(cpi->interp_filter_selected[GOLDEN_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + } + +#if CONFIG_EXT_REFS + if (cpi->refresh_bwd_ref_frame) { + if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) { + // We have swapped the virtual indices to allow bwd_ref_frame to use + // ALT0 as reference frame. We need to swap them back. + // NOTE: The ALT_REFs' are indexed reversely, and ALT0 refers to the + // farthest ALT_REF from the first frame in the gf group. + int tmp = cpi->arf_map[0]; + cpi->arf_map[0] = cpi->alt_fb_idx; + cpi->alt_fb_idx = cpi->bwd_fb_idx; + cpi->bwd_fb_idx = tmp; + } + + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx], + cm->new_fb_idx); + if (use_upsampled_ref) + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx); + + memcpy(cpi->interp_filter_selected[BWDREF_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + } +#endif // CONFIG_EXT_REFS + } + + if (cpi->refresh_last_frame) { +#if CONFIG_EXT_REFS + // NOTE(zoeliu): We have two layers of mapping (1) from the per-frame + // reference to the reference frame buffer virtual index; and then (2) from + // the virtual index to the reference frame buffer physical index: + // + // LAST_FRAME, ..., LAST3_FRAME, ..., ALTREF_FRAME + // | | | + // v v v + // lst_fb_idxes[0], ..., lst_fb_idxes[2], ..., alt_fb_idx + // | | | + // v v v + // ref_frame_map[], ..., ref_frame_map[], ..., ref_frame_map[] + // + // When refresh_last_frame is set, it is intended to retire LAST3_FRAME, + // have the other 2 LAST reference frames shifted as follows: + // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME + // , and then have LAST_FRAME refreshed by the newly coded frame. + // + // To fulfill it, the decoder will be notified to execute following 2 steps: + // + // (a) To change ref_frame_map[] and have the virtual index of LAST3_FRAME + // to point to the newly coded frame, i.e. + // ref_frame_map[lst_fb_idexes[2]] => new_fb_idx; + // + // (b) To change the 1st layer mapping to have LAST_FRAME mapped to the + // original virtual index of LAST3_FRAME and have the other mappings + // shifted as follows: + // LAST_FRAME, LAST2_FRAME, LAST3_FRAME + // | | | + // v v v + // lst_fb_idxes[2], lst_fb_idxes[0], lst_fb_idxes[1] + int ref_frame; + + if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) { + // We have swapped the virtual indices to use ALT0 as BWD_REF + // and we need to swap them back. + int tmp = cpi->arf_map[0]; + cpi->arf_map[0] = cpi->alt_fb_idx; + cpi->alt_fb_idx = cpi->bwd_fb_idx; + cpi->bwd_fb_idx = tmp; + } + + if (cm->frame_type == KEY_FRAME) { + for (ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) { + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]], + cm->new_fb_idx); + + if (use_upsampled_ref) + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->lst_fb_idxes[ref_frame]], + new_uidx); + } + } else { + int tmp; + + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]], + cm->new_fb_idx); + + if (use_upsampled_ref) + uref_cnt_fb( + cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]], + new_uidx); + + tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]; + + shift_last_ref_frames(cpi); + cpi->lst_fb_idxes[0] = tmp; + + assert(cm->show_existing_frame == 0); + // NOTE: Currently only LF_UPDATE and INTNL_OVERLAY_UPDATE frames are to + // refresh the LAST_FRAME. + memcpy(cpi->interp_filter_selected[LAST_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + } +#else + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], + cm->new_fb_idx); + if (use_upsampled_ref) + uref_cnt_fb(cpi->upsampled_ref_bufs, + &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx); + if (!cpi->rc.is_src_frame_alt_ref) { + memcpy(cpi->interp_filter_selected[LAST_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + } +#endif // CONFIG_EXT_REFS + } + +#if DUMP_REF_FRAME_IMAGES == 1 + // Dump out all reference frame images. + dump_ref_frame_images(cpi); +#endif // DUMP_REF_FRAME_IMAGES +} + +static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + struct loopfilter *lf = &cm->lf; + if (is_lossless_requested(&cpi->oxcf)) { + lf->filter_level = 0; + } else { + struct aom_usec_timer timer; + + aom_clear_system_state(); + + aom_usec_timer_start(&timer); + + av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_pick); + + aom_usec_timer_mark(&timer); + cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer); + } + + if (lf->filter_level > 0) { +#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4 + av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); +#else + if (cpi->num_workers > 1) + av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane, + lf->filter_level, 0, 0, cpi->workers, + cpi->num_workers, &cpi->lf_row_sync); + else + av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); +#endif + } +#if CONFIG_CDEF + if (is_lossless_requested(&cpi->oxcf)) { + cm->cdef_bits = 0; + cm->cdef_strengths[0] = 0; + cm->nb_cdef_strengths = 1; + } else { + // Find cm->dering_level, cm->clpf_strength_u and cm->clpf_strength_v + av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd); + + // Apply the filter + av1_cdef_frame(cm->frame_to_show, cm, xd); + } +#endif +#if CONFIG_LOOP_RESTORATION + av1_pick_filter_restoration(cpi->source, cpi, cpi->sf.lpf_pick); + if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || + cm->rst_info[1].frame_restoration_type != RESTORE_NONE || + cm->rst_info[2].frame_restoration_type != RESTORE_NONE) { + av1_loop_restoration_frame(cm->frame_to_show, cm, cm->rst_info, 7, 0, NULL); + } +#endif // CONFIG_LOOP_RESTORATION + aom_extend_frame_inner_borders(cm->frame_to_show); +} + +static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) { + RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; + if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || + new_fb_ptr->mi_cols < cm->mi_cols) { + aom_free(new_fb_ptr->mvs); + CHECK_MEM_ERROR(cm, new_fb_ptr->mvs, + (MV_REF *)aom_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*new_fb_ptr->mvs))); + new_fb_ptr->mi_rows = cm->mi_rows; + new_fb_ptr->mi_cols = cm->mi_cols; + } +} + +void av1_scale_references(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + MV_REFERENCE_FRAME ref_frame; + const AOM_REFFRAME ref_mask[INTER_REFS_PER_FRAME] = { + AOM_LAST_FLAG, +#if CONFIG_EXT_REFS + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, +#endif // CONFIG_EXT_REFS + AOM_GOLD_FLAG, +#if CONFIG_EXT_REFS + AOM_BWD_FLAG, +#endif // CONFIG_EXT_REFS + AOM_ALT_FLAG + }; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1). + if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) { + BufferPool *const pool = cm->buffer_pool; + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_buffer(cpi, ref_frame); + + if (ref == NULL) { + cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX; + continue; + } + +#if CONFIG_HIGHBITDEPTH + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { + RefCntBuffer *new_fb_ptr = NULL; + int force_scaling = 0; + int new_fb = cpi->scaled_ref_idx[ref_frame - 1]; + if (new_fb == INVALID_IDX) { + new_fb = get_free_fb(cm); + force_scaling = 1; + } + if (new_fb == INVALID_IDX) return; + new_fb_ptr = &pool->frame_bufs[new_fb]; + if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || + new_fb_ptr->buf.y_crop_height != cm->height) { + if (aom_realloc_frame_buffer( + &new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x, + cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->byte_alignment, NULL, NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE, + (int)cm->bit_depth); + cpi->scaled_ref_idx[ref_frame - 1] = new_fb; + alloc_frame_mvs(cm, new_fb); + } +#else + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { + RefCntBuffer *new_fb_ptr = NULL; + int force_scaling = 0; + int new_fb = cpi->scaled_ref_idx[ref_frame - 1]; + if (new_fb == INVALID_IDX) { + new_fb = get_free_fb(cm); + force_scaling = 1; + } + if (new_fb == INVALID_IDX) return; + new_fb_ptr = &pool->frame_bufs[new_fb]; + if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || + new_fb_ptr->buf.y_crop_height != cm->height) { + if (aom_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + AOM_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE); + cpi->scaled_ref_idx[ref_frame - 1] = new_fb; + alloc_frame_mvs(cm, new_fb); + } +#endif // CONFIG_HIGHBITDEPTH + + if (cpi->sf.use_upsampled_references && + (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || + new_fb_ptr->buf.y_crop_height != cm->height)) { + const int map_idx = get_ref_frame_map_idx(cpi, ref_frame); + EncRefCntBuffer *ubuf = + &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[map_idx]]; + + if (aom_realloc_frame_buffer(&ubuf->buf, (cm->width << 3), + (cm->height << 3), cm->subsampling_x, + cm->subsampling_y, +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + (AOM_BORDER_IN_PIXELS << 3), + cm->byte_alignment, NULL, NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate up-sampled frame buffer"); +#if CONFIG_HIGHBITDEPTH + scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1, + (int)cm->bit_depth); +#else + scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1); +#endif + } + } else { + const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + RefCntBuffer *const buf = &pool->frame_bufs[buf_idx]; + buf->buf.y_crop_width = ref->y_crop_width; + buf->buf.y_crop_height = ref->y_crop_height; + cpi->scaled_ref_idx[ref_frame - 1] = buf_idx; + ++buf->ref_count; + } + } else { + if (cpi->oxcf.pass != 0) cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX; + } + } +} + +static void release_scaled_references(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + int i; + if (cpi->oxcf.pass == 0) { + // Only release scaled references under certain conditions: + // if reference will be updated, or if scaled reference has same resolution. + int refresh[INTER_REFS_PER_FRAME]; + refresh[0] = (cpi->refresh_last_frame) ? 1 : 0; +#if CONFIG_EXT_REFS + refresh[1] = refresh[2] = 0; + refresh[3] = (cpi->refresh_golden_frame) ? 1 : 0; + refresh[4] = (cpi->refresh_bwd_ref_frame) ? 1 : 0; + refresh[5] = (cpi->refresh_alt_ref_frame) ? 1 : 0; +#else + refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0; + refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0; +#endif // CONFIG_EXT_REFS + for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + const int idx = cpi->scaled_ref_idx[i - 1]; + RefCntBuffer *const buf = + idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; + const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i); + if (buf != NULL && + (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width && + buf->buf.y_crop_height == ref->y_crop_height))) { + --buf->ref_count; + cpi->scaled_ref_idx[i - 1] = INVALID_IDX; + } + } + } else { + for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) { + const int idx = cpi->scaled_ref_idx[i]; + RefCntBuffer *const buf = + idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; + if (buf != NULL) { + --buf->ref_count; + cpi->scaled_ref_idx[i] = INVALID_IDX; + } + } + } +} + +static void full_to_model_count(unsigned int *model_count, + unsigned int *full_count) { + int n; + model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN]; + model_count[ONE_TOKEN] = full_count[ONE_TOKEN]; + model_count[TWO_TOKEN] = full_count[TWO_TOKEN]; + for (n = THREE_TOKEN; n < EOB_TOKEN; ++n) + model_count[TWO_TOKEN] += full_count[n]; + model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN]; +} + +void av1_full_to_model_counts(av1_coeff_count_model *model_count, + av1_coeff_count *full_count) { + int i, j, k, l; + + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) + full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]); +} + +#if 0 && CONFIG_INTERNAL_STATS +static void output_frame_level_debug_stats(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w"); + int64_t recon_err; + + aom_clear_system_state(); + + recon_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm)); + + if (cpi->twopass.total_left_stats.coded_error != 0.0) + fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d" + "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" " + "%10"PRId64" %10"PRId64" %10d " + "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" + "%6d %6d %5d %5d %5d " + "%10"PRId64" %10.3lf" + "%10lf %8u %10"PRId64" %10d %10d %10d\n", + cpi->common.current_video_frame, + cm->width, cm->height, + cpi->rc.source_alt_ref_pending, + cpi->rc.source_alt_ref_active, + cpi->rc.this_frame_target, + cpi->rc.projected_frame_size, + cpi->rc.projected_frame_size / cpi->common.MBs, + (cpi->rc.projected_frame_size - cpi->rc.this_frame_target), + cpi->rc.vbr_bits_off_target, + cpi->rc.vbr_bits_off_target_fast, + cpi->twopass.extend_minq, + cpi->twopass.extend_minq_fast, + cpi->rc.total_target_vs_actual, + (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target), + cpi->rc.total_actual_bits, cm->base_qindex, + av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth), + (double)av1_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0, + av1_convert_qindex_to_q(cpi->twopass.active_worst_quality, + cm->bit_depth), + cpi->rc.avg_q, + av1_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth), + cpi->refresh_last_frame, cpi->refresh_golden_frame, + cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost, + cpi->twopass.bits_left, + cpi->twopass.total_left_stats.coded_error, + cpi->twopass.bits_left / + (1 + cpi->twopass.total_left_stats.coded_error), + cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost, + cpi->twopass.kf_zeromotion_pct, + cpi->twopass.fr_content_type); + + fclose(f); + + if (0) { + FILE *const fmodes = fopen("Modes.stt", "a"); + int i; + + fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, + cm->frame_type, cpi->refresh_golden_frame, + cpi->refresh_alt_ref_frame); + + for (i = 0; i < MAX_MODES; ++i) + fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]); + + fprintf(fmodes, "\n"); + + fclose(fmodes); + } +} +#endif + +static void set_mv_search_params(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + const unsigned int max_mv_def = AOMMIN(cm->width, cm->height); + + // Default based on max resolution. + cpi->mv_step_param = av1_init_search_range(max_mv_def); + + if (cpi->sf.mv.auto_mv_step_size) { + if (frame_is_intra_only(cm)) { + // Initialize max_mv_magnitude for use in the first INTER frame + // after a key/intra-only frame. + cpi->max_mv_magnitude = max_mv_def; + } else { + if (cm->show_frame) { + // Allow mv_steps to correspond to twice the max mv magnitude found + // in the previous frame, capped by the default max_mv_magnitude based + // on resolution. + cpi->mv_step_param = av1_init_search_range( + AOMMIN(max_mv_def, 2 * cpi->max_mv_magnitude)); + } + cpi->max_mv_magnitude = 0; + } + } +} + +static void set_size_independent_vars(AV1_COMP *cpi) { +#if CONFIG_GLOBAL_MOTION + int i; + for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + set_default_warp_params(&cpi->common.global_motion[i]); + } + cpi->global_motion_search_done = 0; +#endif // CONFIG_GLOBAL_MOTION + av1_set_speed_features_framesize_independent(cpi); + av1_set_rd_speed_thresholds(cpi); + av1_set_rd_speed_thresholds_sub8x8(cpi); + cpi->common.interp_filter = cpi->sf.default_interp_filter; +} + +static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, + int *top_index) { + AV1_COMMON *const cm = &cpi->common; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + // Setup variables that depend on the dimensions of the frame. + av1_set_speed_features_framesize_dependent(cpi); + +// Decide q and q bounds. +#if CONFIG_XIPHRC + int frame_type = cm->frame_type == KEY_FRAME ? OD_I_FRAME : OD_P_FRAME; + *q = od_enc_rc_select_quantizers_and_lambdas( + &cpi->od_rc, cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame, + frame_type, bottom_index, top_index); +#else + *q = av1_rc_pick_q_and_bounds(cpi, bottom_index, top_index); +#endif + + if (!frame_is_intra_only(cm)) { + av1_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH); + } + + // Configure experimental use of segmentation for enhanced coding of + // static regions if indicated. + // Only allowed in the second pass of a two pass encode, as it requires + // lagged coding, and if the relevant speed feature flag is set. + if (oxcf->pass == 2 && cpi->sf.static_segmentation) + configure_static_seg_features(cpi); +} + +static void init_motion_estimation(AV1_COMP *cpi) { + int y_stride = cpi->scaled_source.y_stride; + + if (cpi->sf.mv.search_method == NSTEP) { + av1_init3smotion_compensation(&cpi->ss_cfg, y_stride); + } else if (cpi->sf.mv.search_method == DIAMOND) { + av1_init_dsmotion_compensation(&cpi->ss_cfg, y_stride); + } +} + +#if CONFIG_LOOP_RESTORATION +static void set_restoration_tilesize(int width, int height, + RestorationInfo *rst) { + (void)width; + (void)height; + rst[0].restoration_tilesize = (RESTORATION_TILESIZE_MAX >> 1); + rst[1].restoration_tilesize = rst[0].restoration_tilesize; + rst[2].restoration_tilesize = rst[0].restoration_tilesize; +} +#endif // CONFIG_LOOP_RESTORATION + +static void set_frame_size(AV1_COMP *cpi) { + int ref_frame; + AV1_COMMON *const cm = &cpi->common; + AV1EncoderConfig *const oxcf = &cpi->oxcf; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + + if (oxcf->pass == 2 && oxcf->rc_mode == AOM_VBR && + ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) || + (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) { + av1_calculate_coded_size(cpi, &oxcf->scaled_frame_width, + &oxcf->scaled_frame_height); + + // There has been a change in frame size. + av1_set_size_literal(cpi, oxcf->scaled_frame_width, + oxcf->scaled_frame_height); + } + + if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR && + oxcf->resize_mode == RESIZE_DYNAMIC) { + if (cpi->resize_pending == 1) { + oxcf->scaled_frame_width = + (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den; + oxcf->scaled_frame_height = + (cm->height * cpi->resize_scale_num) / cpi->resize_scale_den; + } else if (cpi->resize_pending == -1) { + // Go back up to original size. + oxcf->scaled_frame_width = oxcf->width; + oxcf->scaled_frame_height = oxcf->height; + } + if (cpi->resize_pending != 0) { + // There has been a change in frame size. + av1_set_size_literal(cpi, oxcf->scaled_frame_width, + oxcf->scaled_frame_height); + + // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. + set_mv_search_params(cpi); + } + } + +#if !CONFIG_XIPHRC + if (oxcf->pass == 2) { + av1_set_target_rate(cpi); + } +#endif + + alloc_frame_mvs(cm, cm->new_fb_idx); + + // Reset the frame pointers to the current frame size. + if (aom_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, + NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + +#if CONFIG_LOOP_RESTORATION + set_restoration_tilesize(cm->width, cm->height, cm->rst_info); + for (int i = 0; i < MAX_MB_PLANE; ++i) + cm->rst_info[i].frame_restoration_type = RESTORE_NONE; + av1_alloc_restoration_buffers(cm); + for (int i = 0; i < MAX_MB_PLANE; ++i) { + cpi->rst_search[i].restoration_tilesize = + cm->rst_info[i].restoration_tilesize; + av1_alloc_restoration_struct(cm, &cpi->rst_search[i], cm->width, + cm->height); + } +#endif // CONFIG_LOOP_RESTORATION + alloc_util_frame_buffers(cpi); + init_motion_estimation(cpi); + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME]; + const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + + ref_buf->idx = buf_idx; + + if (buf_idx != INVALID_IDX) { + YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf; + ref_buf->buf = buf; +#if CONFIG_HIGHBITDEPTH + av1_setup_scale_factors_for_frame( + &ref_buf->sf, buf->y_crop_width, buf->y_crop_height, cm->width, + cm->height, (buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0); +#else + av1_setup_scale_factors_for_frame(&ref_buf->sf, buf->y_crop_width, + buf->y_crop_height, cm->width, + cm->height); +#endif // CONFIG_HIGHBITDEPTH + if (av1_is_scaled(&ref_buf->sf)) aom_extend_frame_borders(buf); + } else { + ref_buf->buf = NULL; + } + } + + set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); +} + +static void reset_use_upsampled_references(AV1_COMP *cpi) { + MV_REFERENCE_FRAME ref_frame; + + // reset up-sampled reference buffer structure. + init_upsampled_ref_frame_bufs(cpi); + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, ref_frame); + int new_uidx = upsample_ref_frame(cpi, ref); + + // Update the up-sampled reference index. + cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)] = new_uidx; + cpi->upsampled_ref_bufs[new_uidx].ref_count++; + } +} + +static void encode_without_recode_loop(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + int q = 0, bottom_index = 0, top_index = 0; // Dummy variables. + const int use_upsampled_ref = cpi->sf.use_upsampled_references; + + aom_clear_system_state(); + + set_frame_size(cpi); + + // For 1 pass CBR under dynamic resize mode: use faster scaling for source. + // Only for 2x2 scaling for now. + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == AOM_CBR && + cpi->oxcf.resize_mode == RESIZE_DYNAMIC && + cpi->un_scaled_source->y_width == (cm->width << 1) && + cpi->un_scaled_source->y_height == (cm->height << 1)) { + cpi->source = av1_scale_if_required_fast(cm, cpi->un_scaled_source, + &cpi->scaled_source); + if (cpi->unscaled_last_source != NULL) + cpi->last_source = av1_scale_if_required_fast( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source); + } else { + cpi->source = + av1_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source); + if (cpi->unscaled_last_source != NULL) + cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source, + &cpi->scaled_last_source); + } + + if (frame_is_intra_only(cm) == 0) { + av1_scale_references(cpi); + } + + set_size_independent_vars(cpi); + set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + + // cpi->sf.use_upsampled_references can be different from frame to frame. + // Every time when cpi->sf.use_upsampled_references is changed from 0 to 1. + // The reference frames for this frame have to be up-sampled before encoding. + if (!use_upsampled_ref && cpi->sf.use_upsampled_references && + cm->frame_type != KEY_FRAME) + reset_use_upsampled_references(cpi); + + av1_set_quantizer(cm, q); + av1_set_variance_partition_thresholds(cpi, q); + + setup_frame(cpi); + +#if CONFIG_SUBFRAME_PROB_UPDATE + cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1; + av1_copy(cm->starting_coef_probs, cm->fc->coef_probs); + av1_copy(cpi->subframe_stats.enc_starting_coef_probs, cm->fc->coef_probs); + cm->coef_probs_update_idx = 0; + av1_copy(cpi->subframe_stats.coef_probs_buf[0], cm->fc->coef_probs); +#endif // CONFIG_SUBFRAME_PROB_UPDATE + + suppress_active_map(cpi); + // Variance adaptive and in frame q adjustment experiments are mutually + // exclusive. + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + av1_vaq_frame_setup(cpi); + } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + av1_setup_in_frame_q_adj(cpi); + } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + av1_cyclic_refresh_setup(cpi); + } + apply_active_map(cpi); + + // transform / motion compensation build reconstruction frame + av1_encode_frame(cpi); + + // Update some stats from cyclic refresh, and check if we should not update + // golden reference, for 1 pass CBR. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->frame_type != KEY_FRAME && + (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == AOM_CBR)) + av1_cyclic_refresh_check_golden_update(cpi); + + // Update the skip mb flag probabilities based on the distribution + // seen in the last encoder iteration. + // update_base_skip_probs(cpi); + aom_clear_system_state(); +} + +static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size, + uint8_t *dest) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int bottom_index, top_index; + int loop_count = 0; + int loop_at_this_size = 0; + int loop = 0; +#if !CONFIG_XIPHRC + int overshoot_seen = 0; + int undershoot_seen = 0; +#endif + int frame_over_shoot_limit; + int frame_under_shoot_limit; + int q = 0, q_low = 0, q_high = 0; + const int use_upsampled_ref = cpi->sf.use_upsampled_references; + + set_size_independent_vars(cpi); + + do { + aom_clear_system_state(); + + set_frame_size(cpi); + + if (loop_count == 0 || cpi->resize_pending != 0) { + set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + + // cpi->sf.use_upsampled_references can be different from frame to frame. + // Every time when cpi->sf.use_upsampled_references is changed from 0 to + // 1. + // The reference frames for this frame have to be up-sampled before + // encoding. + if (!use_upsampled_ref && cpi->sf.use_upsampled_references && + cm->frame_type != KEY_FRAME) + reset_use_upsampled_references(cpi); + + // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. + set_mv_search_params(cpi); + +#if !CONFIG_XIPHRC + // Reset the loop state for new frame size. + overshoot_seen = 0; + undershoot_seen = 0; +#endif + + // Reconfiguration for change in frame size has concluded. + cpi->resize_pending = 0; + + q_low = bottom_index; + q_high = top_index; + + loop_at_this_size = 0; + } + + // Decide frame size bounds first time through. + if (loop_count == 0) { + av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, + &frame_under_shoot_limit, + &frame_over_shoot_limit); + } + + cpi->source = + av1_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source); + + if (cpi->unscaled_last_source != NULL) + cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source, + &cpi->scaled_last_source); + + if (frame_is_intra_only(cm) == 0) { + if (loop_count > 0) { + release_scaled_references(cpi); + } + av1_scale_references(cpi); + } + + av1_set_quantizer(cm, q); + + if (loop_count == 0) setup_frame(cpi); + +#if CONFIG_Q_ADAPT_PROBS + // Base q-index may have changed, so we need to assign proper default coef + // probs before every iteration. + if (frame_is_intra_only(cm) || cm->error_resilient_mode) { + int i; + av1_default_coef_probs(cm); + if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || + cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) { + for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc; + } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) { + cm->frame_contexts[cm->frame_context_idx] = *cm->fc; + } + } +#endif // CONFIG_Q_ADAPT_PROBS + +#if CONFIG_SUBFRAME_PROB_UPDATE + cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1; + if (loop_count == 0 || frame_is_intra_only(cm) || + cm->error_resilient_mode) { + av1_copy(cm->starting_coef_probs, cm->fc->coef_probs); + av1_copy(cpi->subframe_stats.enc_starting_coef_probs, cm->fc->coef_probs); + } else { + if (cm->do_subframe_update) { + av1_copy(cm->fc->coef_probs, + cpi->subframe_stats.enc_starting_coef_probs); + av1_copy(cm->starting_coef_probs, + cpi->subframe_stats.enc_starting_coef_probs); + av1_zero(cpi->subframe_stats.coef_counts_buf); + av1_zero(cpi->subframe_stats.eob_counts_buf); + } + } + cm->coef_probs_update_idx = 0; + av1_copy(cpi->subframe_stats.coef_probs_buf[0], cm->fc->coef_probs); +#endif // CONFIG_SUBFRAME_PROB_UPDATE + + // Variance adaptive and in frame q adjustment experiments are mutually + // exclusive. + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + av1_vaq_frame_setup(cpi); + } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + av1_setup_in_frame_q_adj(cpi); + } + + // transform / motion compensation build reconstruction frame + av1_encode_frame(cpi); + + // Update the skip mb flag probabilities based on the distribution + // seen in the last encoder iteration. + // update_base_skip_probs(cpi); + + aom_clear_system_state(); + + // Dummy pack of the bitstream using up to date stats to get an + // accurate estimate of output frame size to determine if we need + // to recode. + if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) { + save_coding_context(cpi); + + av1_pack_bitstream(cpi, dest, size); + + rc->projected_frame_size = (int)(*size) << 3; + restore_coding_context(cpi); + + if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; + } + + if (cpi->oxcf.rc_mode == AOM_Q) { + loop = 0; + } else { + if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced && + (rc->projected_frame_size < rc->max_frame_bandwidth)) { + int last_q = q; + int64_t kf_err; + + int64_t high_err_target = cpi->ambient_err; + int64_t low_err_target = cpi->ambient_err >> 1; + +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) { + kf_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm)); + } else { + kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm)); + } +#else + kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm)); +#endif // CONFIG_HIGHBITDEPTH + + // Prevent possible divide by zero error below for perfect KF + kf_err += !kf_err; + + // The key frame is not good enough or we can afford + // to make it better without undue risk of popping. + if ((kf_err > high_err_target && + rc->projected_frame_size <= frame_over_shoot_limit) || + (kf_err > low_err_target && + rc->projected_frame_size <= frame_under_shoot_limit)) { + // Lower q_high + q_high = q > q_low ? q - 1 : q_low; + + // Adjust Q + q = (int)((q * high_err_target) / kf_err); + q = AOMMIN(q, (q_high + q_low) >> 1); + } else if (kf_err < low_err_target && + rc->projected_frame_size >= frame_under_shoot_limit) { + // The key frame is much better than the previous frame + // Raise q_low + q_low = q < q_high ? q + 1 : q_high; + + // Adjust Q + q = (int)((q * low_err_target) / kf_err); + q = AOMMIN(q, (q_high + q_low + 1) >> 1); + } + + // Clamp Q to upper and lower limits: + q = clamp(q, q_low, q_high); + + loop = q != last_q; + } else if (recode_loop_test(cpi, frame_over_shoot_limit, + frame_under_shoot_limit, q, + AOMMAX(q_high, top_index), bottom_index)) { + // Is the projected frame size out of range and are we allowed + // to attempt to recode. + int last_q = q; +#if !CONFIG_XIPHRC + int retries = 0; +#endif + + if (cpi->resize_pending == 1) { + // Change in frame size so go back around the recode loop. + cpi->rc.frame_size_selector = + SCALE_STEP1 - cpi->rc.frame_size_selector; + cpi->rc.next_frame_size_selector = cpi->rc.frame_size_selector; + +#if CONFIG_INTERNAL_STATS + ++cpi->tot_recode_hits; +#endif + ++loop_count; + loop = 1; + continue; + } + +#if !CONFIG_XIPHRC + // Frame size out of permitted range: + // Update correction factor & compute new Q to try... + // Frame is too large + if (rc->projected_frame_size > rc->this_frame_target) { + // Special case if the projected size is > the max allowed. + if (rc->projected_frame_size >= rc->max_frame_bandwidth) + q_high = rc->worst_quality; + + // Raise Qlow as to at least the current value + q_low = q < q_high ? q + 1 : q_high; + + if (undershoot_seen || loop_at_this_size > 1) { + // Update rate_correction_factor unless + av1_rc_update_rate_correction_factors(cpi); + + q = (q_high + q_low + 1) / 2; + } else { + // Update rate_correction_factor unless + av1_rc_update_rate_correction_factors(cpi); + + q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + AOMMAX(q_high, top_index)); + + while (q < q_low && retries < 10) { + av1_rc_update_rate_correction_factors(cpi); + q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + AOMMAX(q_high, top_index)); + retries++; + } + } + + overshoot_seen = 1; + } else { + // Frame is too small + q_high = q > q_low ? q - 1 : q_low; + + if (overshoot_seen || loop_at_this_size > 1) { + av1_rc_update_rate_correction_factors(cpi); + q = (q_high + q_low) / 2; + } else { + av1_rc_update_rate_correction_factors(cpi); + q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + top_index); + // Special case reset for qlow for constrained quality. + // This should only trigger where there is very substantial + // undershoot on a frame and the auto cq level is above + // the user passsed in value. + if (cpi->oxcf.rc_mode == AOM_CQ && q < q_low) { + q_low = q; + } + + while (q > q_high && retries < 10) { + av1_rc_update_rate_correction_factors(cpi); + q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + top_index); + retries++; + } + } + + undershoot_seen = 1; + } +#endif + + // Clamp Q to upper and lower limits: + q = clamp(q, q_low, q_high); + + loop = (q != last_q); + } else { + loop = 0; + } + } + + // Special case for overlay frame. + if (rc->is_src_frame_alt_ref && + rc->projected_frame_size < rc->max_frame_bandwidth) + loop = 0; + +#if CONFIG_GLOBAL_MOTION + if (recode_loop_test_global_motion(cpi)) { + loop = 1; + } +#endif // CONFIG_GLOBAL_MOTION + + if (loop) { + ++loop_count; + ++loop_at_this_size; + +#if CONFIG_INTERNAL_STATS + ++cpi->tot_recode_hits; +#endif + } + } while (loop); +} + +static int get_ref_frame_flags(const AV1_COMP *cpi) { + const int *const map = cpi->common.ref_frame_map; + +#if CONFIG_EXT_REFS + const int last2_is_last = + map[cpi->lst_fb_idxes[1]] == map[cpi->lst_fb_idxes[0]]; + const int last3_is_last = + map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[0]]; + const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[0]]; +#if CONFIG_LOWDELAY_COMPOUND + const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]]; + const int last3_is_last2 = + map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]]; + const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]]; + const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]]; +#else + const int bwd_is_last = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[0]]; + const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]]; + + const int last3_is_last2 = + map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]]; + const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]]; + const int bwd_is_last2 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[1]]; + + const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]]; + const int bwd_is_last3 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[2]]; + + const int bwd_is_gld = map[cpi->bwd_fb_idx] == map[cpi->gld_fb_idx]; + +#endif + const int last2_is_alt = map[cpi->lst_fb_idxes[1]] == map[cpi->alt_fb_idx]; + const int last3_is_alt = map[cpi->lst_fb_idxes[2]] == map[cpi->alt_fb_idx]; + const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx]; + const int bwd_is_alt = map[cpi->bwd_fb_idx] == map[cpi->alt_fb_idx]; +#else + const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx]; + const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx]; + const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx]; +#endif // CONFIG_EXT_REFS + + int flags = AOM_REFFRAME_ALL; + +#if CONFIG_EXT_REFS + // Disable the use of BWDREF_FRAME for non-bipredictive frames. + if (!(cpi->rc.is_bipred_frame || cpi->rc.is_last_bipred_frame || + (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs))) + flags &= ~AOM_BWD_FLAG; +#endif // CONFIG_EXT_REFS + + if (gld_is_last || gld_is_alt) flags &= ~AOM_GOLD_FLAG; + + if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG; + + if (alt_is_last) flags &= ~AOM_ALT_FLAG; + +#if CONFIG_EXT_REFS + if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG; + + if (last3_is_last || last3_is_last2 || last3_is_alt) flags &= ~AOM_LAST3_FLAG; + + if (gld_is_last2 || gld_is_last3) flags &= ~AOM_GOLD_FLAG; + +#if CONFIG_LOWDELAY_COMPOUND // Changes LL & HL bitstream + /* Allow biprediction between two identical frames (e.g. bwd_is_last = 1) */ + if (bwd_is_alt && (flags & AOM_BWD_FLAG)) flags &= ~AOM_BWD_FLAG; +#else + if ((bwd_is_last || bwd_is_last2 || bwd_is_last3 || bwd_is_gld || + bwd_is_alt) && + (flags & AOM_BWD_FLAG)) + flags &= ~AOM_BWD_FLAG; +#endif +#endif // CONFIG_EXT_REFS + + return flags; +} + +static void set_ext_overrides(AV1_COMP *cpi) { + // Overrides the defaults with the externally supplied values with + // av1_update_reference() and av1_update_entropy() calls + // Note: The overrides are valid only for the next frame passed + // to encode_frame_to_data_rate() function + if (cpi->ext_refresh_frame_context_pending) { + cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context; + cpi->ext_refresh_frame_context_pending = 0; + } + if (cpi->ext_refresh_frame_flags_pending) { + cpi->refresh_last_frame = cpi->ext_refresh_last_frame; + cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame; + cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame; + cpi->ext_refresh_frame_flags_pending = 0; + } +} + +YV12_BUFFER_CONFIG *av1_scale_if_required_fast(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *unscaled, + YV12_BUFFER_CONFIG *scaled) { + if (cm->mi_cols * MI_SIZE != unscaled->y_width || + cm->mi_rows * MI_SIZE != unscaled->y_height) { + // For 2x2 scaling down. + aom_scale_frame(unscaled, scaled, unscaled->y_buffer, 9, 2, 1, 2, 1, 0); + aom_extend_frame_borders(scaled); + return scaled; + } else { + return unscaled; + } +} + +YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *unscaled, + YV12_BUFFER_CONFIG *scaled) { + if (cm->mi_cols * MI_SIZE != unscaled->y_width || + cm->mi_rows * MI_SIZE != unscaled->y_height) { +#if CONFIG_HIGHBITDEPTH + scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth); +#else + scale_and_extend_frame_nonnormative(unscaled, scaled); +#endif // CONFIG_HIGHBITDEPTH + return scaled; + } else { + return unscaled; + } +} + +static void set_arf_sign_bias(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + int arf_sign_bias; +#if CONFIG_EXT_REFS + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + // The arf_sign_bias will be one for internal ARFs' + arf_sign_bias = cpi->rc.source_alt_ref_active && + (!cpi->refresh_alt_ref_frame || + (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)); +#else + if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + arf_sign_bias = cpi->rc.source_alt_ref_active && + (!cpi->refresh_alt_ref_frame || + (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)); + } else { + arf_sign_bias = + (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame); + } +#endif // CONFIG_EXT_REFS + + cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias; +#if CONFIG_EXT_REFS + cm->ref_frame_sign_bias[BWDREF_FRAME] = cm->ref_frame_sign_bias[ALTREF_FRAME]; +#endif // CONFIG_EXT_REFS +} + +static int setup_interp_filter_search_mask(AV1_COMP *cpi) { + InterpFilter ifilter; + int ref_total[TOTAL_REFS_PER_FRAME] = { 0 }; + MV_REFERENCE_FRAME ref; + int mask = 0; + int arf_idx = ALTREF_FRAME; + +#if CONFIG_EXT_REFS + // Get which arf used as ALTREF_FRAME + if (cpi->oxcf.pass == 2) + arf_idx += cpi->twopass.gf_group.arf_ref_idx[cpi->twopass.gf_group.index]; +#endif // CONFIG_EXT_REFS + + if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame) + return mask; + +#if CONFIG_EXT_REFS + for (ref = LAST_FRAME; ref < ALTREF_FRAME; ++ref) + for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) + ref_total[ref] += cpi->interp_filter_selected[ref][ifilter]; + + for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) + ref_total[ref] += cpi->interp_filter_selected[arf_idx][ifilter]; +#else + for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) + for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) + ref_total[ref] += cpi->interp_filter_selected[ref][ifilter]; +#endif // CONFIG_EXT_REFS + + for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) { + if ((ref_total[LAST_FRAME] && + cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) && +#if CONFIG_EXT_REFS + (ref_total[LAST2_FRAME] == 0 || + cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50 < + ref_total[LAST2_FRAME]) && + (ref_total[LAST3_FRAME] == 0 || + cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50 < + ref_total[LAST3_FRAME]) && +#endif // CONFIG_EXT_REFS + (ref_total[GOLDEN_FRAME] == 0 || + cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 < + ref_total[GOLDEN_FRAME]) && +#if CONFIG_EXT_REFS + (ref_total[BWDREF_FRAME] == 0 || + cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50 < + ref_total[BWDREF_FRAME]) && +#endif // CONFIG_EXT_REFS + (ref_total[ALTREF_FRAME] == 0 || + cpi->interp_filter_selected[arf_idx][ifilter] * 50 < + ref_total[ALTREF_FRAME])) + mask |= 1 << ifilter; + } + return mask; +} + +#define DUMP_RECON_FRAMES 0 + +#if DUMP_RECON_FRAMES == 1 +// NOTE(zoeliu): For debug - Output the filtered reconstructed video. +static void dump_filtered_recon_frames(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show; + int h; + char file_name[256] = "/tmp/enc_filtered_recon.yuv"; + FILE *f_recon = NULL; + + if (recon_buf == NULL || !cm->show_frame) { + printf("Frame %d is not ready or no show to dump.\n", + cm->current_video_frame); + return; + } + + if (cm->current_video_frame == 0) { + if ((f_recon = fopen(file_name, "wb")) == NULL) { + printf("Unable to open file %s to write.\n", file_name); + return; + } + } else { + if ((f_recon = fopen(file_name, "ab")) == NULL) { + printf("Unable to open file %s to append.\n", file_name); + return; + } + } + printf( + "\nFrame=%5d, encode_update_type[%5d]=%1d, show_existing_frame=%d, " + "y_stride=%4d, uv_stride=%4d, width=%4d, height=%4d\n", + cm->current_video_frame, cpi->twopass.gf_group.index, + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index], + cm->show_existing_frame, recon_buf->y_stride, recon_buf->uv_stride, + cm->width, cm->height); + + // --- Y --- + for (h = 0; h < cm->height; ++h) { + fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width, + f_recon); + } + // --- U --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1), + f_recon); + } + // --- V --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1), + f_recon); + } + + fclose(f_recon); +} +#endif // DUMP_RECON_FRAMES + +#if CONFIG_EC_ADAPT + +static void make_update_tile_list_enc(AV1_COMP *cpi, const int tile_rows, + const int tile_cols, + FRAME_CONTEXT *ec_ctxs[]) { + int i; + for (i = 0; i < tile_rows * tile_cols; ++i) + ec_ctxs[i] = &cpi->tile_data[i].tctx; +} + +#endif +static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, + uint8_t *dest, int skip_adapt, + unsigned int *frame_flags) { + AV1_COMMON *const cm = &cpi->common; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + struct segmentation *const seg = &cm->seg; + TX_SIZE t; +#if CONFIG_EC_ADAPT + FRAME_CONTEXT **tile_ctxs = aom_malloc(cm->tile_rows * cm->tile_cols * + sizeof(&cpi->tile_data[0].tctx)); + aom_cdf_prob **cdf_ptrs = + aom_malloc(cm->tile_rows * cm->tile_cols * + sizeof(&cpi->tile_data[0].tctx.partition_cdf[0][0])); +#endif +#if CONFIG_XIPHRC + int frame_type; + int drop_this_frame = 0; +#endif // CONFIG_XIPHRC + set_ext_overrides(cpi); + aom_clear_system_state(); + + // Set the arf sign bias for this frame. + set_arf_sign_bias(cpi); +#if CONFIG_TEMPMV_SIGNALING + // frame type has been decided outside of this function call + cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only; + cm->use_prev_frame_mvs = + !cpi->oxcf.disable_tempmv && !cm->cur_frame->intra_only; +#endif + +#if CONFIG_EXT_REFS + // NOTE: + // (1) Move the setup of the ref_frame_flags upfront as it would be + // determined by the current frame properties; + // (2) The setup of the ref_frame_flags applies to both show_existing_frame's + // and the other cases. + if (cm->current_video_frame > 0) + cpi->ref_frame_flags = get_ref_frame_flags(cpi); + + if (cm->show_existing_frame) { + // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current + // BWDREF_FRAME in the reference frame buffer. + cm->frame_type = INTER_FRAME; + cm->show_frame = 1; + cpi->frame_flags = *frame_flags; + + // In the case of show_existing frame, we will not send fresh flag + // to decoder. Any change in the reference frame buffer can be done by + // switching the virtual indices. + + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_bwd_ref_frame = 0; + cpi->refresh_alt_ref_frame = 0; + + cpi->rc.is_bwd_ref_frame = 0; + cpi->rc.is_last_bipred_frame = 0; + cpi->rc.is_bipred_frame = 0; + + // Build the bitstream + av1_pack_bitstream(cpi, dest, size); + + // Set up frame to show to get ready for stats collection. + cm->frame_to_show = get_frame_new_buffer(cm); + +#if DUMP_RECON_FRAMES == 1 + // NOTE(zoeliu): For debug - Output the filtered reconstructed video. + dump_filtered_recon_frames(cpi); +#endif // DUMP_RECON_FRAMES + + // Update the LAST_FRAME in the reference frame buffer. + av1_update_reference_frames(cpi); + + // Update frame flags + cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN; + cpi->frame_flags &= ~FRAMEFLAGS_BWDREF; + cpi->frame_flags &= ~FRAMEFLAGS_ALTREF; + + *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY; + + // Update the frame type + cm->last_frame_type = cm->frame_type; + + // Since we allocate a spot for the OVERLAY frame in the gf group, we need + // to do post-encoding update accordingly. + if (cpi->rc.is_src_frame_alt_ref) { + av1_set_target_rate(cpi); +#if CONFIG_XIPHRC + frame_type = cm->frame_type == INTER_FRAME ? OD_P_FRAME : OD_I_FRAME; + drop_this_frame = od_enc_rc_update_state( + &cpi->od_rc, *size << 3, cpi->refresh_golden_frame, + cpi->refresh_alt_ref_frame, frame_type, cpi->droppable); +#else + av1_rc_postencode_update(cpi, *size); +#endif + } + + cm->last_width = cm->width; + cm->last_height = cm->height; + + ++cm->current_video_frame; + +#if CONFIG_EC_ADAPT + aom_free(tile_ctxs); + aom_free(cdf_ptrs); +#endif + return; + } +#endif // CONFIG_EXT_REFS + + // Set default state for segment based loop filter update flags. + cm->lf.mode_ref_delta_update = 0; + + if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search) + cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi); + + // Set various flags etc to special state if it is a key frame. + if (frame_is_intra_only(cm)) { + // Reset the loop filter deltas and segmentation map. + av1_reset_segment_features(cm); + + // If segmentation is enabled force a map update for key frames. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + } + + // The alternate reference frame cannot be active for a key frame. + cpi->rc.source_alt_ref_active = 0; + + cm->error_resilient_mode = oxcf->error_resilient_mode; + + // By default, encoder assumes decoder can use prev_mi. + if (cm->error_resilient_mode) { + cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE; + cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD; + } else if (cm->intra_only) { + // Only reset the current context. + cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT; + } + } +#if CONFIG_TILE_GROUPS + if (cpi->oxcf.mtu == 0) { + cm->num_tg = cpi->oxcf.num_tile_groups; + } else { + // Use a default value for the purposes of weighting costs in probability + // updates + cm->num_tg = DEFAULT_MAX_NUM_TG; + } +#endif + +#if CONFIG_EXT_TILE + cm->tile_encoding_mode = cpi->oxcf.tile_encoding_mode; +#endif // CONFIG_EXT_TILE + +#if CONFIG_XIPHRC + if (drop_this_frame) { + av1_rc_postencode_update_drop_frame(cpi); + ++cm->current_video_frame; +#if CONFIG_EC_ADAPT + aom_free(tile_ctxs); + aom_free(cdf_ptrs); +#endif + return; + } +#else + // For 1 pass CBR, check if we are dropping this frame. + // Never drop on key frame. + if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR && + cm->frame_type != KEY_FRAME) { + if (av1_rc_drop_frame(cpi)) { + av1_rc_postencode_update_drop_frame(cpi); + ++cm->current_video_frame; +#if CONFIG_EC_ADAPT + aom_free(tile_ctxs); + aom_free(cdf_ptrs); +#endif + return; + } + } +#endif + + aom_clear_system_state(); + +#if CONFIG_INTERNAL_STATS + memset(cpi->mode_chosen_counts, 0, + MAX_MODES * sizeof(*cpi->mode_chosen_counts)); +#endif + +#if CONFIG_REFERENCE_BUFFER + { + /* Non-normative definition of current_frame_id ("frame counter" with + * wraparound) */ + const int frame_id_length = FRAME_ID_LENGTH_MINUS7 + 7; + if (cm->current_frame_id == -1) { + int lsb, msb; +/* quasi-random initialization of current_frame_id for a key frame */ +#if CONFIG_HIGHBITDEPTH + if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) { + lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff; + msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff; + } else { +#endif + lsb = cpi->source->y_buffer[0] & 0xff; + msb = cpi->source->y_buffer[1] & 0xff; +#if CONFIG_HIGHBITDEPTH + } +#endif + cm->current_frame_id = ((msb << 8) + lsb) % (1 << frame_id_length); + } else { + cm->current_frame_id = + (cm->current_frame_id + 1 + (1 << frame_id_length)) % + (1 << frame_id_length); + } + } +#endif + +#if CONFIG_EXT_DELTA_Q + cm->delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q; + cm->delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF; +#endif + + if (cpi->sf.recode_loop == DISALLOW_RECODE) { + encode_without_recode_loop(cpi); + } else { + encode_with_recode_loop(cpi, size, dest); + } + +#ifdef OUTPUT_YUV_SKINMAP + if (cpi->common.current_video_frame > 1) { + av1_compute_skin_map(cpi, yuv_skinmap_file); + } +#endif // OUTPUT_YUV_SKINMAP + + // Special case code to reduce pulsing when key frames are forced at a + // fixed interval. Note the reconstruction error if it is the frame before + // the force key frame + if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) { +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) { + cpi->ambient_err = + aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm)); + } else { + cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm)); + } +#else + cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm)); +#endif // CONFIG_HIGHBITDEPTH + } + + // If the encoder forced a KEY_FRAME decision + if (cm->frame_type == KEY_FRAME) { + cpi->refresh_last_frame = 1; + } + + cm->frame_to_show = get_frame_new_buffer(cm); + cm->frame_to_show->color_space = cm->color_space; + cm->frame_to_show->color_range = cm->color_range; + cm->frame_to_show->render_width = cm->render_width; + cm->frame_to_show->render_height = cm->render_height; + +#if CONFIG_EXT_REFS +// TODO(zoeliu): For non-ref frames, loop filtering may need to be turned +// off. +#endif // CONFIG_EXT_REFS + + // Pick the loop filter level for the frame. + loopfilter_frame(cpi, cm); + + // Build the bitstream + av1_pack_bitstream(cpi, dest, size); + + if (skip_adapt) { +#if CONFIG_EC_ADAPT + aom_free(tile_ctxs); + aom_free(cdf_ptrs); +#endif + return; + } + +#if CONFIG_REFERENCE_BUFFER + { + int i; + /* Update reference frame id values based on the value of refresh_mask */ + for (i = 0; i < REF_FRAMES; i++) { + if ((cm->refresh_mask >> i) & 1) { + cm->ref_frame_id[i] = cm->current_frame_id; + } + } + } +#endif + +#if DUMP_RECON_FRAMES == 1 + // NOTE(zoeliu): For debug - Output the filtered reconstructed video. + if (cm->show_frame) dump_filtered_recon_frames(cpi); +#endif // DUMP_RECON_FRAMES + + if (cm->seg.update_map) update_reference_segmentation_map(cpi); + + if (frame_is_intra_only(cm) == 0) { + release_scaled_references(cpi); + } + + av1_update_reference_frames(cpi); + + for (t = 0; t < TX_SIZES; t++) + av1_full_to_model_counts(cpi->td.counts->coef[t], + cpi->td.rd_counts.coef_counts[t]); +#if CONFIG_ENTROPY_STATS + av1_accumulate_frame_counts(&aggregate_fc, &cm->counts); +#endif // CONFIG_ENTROPY_STATS + if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { +#if CONFIG_SUBFRAME_PROB_UPDATE + cm->partial_prob_update = 0; +#endif // CONFIG_SUBFRAME_PROB_UPDATE + av1_adapt_coef_probs(cm); + av1_adapt_intra_frame_probs(cm); +#if CONFIG_EC_ADAPT + make_update_tile_list_enc(cpi, cm->tile_rows, cm->tile_cols, tile_ctxs); + av1_average_tile_coef_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs, + cm->tile_rows * cm->tile_cols); + av1_average_tile_intra_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs, + cm->tile_rows * cm->tile_cols); +#if CONFIG_PVQ + av1_average_tile_pvq_cdfs(cpi->common.fc, tile_ctxs, + cm->tile_rows * cm->tile_cols); +#endif // CONFIG_PVQ +#endif // CONFIG_EC_ADAPT +#if CONFIG_ADAPT_SCAN + av1_adapt_scan_order(cm); +#endif // CONFIG_ADAPT_SCAN + } + + if (!frame_is_intra_only(cm)) { + if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { + av1_adapt_inter_frame_probs(cm); + av1_adapt_mv_probs(cm, cm->allow_high_precision_mv); +#if CONFIG_EC_ADAPT + av1_average_tile_inter_cdfs(&cpi->common, cpi->common.fc, tile_ctxs, + cdf_ptrs, cm->tile_rows * cm->tile_cols); + av1_average_tile_mv_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs, + cm->tile_rows * cm->tile_cols); +#endif + } + } + + if (cpi->refresh_golden_frame == 1) + cpi->frame_flags |= FRAMEFLAGS_GOLDEN; + else + cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN; + + if (cpi->refresh_alt_ref_frame == 1) + cpi->frame_flags |= FRAMEFLAGS_ALTREF; + else + cpi->frame_flags &= ~FRAMEFLAGS_ALTREF; + +#if CONFIG_EXT_REFS + if (cpi->refresh_bwd_ref_frame == 1) + cpi->frame_flags |= FRAMEFLAGS_BWDREF; + else + cpi->frame_flags &= ~FRAMEFLAGS_BWDREF; +#endif // CONFIG_EXT_REFS + +#if !CONFIG_EXT_REFS + cpi->ref_frame_flags = get_ref_frame_flags(cpi); +#endif // !CONFIG_EXT_REFS + + cm->last_frame_type = cm->frame_type; + +#if CONFIG_XIPHRC + frame_type = cm->frame_type == KEY_FRAME ? OD_I_FRAME : OD_P_FRAME; + + drop_this_frame = + od_enc_rc_update_state(&cpi->od_rc, *size << 3, cpi->refresh_golden_frame, + cpi->refresh_alt_ref_frame, frame_type, 0); + if (drop_this_frame) { + av1_rc_postencode_update_drop_frame(cpi); + ++cm->current_video_frame; +#if CONFIG_EC_ADAPT + aom_free(tile_ctxs); + aom_free(cdf_ptrs); +#endif + return; + } +#else // !CONFIG_XIPHRC + av1_rc_postencode_update(cpi, *size); +#endif // CONFIG_XIPHRC + +#if 0 + output_frame_level_debug_stats(cpi); +#endif + + if (cm->frame_type == KEY_FRAME) { + // Tell the caller that the frame was coded as a key frame + *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY; + } else { + *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY; + } + + // Clear the one shot update flags for segmentation map and mode/ref loop + // filter deltas. + cm->seg.update_map = 0; + cm->seg.update_data = 0; + cm->lf.mode_ref_delta_update = 0; + + // keep track of the last coded dimensions + cm->last_width = cm->width; + cm->last_height = cm->height; + + // reset to normal state now that we are done. + if (!cm->show_existing_frame) cm->last_show_frame = cm->show_frame; + + if (cm->show_frame) { +#if CONFIG_EXT_REFS +// TODO(zoeliu): We may only swamp mi and prev_mi for those frames that are +// being used as reference. +#endif // CONFIG_EXT_REFS + av1_swap_mi_and_prev_mi(cm); + // Don't increment frame counters if this was an altref buffer + // update not a real frame + ++cm->current_video_frame; + } + +#if CONFIG_EXT_REFS + // NOTE: Shall not refer to any frame not used as reference. + if (cm->is_reference_frame) +#endif // CONFIG_EXT_REFS + cm->prev_frame = cm->cur_frame; +#if CONFIG_EC_ADAPT + aom_free(tile_ctxs); + aom_free(cdf_ptrs); +#endif +} + +static void Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, + int skip_adapt, unsigned int *frame_flags) { +#if CONFIG_XIPHRC + int64_t ip_count; + int frame_type, is_golden, is_altref; + + /* Not updated during init so update it here */ + if (cpi->oxcf.rc_mode == AOM_Q) cpi->od_rc.quality = cpi->oxcf.cq_level; + + frame_type = od_frame_type(&cpi->od_rc, cpi->od_rc.cur_frame, &is_golden, + &is_altref, &ip_count); + + if (frame_type == OD_I_FRAME) { + frame_type = KEY_FRAME; + cpi->frame_flags &= FRAMEFLAGS_KEY; + } else if (frame_type == OD_P_FRAME) { + frame_type = INTER_FRAME; + } + + if (is_altref) { + cpi->refresh_alt_ref_frame = 1; + cpi->rc.source_alt_ref_active = 1; + } + + cpi->refresh_golden_frame = is_golden; + cpi->common.frame_type = frame_type; + if (is_golden) cpi->frame_flags &= FRAMEFLAGS_GOLDEN; +#else + if (cpi->oxcf.rc_mode == AOM_CBR) { + av1_rc_get_one_pass_cbr_params(cpi); + } else { + av1_rc_get_one_pass_vbr_params(cpi); + } +#endif + encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags); +} + +#if !CONFIG_XIPHRC +static void Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, + unsigned int *frame_flags) { + encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags); + +#if CONFIG_EXT_REFS + // Do not do post-encoding update for those frames that do not have a spot in + // a gf group, but note that an OVERLAY frame always has a spot in a gf group, + // even when show_existing_frame is used. + if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref) { + av1_twopass_postencode_update(cpi); + } + check_show_existing_frame(cpi); +#else + av1_twopass_postencode_update(cpi); +#endif // CONFIG_EXT_REFS +} +#endif + +static void init_ref_frame_bufs(AV1_COMMON *cm) { + int i; + BufferPool *const pool = cm->buffer_pool; + cm->new_fb_idx = INVALID_IDX; + for (i = 0; i < REF_FRAMES; ++i) { + cm->ref_frame_map[i] = INVALID_IDX; + pool->frame_bufs[i].ref_count = 0; + } +} + +static void check_initial_width(AV1_COMP *cpi, +#if CONFIG_HIGHBITDEPTH + int use_highbitdepth, +#endif + int subsampling_x, int subsampling_y) { + AV1_COMMON *const cm = &cpi->common; + + if (!cpi->initial_width || +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth != use_highbitdepth || +#endif + cm->subsampling_x != subsampling_x || + cm->subsampling_y != subsampling_y) { + cm->subsampling_x = subsampling_x; + cm->subsampling_y = subsampling_y; +#if CONFIG_HIGHBITDEPTH + cm->use_highbitdepth = use_highbitdepth; +#endif + + alloc_raw_frame_buffers(cpi); + init_ref_frame_bufs(cm); + alloc_util_frame_buffers(cpi); + + init_motion_estimation(cpi); // TODO(agrange) This can be removed. + + cpi->initial_width = cm->width; + cpi->initial_height = cm->height; + cpi->initial_mbs = cm->MBs; + } +} + +int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time) { + AV1_COMMON *const cm = &cpi->common; + struct aom_usec_timer timer; + int res = 0; + const int subsampling_x = sd->subsampling_x; + const int subsampling_y = sd->subsampling_y; +#if CONFIG_HIGHBITDEPTH + const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; +#endif + +#if CONFIG_HIGHBITDEPTH + check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y); +#else + check_initial_width(cpi, subsampling_x, subsampling_y); +#endif // CONFIG_HIGHBITDEPTH + + aom_usec_timer_start(&timer); + + if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, +#if CONFIG_HIGHBITDEPTH + use_highbitdepth, +#endif // CONFIG_HIGHBITDEPTH + frame_flags)) + res = -1; + aom_usec_timer_mark(&timer); + cpi->time_receive_data += aom_usec_timer_elapsed(&timer); + + if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) && + (subsampling_x != 1 || subsampling_y != 1)) { + aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, + "Non-4:2:0 color format requires profile 1 or 3"); + res = -1; + } + if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) && + (subsampling_x == 1 && subsampling_y == 1)) { + aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, + "4:2:0 color format requires profile 0 or 2"); + res = -1; + } + + return res; +} + +static int frame_is_reference(const AV1_COMP *cpi) { + const AV1_COMMON *cm = &cpi->common; + + return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame || + cpi->refresh_golden_frame || +#if CONFIG_EXT_REFS + cpi->refresh_bwd_ref_frame || +#endif // CONFIG_EXT_REFS + cpi->refresh_alt_ref_frame || !cm->error_resilient_mode || + cm->lf.mode_ref_delta_update || cm->seg.update_map || + cm->seg.update_data; +} + +static void adjust_frame_rate(AV1_COMP *cpi, + const struct lookahead_entry *source) { + int64_t this_duration; + int step = 0; + + if (source->ts_start == cpi->first_time_stamp_ever) { + this_duration = source->ts_end - source->ts_start; + step = 1; + } else { + int64_t last_duration = + cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen; + + this_duration = source->ts_end - cpi->last_end_time_stamp_seen; + + // do a step update if the duration changes by 10% + if (last_duration) + step = (int)((this_duration - last_duration) * 10 / last_duration); + } + + if (this_duration) { + if (step) { + av1_new_framerate(cpi, 10000000.0 / this_duration); + } else { + // Average this frame's rate into the last second's average + // frame rate. If we haven't seen 1 second yet, then average + // over the whole interval seen. + const double interval = AOMMIN( + (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0); + double avg_duration = 10000000.0 / cpi->framerate; + avg_duration *= (interval - avg_duration + this_duration); + avg_duration /= interval; + + av1_new_framerate(cpi, 10000000.0 / avg_duration); + } + } + cpi->last_time_stamp_seen = source->ts_start; + cpi->last_end_time_stamp_seen = source->ts_end; +} + +// Returns 0 if this is not an alt ref else the offset of the source frame +// used as the arf midpoint. +static int get_arf_src_index(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + int arf_src_index = 0; + if (is_altref_enabled(cpi)) { + if (cpi->oxcf.pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + arf_src_index = gf_group->arf_src_offset[gf_group->index]; + } + } else if (rc->source_alt_ref_pending) { + arf_src_index = rc->frames_till_gf_update_due; + } + } + return arf_src_index; +} + +#if CONFIG_EXT_REFS +static int get_brf_src_index(AV1_COMP *cpi) { + int brf_src_index = 0; + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + + // TODO(zoeliu): We need to add the check on the -bwd_ref command line setup + // flag. + if (gf_group->bidir_pred_enabled[gf_group->index]) { + if (cpi->oxcf.pass == 2) { + if (gf_group->update_type[gf_group->index] == BRF_UPDATE) + brf_src_index = gf_group->brf_src_offset[gf_group->index]; + } else { + // TODO(zoeliu): To re-visit the setup for this scenario + brf_src_index = cpi->rc.bipred_group_interval - 1; + } + } + + return brf_src_index; +} +#endif // CONFIG_EXT_REFS + +static void check_src_altref(AV1_COMP *cpi, + const struct lookahead_entry *source) { + RATE_CONTROL *const rc = &cpi->rc; + + // If pass == 2, the parameters set here will be reset in + // av1_rc_get_second_pass_params() + + if (cpi->oxcf.pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + rc->is_src_frame_alt_ref = +#if CONFIG_EXT_REFS + (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) || +#endif // CONFIG_EXT_REFS + (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE); + } else { + rc->is_src_frame_alt_ref = + cpi->alt_ref_source && (source == cpi->alt_ref_source); + } + + if (rc->is_src_frame_alt_ref) { + // Current frame is an ARF overlay frame. + cpi->alt_ref_source = NULL; + + // Don't refresh the last buffer for an ARF overlay frame. It will + // become the GF so preserve last as an alternative prediction option. + cpi->refresh_last_frame = 0; + } +} + +#if CONFIG_INTERNAL_STATS +extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch, + const unsigned char *img2, int img2_pitch, + int width, int height); + +static void adjust_image_stat(double y, double u, double v, double all, + ImageStat *s) { + s->stat[Y] += y; + s->stat[U] += u; + s->stat[V] += v; + s->stat[ALL] += all; + s->worst = AOMMIN(s->worst, all); +} + +static void compute_internal_stats(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + double samples = 0.0; + uint32_t in_bit_depth = 8; + uint32_t bit_depth = 8; + +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) { + in_bit_depth = cpi->oxcf.input_bit_depth; + bit_depth = cm->bit_depth; + } +#endif + if (cm->show_frame) { + const YV12_BUFFER_CONFIG *orig = cpi->source; + const YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; + double y, u, v, frame_all; + + cpi->count++; + if (cpi->b_calculate_psnr) { + PSNR_STATS psnr; + double frame_ssim2 = 0.0, weight = 0.0; + aom_clear_system_state(); +// TODO(yaowu): unify these two versions into one. +#if CONFIG_HIGHBITDEPTH + aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth); +#else + aom_calc_psnr(orig, recon, &psnr); +#endif // CONFIG_HIGHBITDEPTH + + adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0], + &cpi->psnr); + cpi->total_sq_error += psnr.sse[0]; + cpi->total_samples += psnr.samples[0]; + samples = psnr.samples[0]; +// TODO(yaowu): unify these two versions into one. +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + frame_ssim2 = + aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth); + else + frame_ssim2 = aom_calc_ssim(orig, recon, &weight); +#else + frame_ssim2 = aom_calc_ssim(orig, recon, &weight); +#endif // CONFIG_HIGHBITDEPTH + + cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2); + cpi->summed_quality += frame_ssim2 * weight; + cpi->summed_weights += weight; + +#if 0 + { + FILE *f = fopen("q_used.stt", "a"); + fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", + cpi->common.current_video_frame, y2, u2, v2, + frame_psnr2, frame_ssim2); + fclose(f); + } +#endif + } + if (cpi->b_calculate_blockiness) { +#if CONFIG_HIGHBITDEPTH + if (!cm->use_highbitdepth) +#endif + { + const double frame_blockiness = + av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer, + recon->y_stride, orig->y_width, orig->y_height); + cpi->worst_blockiness = AOMMAX(cpi->worst_blockiness, frame_blockiness); + cpi->total_blockiness += frame_blockiness; + } + + if (cpi->b_calculate_consistency) { +#if CONFIG_HIGHBITDEPTH + if (!cm->use_highbitdepth) +#endif + { + const double this_inconsistency = aom_get_ssim_metrics( + orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, + orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1); + + const double peak = (double)((1 << in_bit_depth) - 1); + const double consistency = + aom_sse_to_psnr(samples, peak, cpi->total_inconsistency); + if (consistency > 0.0) + cpi->worst_consistency = + AOMMIN(cpi->worst_consistency, consistency); + cpi->total_inconsistency += this_inconsistency; + } + } + } + + frame_all = + aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth); + adjust_image_stat(y, u, v, frame_all, &cpi->fastssim); + frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth); + adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs); + } +} +#endif // CONFIG_INTERNAL_STATS + +int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, + size_t *size, uint8_t *dest, int64_t *time_stamp, + int64_t *time_end, int flush) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + AV1_COMMON *const cm = &cpi->common; + BufferPool *const pool = cm->buffer_pool; + RATE_CONTROL *const rc = &cpi->rc; + struct aom_usec_timer cmptimer; + YV12_BUFFER_CONFIG *force_src_buffer = NULL; + struct lookahead_entry *last_source = NULL; + struct lookahead_entry *source = NULL; + int arf_src_index; +#if CONFIG_EXT_REFS + int brf_src_index; +#endif // CONFIG_EXT_REFS + int i; + +#if CONFIG_XIPHRC + cpi->od_rc.end_of_input = flush; +#endif + +#if CONFIG_BITSTREAM_DEBUG + assert(cpi->oxcf.max_threads == 0 && + "bitstream debug tool does not support multithreading"); + bitstream_queue_record_write(); + bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame); +#endif + + aom_usec_timer_start(&cmptimer); + + av1_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV); + + // Is multi-arf enabled. + // Note that at the moment multi_arf is only configured for 2 pass VBR + if ((oxcf->pass == 2) && (cpi->oxcf.enable_auto_arf > 1)) + cpi->multi_arf_allowed = 1; + else + cpi->multi_arf_allowed = 0; + + // Normal defaults + cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE; + cm->refresh_frame_context = + (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode) + ? REFRESH_FRAME_CONTEXT_FORWARD + : REFRESH_FRAME_CONTEXT_BACKWARD; + + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; +#if CONFIG_EXT_REFS + cpi->refresh_bwd_ref_frame = 0; +#endif // CONFIG_EXT_REFS + cpi->refresh_alt_ref_frame = 0; + +#if CONFIG_EXT_REFS && !CONFIG_XIPHRC + if (oxcf->pass == 2 && cm->show_existing_frame) { + // Manage the source buffer and flush out the source frame that has been + // coded already; Also get prepared for PSNR calculation if needed. + if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) { + *size = 0; + return -1; + } + cpi->source = &source->img; + // TODO(zoeliu): To track down to determine whether it's needed to adjust + // the frame rate. + *time_stamp = source->ts_start; + *time_end = source->ts_end; + + // We need to adjust frame rate for an overlay frame + if (cpi->rc.is_src_frame_alt_ref) adjust_frame_rate(cpi, source); + + // Find a free buffer for the new frame, releasing the reference previously + // held. + if (cm->new_fb_idx != INVALID_IDX) { + --pool->frame_bufs[cm->new_fb_idx].ref_count; + } + cm->new_fb_idx = get_free_fb(cm); + + if (cm->new_fb_idx == INVALID_IDX) return -1; + + // Clear down mmx registers + aom_clear_system_state(); + + // Start with a 0 size frame. + *size = 0; + + // We need to update the gf_group for show_existing overlay frame + if (cpi->rc.is_src_frame_alt_ref) av1_rc_get_second_pass_params(cpi); + + Pass2Encode(cpi, size, dest, frame_flags); + + if (cpi->b_calculate_psnr) generate_psnr_packet(cpi); + +#if CONFIG_INTERNAL_STATS + compute_internal_stats(cpi); + cpi->bytes += (int)(*size); +#endif // CONFIG_INTERNAL_STATS + + // Clear down mmx registers + aom_clear_system_state(); + + cm->show_existing_frame = 0; + return 0; + } +#endif // CONFIG_EXT_REFS && !CONFIG_XIPHRC + + // Should we encode an arf frame. + arf_src_index = get_arf_src_index(cpi); + if (arf_src_index) { + for (i = 0; i <= arf_src_index; ++i) { + struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i); + // Avoid creating an alt-ref if there's a forced keyframe pending. + if (e == NULL) { + break; + } else if (e->flags == AOM_EFLAG_FORCE_KF) { + arf_src_index = 0; + flush = 1; + break; + } + } + } + + if (arf_src_index) { + assert(arf_src_index <= rc->frames_to_key); + + if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) { + cpi->alt_ref_source = source; + + if (oxcf->arnr_max_frames > 0) { + // Produce the filtered ARF frame. + av1_temporal_filter(cpi, arf_src_index); + aom_extend_frame_borders(&cpi->alt_ref_buffer); + force_src_buffer = &cpi->alt_ref_buffer; + } + + cm->show_frame = 0; + cm->intra_only = 0; + cpi->refresh_alt_ref_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 0; + rc->is_src_frame_alt_ref = 0; + } + rc->source_alt_ref_pending = 0; + } + +#if CONFIG_EXT_REFS + rc->is_bwd_ref_frame = 0; + brf_src_index = get_brf_src_index(cpi); + if (brf_src_index) { + assert(brf_src_index <= rc->frames_to_key); + if ((source = av1_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) { + cm->show_frame = 0; + cm->intra_only = 0; + + cpi->refresh_bwd_ref_frame = 1; + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + + rc->is_bwd_ref_frame = 1; + } + } +#endif // CONFIG_EXT_REFS + + if (!source) { + // Get last frame source. + if (cm->current_video_frame > 0) { + if ((last_source = av1_lookahead_peek(cpi->lookahead, -1)) == NULL) + return -1; + } + + // Read in the source frame. + source = av1_lookahead_pop(cpi->lookahead, flush); + + if (source != NULL) { + cm->show_frame = 1; + cm->intra_only = 0; + + // Check to see if the frame should be encoded as an arf overlay. + check_src_altref(cpi, source); + } + } + + if (source) { + cpi->un_scaled_source = cpi->source = + force_src_buffer ? force_src_buffer : &source->img; + + cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL; + + *time_stamp = source->ts_start; + *time_end = source->ts_end; + *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; + + } else { + *size = 0; + if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) { +#if CONFIG_XIPHRC + od_enc_rc_2pass_out(&cpi->od_rc, cpi->output_pkt_list, 1); +#else + av1_end_first_pass(cpi); /* get last stats packet */ +#endif + cpi->twopass.first_pass_done = 1; + } + return -1; + } + + if (source->ts_start < cpi->first_time_stamp_ever) { + cpi->first_time_stamp_ever = source->ts_start; + cpi->last_end_time_stamp_seen = source->ts_start; + } + + // Clear down mmx registers + aom_clear_system_state(); + + // adjust frame rates based on timestamps given + if (cm->show_frame) adjust_frame_rate(cpi, source); + + // Find a free buffer for the new frame, releasing the reference previously + // held. + if (cm->new_fb_idx != INVALID_IDX) { + --pool->frame_bufs[cm->new_fb_idx].ref_count; + } + cm->new_fb_idx = get_free_fb(cm); + + if (cm->new_fb_idx == INVALID_IDX) return -1; + + cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; + +#if CONFIG_EXT_REFS + if (oxcf->pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + cpi->alt_fb_idx = cpi->arf_map[gf_group->arf_ref_idx[gf_group->index]]; + } +#else + if (cpi->multi_arf_allowed) { + if (cm->frame_type == KEY_FRAME) { + init_buffer_indices(cpi); + } else if (oxcf->pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index]; + } + } +#endif // CONFIG_EXT_REFS + + // Start with a 0 size frame. + *size = 0; + + cpi->frame_flags = *frame_flags; + + if (oxcf->pass == 2) { +#if CONFIG_XIPHRC + if (od_enc_rc_2pass_in(&cpi->od_rc) < 0) return -1; + } +#else + av1_rc_get_second_pass_params(cpi); + } else if (oxcf->pass == 1) { + set_frame_size(cpi); + } +#endif + + if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) { + for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) + cpi->scaled_ref_idx[i] = INVALID_IDX; + } + +#if CONFIG_AOM_QM + cm->using_qmatrix = cpi->oxcf.using_qm; + cm->min_qmlevel = cpi->oxcf.qm_minlevel; + cm->max_qmlevel = cpi->oxcf.qm_maxlevel; +#endif + +#if CONFIG_REFERENCE_BUFFER + if (*time_stamp == 0) { + cpi->common.current_frame_id = -1; + } +#endif + +#if CONFIG_XIPHRC + if (oxcf->pass == 1) { + size_t tmp; + if (cpi->od_rc.cur_frame == 0) Pass0Encode(cpi, &tmp, dest, 1, frame_flags); + cpi->od_rc.firstpass_quant = cpi->od_rc.target_quantizer; + Pass0Encode(cpi, &tmp, dest, 0, frame_flags); + od_enc_rc_2pass_out(&cpi->od_rc, cpi->output_pkt_list, 0); + } else if (oxcf->pass == 2) { + Pass0Encode(cpi, size, dest, 0, frame_flags); + } else { + if (cpi->od_rc.cur_frame == 0) { + size_t tmp; + Pass0Encode(cpi, &tmp, dest, 1, frame_flags); + } + Pass0Encode(cpi, size, dest, 0, frame_flags); + } +#else + if (oxcf->pass == 1) { + cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf); + av1_first_pass(cpi, source); + } else if (oxcf->pass == 2) { + Pass2Encode(cpi, size, dest, frame_flags); + } else { + // One pass encode + Pass0Encode(cpi, size, dest, 0, frame_flags); + } +#endif + + if (!cm->error_resilient_mode) + cm->frame_contexts[cm->frame_context_idx] = *cm->fc; + + // No frame encoded, or frame was dropped, release scaled references. + if ((*size == 0) && (frame_is_intra_only(cm) == 0)) { + release_scaled_references(cpi); + } + + if (*size > 0) { + cpi->droppable = !frame_is_reference(cpi); + } + + aom_usec_timer_mark(&cmptimer); + cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer); + + if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame) + generate_psnr_packet(cpi); + +#if CONFIG_INTERNAL_STATS + if (oxcf->pass != 1) { + compute_internal_stats(cpi); + cpi->bytes += (int)(*size); + } +#endif // CONFIG_INTERNAL_STATS + +#if CONFIG_XIPHRC + cpi->od_rc.cur_frame++; +#endif + + aom_clear_system_state(); + + return 0; +} + +int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) { + AV1_COMMON *cm = &cpi->common; + if (!cm->show_frame) { + return -1; + } else { + int ret; + if (cm->frame_to_show) { + *dest = *cm->frame_to_show; + dest->y_width = cm->width; + dest->y_height = cm->height; + dest->uv_width = cm->width >> cm->subsampling_x; + dest->uv_height = cm->height >> cm->subsampling_y; + ret = 0; + } else { + ret = -1; + } + aom_clear_system_state(); + return ret; + } +} + +int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) { + if (cpi->last_show_frame_buf_idx == INVALID_IDX) return -1; + + *frame = + cpi->common.buffer_pool->frame_bufs[cpi->last_show_frame_buf_idx].buf; + return 0; +} + +int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode, + AOM_SCALING vert_mode) { + AV1_COMMON *cm = &cpi->common; + int hr = 0, hs = 0, vr = 0, vs = 0; + + if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1; + + Scale2Ratio(horiz_mode, &hr, &hs); + Scale2Ratio(vert_mode, &vr, &vs); + + // always go to the next whole number + cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs; + cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs; + assert(cm->width <= cpi->initial_width); + assert(cm->height <= cpi->initial_height); + + update_frame_size(cpi); + + return 0; +} + +int av1_set_size_literal(AV1_COMP *cpi, unsigned int width, + unsigned int height) { + AV1_COMMON *cm = &cpi->common; +#if CONFIG_HIGHBITDEPTH + check_initial_width(cpi, cm->use_highbitdepth, 1, 1); +#else + check_initial_width(cpi, 1, 1); +#endif // CONFIG_HIGHBITDEPTH + + if (width) { + cm->width = width; + if (cm->width > cpi->initial_width) { + cm->width = cpi->initial_width; + printf("Warning: Desired width too large, changed to %d\n", cm->width); + } + } + + if (height) { + cm->height = height; + if (cm->height > cpi->initial_height) { + cm->height = cpi->initial_height; + printf("Warning: Desired height too large, changed to %d\n", cm->height); + } + } + assert(cm->width <= cpi->initial_width); + assert(cm->height <= cpi->initial_height); + + update_frame_size(cpi); + + return 0; +} + +int av1_get_quantizer(AV1_COMP *cpi) { return cpi->common.base_qindex; } + +void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) { + if (flags & + (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF)) { + int ref = AOM_REFFRAME_ALL; + + if (flags & AOM_EFLAG_NO_REF_LAST) { + ref ^= AOM_LAST_FLAG; +#if CONFIG_EXT_REFS + ref ^= AOM_LAST2_FLAG; + ref ^= AOM_LAST3_FLAG; +#endif // CONFIG_EXT_REFS + } + + if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG; + + if (flags & AOM_EFLAG_NO_REF_ARF) ref ^= AOM_ALT_FLAG; + + av1_use_as_reference(cpi, ref); + } + + if (flags & + (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | + AOM_EFLAG_FORCE_GF | AOM_EFLAG_FORCE_ARF)) { + int upd = AOM_REFFRAME_ALL; + + if (flags & AOM_EFLAG_NO_UPD_LAST) { + upd ^= AOM_LAST_FLAG; +#if CONFIG_EXT_REFS + upd ^= AOM_LAST2_FLAG; + upd ^= AOM_LAST3_FLAG; +#endif // CONFIG_EXT_REFS + } + + if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG; + + if (flags & AOM_EFLAG_NO_UPD_ARF) upd ^= AOM_ALT_FLAG; + + av1_update_reference(cpi, upd); + } + + if (flags & AOM_EFLAG_NO_UPD_ENTROPY) { + av1_update_entropy(cpi, 0); + } +} diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h new file mode 100644 index 000000000..4e7aef8fc --- /dev/null +++ b/third_party/aom/av1/encoder/encoder.h @@ -0,0 +1,883 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_ENCODER_H_ +#define AV1_ENCODER_ENCODER_H_ + +#include + +#include "./aom_config.h" +#include "aom/aomcx.h" + +#include "av1/common/alloccommon.h" +#include "av1/common/entropymode.h" +#include "av1/common/thread_common.h" +#include "av1/common/onyxc_int.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#if CONFIG_ANS +#include "aom_dsp/ans.h" +#include "aom_dsp/buf_ans.h" +#endif +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/lookahead.h" +#include "av1/encoder/mbgraph.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/tokenize.h" +#include "av1/encoder/variance_tree.h" +#if CONFIG_XIPHRC +#include "av1/encoder/ratectrl_xiph.h" +#endif + +#if CONFIG_INTERNAL_STATS +#include "aom_dsp/ssim.h" +#endif +#include "aom_dsp/variance.h" +#include "aom/internal/aom_codec_internal.h" +#include "aom_util/aom_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + int nmvjointcost[MV_JOINTS]; + int nmvcosts[2][MV_VALS]; + int nmvcosts_hp[2][MV_VALS]; + +#if CONFIG_REF_MV + int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS]; + int nmv_costs[NMV_CONTEXTS][2][MV_VALS]; + int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS]; +#endif + + // 0 = Intra, Last, GF, ARF + signed char last_ref_lf_deltas[TOTAL_REFS_PER_FRAME]; + // 0 = ZERO_MV, MV + signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; + + FRAME_CONTEXT fc; +} CODING_CONTEXT; + +typedef enum { + // regular inter frame + REGULAR_FRAME = 0, + // alternate reference frame + ARF_FRAME = 1, + // overlay frame + OVERLAY_FRAME = 2, + // golden frame + GLD_FRAME = 3, +#if CONFIG_EXT_REFS + // backward reference frame + BRF_FRAME = 4, + // extra alternate reference frame + EXT_ARF_FRAME = 5 +#endif +} FRAME_CONTEXT_INDEX; + +typedef enum { + NORMAL = 0, + FOURFIVE = 1, + THREEFIVE = 2, + ONETWO = 3 +} AOM_SCALING; + +typedef enum { + // Good Quality Fast Encoding. The encoder balances quality with the amount of + // time it takes to encode the output. Speed setting controls how fast. + GOOD +} MODE; + +typedef enum { + FRAMEFLAGS_KEY = 1 << 0, + FRAMEFLAGS_GOLDEN = 1 << 1, +#if CONFIG_EXT_REFS + FRAMEFLAGS_BWDREF = 1 << 2, + FRAMEFLAGS_ALTREF = 1 << 3, +#else + FRAMEFLAGS_ALTREF = 1 << 2, +#endif // CONFIG_EXT_REFS +} FRAMETYPE_FLAGS; + +typedef enum { + NO_AQ = 0, + VARIANCE_AQ = 1, + COMPLEXITY_AQ = 2, + CYCLIC_REFRESH_AQ = 3, +#if CONFIG_DELTA_Q && !CONFIG_EXT_DELTA_Q + DELTA_AQ = 4, +#endif + AQ_MODE_COUNT // This should always be the last member of the enum +} AQ_MODE; +#if CONFIG_EXT_DELTA_Q +typedef enum { + NO_DELTA_Q = 0, + DELTA_Q_ONLY = 1, + DELTA_Q_LF = 2, + DELTAQ_MODE_COUNT // This should always be the last member of the enum +} DELTAQ_MODE; +#endif +typedef enum { + RESIZE_NONE = 0, // No frame resizing allowed. + RESIZE_FIXED = 1, // All frames are coded at the specified dimension. + RESIZE_DYNAMIC = 2 // Coded size of each frame is determined by the codec. +} RESIZE_TYPE; + +typedef struct AV1EncoderConfig { + BITSTREAM_PROFILE profile; + aom_bit_depth_t bit_depth; // Codec bit-depth. + int width; // width of data passed to the compressor + int height; // height of data passed to the compressor + unsigned int input_bit_depth; // Input bit depth. + double init_framerate; // set to passed in framerate + int64_t target_bandwidth; // bandwidth to be used in bits per second + + int noise_sensitivity; // pre processing blur: recommendation 0 + int sharpness; // sharpening output: recommendation 0: + int speed; + // maximum allowed bitrate for any intra frame in % of bitrate target. + unsigned int rc_max_intra_bitrate_pct; + // maximum allowed bitrate for any inter frame in % of bitrate target. + unsigned int rc_max_inter_bitrate_pct; + // percent of rate boost for golden frame in CBR mode. + unsigned int gf_cbr_boost_pct; + + MODE mode; + int pass; + + // Key Framing Operations + int auto_key; // autodetect cut scenes and set the keyframes + int key_freq; // maximum distance to key frame. + + int lag_in_frames; // how many frames lag before we start encoding + + // ---------------------------------------------------------------- + // DATARATE CONTROL OPTIONS + + // vbr, cbr, constrained quality or constant quality + enum aom_rc_mode rc_mode; + + // buffer targeting aggressiveness + int under_shoot_pct; + int over_shoot_pct; + + // buffering parameters + int64_t starting_buffer_level_ms; + int64_t optimal_buffer_level_ms; + int64_t maximum_buffer_size_ms; + + // Frame drop threshold. + int drop_frames_water_mark; + + // controlling quality + int fixed_q; + int worst_allowed_q; + int best_allowed_q; + int cq_level; + AQ_MODE aq_mode; // Adaptive Quantization mode +#if CONFIG_EXT_DELTA_Q + DELTAQ_MODE deltaq_mode; +#endif +#if CONFIG_AOM_QM + int using_qm; + int qm_minlevel; + int qm_maxlevel; +#endif +#if CONFIG_TILE_GROUPS + unsigned int num_tile_groups; + unsigned int mtu; +#endif + +#if CONFIG_TEMPMV_SIGNALING + unsigned int disable_tempmv; +#endif + // Internal frame size scaling. + RESIZE_TYPE resize_mode; + int scaled_frame_width; + int scaled_frame_height; + + // Enable feature to reduce the frame quantization every x frames. + int frame_periodic_boost; + + // two pass datarate control + int two_pass_vbrbias; // two pass datarate control tweaks + int two_pass_vbrmin_section; + int two_pass_vbrmax_section; + // END DATARATE CONTROL OPTIONS + // ---------------------------------------------------------------- + + int enable_auto_arf; +#if CONFIG_EXT_REFS + int enable_auto_brf; // (b)ackward (r)ef (f)rame +#endif // CONFIG_EXT_REFS + + /* Bitfield defining the error resiliency features to enable. + * Can provide decodable frames after losses in previous + * frames and decodable partitions after losses in the same frame. + */ + unsigned int error_resilient_mode; + + /* Bitfield defining the parallel decoding mode where the + * decoding in successive frames may be conducted in parallel + * just by decoding the frame headers. + */ + unsigned int frame_parallel_decoding_mode; + + int arnr_max_frames; + int arnr_strength; + + int min_gf_interval; + int max_gf_interval; + + int tile_columns; + int tile_rows; +#if CONFIG_DEPENDENT_HORZTILES + int dependent_horz_tiles; +#endif +#if CONFIG_LOOPFILTERING_ACROSS_TILES + int loop_filter_across_tiles_enabled; +#endif // CONFIG_LOOPFILTERING_ACROSS_TILES + + int max_threads; + + aom_fixed_buf_t two_pass_stats_in; + struct aom_codec_pkt_list *output_pkt_list; + +#if CONFIG_FP_MB_STATS + aom_fixed_buf_t firstpass_mb_stats_in; +#endif + + aom_tune_metric tuning; + aom_tune_content content; +#if CONFIG_HIGHBITDEPTH + int use_highbitdepth; +#endif + aom_color_space_t color_space; + int color_range; + int render_width; + int render_height; + +#if CONFIG_EXT_PARTITION + aom_superblock_size_t superblock_size; +#endif // CONFIG_EXT_PARTITION +#if CONFIG_ANS && ANS_MAX_SYMBOLS + int ans_window_size_log2; +#endif // CONFIG_ANS && ANS_MAX_SYMBOLS +#if CONFIG_EXT_TILE + unsigned int tile_encoding_mode; +#endif // CONFIG_EXT_TILE + + unsigned int motion_vector_unit_test; +} AV1EncoderConfig; + +static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) { + return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; +} + +// TODO(jingning) All spatially adaptive variables should go to TileDataEnc. +typedef struct TileDataEnc { + TileInfo tile_info; + int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; + int mode_map[BLOCK_SIZES][MAX_MODES]; + int m_search_count; + int ex_search_count; +#if CONFIG_PVQ + PVQ_QUEUE pvq_q; +#endif +#if CONFIG_CFL + CFL_CTX cfl; +#endif +#if CONFIG_EC_ADAPT + DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); +#endif +} TileDataEnc; + +typedef struct RD_COUNTS { + av1_coeff_count coef_counts[TX_SIZES][PLANE_TYPES]; + int64_t comp_pred_diff[REFERENCE_MODES]; +#if CONFIG_GLOBAL_MOTION + // Stores number of 4x4 blocks using global motion per reference frame. + int global_motion_used[TOTAL_REFS_PER_FRAME]; +#endif // CONFIG_GLOBAL_MOTION +} RD_COUNTS; + +typedef struct ThreadData { + MACROBLOCK mb; + RD_COUNTS rd_counts; + FRAME_COUNTS *counts; + + PICK_MODE_CONTEXT *leaf_tree; + PC_TREE *pc_tree; + PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1]; + + VAR_TREE *var_tree; + VAR_TREE *var_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1]; +} ThreadData; + +struct EncWorkerData; + +typedef struct ActiveMap { + int enabled; + int update; + unsigned char *map; +} ActiveMap; + +#define NUM_STAT_TYPES 4 // types of stats: Y, U, V and ALL + +typedef struct IMAGE_STAT { + double stat[NUM_STAT_TYPES]; + double worst; +} ImageStat; + +#undef NUM_STAT_TYPES + +typedef struct { + int ref_count; + YV12_BUFFER_CONFIG buf; +} EncRefCntBuffer; + +#if CONFIG_SUBFRAME_PROB_UPDATE +typedef struct SUBFRAME_STATS { + av1_coeff_probs_model coef_probs_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES]; + av1_coeff_count coef_counts_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES]; + unsigned int eob_counts_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES][REF_TYPES] + [COEF_BANDS][COEFF_CONTEXTS]; + av1_coeff_probs_model enc_starting_coef_probs[TX_SIZES][PLANE_TYPES]; +} SUBFRAME_STATS; +#endif // CONFIG_SUBFRAME_PROB_UPDATE + +typedef struct TileBufferEnc { + uint8_t *data; + size_t size; +} TileBufferEnc; + +typedef struct AV1_COMP { + QUANTS quants; + ThreadData td; + MB_MODE_INFO_EXT *mbmi_ext_base; + DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]); // 8: SIMD width + DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]); // 8: SIMD width +#if CONFIG_NEW_QUANT + DECLARE_ALIGNED(16, dequant_val_type_nuq, + y_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]); + DECLARE_ALIGNED(16, dequant_val_type_nuq, + uv_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]); +#endif // CONFIG_NEW_QUANT + AV1_COMMON common; + AV1EncoderConfig oxcf; + struct lookahead_ctx *lookahead; + struct lookahead_entry *alt_ref_source; + + YV12_BUFFER_CONFIG *source; + YV12_BUFFER_CONFIG *last_source; // NULL for first frame and alt_ref frames + YV12_BUFFER_CONFIG *un_scaled_source; + YV12_BUFFER_CONFIG scaled_source; + YV12_BUFFER_CONFIG *unscaled_last_source; + YV12_BUFFER_CONFIG scaled_last_source; + + // Up-sampled reference buffers + // NOTE(zoeliu): It is needed to allocate sufficient space to the up-sampled + // reference buffers, which should include the up-sampled version of all the + // possibly stored references plus the currently coded frame itself. + EncRefCntBuffer upsampled_ref_bufs[REF_FRAMES + 1]; + int upsampled_ref_idx[REF_FRAMES + 1]; + + // For a still frame, this flag is set to 1 to skip partition search. + int partition_search_skippable_frame; + + int scaled_ref_idx[TOTAL_REFS_PER_FRAME]; +#if CONFIG_EXT_REFS + int lst_fb_idxes[LAST_REF_FRAMES]; +#else + int lst_fb_idx; +#endif // CONFIG_EXT_REFS + int gld_fb_idx; +#if CONFIG_EXT_REFS + int bwd_fb_idx; // BWD_REF_FRAME +#endif // CONFIG_EXT_REFS + int alt_fb_idx; + + int last_show_frame_buf_idx; // last show frame buffer index + + int refresh_last_frame; + int refresh_golden_frame; +#if CONFIG_EXT_REFS + int refresh_bwd_ref_frame; +#endif // CONFIG_EXT_REFS + int refresh_alt_ref_frame; + + int ext_refresh_frame_flags_pending; + int ext_refresh_last_frame; + int ext_refresh_golden_frame; + int ext_refresh_alt_ref_frame; + + int ext_refresh_frame_context_pending; + int ext_refresh_frame_context; + + YV12_BUFFER_CONFIG last_frame_uf; +#if CONFIG_LOOP_RESTORATION + YV12_BUFFER_CONFIG last_frame_db; + YV12_BUFFER_CONFIG trial_frame_rst; + uint8_t *extra_rstbuf; // Extra buffers used in restoration search + RestorationInfo rst_search[MAX_MB_PLANE]; // Used for encoder side search +#endif // CONFIG_LOOP_RESTORATION + + // Ambient reconstruction err target for force key frames + int64_t ambient_err; + + RD_OPT rd; + + CODING_CONTEXT coding_context; + +#if CONFIG_REF_MV + int nmv_costs[NMV_CONTEXTS][2][MV_VALS]; + int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS]; +#endif + + int nmvcosts[2][MV_VALS]; + int nmvcosts_hp[2][MV_VALS]; + int nmvsadcosts[2][MV_VALS]; + int nmvsadcosts_hp[2][MV_VALS]; + + int64_t last_time_stamp_seen; + int64_t last_end_time_stamp_seen; + int64_t first_time_stamp_ever; + + RATE_CONTROL rc; +#if CONFIG_XIPHRC + od_rc_state od_rc; +#endif + double framerate; + + // NOTE(zoeliu): Any inter frame allows maximum of REF_FRAMES inter + // references; Plus the currently coded frame itself, it is needed to allocate + // sufficient space to the size of the maximum possible number of frames. + int interp_filter_selected[REF_FRAMES + 1][SWITCHABLE]; + + struct aom_codec_pkt_list *output_pkt_list; + + MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS]; + int mbgraph_n_frames; // number of frames filled in the above + int static_mb_pct; // % forced skip mbs by segmentation + int ref_frame_flags; + + SPEED_FEATURES sf; + + unsigned int max_mv_magnitude; + int mv_step_param; + + int allow_comp_inter_inter; + + uint8_t *segmentation_map; + + CYCLIC_REFRESH *cyclic_refresh; + ActiveMap active_map; + + fractional_mv_step_fp *find_fractional_mv_step; + av1_full_search_fn_t full_search_sad; // It is currently unused. + av1_diamond_search_fn_t diamond_search_sad; + aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES]; + uint64_t time_receive_data; + uint64_t time_compress_data; + uint64_t time_pick_lpf; + uint64_t time_encode_sb_row; + +#if CONFIG_FP_MB_STATS + int use_fp_mb_stats; +#endif + + TWO_PASS twopass; + + YV12_BUFFER_CONFIG alt_ref_buffer; + +#if CONFIG_INTERNAL_STATS + unsigned int mode_chosen_counts[MAX_MODES]; + + int count; + uint64_t total_sq_error; + uint64_t total_samples; + ImageStat psnr; + + double total_blockiness; + double worst_blockiness; + + int bytes; + double summed_quality; + double summed_weights; + unsigned int tot_recode_hits; + double worst_ssim; + + ImageStat fastssim; + ImageStat psnrhvs; + + int b_calculate_blockiness; + int b_calculate_consistency; + + double total_inconsistency; + double worst_consistency; + Ssimv *ssim_vars; + Metrics metrics; +#endif + int b_calculate_psnr; + + int droppable; + + int initial_width; + int initial_height; + int initial_mbs; // Number of MBs in the full-size frame; to be used to + // normalize the firstpass stats. This will differ from the + // number of MBs in the current frame when the frame is + // scaled. + + // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type. + DIFF *source_diff_var; + // The threshold used in SOURCE_VAR_BASED_PARTITION search type. + unsigned int source_var_thresh; + int frames_till_next_var_check; + + int frame_flags; + + search_site_config ss_cfg; + + int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES]; +#if CONFIG_REF_MV + int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2]; + int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2]; + int refmv_mode_cost[REFMV_MODE_CONTEXTS][2]; + int drl_mode_cost0[DRL_MODE_CONTEXTS][2]; +#endif + + unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES]; +#if CONFIG_EXT_INTER + unsigned int inter_compound_mode_cost[INTER_MODE_CONTEXTS] + [INTER_COMPOUND_MODES]; + unsigned int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES]; +#endif // CONFIG_EXT_INTER +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + int motion_mode_cost[BLOCK_SIZES][MOTION_MODES]; +#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION + int motion_mode_cost1[BLOCK_SIZES][2]; +#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + int intra_uv_mode_cost[INTRA_MODES][INTRA_MODES]; + int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; + int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; +#if CONFIG_EXT_PARTITION_TYPES + int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX] + [EXT_PARTITION_TYPES]; +#else + int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX] + [PARTITION_TYPES]; +#endif +#if CONFIG_PALETTE + int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES]; + int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES]; + int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; +#endif // CONFIG_PALETTE + int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES]; +#if CONFIG_EXT_TX + int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES]; + int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] + [TX_TYPES]; +#else + int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES]; + int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES]; +#endif // CONFIG_EXT_TX +#if CONFIG_EXT_INTRA +#if CONFIG_INTRA_INTERP + int intra_filter_cost[INTRA_FILTERS + 1][INTRA_FILTERS]; +#endif // CONFIG_INTRA_INTERP +#endif // CONFIG_EXT_INTRA +#if CONFIG_LOOP_RESTORATION + int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES]; +#endif // CONFIG_LOOP_RESTORATION +#if CONFIG_GLOBAL_MOTION + int gmtype_cost[TRANS_TYPES]; + int gmparams_cost[TOTAL_REFS_PER_FRAME]; +#endif // CONFIG_GLOBAL_MOTION + + int multi_arf_allowed; + int multi_arf_enabled; + int multi_arf_last_grp_enabled; + + TileDataEnc *tile_data; + int allocated_tiles; // Keep track of memory allocated for tiles. + + TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS]; + unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS]; + + TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; + + int resize_pending; + int resize_state; + int resize_scale_num; + int resize_scale_den; + int resize_avg_qp; + int resize_buffer_underflow; + int resize_count; + + // VAR_BASED_PARTITION thresholds + // 0 - threshold_128x128; + // 1 - threshold_64x64; + // 2 - threshold_32x32; + // 3 - threshold_16x16; + // 4 - threshold_8x8; + int64_t vbp_thresholds[5]; + int64_t vbp_threshold_minmax; + int64_t vbp_threshold_sad; + BLOCK_SIZE vbp_bsize_min; + + // VARIANCE_AQ segment map refresh + int vaq_refresh; + + // Multi-threading + int num_workers; + AVxWorker *workers; + struct EncWorkerData *tile_thr_data; + AV1LfSync lf_row_sync; +#if CONFIG_SUBFRAME_PROB_UPDATE + SUBFRAME_STATS subframe_stats; + // TODO(yaowu): minimize the size of count buffers + SUBFRAME_STATS wholeframe_stats; + av1_coeff_stats branch_ct_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES]; +#endif // CONFIG_SUBFRAME_PROB_UPDATE +#if CONFIG_ANS + struct BufAnsCoder buf_ans; +#endif +#if CONFIG_EXT_REFS + int refresh_frame_mask; + int existing_fb_idx_to_show; + int is_arf_filter_off[MAX_EXT_ARFS + 1]; + int num_extra_arfs; + int arf_map[MAX_EXT_ARFS + 1]; +#endif // CONFIG_EXT_REFS +#if CONFIG_GLOBAL_MOTION + int global_motion_search_done; +#endif +#if CONFIG_REFERENCE_BUFFER + SequenceHeader seq_params; +#endif +#if CONFIG_LV_MAP + tran_low_t *tcoeff_buf[MAX_MB_PLANE]; +#endif +} AV1_COMP; + +void av1_initialize_enc(void); + +struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, + BufferPool *const pool); +void av1_remove_compressor(AV1_COMP *cpi); + +void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf); + +// receive a frames worth of data. caller can assume that a copy of this +// frame is made and not just a copy of the pointer.. +int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time_stamp); + +int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, + size_t *size, uint8_t *dest, int64_t *time_stamp, + int64_t *time_end, int flush); + +int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest); + +int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame); + +int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags); + +void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags); + +int av1_copy_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + +int av1_set_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + +int av1_update_entropy(AV1_COMP *cpi, int update); + +int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols); + +int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols); + +int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode, + AOM_SCALING vert_mode); + +int av1_set_size_literal(AV1_COMP *cpi, unsigned int width, + unsigned int height); + +int av1_get_quantizer(struct AV1_COMP *cpi); + +void av1_full_to_model_counts(av1_coeff_count_model *model_count, + av1_coeff_count *full_count); + +static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) { + return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); +} + +static INLINE int get_ref_frame_map_idx(const AV1_COMP *cpi, + MV_REFERENCE_FRAME ref_frame) { +#if CONFIG_EXT_REFS + if (ref_frame >= LAST_FRAME && ref_frame <= LAST3_FRAME) + return cpi->lst_fb_idxes[ref_frame - 1]; +#else + if (ref_frame == LAST_FRAME) return cpi->lst_fb_idx; +#endif // CONFIG_EXT_REFS + else if (ref_frame == GOLDEN_FRAME) + return cpi->gld_fb_idx; +#if CONFIG_EXT_REFS + else if (ref_frame == BWDREF_FRAME) + return cpi->bwd_fb_idx; +#endif // CONFIG_EXT_REFS + else + return cpi->alt_fb_idx; +} + +static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi, + MV_REFERENCE_FRAME ref_frame) { + const AV1_COMMON *const cm = &cpi->common; + const int map_idx = get_ref_frame_map_idx(cpi, ref_frame); + return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX; +} + +static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer( + const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { + const AV1_COMMON *const cm = &cpi->common; + const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf + : NULL; +} + +static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref( + const AV1_COMP *cpi, const MV_REFERENCE_FRAME ref_frame) { + // Use up-sampled reference frames. + const int buf_idx = + cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)]; + return &cpi->upsampled_ref_bufs[buf_idx].buf; +} + +#if CONFIG_EXT_REFS +static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) { + MV_REFERENCE_FRAME ref_frame; + AV1_COMMON *const cm = &cpi->common; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + if (buf_idx == INVALID_IDX) continue; + if (frame_buf == &cm->buffer_pool->frame_bufs[buf_idx]) break; + } + return (ref_frame <= ALTREF_FRAME); +} +#endif // CONFIG_EXT_REFS + +static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols) { + // We assume 3 planes all at full resolution. We assume up to 1 token per + // pixel, and then allow a head room of 1 EOSB token per 4x4 block per plane, + // plus EOSB_TOKEN per plane. + return mb_rows * mb_cols * (16 * 16 + 17) * 3; +} + +// Get the allocated token size for a tile. It does the same calculation as in +// the frame token allocation. +static INLINE unsigned int allocated_tokens(TileInfo tile) { +#if CONFIG_CB4X4 + int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 2) >> 2; + int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 2) >> 2; +#else + int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1; + int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1; +#endif + + return get_token_alloc(tile_mb_rows, tile_mb_cols); +} + +void av1_alloc_compressor_data(AV1_COMP *cpi); + +void av1_scale_references(AV1_COMP *cpi); + +void av1_update_reference_frames(AV1_COMP *cpi); + +void av1_set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv); +#if CONFIG_TEMPMV_SIGNALING +void av1_set_temporal_mv_prediction(AV1_COMP *cpi, int allow_tempmv_prediction); +#endif + +YV12_BUFFER_CONFIG *av1_scale_if_required_fast(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *unscaled, + YV12_BUFFER_CONFIG *scaled); + +YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *unscaled, + YV12_BUFFER_CONFIG *scaled); + +void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags); + +static INLINE int is_altref_enabled(const AV1_COMP *const cpi) { + return cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.enable_auto_arf; +} + +// TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf +#if 0 && CONFIG_EXT_REFS +static INLINE int is_bwdref_enabled(const AV1_COMP *const cpi) { + // NOTE(zoeliu): The enabling of bi-predictive frames depends on the use of + // alt_ref, and now will be off when the alt_ref interval is + // not sufficiently large. + return is_altref_enabled(cpi) && cpi->oxcf.enable_auto_brf; +} +#endif // CONFIG_EXT_REFS + +static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd, + MV_REFERENCE_FRAME ref0, + MV_REFERENCE_FRAME ref1) { + xd->block_refs[0] = + &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME : 0]; + xd->block_refs[1] = + &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME : 0]; +} + +static INLINE int get_chessboard_index(int frame_index) { + return frame_index & 0x1; +} + +static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) { + return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL; +} + +void av1_new_framerate(AV1_COMP *cpi, double framerate); + +#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) + +// Update up-sampled reference frame index. +static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx, + int new_uidx) { + const int ref_index = *uidx; + + if (ref_index >= 0 && ubufs[ref_index].ref_count > 0) + ubufs[ref_index].ref_count--; + + *uidx = new_uidx; + ubufs[new_uidx].ref_count++; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_ENCODER_H_ diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c new file mode 100644 index 000000000..3f71a4472 --- /dev/null +++ b/third_party/aom/av1/encoder/encodetxb.c @@ -0,0 +1,784 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/scan.h" +#include "av1/common/blockd.h" +#include "av1/common/idct.h" +#include "av1/common/pred_common.h" +#include "av1/encoder/bitstream.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/subexp.h" +#include "av1/encoder/tokenize.h" + +void av1_alloc_txb_buf(AV1_COMP *cpi) { +#if 0 + AV1_COMMON *cm = &cpi->common; + int mi_block_size = 1 << MI_SIZE_LOG2; + // TODO(angiebird): Make sure cm->subsampling_x/y is set correctly, and then + // use precise buffer size according to cm->subsampling_x/y + int pixel_stride = mi_block_size * cm->mi_cols; + int pixel_height = mi_block_size * cm->mi_rows; + int i; + for (i = 0; i < MAX_MB_PLANE; ++i) { + CHECK_MEM_ERROR( + cm, cpi->tcoeff_buf[i], + aom_malloc(sizeof(*cpi->tcoeff_buf[i]) * pixel_stride * pixel_height)); + } +#else + (void)cpi; +#endif +} + +void av1_free_txb_buf(AV1_COMP *cpi) { +#if 0 + int i; + for (i = 0; i < MAX_MB_PLANE; ++i) { + aom_free(cpi->tcoeff_buf[i]); + } +#else + (void)cpi; +#endif +} + +static void write_golomb(aom_writer *w, int level) { + int x = level + 1; + int i = x; + int length = 0; + + while (i) { + i >>= 1; + ++length; + } + assert(length > 0); + + for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0); + + for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01); +} + +void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd, + aom_writer *w, int block, int plane, + const tran_low_t *tcoeff, uint16_t eob, + TXB_CTX *txb_ctx) { + aom_prob *nz_map; + aom_prob *eob_flag; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_SIZE tx_size = get_tx_size(plane, xd); + const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); + const SCAN_ORDER *const scan_order = + get_scan(cm, tx_size, tx_type, is_inter_block(mbmi)); + const int16_t *scan = scan_order->scan; + int c; + int is_nz; + const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; + const int seg_eob = tx_size_2d[tx_size]; + uint8_t txb_mask[32 * 32] = { 0 }; + uint16_t update_eob = 0; + + aom_write(w, eob == 0, cm->fc->txb_skip[tx_size][txb_ctx->txb_skip_ctx]); + + if (eob == 0) return; +#if CONFIG_TXK_SEL + av1_write_tx_type(cm, xd, block, plane, w); +#endif + + nz_map = cm->fc->nz_map[tx_size][plane_type]; + eob_flag = cm->fc->eob_flag[tx_size][plane_type]; + + for (c = 0; c < eob; ++c) { + int coeff_ctx = get_nz_map_ctx(tcoeff, txb_mask, scan[c], bwl); + int eob_ctx = get_eob_ctx(tcoeff, scan[c], bwl); + + tran_low_t v = tcoeff[scan[c]]; + is_nz = (v != 0); + + if (c == seg_eob - 1) break; + + aom_write(w, is_nz, nz_map[coeff_ctx]); + + if (is_nz) { + aom_write(w, c == (eob - 1), eob_flag[eob_ctx]); + } + txb_mask[scan[c]] = 1; + } + + int i; + for (i = 0; i < NUM_BASE_LEVELS; ++i) { + aom_prob *coeff_base = cm->fc->coeff_base[tx_size][plane_type][i]; + + update_eob = 0; + for (c = eob - 1; c >= 0; --c) { + tran_low_t v = tcoeff[scan[c]]; + tran_low_t level = abs(v); + int sign = (v < 0) ? 1 : 0; + int ctx; + + if (level <= i) continue; + + ctx = get_base_ctx(tcoeff, scan[c], bwl, i + 1); + + if (level == i + 1) { + aom_write(w, 1, coeff_base[ctx]); + if (c == 0) { + aom_write(w, sign, cm->fc->dc_sign[plane_type][txb_ctx->dc_sign_ctx]); + } else { + aom_write_bit(w, sign); + } + continue; + } + aom_write(w, 0, coeff_base[ctx]); + update_eob = AOMMAX(update_eob, c); + } + } + + for (c = update_eob; c >= 0; --c) { + tran_low_t v = tcoeff[scan[c]]; + tran_low_t level = abs(v); + int sign = (v < 0) ? 1 : 0; + int idx; + int ctx; + + if (level <= NUM_BASE_LEVELS) continue; + + if (c == 0) { + aom_write(w, sign, cm->fc->dc_sign[plane_type][txb_ctx->dc_sign_ctx]); + } else { + aom_write_bit(w, sign); + } + + // level is above 1. + ctx = get_level_ctx(tcoeff, scan[c], bwl); + for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) { + if (level == (idx + 1 + NUM_BASE_LEVELS)) { + aom_write(w, 1, cm->fc->coeff_lps[tx_size][plane_type][ctx]); + break; + } + aom_write(w, 0, cm->fc->coeff_lps[tx_size][plane_type][ctx]); + } + if (idx < COEFF_BASE_RANGE) continue; + + // use 0-th order Golomb code to handle the residual level. + write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS); + } +} + +void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, + aom_writer *w, int plane) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + BLOCK_SIZE bsize = mbmi->sb_type; + struct macroblockd_plane *pd = &xd->plane[plane]; + +#if CONFIG_CB4X4 + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); +#else + const BLOCK_SIZE plane_bsize = + get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd); +#endif + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + TX_SIZE tx_size = get_tx_size(plane, xd); + const int bkw = tx_size_wide_unit[tx_size]; + const int bkh = tx_size_high_unit[tx_size]; + const int step = tx_size_wide_unit[tx_size] * tx_size_high_unit[tx_size]; + int row, col; + int block = 0; + for (row = 0; row < max_blocks_high; row += bkh) { + for (col = 0; col < max_blocks_wide; col += bkw) { + tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block); + uint16_t eob = x->mbmi_ext->eobs[plane][block]; + TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block], + x->mbmi_ext->dc_sign_ctx[plane][block] }; + av1_write_coeffs_txb(cm, xd, w, block, plane, tcoeff, eob, &txb_ctx); + block += step; + } + } +} + +static INLINE void get_base_ctx_set(const tran_low_t *tcoeffs, + int c, // raster order + const int bwl, + int ctx_set[NUM_BASE_LEVELS]) { + const int row = c >> bwl; + const int col = c - (row << bwl); + const int stride = 1 << bwl; + int mag[NUM_BASE_LEVELS] = { 0 }; + int idx; + tran_low_t abs_coeff; + int i; + + for (idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) { + int ref_row = row + base_ref_offset[idx][0]; + int ref_col = col + base_ref_offset[idx][1]; + int pos = (ref_row << bwl) + ref_col; + + if (ref_row < 0 || ref_col < 0 || ref_row >= stride || ref_col >= stride) + continue; + + abs_coeff = abs(tcoeffs[pos]); + + for (i = 0; i < NUM_BASE_LEVELS; ++i) { + ctx_set[i] += abs_coeff > i; + if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0) + mag[i] |= abs_coeff > (i + 1); + } + } + + for (i = 0; i < NUM_BASE_LEVELS; ++i) { + ctx_set[i] = (ctx_set[i] + 1) >> 1; + + if (row == 0 && col == 0) + ctx_set[i] = (ctx_set[i] << 1) + mag[i]; + else if (row == 0) + ctx_set[i] = 8 + (ctx_set[i] << 1) + mag[i]; + else if (col == 0) + ctx_set[i] = 18 + (ctx_set[i] << 1) + mag[i]; + else + ctx_set[i] = 28 + (ctx_set[i] << 1) + mag[i]; + } + return; +} + +int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane, + int block, TXB_CTX *txb_ctx) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const TX_SIZE tx_size = get_tx_size(plane, xd); + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const struct macroblock_plane *p = &x->plane[plane]; + const int eob = p->eobs[block]; + const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + int c, cost; + const int seg_eob = AOMMIN(eob, tx_size_2d[tx_size] - 1); + int txb_skip_ctx = txb_ctx->txb_skip_ctx; + aom_prob *nz_map = xd->fc->nz_map[tx_size][plane_type]; + + const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; + // txb_mask is only initialized for once here. After that, it will be set when + // coding zero map and then reset when coding level 1 info. + uint8_t txb_mask[32 * 32] = { 0 }; + aom_prob(*coeff_base)[COEFF_BASE_CONTEXTS] = + xd->fc->coeff_base[tx_size][plane_type]; + + const SCAN_ORDER *const scan_order = + get_scan(cm, tx_size, tx_type, is_inter_block(mbmi)); + const int16_t *scan = scan_order->scan; + + cost = 0; + + if (eob == 0) { + cost = av1_cost_bit(xd->fc->txb_skip[tx_size][txb_skip_ctx], 1); + return cost; + } + + cost = av1_cost_bit(xd->fc->txb_skip[tx_size][txb_skip_ctx], 0); + +#if CONFIG_TXK_SEL + cost += av1_tx_type_cost(cpi, xd, mbmi->sb_type, plane, tx_size, tx_type); +#endif + + for (c = 0; c < eob; ++c) { + tran_low_t v = qcoeff[scan[c]]; + int is_nz = (v != 0); + int level = abs(v); + + if (c < seg_eob) { + int coeff_ctx = get_nz_map_ctx(qcoeff, txb_mask, scan[c], bwl); + cost += av1_cost_bit(nz_map[coeff_ctx], is_nz); + } + + if (is_nz) { + int ctx_ls[NUM_BASE_LEVELS] = { 0 }; + int sign = (v < 0) ? 1 : 0; + + // sign bit cost + if (c == 0) { + int dc_sign_ctx = txb_ctx->dc_sign_ctx; + + cost += av1_cost_bit(xd->fc->dc_sign[plane_type][dc_sign_ctx], sign); + } else { + cost += av1_cost_bit(128, sign); + } + + get_base_ctx_set(qcoeff, scan[c], bwl, ctx_ls); + + int i; + for (i = 0; i < NUM_BASE_LEVELS; ++i) { + if (level <= i) continue; + + if (level == i + 1) { + cost += av1_cost_bit(coeff_base[i][ctx_ls[i]], 1); + continue; + } + cost += av1_cost_bit(coeff_base[i][ctx_ls[i]], 0); + } + + if (level > NUM_BASE_LEVELS) { + int idx; + int ctx; + + ctx = get_level_ctx(qcoeff, scan[c], bwl); + + for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) { + if (level == (idx + 1 + NUM_BASE_LEVELS)) { + cost += + av1_cost_bit(xd->fc->coeff_lps[tx_size][plane_type][ctx], 1); + break; + } + cost += av1_cost_bit(xd->fc->coeff_lps[tx_size][plane_type][ctx], 0); + } + + if (idx >= COEFF_BASE_RANGE) { + // residual cost + int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS; + int ri = r; + int length = 0; + + while (ri) { + ri >>= 1; + ++length; + } + + for (ri = 0; ri < length - 1; ++ri) cost += av1_cost_bit(128, 0); + + for (ri = length - 1; ri >= 0; --ri) + cost += av1_cost_bit(128, (r >> ri) & 0x01); + } + } + + if (c < seg_eob) { + int eob_ctx = get_eob_ctx(qcoeff, scan[c], bwl); + cost += av1_cost_bit(xd->fc->eob_flag[tx_size][plane_type][eob_ctx], + c == (eob - 1)); + } + } + + txb_mask[scan[c]] = 1; + } + + return cost; +} + +typedef struct TxbParams { + const AV1_COMP *cpi; + ThreadData *td; + int rate; +} TxbParams; + +int av1_get_txb_entropy_context(const tran_low_t *qcoeff, + const SCAN_ORDER *scan_order, int eob) { + const int16_t *scan = scan_order->scan; + int cul_level = 0; + int c; + for (c = 0; c < eob; ++c) { + cul_level += abs(qcoeff[scan[c]]); + } + + cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level); + set_dc_sign(&cul_level, qcoeff[0]); + + return cul_level; +} + +static void update_txb_context(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + TxbParams *const args = arg; + const AV1_COMP *cpi = args->cpi; + const AV1_COMMON *cm = &cpi->common; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const uint16_t eob = p->eobs[block]; + const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); + const PLANE_TYPE plane_type = pd->plane_type; + const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); + const SCAN_ORDER *const scan_order = + get_scan(cm, tx_size, tx_type, is_inter_block(mbmi)); + (void)plane_bsize; + + int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob); + av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row); +} + +static void update_and_record_txb_context(int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + TxbParams *const args = arg; + const AV1_COMP *cpi = args->cpi; + const AV1_COMMON *cm = &cpi->common; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + int eob = p->eobs[block], update_eob = 0; + const PLANE_TYPE plane_type = pd->plane_type; + const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block); + const int segment_id = mbmi->segment_id; + const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); + const SCAN_ORDER *const scan_order = + get_scan(cm, tx_size, tx_type, is_inter_block(mbmi)); + const int16_t *scan = scan_order->scan; + const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size); + int c, i; + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + blk_col, + pd->left_context + blk_row, &txb_ctx); + const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; + int cul_level = 0; + unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2]; + uint8_t txb_mask[32 * 32] = { 0 }; + + nz_map_count = &td->counts->nz_map[tx_size][plane_type]; + + memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob); + + ++td->counts->txb_skip[tx_size][txb_ctx.txb_skip_ctx][eob == 0]; + x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx; + + x->mbmi_ext->eobs[plane][block] = eob; + + if (eob == 0) { + av1_set_contexts(xd, pd, plane, tx_size, 0, blk_col, blk_row); + return; + } + +#if CONFIG_TXK_SEL + av1_update_tx_type_count(cm, xd, block, plane, mbmi->sb_type, tx_size, + td->counts); +#endif + + for (c = 0; c < eob; ++c) { + tran_low_t v = qcoeff[scan[c]]; + int is_nz = (v != 0); + int coeff_ctx = get_nz_map_ctx(tcoeff, txb_mask, scan[c], bwl); + int eob_ctx = get_eob_ctx(tcoeff, scan[c], bwl); + + if (c == seg_eob - 1) break; + + ++(*nz_map_count)[coeff_ctx][is_nz]; + + if (is_nz) { + ++td->counts->eob_flag[tx_size][plane_type][eob_ctx][c == (eob - 1)]; + } + txb_mask[scan[c]] = 1; + } + + // Reverse process order to handle coefficient level and sign. + for (i = 0; i < NUM_BASE_LEVELS; ++i) { + update_eob = 0; + for (c = eob - 1; c >= 0; --c) { + tran_low_t v = qcoeff[scan[c]]; + tran_low_t level = abs(v); + int ctx; + + if (level <= i) continue; + + ctx = get_base_ctx(tcoeff, scan[c], bwl, i + 1); + + if (level == i + 1) { + ++td->counts->coeff_base[tx_size][plane_type][i][ctx][1]; + if (c == 0) { + int dc_sign_ctx = txb_ctx.dc_sign_ctx; + + ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0]; + x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx; + } + cul_level += level; + continue; + } + ++td->counts->coeff_base[tx_size][plane_type][i][ctx][0]; + update_eob = AOMMAX(update_eob, c); + } + } + + for (c = update_eob; c >= 0; --c) { + tran_low_t v = qcoeff[scan[c]]; + tran_low_t level = abs(v); + int idx; + int ctx; + + if (level <= NUM_BASE_LEVELS) continue; + + cul_level += level; + if (c == 0) { + int dc_sign_ctx = txb_ctx.dc_sign_ctx; + + ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0]; + x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx; + } + + // level is above 1. + ctx = get_level_ctx(tcoeff, scan[c], bwl); + for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) { + if (level == (idx + 1 + NUM_BASE_LEVELS)) { + ++td->counts->coeff_lps[tx_size][plane_type][ctx][1]; + break; + } + ++td->counts->coeff_lps[tx_size][plane_type][ctx][0]; + } + if (idx < COEFF_BASE_RANGE) continue; + + // use 0-th order Golomb code to handle the residual level. + } + + cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level); + + // DC value + set_dc_sign(&cul_level, tcoeff[0]); + av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row); + +#if CONFIG_ADAPT_SCAN + // Since dqcoeff is not available here, we pass qcoeff into + // av1_update_scan_count_facade(). The update behavior should be the same + // because av1_update_scan_count_facade() only cares if coefficients are zero + // or not. + av1_update_scan_count_facade((AV1_COMMON *)cm, td->counts, tx_size, tx_type, + qcoeff, eob); +#endif +} + +void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, + int mi_row, int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const int ctx = av1_get_skip_context(xd); + const int skip_inc = + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); + struct TxbParams arg = { cpi, td, 0 }; + (void)rate; + (void)mi_row; + (void)mi_col; + if (mbmi->skip) { + if (!dry_run) td->counts->skip[ctx][1] += skip_inc; + reset_skip_context(xd, bsize); + return; + } + + if (!dry_run) { + td->counts->skip[ctx][0] += skip_inc; + av1_foreach_transformed_block(xd, bsize, mi_row, mi_col, + update_and_record_txb_context, &arg); + } else if (dry_run == DRY_RUN_NORMAL) { + av1_foreach_transformed_block(xd, bsize, mi_row, mi_col, update_txb_context, + &arg); + } else { + printf("DRY_RUN_COSTCOEFFS is not supported yet\n"); + assert(0); + } +} + +static void find_new_prob(unsigned int *branch_cnt, aom_prob *oldp, + int *savings, int *update, aom_writer *const bc) { + const aom_prob upd = DIFF_UPDATE_PROB; + int u = 0; + aom_prob newp = get_binary_prob(branch_cnt[0], branch_cnt[1]); + int s = av1_prob_diff_update_savings_search(branch_cnt, *oldp, &newp, upd, 1); + + if (s > 0 && newp != *oldp) u = 1; + + if (u) + *savings += s - (int)(av1_cost_zero(upd)); // TODO(jingning): 1? + else + *savings -= (int)(av1_cost_zero(upd)); + + if (update) { + ++update[u]; + return; + } + + aom_write(bc, u, upd); + if (u) { + /* send/use new probability */ + av1_write_prob_diff_update(bc, newp, *oldp); + *oldp = newp; + } +} + +static void write_txb_probs(aom_writer *const bc, AV1_COMP *cpi, + TX_SIZE tx_size) { + FRAME_CONTEXT *fc = cpi->common.fc; + FRAME_COUNTS *counts = cpi->td.counts; + int savings = 0; + int update[2] = { 0, 0 }; + int plane, ctx, level; + + for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) { + find_new_prob(counts->txb_skip[tx_size][ctx], &fc->txb_skip[tx_size][ctx], + &savings, update, bc); + } + + for (plane = 0; plane < PLANE_TYPES; ++plane) { + for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) { + find_new_prob(counts->nz_map[tx_size][plane][ctx], + &fc->nz_map[tx_size][plane][ctx], &savings, update, bc); + } + } + + for (plane = 0; plane < PLANE_TYPES; ++plane) { + for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) { + find_new_prob(counts->eob_flag[tx_size][plane][ctx], + &fc->eob_flag[tx_size][plane][ctx], &savings, update, bc); + } + } + + for (level = 0; level < NUM_BASE_LEVELS; ++level) { + for (plane = 0; plane < PLANE_TYPES; ++plane) { + for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) { + find_new_prob(counts->coeff_base[tx_size][plane][level][ctx], + &fc->coeff_base[tx_size][plane][level][ctx], &savings, + update, bc); + } + } + } + + for (plane = 0; plane < PLANE_TYPES; ++plane) { + for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { + find_new_prob(counts->coeff_lps[tx_size][plane][ctx], + &fc->coeff_lps[tx_size][plane][ctx], &savings, update, bc); + } + } + + // Decide if to update the model for this tx_size + if (update[1] == 0 || savings < 0) { + aom_write_bit(bc, 0); + return; + } + aom_write_bit(bc, 1); + + for (ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) { + find_new_prob(counts->txb_skip[tx_size][ctx], &fc->txb_skip[tx_size][ctx], + &savings, NULL, bc); + } + + for (plane = 0; plane < PLANE_TYPES; ++plane) { + for (ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) { + find_new_prob(counts->nz_map[tx_size][plane][ctx], + &fc->nz_map[tx_size][plane][ctx], &savings, NULL, bc); + } + } + + for (plane = 0; plane < PLANE_TYPES; ++plane) { + for (ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) { + find_new_prob(counts->eob_flag[tx_size][plane][ctx], + &fc->eob_flag[tx_size][plane][ctx], &savings, NULL, bc); + } + } + + for (level = 0; level < NUM_BASE_LEVELS; ++level) { + for (plane = 0; plane < PLANE_TYPES; ++plane) { + for (ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx) { + find_new_prob(counts->coeff_base[tx_size][plane][level][ctx], + &fc->coeff_base[tx_size][plane][level][ctx], &savings, + NULL, bc); + } + } + } + + for (plane = 0; plane < PLANE_TYPES; ++plane) { + for (ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { + find_new_prob(counts->coeff_lps[tx_size][plane][ctx], + &fc->coeff_lps[tx_size][plane][ctx], &savings, NULL, bc); + } + } +} + +void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w) { + const TX_MODE tx_mode = cpi->common.tx_mode; + const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + TX_SIZE tx_size; + int ctx, plane; + + for (plane = 0; plane < PLANE_TYPES; ++plane) + for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx) + av1_cond_prob_diff_update(w, &cpi->common.fc->dc_sign[plane][ctx], + cpi->td.counts->dc_sign[plane][ctx], 1); + + for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) + write_txb_probs(w, cpi, tx_size); +} + +#if CONFIG_TXK_SEL +int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l, + int use_fast_coef_costing, RD_STATS *rd_stats) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + TX_TYPE txk_start = DCT_DCT; + TX_TYPE txk_end = TX_TYPES - 1; + TX_TYPE best_tx_type = txk_start; + int64_t best_rd = INT64_MAX; + const int coeff_ctx = combine_entropy_contexts(*a, *l); + TX_TYPE tx_type; + for (tx_type = txk_start; tx_type <= txk_end; ++tx_type) { + if (plane == 0) mbmi->txk_type[block] = tx_type; + TX_TYPE ref_tx_type = + get_tx_type(get_plane_type(plane), xd, block, tx_size); + if (tx_type != ref_tx_type) { + // use get_tx_type() to check if the tx_type is valid for the current mode + // if it's not, we skip it here. + continue; + } + RD_STATS this_rd_stats; + av1_invalid_rd_stats(&this_rd_stats); + av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + coeff_ctx, AV1_XFORM_QUANT_FP); + if (x->plane[plane].eobs[block] && !xd->lossless[mbmi->segment_id]) + av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx); + av1_dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size, + &this_rd_stats.dist, &this_rd_stats.sse, + OUTPUT_HAS_PREDICTED_PIXELS); + const SCAN_ORDER *scan_order = + get_scan(cm, tx_size, tx_type, is_inter_block(mbmi)); + this_rd_stats.rate = av1_cost_coeffs( + cpi, x, plane, block, tx_size, scan_order, a, l, use_fast_coef_costing); + int rd = + RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist); + if (rd < best_rd) { + best_rd = rd; + *rd_stats = this_rd_stats; + best_tx_type = tx_type; + } + } + if (plane == 0) mbmi->txk_type[block] = best_tx_type; + // TODO(angiebird): Instead of re-call av1_xform_quant and av1_optimize_b, + // copy the best result in the above tx_type search for loop + av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + coeff_ctx, AV1_XFORM_QUANT_FP); + if (x->plane[plane].eobs[block] && !xd->lossless[mbmi->segment_id]) + av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx); + if (!is_inter_block(mbmi)) { + // intra mode needs decoded result such that the next transform block + // can use it for prediction. + av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col, + x->plane[plane].eobs[block]); + } + return best_rd; +} +#endif // CONFIG_TXK_SEL diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h new file mode 100644 index 000000000..552d47b54 --- /dev/null +++ b/third_party/aom/av1/encoder/encodetxb.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef ENCODETXB_H_ +#define ENCODETXB_H_ + +#include "./aom_config.h" +#include "av1/common/blockd.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" +#include "aom_dsp/bitwriter.h" +#ifdef __cplusplus +extern "C" { +#endif +void av1_alloc_txb_buf(AV1_COMP *cpi); +void av1_free_txb_buf(AV1_COMP *cpi); +int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane, + int block, TXB_CTX *txb_ctx); +void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd, + aom_writer *w, int block, int plane, + const tran_low_t *tcoeff, uint16_t eob, + TXB_CTX *txb_ctx); +void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, + aom_writer *w, int plane); +int av1_get_txb_entropy_context(const tran_low_t *qcoeff, + const SCAN_ORDER *scan_order, int eob); +void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, + const int mi_row, const int mi_col); +void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w); + +#if CONFIG_TXK_SEL +int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l, + int use_fast_coef_costing, RD_STATS *rd_stats); +#endif +#ifdef __cplusplus +} +#endif + +#endif // COEFFS_CODING_H_ diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c new file mode 100644 index 000000000..34f0b9566 --- /dev/null +++ b/third_party/aom/av1/encoder/ethread.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "aom_dsp/aom_dsp_common.h" + +static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { + int i, j, k, l, m, n; + + for (i = 0; i < REFERENCE_MODES; i++) + td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i]; + +#if CONFIG_GLOBAL_MOTION + for (i = 0; i < TOTAL_REFS_PER_FRAME; i++) + td->rd_counts.global_motion_used[i] += + td_t->rd_counts.global_motion_used[i]; +#endif // CONFIG_GLOBAL_MOTION + + for (i = 0; i < TX_SIZES; i++) + for (j = 0; j < PLANE_TYPES; j++) + for (k = 0; k < REF_TYPES; k++) + for (l = 0; l < COEF_BANDS; l++) + for (m = 0; m < COEFF_CONTEXTS; m++) + for (n = 0; n < ENTROPY_TOKENS; n++) + td->rd_counts.coef_counts[i][j][k][l][m][n] += + td_t->rd_counts.coef_counts[i][j][k][l][m][n]; +} + +static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { + AV1_COMP *const cpi = thread_data->cpi; + const AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; + int t; + + (void)unused; + + for (t = thread_data->start; t < tile_rows * tile_cols; + t += cpi->num_workers) { + int tile_row = t / tile_cols; + int tile_col = t % tile_cols; + + av1_encode_tile(cpi, thread_data->td, tile_row, tile_col); + } + + return 0; +} + +void av1_encode_tiles_mt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tile_cols; + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + const int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols); + int i; + + av1_init_tile_data(cpi); + + // Only run once to create threads and allocate thread data. + if (cpi->num_workers == 0) { + CHECK_MEM_ERROR(cm, cpi->workers, + aom_malloc(num_workers * sizeof(*cpi->workers))); + + CHECK_MEM_ERROR(cm, cpi->tile_thr_data, + aom_calloc(num_workers, sizeof(*cpi->tile_thr_data))); + + for (i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; + + ++cpi->num_workers; + winterface->init(worker); + + thread_data->cpi = cpi; + + if (i < num_workers - 1) { + // Allocate thread data. + CHECK_MEM_ERROR(cm, thread_data->td, + aom_memalign(32, sizeof(*thread_data->td))); + av1_zero(*thread_data->td); + + // Set up pc_tree. + thread_data->td->leaf_tree = NULL; + thread_data->td->pc_tree = NULL; + av1_setup_pc_tree(cm, thread_data->td); + + // Set up variance tree if needed. + if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) + av1_setup_var_tree(cm, thread_data->td); + + // Allocate frame counters in thread data. + CHECK_MEM_ERROR(cm, thread_data->td->counts, + aom_calloc(1, sizeof(*thread_data->td->counts))); + + // Create threads + if (!winterface->reset(worker)) + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Tile encoder thread creation failed"); + } else { + // Main thread acts as a worker and uses the thread data in cpi. + thread_data->td = &cpi->td; + } + + winterface->sync(worker); + } + } + + for (i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *thread_data; + + worker->hook = (AVxWorkerHook)enc_worker_hook; + worker->data1 = &cpi->tile_thr_data[i]; + worker->data2 = NULL; + thread_data = (EncWorkerData *)worker->data1; + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + thread_data->td->rd_counts = cpi->td.rd_counts; + } + if (thread_data->td->counts != &cpi->common.counts) { + memcpy(thread_data->td->counts, &cpi->common.counts, + sizeof(cpi->common.counts)); + } + +#if CONFIG_PALETTE + // Allocate buffers used by palette coding mode. + if (cpi->common.allow_screen_content_tools && i < num_workers - 1) { + MACROBLOCK *x = &thread_data->td->mb; + CHECK_MEM_ERROR(cm, x->palette_buffer, + aom_memalign(16, sizeof(*x->palette_buffer))); + } +#endif // CONFIG_PALETTE + } + + // Encode a frame + for (i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + + // Set the starting tile for each thread. + thread_data->start = i; + + if (i == cpi->num_workers - 1) + winterface->execute(worker); + else + winterface->launch(worker); + } + + // Encoding ends. + for (i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + winterface->sync(worker); + } + + for (i = 0; i < num_workers; i++) { + AVxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + + // Accumulate counters. + if (i < cpi->num_workers - 1) { + av1_accumulate_frame_counts(&cm->counts, thread_data->td->counts); + accumulate_rd_opt(&cpi->td, thread_data->td); + } + } +} diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h new file mode 100644 index 000000000..6c30a3e5c --- /dev/null +++ b/third_party/aom/av1/encoder/ethread.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_ETHREAD_H_ +#define AV1_ENCODER_ETHREAD_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct ThreadData; + +typedef struct EncWorkerData { + struct AV1_COMP *cpi; + struct ThreadData *td; + int start; +} EncWorkerData; + +void av1_encode_tiles_mt(struct AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_ETHREAD_H_ diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c new file mode 100644 index 000000000..007694a38 --- /dev/null +++ b/third_party/aom/av1/encoder/extend.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/common.h" +#include "av1/encoder/extend.h" + +static void copy_and_extend_plane(const uint8_t *src, int src_pitch, + uint8_t *dst, int dst_pitch, int w, int h, + int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, linesize; + + // copy the left and right most columns out + const uint8_t *src_ptr1 = src; + const uint8_t *src_ptr2 = src + w - 1; + uint8_t *dst_ptr1 = dst - extend_left; + uint8_t *dst_ptr2 = dst + w; + + for (i = 0; i < h; i++) { + memset(dst_ptr1, src_ptr1[0], extend_left); + memcpy(dst_ptr1 + extend_left, src_ptr1, w); + memset(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; + } + + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h)-extend_left; + linesize = extend_left + extend_right + w; + + for (i = 0; i < extend_top; i++) { + memcpy(dst_ptr1, src_ptr1, linesize); + dst_ptr1 += dst_pitch; + } + + for (i = 0; i < extend_bottom; i++) { + memcpy(dst_ptr2, src_ptr2, linesize); + dst_ptr2 += dst_pitch; + } +} + +#if CONFIG_HIGHBITDEPTH +static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch, + uint8_t *dst8, int dst_pitch, int w, + int h, int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, linesize; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + // copy the left and right most columns out + const uint16_t *src_ptr1 = src; + const uint16_t *src_ptr2 = src + w - 1; + uint16_t *dst_ptr1 = dst - extend_left; + uint16_t *dst_ptr2 = dst + w; + + for (i = 0; i < h; i++) { + aom_memset16(dst_ptr1, src_ptr1[0], extend_left); + memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0])); + aom_memset16(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; + } + + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h)-extend_left; + linesize = extend_left + extend_right + w; + + for (i = 0; i < extend_top; i++) { + memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0])); + dst_ptr1 += dst_pitch; + } + + for (i = 0; i < extend_bottom; i++) { + memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0])); + dst_ptr2 += dst_pitch; + } +} +#endif // CONFIG_HIGHBITDEPTH + +void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + // Extend src frame in buffer + // Altref filtering assumes 16 pixel extension + const int et_y = 16; + const int el_y = 16; + // Motion estimation may use src block variance with the block size up + // to 64x64, so the right and bottom need to be extended to 64 multiple + // or up to 16, whichever is greater. + const int er_y = + AOMMAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6)) - + src->y_crop_width; + const int eb_y = + AOMMAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6)) - + src->y_crop_height; + const int uv_width_subsampling = (src->uv_width != src->y_width); + const int uv_height_subsampling = (src->uv_height != src->y_height); + const int et_uv = et_y >> uv_height_subsampling; + const int el_uv = el_y >> uv_width_subsampling; + const int eb_uv = eb_y >> uv_height_subsampling; + const int er_uv = er_y >> uv_width_subsampling; + +#if CONFIG_HIGHBITDEPTH + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, src->y_crop_width, + src->y_crop_height, et_y, el_y, eb_y, er_y); + + highbd_copy_and_extend_plane( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); + + highbd_copy_and_extend_plane( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); + return; + } +#endif // CONFIG_HIGHBITDEPTH + + copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, src->y_crop_width, src->y_crop_height, + et_y, el_y, eb_y, er_y); + + copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, src->uv_crop_width, src->uv_crop_height, + et_uv, el_uv, eb_uv, er_uv); + + copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, src->uv_crop_width, src->uv_crop_height, + et_uv, el_uv, eb_uv, er_uv); +} + +void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int srcy, + int srcx, int srch, int srcw) { + // If the side is not touching the bounder then don't extend. + const int et_y = srcy ? 0 : dst->border; + const int el_y = srcx ? 0 : dst->border; + const int eb_y = srcy + srch != src->y_height + ? 0 + : dst->border + dst->y_height - src->y_height; + const int er_y = srcx + srcw != src->y_width + ? 0 + : dst->border + dst->y_width - src->y_width; + const int src_y_offset = srcy * src->y_stride + srcx; + const int dst_y_offset = srcy * dst->y_stride + srcx; + + const int et_uv = ROUND_POWER_OF_TWO(et_y, 1); + const int el_uv = ROUND_POWER_OF_TWO(el_y, 1); + const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1); + const int er_uv = ROUND_POWER_OF_TWO(er_y, 1); + const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); + const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); + const int srch_uv = ROUND_POWER_OF_TWO(srch, 1); + const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1); + + copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride, + dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch, + et_y, el_y, eb_y, er_y); + + copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride, + dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv, + srch_uv, et_uv, el_uv, eb_uv, er_uv); + + copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride, + dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv, + srch_uv, et_uv, el_uv, eb_uv, er_uv); +} diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h new file mode 100644 index 000000000..48178b964 --- /dev/null +++ b/third_party/aom/av1/encoder/extend.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_EXTEND_H_ +#define AV1_ENCODER_EXTEND_H_ + +#include "aom_scale/yv12config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); + +void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int srcy, + int srcx, int srch, int srcw); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_EXTEND_H_ diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c new file mode 100644 index 000000000..e35a54ef2 --- /dev/null +++ b/third_party/aom/av1/encoder/firstpass.c @@ -0,0 +1,3026 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "./aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" +#include "aom_scale/aom_scale.h" +#include "aom_scale/yv12config.h" + +#include "aom_dsp/variance.h" +#include "av1/common/entropymv.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" // av1_setup_dst_planes() +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/block.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/rd.h" + +#define OUTPUT_FPF 0 +#define ARF_STATS_OUTPUT 0 + +#define GROUP_ADAPTIVE_MAXQ 1 + +#define BOOST_BREAKOUT 12.5 +#define BOOST_FACTOR 12.5 +#define FACTOR_PT_LOW 0.70 +#define FACTOR_PT_HIGH 0.90 +#define FIRST_PASS_Q 10.0 +#define GF_MAX_BOOST 96.0 +#define INTRA_MODE_PENALTY 1024 +#define KF_MAX_BOOST 128.0 +#define MIN_ARF_GF_BOOST 240 +#define MIN_DECAY_FACTOR 0.01 +#define MIN_KF_BOOST 300 +#define NEW_MV_MODE_PENALTY 32 +#define DARK_THRESH 64 +#define DEFAULT_GRP_WEIGHT 1.0 +#define RC_FACTOR_MIN 0.75 +#define RC_FACTOR_MAX 1.75 + +#define NCOUNT_INTRA_THRESH 8192 +#define NCOUNT_INTRA_FACTOR 3 +#define NCOUNT_FRAME_II_THRESH 5.0 + +#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001) + +#if ARF_STATS_OUTPUT +unsigned int arf_count = 0; +#endif + +// Resets the first pass file to the given position using a relative seek from +// the current position. +static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) { + p->stats_in = position; +} + +// Read frame stats at an offset from the current position. +static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) { + if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) || + (offset < 0 && p->stats_in + offset < p->stats_in_start)) { + return NULL; + } + + return &p->stats_in[offset]; +} + +static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) { + if (p->stats_in >= p->stats_in_end) return EOF; + + *fps = *p->stats_in; + ++p->stats_in; + return 1; +} + +static void output_stats(FIRSTPASS_STATS *stats, + struct aom_codec_pkt_list *pktlist) { + struct aom_codec_cx_pkt pkt; + pkt.kind = AOM_CODEC_STATS_PKT; + pkt.data.twopass_stats.buf = stats; + pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS); + aom_codec_pkt_list_add(pktlist, &pkt); + +// TEMP debug code +#if OUTPUT_FPF + { + FILE *fpfile; + fpfile = fopen("firstpass.stt", "a"); + + fprintf(fpfile, + "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf" + "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" + "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n", + stats->frame, stats->weight, stats->intra_error, stats->coded_error, + stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion, + stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct, + stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr, + stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv, + stats->MVcv, stats->mv_in_out_count, stats->new_mv_count, + stats->count, stats->duration); + fclose(fpfile); + } +#endif +} + +#if CONFIG_FP_MB_STATS +static void output_fpmb_stats(uint8_t *this_frame_mb_stats, int stats_size, + struct aom_codec_pkt_list *pktlist) { + struct aom_codec_cx_pkt pkt; + pkt.kind = AOM_CODEC_FPMB_STATS_PKT; + pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats; + pkt.data.firstpass_mb_stats.sz = stats_size * sizeof(*this_frame_mb_stats); + aom_codec_pkt_list_add(pktlist, &pkt); +} +#endif + +static void zero_stats(FIRSTPASS_STATS *section) { + section->frame = 0.0; + section->weight = 0.0; + section->intra_error = 0.0; + section->coded_error = 0.0; + section->sr_coded_error = 0.0; + section->pcnt_inter = 0.0; + section->pcnt_motion = 0.0; + section->pcnt_second_ref = 0.0; + section->pcnt_neutral = 0.0; + section->intra_skip_pct = 0.0; + section->inactive_zone_rows = 0.0; + section->inactive_zone_cols = 0.0; + section->MVr = 0.0; + section->mvr_abs = 0.0; + section->MVc = 0.0; + section->mvc_abs = 0.0; + section->MVrv = 0.0; + section->MVcv = 0.0; + section->mv_in_out_count = 0.0; + section->new_mv_count = 0.0; + section->count = 0.0; + section->duration = 1.0; +} + +static void accumulate_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { + section->frame += frame->frame; + section->weight += frame->weight; + section->intra_error += frame->intra_error; + section->coded_error += frame->coded_error; + section->sr_coded_error += frame->sr_coded_error; + section->pcnt_inter += frame->pcnt_inter; + section->pcnt_motion += frame->pcnt_motion; + section->pcnt_second_ref += frame->pcnt_second_ref; + section->pcnt_neutral += frame->pcnt_neutral; + section->intra_skip_pct += frame->intra_skip_pct; + section->inactive_zone_rows += frame->inactive_zone_rows; + section->inactive_zone_cols += frame->inactive_zone_cols; + section->MVr += frame->MVr; + section->mvr_abs += frame->mvr_abs; + section->MVc += frame->MVc; + section->mvc_abs += frame->mvc_abs; + section->MVrv += frame->MVrv; + section->MVcv += frame->MVcv; + section->mv_in_out_count += frame->mv_in_out_count; + section->new_mv_count += frame->new_mv_count; + section->count += frame->count; + section->duration += frame->duration; +} + +static void subtract_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { + section->frame -= frame->frame; + section->weight -= frame->weight; + section->intra_error -= frame->intra_error; + section->coded_error -= frame->coded_error; + section->sr_coded_error -= frame->sr_coded_error; + section->pcnt_inter -= frame->pcnt_inter; + section->pcnt_motion -= frame->pcnt_motion; + section->pcnt_second_ref -= frame->pcnt_second_ref; + section->pcnt_neutral -= frame->pcnt_neutral; + section->intra_skip_pct -= frame->intra_skip_pct; + section->inactive_zone_rows -= frame->inactive_zone_rows; + section->inactive_zone_cols -= frame->inactive_zone_cols; + section->MVr -= frame->MVr; + section->mvr_abs -= frame->mvr_abs; + section->MVc -= frame->MVc; + section->mvc_abs -= frame->mvc_abs; + section->MVrv -= frame->MVrv; + section->MVcv -= frame->MVcv; + section->mv_in_out_count -= frame->mv_in_out_count; + section->new_mv_count -= frame->new_mv_count; + section->count -= frame->count; + section->duration -= frame->duration; +} + +// Calculate the linear size relative to a baseline of 1080P +#define BASE_SIZE 2073600.0 // 1920x1080 +static double get_linear_size_factor(const AV1_COMP *cpi) { + const double this_area = cpi->initial_width * cpi->initial_height; + return pow(this_area / BASE_SIZE, 0.5); +} + +// Calculate an active area of the image that discounts formatting +// bars and partially discounts other 0 energy areas. +#define MIN_ACTIVE_AREA 0.5 +#define MAX_ACTIVE_AREA 1.0 +static double calculate_active_area(const AV1_COMP *cpi, + const FIRSTPASS_STATS *this_frame) { + double active_pct; + + active_pct = + 1.0 - + ((this_frame->intra_skip_pct / 2) + + ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows)); + return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA); +} + +// Calculate a modified Error used in distributing bits between easier and +// harder frames. +#define ACT_AREA_CORRECTION 0.5 +static double calculate_modified_err(const AV1_COMP *cpi, + const TWO_PASS *twopass, + const AV1EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame) { + const FIRSTPASS_STATS *const stats = &twopass->total_stats; + const double av_weight = stats->weight / stats->count; + const double av_err = (stats->coded_error * av_weight) / stats->count; + double modified_error = + av_err * pow(this_frame->coded_error * this_frame->weight / + DOUBLE_DIVIDE_CHECK(av_err), + oxcf->two_pass_vbrbias / 100.0); + + // Correction for active area. Frames with a reduced active area + // (eg due to formatting bars) have a higher error per mb for the + // remaining active MBs. The correction here assumes that coding + // 0.5N blocks of complexity 2X is a little easier than coding N + // blocks of complexity X. + modified_error *= + pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION); + + return fclamp(modified_error, twopass->modified_error_min, + twopass->modified_error_max); +} + +// This function returns the maximum target rate per frame. +static int frame_max_bits(const RATE_CONTROL *rc, + const AV1EncoderConfig *oxcf) { + int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth * + (int64_t)oxcf->two_pass_vbrmax_section) / + 100; + if (max_bits < 0) + max_bits = 0; + else if (max_bits > rc->max_frame_bandwidth) + max_bits = rc->max_frame_bandwidth; + + return (int)max_bits; +} + +void av1_init_first_pass(AV1_COMP *cpi) { + zero_stats(&cpi->twopass.total_stats); +} + +void av1_end_first_pass(AV1_COMP *cpi) { + output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list); +} + +static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { + switch (bsize) { + case BLOCK_8X8: return aom_mse8x8; + case BLOCK_16X8: return aom_mse16x8; + case BLOCK_8X16: return aom_mse8x16; + default: return aom_mse16x16; + } +} + +static unsigned int get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref) { + unsigned int sse; + const aom_variance_fn_t fn = get_block_variance_fn(bsize); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); + return sse; +} + +#if CONFIG_HIGHBITDEPTH +static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, + int bd) { + switch (bd) { + default: + switch (bsize) { + case BLOCK_8X8: return aom_highbd_8_mse8x8; + case BLOCK_16X8: return aom_highbd_8_mse16x8; + case BLOCK_8X16: return aom_highbd_8_mse8x16; + default: return aom_highbd_8_mse16x16; + } + break; + case 10: + switch (bsize) { + case BLOCK_8X8: return aom_highbd_10_mse8x8; + case BLOCK_16X8: return aom_highbd_10_mse16x8; + case BLOCK_8X16: return aom_highbd_10_mse8x16; + default: return aom_highbd_10_mse16x16; + } + break; + case 12: + switch (bsize) { + case BLOCK_8X8: return aom_highbd_12_mse8x8; + case BLOCK_16X8: return aom_highbd_12_mse16x8; + case BLOCK_8X16: return aom_highbd_12_mse8x16; + default: return aom_highbd_12_mse16x16; + } + break; + } +} + +static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref, + int bd) { + unsigned int sse; + const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); + return sse; +} +#endif // CONFIG_HIGHBITDEPTH + +// Refine the motion search range according to the frame dimension +// for first pass test. +static int get_search_range(const AV1_COMP *cpi) { + int sr = 0; + const int dim = AOMMIN(cpi->initial_width, cpi->initial_height); + + while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr; + return sr; +} + +static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x, + const MV *ref_mv, MV *best_mv, + int *best_motion_err) { + MACROBLOCKD *const xd = &x->e_mbd; + MV tmp_mv = { 0, 0 }; + MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 }; + int num00, tmp_err, n; + const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; + aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; + const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; + + int step_param = 3; + int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; + const int sr = get_search_range(cpi); + step_param += sr; + further_steps -= sr; + + // Override the default variance function to use MSE. + v_fn_ptr.vf = get_block_variance_fn(bsize); +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd); + } +#endif // CONFIG_HIGHBITDEPTH + + // Center the initial step/diamond search on best mv. + tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, + step_param, x->sadperbit16, &num00, + &v_fn_ptr, ref_mv); + if (tmp_err < INT_MAX) + tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); + if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty; + + if (tmp_err < *best_motion_err) { + *best_motion_err = tmp_err; + *best_mv = tmp_mv; + } + + // Carry out further step/diamond searches as necessary. + n = num00; + num00 = 0; + + while (n < further_steps) { + ++n; + + if (num00) { + --num00; + } else { + tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, + step_param + n, x->sadperbit16, &num00, + &v_fn_ptr, ref_mv); + if (tmp_err < INT_MAX) + tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); + if (tmp_err < INT_MAX - new_mv_mode_penalty) + tmp_err += new_mv_mode_penalty; + + if (tmp_err < *best_motion_err) { + *best_motion_err = tmp_err; + *best_mv = tmp_mv; + } + } + } +} + +static BLOCK_SIZE get_bsize(const AV1_COMMON *cm, int mb_row, int mb_col) { + if (mi_size_wide[BLOCK_16X16] * mb_col + mi_size_wide[BLOCK_8X8] < + cm->mi_cols) { + return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] < + cm->mi_rows + ? BLOCK_16X16 + : BLOCK_16X8; + } else { + return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] < + cm->mi_rows + ? BLOCK_8X16 + : BLOCK_8X8; + } +} + +static int find_fp_qindex(aom_bit_depth_t bit_depth) { + int i; + + for (i = 0; i < QINDEX_RANGE; ++i) + if (av1_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q) break; + + if (i == QINDEX_RANGE) i--; + + return i; +} + +static void set_first_pass_params(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + if (!cpi->refresh_alt_ref_frame && + (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) { + cm->frame_type = KEY_FRAME; + } else { + cm->frame_type = INTER_FRAME; + } + // Do not use periodic key frames. + cpi->rc.frames_to_key = INT_MAX; +} + +#define UL_INTRA_THRESH 50 +#define INVALID_ROW -1 +void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { + int mb_row, mb_col; + MACROBLOCK *const x = &cpi->td.mb; + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TileInfo tile; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + const PICK_MODE_CONTEXT *ctx = + &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none; + int i; + + int recon_yoffset, recon_uvoffset; + int64_t intra_error = 0; + int64_t coded_error = 0; + int64_t sr_coded_error = 0; + + int sum_mvr = 0, sum_mvc = 0; + int sum_mvr_abs = 0, sum_mvc_abs = 0; + int64_t sum_mvrs = 0, sum_mvcs = 0; + int mvcount = 0; + int intercount = 0; + int second_ref_count = 0; + const int intrapenalty = INTRA_MODE_PENALTY; + double neutral_count; + int intra_skip_count = 0; + int image_data_start_row = INVALID_ROW; + int new_mv_count = 0; + int sum_in_vectors = 0; + MV lastmv = { 0, 0 }; + TWO_PASS *twopass = &cpi->twopass; + const MV zero_mv = { 0, 0 }; + int recon_y_stride, recon_uv_stride, uv_mb_height; + + YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); + const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; + double intra_factor; + double brightness_factor; + BufferPool *const pool = cm->buffer_pool; + const int qindex = find_fp_qindex(cm->bit_depth); + const int mb_scale = mi_size_wide[BLOCK_16X16]; +#if CONFIG_PVQ + PVQ_QUEUE pvq_q; + od_adapt_ctx pvq_context; +#endif + + // First pass code requires valid last and new frame buffers. + assert(new_yv12 != NULL); + assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + av1_zero_array(cpi->twopass.frame_mb_stats_buf, cpi->initial_mbs); + } +#endif + + aom_clear_system_state(); + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + x->e_mbd.mi[0]->mbmi.sb_type = BLOCK_16X16; + + intra_factor = 0.0; + brightness_factor = 0.0; + neutral_count = 0.0; + + set_first_pass_params(cpi); + av1_set_quantizer(cm, qindex); + + av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); + + av1_setup_src_planes(x, cpi->source, 0, 0); + av1_setup_dst_planes(xd->plane, cm->sb_size, new_yv12, 0, 0); + + if (!frame_is_intra_only(cm)) { + av1_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL); + } + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + +#if CONFIG_CFL + // Don't store luma on the fist pass since chroma is not computed + x->cfl_store_y = 0; +#endif + av1_frame_init_quantizer(cpi); + +#if CONFIG_PVQ + // For pass 1 of 2-pass encoding, init here for PVQ for now. + { + pvq_q.buf_len = 5000; + CHECK_MEM_ERROR(cm, pvq_q.buf, + aom_malloc(pvq_q.buf_len * sizeof(PVQ_INFO))); + pvq_q.curr_pos = 0; + x->pvq_coded = 0; + + x->pvq_q = &pvq_q; + + // TODO(yushin): Since this init step is also called in 2nd pass, + // or 1-pass encoding, consider factoring out it as a function. + // TODO(yushin) + // If activity masking is enabled, change below to OD_HVS_QM + x->daala_enc.qm = OD_FLAT_QM; // Hard coded. Enc/dec required to sync. + x->daala_enc.pvq_norm_lambda = OD_PVQ_LAMBDA; + x->daala_enc.pvq_norm_lambda_dc = OD_PVQ_LAMBDA; + + od_init_qm(x->daala_enc.state.qm, x->daala_enc.state.qm_inv, + x->daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT); +#if CONFIG_DAALA_EC + od_ec_enc_init(&x->daala_enc.w.ec, 65025); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + +#if CONFIG_DAALA_EC + od_ec_enc_reset(&x->daala_enc.w.ec); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + } +#endif + + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff[i]; + p[i].qcoeff = ctx->qcoeff[i]; + pd[i].dqcoeff = ctx->dqcoeff[i]; +#if CONFIG_PVQ + pd[i].pvq_ref_coeff = ctx->pvq_ref_coeff[i]; +#endif + p[i].eobs = ctx->eobs[i]; +#if CONFIG_LV_MAP + p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; +#endif + } + + av1_init_mv_probs(cm); +#if CONFIG_ADAPT_SCAN + av1_init_scan_order(cm); +#endif + av1_convolve_init(cm); +#if CONFIG_PVQ + od_adapt_ctx_reset(&pvq_context, 0); + x->daala_enc.state.adapt = &pvq_context; +#endif // CONFIG_PVQ + av1_initialize_rd_consts(cpi); + + // Tiling is ignored in the first pass. + av1_tile_init(&tile, cm, 0, 0); + + recon_y_stride = new_yv12->y_stride; + recon_uv_stride = new_yv12->uv_stride; + uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); + + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + MV best_ref_mv = { 0, 0 }; + + // Reset above block coeffs. + xd->up_available = (mb_row != 0); + recon_yoffset = (mb_row * recon_y_stride * 16); + recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height); + + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16); + x->mv_limits.row_max = + ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16; + + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + int this_error; + const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); + const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); + double log_intra; + int level_sample; + +#if CONFIG_FP_MB_STATS + const int mb_index = mb_row * cm->mb_cols + mb_col; +#endif + + aom_clear_system_state(); + + xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; + xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; + xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset; + xd->left_available = (mb_col != 0); + xd->mi[0]->mbmi.sb_type = bsize; + xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME; + set_mi_row_col(xd, &tile, mb_row * mb_scale, mi_size_high[bsize], + mb_col * mb_scale, mi_size_wide[bsize], +#if CONFIG_DEPENDENT_HORZTILES + cm->dependent_horz_tiles, +#endif // CONFIG_DEPENDENT_HORZTILES + cm->mi_rows, cm->mi_cols); + + set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize]); + + // Do intra 16x16 prediction. + xd->mi[0]->mbmi.segment_id = 0; +#if CONFIG_SUPERTX + xd->mi[0]->mbmi.segment_id_supertx = 0; +#endif // CONFIG_SUPERTX + xd->lossless[xd->mi[0]->mbmi.segment_id] = (qindex == 0); + xd->mi[0]->mbmi.mode = DC_PRED; + xd->mi[0]->mbmi.tx_size = + use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; + av1_encode_intra_block_plane(cm, x, bsize, 0, 0, mb_row * 2, mb_col * 2); + this_error = aom_get_mb_ss(x->plane[0].src_diff); + + // Keep a record of blocks that have almost no intra error residual + // (i.e. are in effect completely flat and untextured in the intra + // domain). In natural videos this is uncommon, but it is much more + // common in animations, graphics and screen content, so may be used + // as a signal to detect these types of content. + if (this_error < UL_INTRA_THRESH) { + ++intra_skip_count; + } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) { + image_data_start_row = mb_row; + } + +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case AOM_BITS_8: break; + case AOM_BITS_10: this_error >>= 4; break; + case AOM_BITS_12: this_error >>= 8; break; + default: + assert(0 && + "cm->bit_depth should be AOM_BITS_8, " + "AOM_BITS_10 or AOM_BITS_12"); + return; + } + } +#endif // CONFIG_HIGHBITDEPTH + + aom_clear_system_state(); + log_intra = log(this_error + 1.0); + if (log_intra < 10.0) + intra_factor += 1.0 + ((10.0 - log_intra) * 0.05); + else + intra_factor += 1.0; + +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; + else + level_sample = x->plane[0].src.buf[0]; +#else + level_sample = x->plane[0].src.buf[0]; +#endif + if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) + brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample)); + else + brightness_factor += 1.0; + + // Intrapenalty below deals with situations where the intra and inter + // error scores are very low (e.g. a plain black frame). + // We do not have special cases in first pass for 0,0 and nearest etc so + // all inter modes carry an overhead cost estimate for the mv. + // When the error score is very low this causes us to pick all or lots of + // INTRA modes and throw lots of key frames. + // This penalty adds a cost matching that of a 0,0 mv to the intra case. + this_error += intrapenalty; + + // Accumulate the intra error. + intra_error += (int64_t)this_error; + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // initialization + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + } +#endif + + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); + x->mv_limits.col_max = + ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; + + if (!frame_is_intra_only(cm)) { // Do a motion search + int tmp_err, motion_error, raw_motion_error; + // Assume 0,0 motion with no mv overhead. + MV mv = { 0, 0 }, tmp_mv = { 0, 0 }; + struct buf_2d unscaled_last_source_buf_2d; + + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + } else { + motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); + } +#else + motion_error = + get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); +#endif // CONFIG_HIGHBITDEPTH + + // Compute the motion error of the 0,0 motion using the last source + // frame as the reference. Skip the further motion search on + // reconstructed frame if this error is small. + unscaled_last_source_buf_2d.buf = + cpi->unscaled_last_source->y_buffer + recon_yoffset; + unscaled_last_source_buf_2d.stride = + cpi->unscaled_last_source->y_stride; +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + raw_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd); + } else { + raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); + } +#else + raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); +#endif // CONFIG_HIGHBITDEPTH + + // TODO(pengchong): Replace the hard-coded threshold + if (raw_motion_error > 25) { + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search. + first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error); + + // If the current best reference mv is not centered on 0,0 then do a + // 0,0 based search as well. + if (!is_zero_mv(&best_ref_mv)) { + tmp_err = INT_MAX; + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err); + + if (tmp_err < motion_error) { + motion_error = tmp_err; + mv = tmp_mv; + } + } + + // Search in an older reference frame. + if ((cm->current_video_frame > 1) && gld_yv12 != NULL) { + // Assume 0,0 motion with no mv overhead. + int gf_motion_error; + + xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + gf_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + } else { + gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); + } +#else + gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); +#endif // CONFIG_HIGHBITDEPTH + + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, + &gf_motion_error); + + if (gf_motion_error < motion_error && gf_motion_error < this_error) + ++second_ref_count; + + // Reset to last frame as reference buffer. + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; + xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; + xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; + + // In accumulating a score for the older reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. + if (gf_motion_error < this_error) + sr_coded_error += gf_motion_error; + else + sr_coded_error += this_error; + } else { + sr_coded_error += motion_error; + } + } else { + sr_coded_error += motion_error; + } + + // Start by assuming that intra mode is best. + best_ref_mv.row = 0; + best_ref_mv.col = 0; + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // intra predication statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; + if (this_error > FPMB_ERROR_LARGE_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; + } else if (this_error < FPMB_ERROR_SMALL_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK; + } + } +#endif + + if (motion_error <= this_error) { + aom_clear_system_state(); + + // Keep a count of cases where the inter and intra were very close + // and very low. This helps with scene cut detection for example in + // cropped clips with black bars at the sides or top and bottom. + if (((this_error - intrapenalty) * 9 <= motion_error * 10) && + (this_error < (2 * intrapenalty))) { + neutral_count += 1.0; + // Also track cases where the intra is not much worse than the inter + // and use this in limiting the GF/arf group length. + } else if ((this_error > NCOUNT_INTRA_THRESH) && + (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) { + neutral_count += + (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error); + } + + mv.row *= 8; + mv.col *= 8; + this_error = motion_error; + xd->mi[0]->mbmi.mode = NEWMV; + xd->mi[0]->mbmi.mv[0].as_mv = mv; + xd->mi[0]->mbmi.tx_size = TX_4X4; + xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME; + xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME; + av1_build_inter_predictors_sby(xd, mb_row * mb_scale, + mb_col * mb_scale, NULL, bsize); + av1_encode_sby_pass1(cm, x, bsize); + sum_mvr += mv.row; + sum_mvr_abs += abs(mv.row); + sum_mvc += mv.col; + sum_mvc_abs += abs(mv.col); + sum_mvrs += mv.row * mv.row; + sum_mvcs += mv.col * mv.col; + ++intercount; + + best_ref_mv = mv; + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // inter predication statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; + if (this_error > FPMB_ERROR_LARGE_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_LARGE_MASK; + } else if (this_error < FPMB_ERROR_SMALL_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_SMALL_MASK; + } + } +#endif + + if (!is_zero_mv(&mv)) { + ++mvcount; + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + cpi->twopass.frame_mb_stats_buf[mb_index] &= + ~FPMB_MOTION_ZERO_MASK; + // check estimated motion direction + if (mv.col > 0 && mv.col >= abs(mv.row)) { + // right direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_RIGHT_MASK; + } else if (mv.row < 0 && abs(mv.row) >= abs(mv.col)) { + // up direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_UP_MASK; + } else if (mv.col < 0 && abs(mv.col) >= abs(mv.row)) { + // left direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_LEFT_MASK; + } else { + // down direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_DOWN_MASK; + } + } +#endif + + // Non-zero vector, was it different from the last non zero vector? + if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count; + lastmv = mv; + + // Does the row vector point inwards or outwards? + if (mb_row < cm->mb_rows / 2) { + if (mv.row > 0) + --sum_in_vectors; + else if (mv.row < 0) + ++sum_in_vectors; + } else if (mb_row > cm->mb_rows / 2) { + if (mv.row > 0) + ++sum_in_vectors; + else if (mv.row < 0) + --sum_in_vectors; + } + + // Does the col vector point inwards or outwards? + if (mb_col < cm->mb_cols / 2) { + if (mv.col > 0) + --sum_in_vectors; + else if (mv.col < 0) + ++sum_in_vectors; + } else if (mb_col > cm->mb_cols / 2) { + if (mv.col > 0) + ++sum_in_vectors; + else if (mv.col < 0) + --sum_in_vectors; + } + } + } + } else { + sr_coded_error += (int64_t)this_error; + } + coded_error += (int64_t)this_error; + + // Adjust to the next column of MBs. + x->plane[0].src.buf += 16; + x->plane[1].src.buf += uv_mb_height; + x->plane[2].src.buf += uv_mb_height; + + recon_yoffset += 16; + recon_uvoffset += uv_mb_height; + } + + // Adjust to the next row of MBs. + x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols; + x->plane[1].src.buf += + uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols; + x->plane[2].src.buf += + uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols; + + aom_clear_system_state(); + } + +#if CONFIG_PVQ +#if CONFIG_DAALA_EC + od_ec_enc_clear(&x->daala_enc.w.ec); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + + x->pvq_q->last_pos = x->pvq_q->curr_pos; + x->pvq_q->curr_pos = 0; + x->pvq_q = NULL; + + aom_free(pvq_q.buf); +#endif + + // Clamp the image start to rows/2. This number of rows is discarded top + // and bottom as dead data so rows / 2 means the frame is blank. + if ((image_data_start_row > cm->mb_rows / 2) || + (image_data_start_row == INVALID_ROW)) { + image_data_start_row = cm->mb_rows / 2; + } + // Exclude any image dead zone + if (image_data_start_row > 0) { + intra_skip_count = + AOMMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2)); + } + + { + FIRSTPASS_STATS fps; + // The minimum error here insures some bit allocation to frames even + // in static regions. The allocation per MB declines for larger formats + // where the typical "real" energy per MB also falls. + // Initial estimate here uses sqrt(mbs) to define the min_err, where the + // number of mbs is proportional to the image area. + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.MBs; + const double min_err = 200 * sqrt(num_mbs); + + intra_factor = intra_factor / (double)num_mbs; + brightness_factor = brightness_factor / (double)num_mbs; + fps.weight = intra_factor * brightness_factor; + + fps.frame = cm->current_video_frame; + fps.coded_error = (double)(coded_error >> 8) + min_err; + fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err; + fps.intra_error = (double)(intra_error >> 8) + min_err; + fps.count = 1.0; + fps.pcnt_inter = (double)intercount / num_mbs; + fps.pcnt_second_ref = (double)second_ref_count / num_mbs; + fps.pcnt_neutral = (double)neutral_count / num_mbs; + fps.intra_skip_pct = (double)intra_skip_count / num_mbs; + fps.inactive_zone_rows = (double)image_data_start_row; + fps.inactive_zone_cols = (double)0; // TODO(paulwilkins): fix + + if (mvcount > 0) { + fps.MVr = (double)sum_mvr / mvcount; + fps.mvr_abs = (double)sum_mvr_abs / mvcount; + fps.MVc = (double)sum_mvc / mvcount; + fps.mvc_abs = (double)sum_mvc_abs / mvcount; + fps.MVrv = + ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount; + fps.MVcv = + ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount; + fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2); + fps.new_mv_count = new_mv_count; + fps.pcnt_motion = (double)mvcount / num_mbs; + } else { + fps.MVr = 0.0; + fps.mvr_abs = 0.0; + fps.MVc = 0.0; + fps.mvc_abs = 0.0; + fps.MVrv = 0.0; + fps.MVcv = 0.0; + fps.mv_in_out_count = 0.0; + fps.new_mv_count = 0.0; + fps.pcnt_motion = 0.0; + } + + // TODO(paulwilkins): Handle the case when duration is set to 0, or + // something less than the full time between subsequent values of + // cpi->source_time_stamp. + fps.duration = (double)(source->ts_end - source->ts_start); + + // Don't want to do output stats with a stack variable! + twopass->this_frame_stats = fps; + output_stats(&twopass->this_frame_stats, cpi->output_pkt_list); + accumulate_stats(&twopass->total_stats, &fps); + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + output_fpmb_stats(twopass->frame_mb_stats_buf, cpi->initial_mbs, + cpi->output_pkt_list); + } +#endif + } + + // Copy the previous Last Frame back into gf and and arf buffers if + // the prediction is good enough... but also don't allow it to lag too far. + if ((twopass->sr_update_lag > 3) || + ((cm->current_video_frame > 0) && + (twopass->this_frame_stats.pcnt_inter > 0.20) && + ((twopass->this_frame_stats.intra_error / + DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) { + if (gld_yv12 != NULL) { +#if CONFIG_EXT_REFS + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], + cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]); +#else + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], + cm->ref_frame_map[cpi->lst_fb_idx]); +#endif // CONFIG_EXT_REFS + } + twopass->sr_update_lag = 1; + } else { + ++twopass->sr_update_lag; + } + + aom_extend_frame_borders(new_yv12); + +// The frame we just compressed now becomes the last frame. +#if CONFIG_EXT_REFS + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]], + cm->new_fb_idx); +#else + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], + cm->new_fb_idx); +#endif // CONFIG_EXT_REFS + + // Special case for the first frame. Copy into the GF buffer as a second + // reference. + if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) { +#if CONFIG_EXT_REFS + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], + cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]); +#else + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], + cm->ref_frame_map[cpi->lst_fb_idx]); +#endif // CONFIG_EXT_REFS + } + + // Use this to see what the first pass reconstruction looks like. + if (0) { + char filename[512]; + FILE *recon_file; + snprintf(filename, sizeof(filename), "enc%04d.yuv", + (int)cm->current_video_frame); + + if (cm->current_video_frame == 0) + recon_file = fopen(filename, "wb"); + else + recon_file = fopen(filename, "ab"); + + (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file); + fclose(recon_file); + } + + ++cm->current_video_frame; +} + +static double calc_correction_factor(double err_per_mb, double err_divisor, + double pt_low, double pt_high, int q, + aom_bit_depth_t bit_depth) { + const double error_term = err_per_mb / err_divisor; + + // Adjustment based on actual quantizer to power term. + const double power_term = + AOMMIN(av1_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high); + + // Calculate correction factor. + if (power_term < 1.0) assert(error_term >= 0.0); + + return fclamp(pow(error_term, power_term), 0.05, 5.0); +} + +#define ERR_DIVISOR 100.0 +static int get_twopass_worst_quality(const AV1_COMP *cpi, + const double section_err, + double inactive_zone, + int section_target_bandwidth, + double group_weight_factor) { + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + inactive_zone = fclamp(inactive_zone, 0.0, 1.0); + + if (section_target_bandwidth <= 0) { + return rc->worst_quality; // Highest value allowed + } else { + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.MBs; + const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); + const double av_err_per_mb = section_err / active_mbs; + const double speed_term = 1.0 + 0.04 * oxcf->speed; + double ediv_size_correction; + const int target_norm_bits_per_mb = + (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) / + active_mbs; + int q; + + // Larger image formats are expected to be a little harder to code + // relatively given the same prediction error score. This in part at + // least relates to the increased size and hence coding overheads of + // motion vectors. Some account of this is made through adjustment of + // the error divisor. + ediv_size_correction = + AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi))); + if (ediv_size_correction < 1.0) + ediv_size_correction = -(1.0 / ediv_size_correction); + ediv_size_correction *= 4.0; + + // Try and pick a max Q that will be high enough to encode the + // content at the given rate. + for (q = rc->best_quality; q < rc->worst_quality; ++q) { + const double factor = calc_correction_factor( + av_err_per_mb, ERR_DIVISOR - ediv_size_correction, FACTOR_PT_LOW, + FACTOR_PT_HIGH, q, cpi->common.bit_depth); + const int bits_per_mb = av1_rc_bits_per_mb( + INTER_FRAME, q, factor * speed_term * group_weight_factor, + cpi->common.bit_depth); + if (bits_per_mb <= target_norm_bits_per_mb) break; + } + + // Restriction on active max q for constrained quality mode. + if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level); + return q; + } +} + +static void setup_rf_level_maxq(AV1_COMP *cpi) { + int i; + RATE_CONTROL *const rc = &cpi->rc; + for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) { + int qdelta = av1_frame_type_qdelta(cpi, i, rc->worst_quality); + rc->rf_level_maxq[i] = AOMMAX(rc->worst_quality + qdelta, rc->best_quality); + } +} + +void av1_init_subsampling(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + const int w = cm->width; + const int h = cm->height; + int i; + + for (i = 0; i < FRAME_SCALE_STEPS; ++i) { + // Note: Frames with odd-sized dimensions may result from this scaling. + rc->frame_width[i] = (w * 16) / frame_scale_factor[i]; + rc->frame_height[i] = (h * 16) / frame_scale_factor[i]; + } + + setup_rf_level_maxq(cpi); +} + +void av1_calculate_coded_size(AV1_COMP *cpi, int *scaled_frame_width, + int *scaled_frame_height) { + RATE_CONTROL *const rc = &cpi->rc; + *scaled_frame_width = rc->frame_width[rc->frame_size_selector]; + *scaled_frame_height = rc->frame_height[rc->frame_size_selector]; +} + +void av1_init_second_pass(AV1_COMP *cpi) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + double frame_rate; + FIRSTPASS_STATS *stats; + + zero_stats(&twopass->total_stats); + zero_stats(&twopass->total_left_stats); + + if (!twopass->stats_in_end) return; + + stats = &twopass->total_stats; + + *stats = *twopass->stats_in_end; + twopass->total_left_stats = *stats; + + frame_rate = 10000000.0 * stats->count / stats->duration; + // Each frame can have a different duration, as the frame rate in the source + // isn't guaranteed to be constant. The frame rate prior to the first frame + // encoded in the second pass is a guess. However, the sum duration is not. + // It is calculated based on the actual durations of all frames from the + // first pass. + av1_new_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); + + // This variable monitors how far behind the second ref update is lagging. + twopass->sr_update_lag = 1; + + // Scan the first pass file and calculate a modified total error based upon + // the bias/power function used to allocate bits. + { + const double avg_error = + stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count); + const FIRSTPASS_STATS *s = twopass->stats_in; + double modified_error_total = 0.0; + twopass->modified_error_min = + (avg_error * oxcf->two_pass_vbrmin_section) / 100; + twopass->modified_error_max = + (avg_error * oxcf->two_pass_vbrmax_section) / 100; + while (s < twopass->stats_in_end) { + modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s); + ++s; + } + twopass->modified_error_left = modified_error_total; + } + + // Reset the vbr bits off target counters + cpi->rc.vbr_bits_off_target = 0; + cpi->rc.vbr_bits_off_target_fast = 0; + + cpi->rc.rate_error_estimate = 0; + + // Static sequence monitor variables. + twopass->kf_zeromotion_pct = 100; + twopass->last_kfgroup_zeromotion_pct = 100; + + if (oxcf->resize_mode != RESIZE_NONE) { + av1_init_subsampling(cpi); + } +} + +#define SR_DIFF_PART 0.0015 +#define MOTION_AMP_PART 0.003 +#define INTRA_PART 0.005 +#define DEFAULT_DECAY_LIMIT 0.75 +#define LOW_SR_DIFF_TRHESH 0.1 +#define SR_DIFF_MAX 128.0 + +static double get_sr_decay_rate(const AV1_COMP *cpi, + const FIRSTPASS_STATS *frame) { + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : cpi->common.MBs; + double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs; + double sr_decay = 1.0; + double modified_pct_inter; + double modified_pcnt_intra; + const double motion_amplitude_factor = + frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2); + + modified_pct_inter = frame->pcnt_inter; + if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < + (double)NCOUNT_FRAME_II_THRESH) { + modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral; + } + modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); + + if ((sr_diff > LOW_SR_DIFF_TRHESH)) { + sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX); + sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - + (MOTION_AMP_PART * motion_amplitude_factor) - + (INTRA_PART * modified_pcnt_intra); + } + return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter)); +} + +// This function gives an estimate of how badly we believe the prediction +// quality is decaying from frame to frame. +static double get_zero_motion_factor(const AV1_COMP *cpi, + const FIRSTPASS_STATS *frame) { + const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion; + double sr_decay = get_sr_decay_rate(cpi, frame); + return AOMMIN(sr_decay, zero_motion_pct); +} + +#define ZM_POWER_FACTOR 0.75 + +static double get_prediction_decay_rate(const AV1_COMP *cpi, + const FIRSTPASS_STATS *next_frame) { + const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame); + const double zero_motion_factor = + (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion), + ZM_POWER_FACTOR)); + + return AOMMAX(zero_motion_factor, + (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); +} + +// Function to test for a condition where a complex transition is followed +// by a static section. For example in slide shows where there is a fade +// between slides. This is to help with more optimal kf and gf positioning. +static int detect_transition_to_still(AV1_COMP *cpi, int frame_interval, + int still_interval, + double loop_decay_rate, + double last_decay_rate) { + TWO_PASS *const twopass = &cpi->twopass; + RATE_CONTROL *const rc = &cpi->rc; + + // Break clause to detect very still sections after motion + // For example a static image after a fade or other transition + // instead of a clean scene cut. + if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 && + last_decay_rate < 0.9) { + int j; + + // Look ahead a few frames to see if static condition persists... + for (j = 0; j < still_interval; ++j) { + const FIRSTPASS_STATS *stats = &twopass->stats_in[j]; + if (stats >= twopass->stats_in_end) break; + + if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break; + } + + // Only if it does do we signal a transition to still. + return j == still_interval; + } + + return 0; +} + +// This function detects a flash through the high relative pcnt_second_ref +// score in the frame following a flash frame. The offset passed in should +// reflect this. +static int detect_flash(const TWO_PASS *twopass, int offset) { + const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset); + + // What we are looking for here is a situation where there is a + // brief break in prediction (such as a flash) but subsequent frames + // are reasonably well predicted by an earlier (pre flash) frame. + // The recovery after a flash is indicated by a high pcnt_second_ref + // compared to pcnt_inter. + return next_frame != NULL && + next_frame->pcnt_second_ref > next_frame->pcnt_inter && + next_frame->pcnt_second_ref >= 0.5; +} + +// Update the motion related elements to the GF arf boost calculation. +static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, + double *mv_in_out, + double *mv_in_out_accumulator, + double *abs_mv_in_out_accumulator, + double *mv_ratio_accumulator) { + const double pct = stats->pcnt_motion; + + // Accumulate Motion In/Out of frame stats. + *mv_in_out = stats->mv_in_out_count * pct; + *mv_in_out_accumulator += *mv_in_out; + *abs_mv_in_out_accumulator += fabs(*mv_in_out); + + // Accumulate a measure of how uniform (or conversely how random) the motion + // field is (a ratio of abs(mv) / mv). + if (pct > 0.05) { + const double mvr_ratio = + fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr)); + const double mvc_ratio = + fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc)); + + *mv_ratio_accumulator += + pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs); + *mv_ratio_accumulator += + pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs); + } +} + +#define BASELINE_ERR_PER_MB 1000.0 +static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame, + double this_frame_mv_in_out, double max_boost) { + double frame_boost; + const double lq = av1_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); + const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5); + int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : cpi->common.MBs; + + // Correct for any inactive region in the image + num_mbs = (int)AOMMAX(1, num_mbs * calculate_active_area(cpi, this_frame)); + + // Underlying boost factor is based on inter error ratio. + frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error); + frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction; + + // Increase boost for frames where new data coming into frame (e.g. zoom out). + // Slightly reduce boost if there is a net balance of motion out of the frame + // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0. + if (this_frame_mv_in_out > 0.0) + frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); + // In the extreme case the boost is halved. + else + frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); + + return AOMMIN(frame_boost, max_boost * boost_q_correction); +} + +static int calc_arf_boost(AV1_COMP *cpi, int offset, int f_frames, int b_frames, + int *f_boost, int *b_boost) { + TWO_PASS *const twopass = &cpi->twopass; + int i; + double boost_score = 0.0; + double mv_ratio_accumulator = 0.0; + double decay_accumulator = 1.0; + double this_frame_mv_in_out = 0.0; + double mv_in_out_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; + int arf_boost; + int flash_detected = 0; + + // Search forward from the proposed arf/next gf position. + for (i = 0; i < f_frames; ++i) { + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + if (this_frame == NULL) break; + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats( + this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + + // We want to discount the flash frame itself and the recovery + // frame that follows as both will have poor scores. + flash_detected = detect_flash(twopass, i + offset) || + detect_flash(twopass, i + offset + 1); + + // Accumulate the effect of prediction quality decay. + if (!flash_detected) { + decay_accumulator *= get_prediction_decay_rate(cpi, this_frame); + decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR + ? MIN_DECAY_FACTOR + : decay_accumulator; + } + + boost_score += + decay_accumulator * + calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST); + } + + *f_boost = (int)boost_score; + + // Reset for backward looking loop. + boost_score = 0.0; + mv_ratio_accumulator = 0.0; + decay_accumulator = 1.0; + this_frame_mv_in_out = 0.0; + mv_in_out_accumulator = 0.0; + abs_mv_in_out_accumulator = 0.0; + + // Search backward towards last gf position. + for (i = -1; i >= -b_frames; --i) { + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + if (this_frame == NULL) break; + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats( + this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + + // We want to discount the the flash frame itself and the recovery + // frame that follows as both will have poor scores. + flash_detected = detect_flash(twopass, i + offset) || + detect_flash(twopass, i + offset + 1); + + // Cumulative effect of prediction quality decay. + if (!flash_detected) { + decay_accumulator *= get_prediction_decay_rate(cpi, this_frame); + decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR + ? MIN_DECAY_FACTOR + : decay_accumulator; + } + + boost_score += + decay_accumulator * + calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST); + } + *b_boost = (int)boost_score; + + arf_boost = (*f_boost + *b_boost); + if (arf_boost < ((b_frames + f_frames) * 20)) + arf_boost = ((b_frames + f_frames) * 20); + arf_boost = AOMMAX(arf_boost, MIN_ARF_GF_BOOST); + + return arf_boost; +} + +// Calculate a section intra ratio used in setting max loop filter. +static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, + const FIRSTPASS_STATS *end, + int section_length) { + const FIRSTPASS_STATS *s = begin; + double intra_error = 0.0; + double coded_error = 0.0; + int i = 0; + + while (s < end && i < section_length) { + intra_error += s->intra_error; + coded_error += s->coded_error; + ++s; + ++i; + } + + return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error)); +} + +// Calculate the total bits to allocate in this GF/ARF group. +static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi, + double gf_group_err) { + const RATE_CONTROL *const rc = &cpi->rc; + const TWO_PASS *const twopass = &cpi->twopass; + const int max_bits = frame_max_bits(rc, &cpi->oxcf); + int64_t total_group_bits; + + // Calculate the bits to be allocated to the group as a whole. + if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) { + total_group_bits = (int64_t)(twopass->kf_group_bits * + (gf_group_err / twopass->kf_group_error_left)); + } else { + total_group_bits = 0; + } + + // Clamp odd edge cases. + total_group_bits = (total_group_bits < 0) + ? 0 + : (total_group_bits > twopass->kf_group_bits) + ? twopass->kf_group_bits + : total_group_bits; + + // Clip based on user supplied data rate variability limit. + if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) + total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval; + + return total_group_bits; +} + +// Calculate the number bits extra to assign to boosted frames in a group. +static int calculate_boost_bits(int frame_count, int boost, + int64_t total_group_bits) { + int allocation_chunks; + + // return 0 for invalid inputs (could arise e.g. through rounding errors) + if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0; + + allocation_chunks = (frame_count * 100) + boost; + + // Prevent overflow. + if (boost > 1023) { + int divisor = boost >> 10; + boost /= divisor; + allocation_chunks /= divisor; + } + + // Calculate the number of extra bits for use in the boosted frame or frames. + return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), + 0); +} + +#if !CONFIG_EXT_REFS +// Current limit on maximum number of active arfs in a GF/ARF group. +#define MAX_ACTIVE_ARFS 2 +#define ARF_SLOT1 2 +#define ARF_SLOT2 3 +// This function indirects the choice of buffers for arfs. +// At the moment the values are fixed but this may change as part of +// the integration process with other codec features that swap buffers around. +static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) { + arf_buffer_indices[0] = ARF_SLOT1; + arf_buffer_indices[1] = ARF_SLOT2; +} +#endif + +static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, + double group_error, int gf_arf_bits) { + RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + FIRSTPASS_STATS frame_stats; + int i; + int frame_index = 0; + int target_frame_size; + int key_frame; + const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf); + int64_t total_group_bits = gf_group_bits; + double modified_err = 0.0; + double err_fraction; + int mid_boost_bits = 0; +#if CONFIG_EXT_REFS + // The use of bi-predictive frames are only enabled when following 3 + // conditions are met: + // (1) Alt-ref is enabled; + // (2) The bi-predictive group interval is at least 2; and + // (3) The bi-predictive group interval is strictly smaller than the + // golden group interval. + const int is_bipred_enabled = + rc->source_alt_ref_pending && rc->bipred_group_interval && + rc->bipred_group_interval <= + (rc->baseline_gf_interval - rc->source_alt_ref_pending); + int bipred_group_end = 0; + int bipred_frame_index = 0; + + int arf_pos[MAX_EXT_ARFS + 1]; + const unsigned char ext_arf_interval = + (unsigned char)(rc->baseline_gf_interval / (cpi->num_extra_arfs + 1) - 1); + int which_arf = cpi->num_extra_arfs; + int subgroup_interval[MAX_EXT_ARFS + 1]; + int ext_arf_boost[MAX_EXT_ARFS]; + int is_sg_bipred_enabled = is_bipred_enabled; + int accumulative_subgroup_interval = 0; +#else + int mid_frame_idx; + unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS]; +#endif // CONFIG_EXT_REFS + +#if CONFIG_EXT_REFS + av1_zero_array(ext_arf_boost, MAX_EXT_ARFS); +#endif // CONFIG_EXT_REFS + + key_frame = cpi->common.frame_type == KEY_FRAME; + +#if !CONFIG_EXT_REFS + get_arf_buffer_indices(arf_buffer_indices); +#endif // !CONFIG_EXT_REFS + + // For key frames the frame target rate is already set and it + // is also the golden frame. + if (!key_frame) { + if (rc->source_alt_ref_active) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->rf_level[frame_index] = INTER_NORMAL; + gf_group->bit_allocation[frame_index] = 0; + } else { + gf_group->update_type[frame_index] = GF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + gf_group->bit_allocation[frame_index] = gf_arf_bits; + } +#if CONFIG_EXT_REFS + gf_group->arf_update_idx[frame_index] = 0; + gf_group->arf_ref_idx[frame_index] = 0; +#else + gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; + gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; +#endif // CONFIG_EXT_REFS + // Step over the golden frame / overlay frame + if (EOF == input_stats(twopass, &frame_stats)) return; + } + +#if CONFIG_EXT_REFS + gf_group->bidir_pred_enabled[frame_index] = 0; + gf_group->brf_src_offset[frame_index] = 0; +#endif // CONFIG_EXT_REFS + + // Deduct the boost bits for arf (or gf if it is not a key frame) + // from the group total. + if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits; + + frame_index++; + +#if CONFIG_EXT_REFS + bipred_frame_index++; +#endif // CONFIG_EXT_REFS + + // Store the bits to spend on the ARF if there is one. + if (rc->source_alt_ref_pending) { + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + gf_group->bit_allocation[frame_index] = gf_arf_bits; + + gf_group->arf_src_offset[frame_index] = + (unsigned char)(rc->baseline_gf_interval - 1); + +#if CONFIG_EXT_REFS + gf_group->arf_update_idx[frame_index] = 0; + gf_group->arf_ref_idx[frame_index] = 0; + + gf_group->bidir_pred_enabled[frame_index] = 0; + gf_group->brf_src_offset[frame_index] = 0; +// NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames. +#else + gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; + gf_group->arf_ref_idx[frame_index] = + arf_buffer_indices[cpi->multi_arf_last_grp_enabled && + rc->source_alt_ref_active]; +#endif // CONFIG_EXT_REFS + +#if CONFIG_EXT_REFS + // Work out the ARFs' positions in this gf group + // NOTE(weitinglin): ALT_REFs' are indexed inversely, but coded in display + // order (except for the original ARF). In the example of three ALT_REF's, + // We index ALTREF's as: KEY ----- ALT2 ----- ALT1 ----- ALT0 + // but code them in the following order: + // KEY-ALT0-ALT2 ----- OVERLAY2-ALT1 ----- OVERLAY1 ----- OVERLAY0 + arf_pos[0] = + frame_index + cpi->num_extra_arfs + gf_group->arf_src_offset[1] + 1; + for (i = 0; i < cpi->num_extra_arfs; ++i) { + arf_pos[i + 1] = + frame_index + (cpi->num_extra_arfs - i) * (ext_arf_interval + 2); + subgroup_interval[i] = arf_pos[i] - arf_pos[i + 1] - (i == 0 ? 1 : 2); + } + subgroup_interval[cpi->num_extra_arfs] = arf_pos[cpi->num_extra_arfs] - + frame_index - + (cpi->num_extra_arfs == 0 ? 1 : 2); +#endif // CONFIG_EXT_REFS + + ++frame_index; + +#if CONFIG_EXT_REFS + // Insert an extra ARF + if (cpi->num_extra_arfs) { + gf_group->update_type[frame_index] = ARF_UPDATE; + // Note (weitinglin): GF_ARF_LOW is also used as an identifier + // for internal ALT_REF's: + gf_group->rf_level[frame_index] = GF_ARF_LOW; + gf_group->arf_src_offset[frame_index] = ext_arf_interval; + gf_group->arf_update_idx[frame_index] = which_arf; + gf_group->arf_ref_idx[frame_index] = 0; + ++frame_index; + } + accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs]; +#else + if (cpi->multi_arf_enabled) { + // Set aside a slot for a level 1 arf. + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_LOW; + gf_group->arf_src_offset[frame_index] = + (unsigned char)((rc->baseline_gf_interval >> 1) - 1); + gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1]; + gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; + ++frame_index; + } +#endif // CONFIG_EXT_ARFS + } + +#if !CONFIG_EXT_REFS + // Define middle frame + mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; +#endif // !CONFIG_EXT_REFS + + // Allocate bits to the other frames in the group. + for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) { +#if !CONFIG_EXT_REFS + int arf_idx = 0; +#endif // !CONFIG_EXT_REFS + + if (EOF == input_stats(twopass, &frame_stats)) break; + + modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats); + + if (group_error > 0) + err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error); + else + err_fraction = 0.0; + + target_frame_size = (int)((double)total_group_bits * err_fraction); + + if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) { + mid_boost_bits += (target_frame_size >> 4); + target_frame_size -= (target_frame_size >> 4); +#if !CONFIG_EXT_REFS + if (frame_index <= mid_frame_idx) arf_idx = 1; +#endif // !CONFIG_EXT_REFS + } + +#if CONFIG_EXT_REFS + gf_group->arf_update_idx[frame_index] = which_arf; + gf_group->arf_ref_idx[frame_index] = which_arf; +#else + gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx]; + gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx]; +#endif // CONFIG_EXT_REFS + + target_frame_size = + clamp(target_frame_size, 0, AOMMIN(max_bits, (int)total_group_bits)); + +#if CONFIG_EXT_REFS + // If we are going to have ARFs, check if we can have BWDREF in this + // subgroup. + if (rc->source_alt_ref_pending) { + is_sg_bipred_enabled = + is_bipred_enabled && + (subgroup_interval[which_arf] > rc->bipred_group_interval); + } + + // NOTE: BIDIR_PRED is only enabled when the length of the bi-predictive + // frame group interval is strictly smaller than that of the GOLDEN + // FRAME group interval. + // TODO(zoeliu): Currently BIDIR_PRED is only enabled when alt-ref is on. + if (is_sg_bipred_enabled && !bipred_group_end) { + const int cur_brf_src_offset = rc->bipred_group_interval - 1; + + // --- BRF_UPDATE --- + if (bipred_frame_index == 1) { + gf_group->update_type[frame_index] = BRF_UPDATE; + gf_group->bidir_pred_enabled[frame_index] = 1; + gf_group->brf_src_offset[frame_index] = cur_brf_src_offset; + // --- LAST_BIPRED_UPDATE --- + } else if (bipred_frame_index == rc->bipred_group_interval) { + gf_group->update_type[frame_index] = LAST_BIPRED_UPDATE; + gf_group->bidir_pred_enabled[frame_index] = 1; + gf_group->brf_src_offset[frame_index] = 0; + // Reset the bi-predictive frame index. + bipred_frame_index = 0; + // --- BIPRED_UPDATE --- + } else { + gf_group->update_type[frame_index] = BIPRED_UPDATE; + gf_group->bidir_pred_enabled[frame_index] = 1; + gf_group->brf_src_offset[frame_index] = 0; + } + + bipred_frame_index++; + // Check whether the next bi-predictive frame group would entirely be + // included within the current golden frame group. + // In addition, we need to avoid coding a BRF right before an ARF. + if (bipred_frame_index == 1 && + (i + 2 + cur_brf_src_offset) >= accumulative_subgroup_interval) { + bipred_group_end = 1; + } + } else { +#endif // CONFIG_EXT_REFS + gf_group->update_type[frame_index] = LF_UPDATE; +#if CONFIG_EXT_REFS + gf_group->bidir_pred_enabled[frame_index] = 0; + gf_group->brf_src_offset[frame_index] = 0; + } +#endif // CONFIG_EXT_REFS + +#if CONFIG_EXT_REFS + if (gf_group->update_type[frame_index] == BRF_UPDATE) { + // Boost up the allocated bits on BWDREF_FRAME + gf_group->rf_level[frame_index] = GF_ARF_LOW; + gf_group->bit_allocation[frame_index] = + target_frame_size + (target_frame_size >> 2); + } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) { + // Press down the allocated bits on LAST_BIPRED_UPDATE frames + gf_group->rf_level[frame_index] = INTER_NORMAL; + gf_group->bit_allocation[frame_index] = + target_frame_size - (target_frame_size >> 1); + } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) { + // TODO(zoeliu): To investigate whether the allocated bits on + // BIPRED_UPDATE frames need to be further adjusted. + gf_group->rf_level[frame_index] = INTER_NORMAL; + gf_group->bit_allocation[frame_index] = target_frame_size; + } else { +#endif // CONFIG_EXT_REFS + gf_group->rf_level[frame_index] = INTER_NORMAL; + gf_group->bit_allocation[frame_index] = target_frame_size; +#if CONFIG_EXT_REFS + } +#endif // CONFIG_EXT_REFS + + ++frame_index; + +#if CONFIG_EXT_REFS + // Check if we need to update the ARF + if (is_sg_bipred_enabled && cpi->num_extra_arfs && which_arf > 0 && + frame_index > arf_pos[which_arf]) { + --which_arf; + accumulative_subgroup_interval += subgroup_interval[which_arf] + 1; + // Meet the new subgroup. Reset the bipred_group_end flag; + bipred_group_end = 0; + // Insert another extra ARF after the overlay frame + if (which_arf) { + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_LOW; + gf_group->arf_src_offset[frame_index] = ext_arf_interval; + gf_group->arf_update_idx[frame_index] = which_arf; + gf_group->arf_ref_idx[frame_index] = 0; + ++frame_index; + } + } +#endif // CONFIG_EXT_REFS + } + +// Note: +// We need to configure the frame at the end of the sequence + 1 that will be +// the start frame for the next group. Otherwise prior to the call to +// av1_rc_get_second_pass_params() the data will be undefined. +#if CONFIG_EXT_REFS + gf_group->arf_update_idx[frame_index] = 0; + gf_group->arf_ref_idx[frame_index] = 0; +#else + gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; + gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; +#endif // CONFIG_EXT_REFS + + if (rc->source_alt_ref_pending) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->rf_level[frame_index] = INTER_NORMAL; + +#if CONFIG_EXT_REFS + if (cpi->num_extra_arfs) { + for (i = cpi->num_extra_arfs; i > 0; --i) { + int arf_pos_in_gf = (i == cpi->num_extra_arfs ? 2 : arf_pos[i + 1] + 1); + gf_group->bit_allocation[arf_pos_in_gf] = + gf_group->bit_allocation[arf_pos[i]]; + gf_group->update_type[arf_pos[i]] = INTNL_OVERLAY_UPDATE; + gf_group->bit_allocation[arf_pos[i]] = 0; + gf_group->rf_level[arf_pos[i]] = INTER_NORMAL; + } + } +#else + // Final setup for second arf and its overlay. + if (cpi->multi_arf_enabled) { + gf_group->bit_allocation[2] = + gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits; + gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE; + gf_group->bit_allocation[mid_frame_idx] = 0; + } +#endif // CONFIG_EXT_REFS + } else { + gf_group->update_type[frame_index] = GF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + } + +#if CONFIG_EXT_REFS + gf_group->bidir_pred_enabled[frame_index] = 0; + gf_group->brf_src_offset[frame_index] = 0; +#endif // CONFIG_EXT_REFS + + // Note whether multi-arf was enabled this group for next time. + cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled; +} + +// Analyse and define a gf/arf group. +static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + AV1EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_STATS *const start_pos = twopass->stats_in; + int i; + + double boost_score = 0.0; + double old_boost_score = 0.0; + double gf_group_err = 0.0; +#if GROUP_ADAPTIVE_MAXQ + double gf_group_raw_error = 0.0; +#endif + double gf_group_skip_pct = 0.0; + double gf_group_inactive_zone_rows = 0.0; + double gf_first_frame_err = 0.0; + double mod_frame_err = 0.0; + + double mv_ratio_accumulator = 0.0; + double decay_accumulator = 1.0; + double zero_motion_accumulator = 1.0; + + double loop_decay_rate = 1.00; + double last_loop_decay_rate = 1.00; + + double this_frame_mv_in_out = 0.0; + double mv_in_out_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; + double mv_ratio_accumulator_thresh; + unsigned int allow_alt_ref = is_altref_enabled(cpi); + + int f_boost = 0; + int b_boost = 0; + int flash_detected; + int active_max_gf_interval; + int active_min_gf_interval; + int64_t gf_group_bits; + double gf_group_error_left; + int gf_arf_bits; + const int is_key_frame = frame_is_intra_only(cm); + const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active; + + // Reset the GF group data structures unless this is a key + // frame in which case it will already have been done. + if (is_key_frame == 0) { + av1_zero(twopass->gf_group); + } + + aom_clear_system_state(); + av1_zero(next_frame); + + // Load stats for the current frame. + mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + + // Note the error of the frame at the start of the group. This will be + // the GF frame error if we code a normal gf. + gf_first_frame_err = mod_frame_err; + + // If this is a key frame or the overlay from a previous arf then + // the error score / cost of this frame has already been accounted for. + if (arf_active_or_kf) { + gf_group_err -= gf_first_frame_err; +#if GROUP_ADAPTIVE_MAXQ + gf_group_raw_error -= this_frame->coded_error; +#endif + gf_group_skip_pct -= this_frame->intra_skip_pct; + gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows; + } + + // Motion breakout threshold for loop below depends on image size. + mv_ratio_accumulator_thresh = + (cpi->initial_height + cpi->initial_width) / 4.0; + + // Set a maximum and minimum interval for the GF group. + // If the image appears almost completely static we can extend beyond this. + { + int int_max_q = (int)(av1_convert_qindex_to_q(twopass->active_worst_quality, + cpi->common.bit_depth)); + int int_lbq = (int)(av1_convert_qindex_to_q(rc->last_boosted_qindex, + cpi->common.bit_depth)); + + active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200); + if (active_min_gf_interval > rc->max_gf_interval) + active_min_gf_interval = rc->max_gf_interval; + + if (cpi->multi_arf_allowed) { + active_max_gf_interval = rc->max_gf_interval; + } else { + // The value chosen depends on the active Q range. At low Q we have + // bits to spare and are better with a smaller interval and smaller boost. + // At high Q when there are few bits to spare we are better with a longer + // interval to spread the cost of the GF. + active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6)); + + // We have: active_min_gf_interval <= rc->max_gf_interval + if (active_max_gf_interval < active_min_gf_interval) + active_max_gf_interval = active_min_gf_interval; + else if (active_max_gf_interval > rc->max_gf_interval) + active_max_gf_interval = rc->max_gf_interval; + } + } + + i = 0; + while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) { + ++i; + + // Accumulate error score of frames in this gf group. + mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + gf_group_err += mod_frame_err; +#if GROUP_ADAPTIVE_MAXQ + gf_group_raw_error += this_frame->coded_error; +#endif + gf_group_skip_pct += this_frame->intra_skip_pct; + gf_group_inactive_zone_rows += this_frame->inactive_zone_rows; + + if (EOF == input_stats(twopass, &next_frame)) break; + + // Test for the case where there is a brief flash but the prediction + // quality back to an earlier frame is then restored. + flash_detected = detect_flash(twopass, 0); + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats( + &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + + // Accumulate the effect of prediction quality decay. + if (!flash_detected) { + last_loop_decay_rate = loop_decay_rate; + loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); + + decay_accumulator = decay_accumulator * loop_decay_rate; + + // Monitor for static sections. + zero_motion_accumulator = AOMMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + + // Break clause to detect very still sections after motion. For example, + // a static image after a fade or other transition. + if (detect_transition_to_still(cpi, i, 5, loop_decay_rate, + last_loop_decay_rate)) { + allow_alt_ref = 0; + break; + } + } + + // Calculate a boost number for this frame. + boost_score += + decay_accumulator * + calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST); + + // Break out conditions. + if ( + // Break at active_max_gf_interval unless almost totally static. + (i >= (active_max_gf_interval + arf_active_or_kf) && + zero_motion_accumulator < 0.995) || + ( + // Don't break out with a very short interval. + (i >= active_min_gf_interval + arf_active_or_kf) && + (!flash_detected) && + ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || + (abs_mv_in_out_accumulator > 3.0) || + (mv_in_out_accumulator < -2.0) || + ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) { + boost_score = old_boost_score; + break; + } + + *this_frame = next_frame; + old_boost_score = boost_score; + } + + twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0); + + // Was the group length constrained by the requirement for a new KF? + rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; + + // Should we use the alternate reference frame. + if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && + (i >= rc->min_gf_interval)) { + // Calculate the boost for alt ref. + rc->gfu_boost = + calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); + rc->source_alt_ref_pending = 1; + + // Test to see if multi arf is appropriate. + cpi->multi_arf_enabled = + (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) && + (zero_motion_accumulator < 0.995)) + ? 1 + : 0; + } else { + rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST); + rc->source_alt_ref_pending = 0; + } + + // Set the interval until the next gf. + rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending); + +#if CONFIG_EXT_REFS + // Compute how many extra alt_refs we can have + cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval, + rc->source_alt_ref_pending); + // Currently at maximum two extra ARFs' are allowed + assert(cpi->num_extra_arfs <= MAX_EXT_ARFS); +#endif // CONFIG_EXT_REFS + + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + +#if CONFIG_EXT_REFS + rc->bipred_group_interval = BFG_INTERVAL; + // The minimum bi-predictive frame group interval is 2. + if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0; +#endif // CONFIG_EXT_REFS + + // Reset the file position. + reset_fpf_position(twopass, start_pos); + + // Calculate the bits to be allocated to the gf/arf group as a whole + gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err); + +#if GROUP_ADAPTIVE_MAXQ + // Calculate an estimate of the maxq needed for the group. + // We are more agressive about correcting for sections + // where there could be significant overshoot than for easier + // sections where we do not wish to risk creating an overshoot + // of the allocated bit budget. + if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) { + const int vbr_group_bits_per_frame = + (int)(gf_group_bits / rc->baseline_gf_interval); + const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval; + const double group_av_skip_pct = + gf_group_skip_pct / rc->baseline_gf_interval; + const double group_av_inactive_zone = + ((gf_group_inactive_zone_rows * 2) / + (rc->baseline_gf_interval * (double)cm->mb_rows)); + + int tmp_q; + // rc factor is a weight factor that corrects for local rate control drift. + double rc_factor = 1.0; + if (rc->rate_error_estimate > 0) { + rc_factor = AOMMAX(RC_FACTOR_MIN, + (double)(100 - rc->rate_error_estimate) / 100.0); + } else { + rc_factor = AOMMIN(RC_FACTOR_MAX, + (double)(100 - rc->rate_error_estimate) / 100.0); + } + tmp_q = get_twopass_worst_quality( + cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone), + vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor); + twopass->active_worst_quality = + AOMMAX(tmp_q, twopass->active_worst_quality >> 1); + } +#endif + + // Calculate the extra bits to be used for boosted frame(s) + gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost, + gf_group_bits); + + // Adjust KF group bits and error remaining. + twopass->kf_group_error_left -= (int64_t)gf_group_err; + + // If this is an arf update we want to remove the score for the overlay + // frame at the end which will usually be very cheap to code. + // The overlay frame has already, in effect, been coded so we want to spread + // the remaining bits among the other frames. + // For normal GFs remove the score for the GF itself unless this is + // also a key frame in which case it has already been accounted for. + if (rc->source_alt_ref_pending) { + gf_group_error_left = gf_group_err - mod_frame_err; + } else if (is_key_frame == 0) { + gf_group_error_left = gf_group_err - gf_first_frame_err; + } else { + gf_group_error_left = gf_group_err; + } + + // Allocate bits to each of the frames in the GF group. + allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits); + + // Reset the file position. + reset_fpf_position(twopass, start_pos); + + // Calculate a section intra ratio used in setting max loop filter. + if (cpi->common.frame_type != KEY_FRAME) { + twopass->section_intra_rating = calculate_section_intra_ratio( + start_pos, twopass->stats_in_end, rc->baseline_gf_interval); + } + + if (oxcf->resize_mode == RESIZE_DYNAMIC) { + // Default to starting GF groups at normal frame size. + cpi->rc.next_frame_size_selector = UNSCALED; + } +} + +// Threshold for use of the lagging second reference frame. High second ref +// usage may point to a transient event like a flash or occlusion rather than +// a real scene cut. +#define SECOND_REF_USEAGE_THRESH 0.1 +// Minimum % intra coding observed in first pass (1.0 = 100%) +#define MIN_INTRA_LEVEL 0.25 +// Minimum ratio between the % of intra coding and inter coding in the first +// pass after discounting neutral blocks (discounting neutral blocks in this +// way helps catch scene cuts in clips with very flat areas or letter box +// format clips with image padding. +#define INTRA_VS_INTER_THRESH 2.0 +// Hard threshold where the first pass chooses intra for almost all blocks. +// In such a case even if the frame is not a scene cut coding a key frame +// may be a good option. +#define VERY_LOW_INTER_THRESH 0.05 +// Maximum threshold for the relative ratio of intra error score vs best +// inter error score. +#define KF_II_ERR_THRESHOLD 2.5 +// In real scene cuts there is almost always a sharp change in the intra +// or inter error score. +#define ERR_CHANGE_THRESHOLD 0.4 +// For real scene cuts we expect an improvment in the intra inter error +// ratio in the next frame. +#define II_IMPROVEMENT_THRESHOLD 3.5 +#define KF_II_MAX 128.0 + +static int test_candidate_kf(TWO_PASS *twopass, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *next_frame) { + int is_viable_kf = 0; + double pcnt_intra = 1.0 - this_frame->pcnt_inter; + double modified_pcnt_inter = + this_frame->pcnt_inter - this_frame->pcnt_neutral; + + // Does the frame satisfy the primary criteria of a key frame? + // See above for an explanation of the test criteria. + // If so, then examine how well it predicts subsequent frames. + if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && + (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && + ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) || + ((pcnt_intra > MIN_INTRA_LEVEL) && + (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) && + ((this_frame->intra_error / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < + KF_II_ERR_THRESHOLD) && + ((fabs(last_frame->coded_error - this_frame->coded_error) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > + ERR_CHANGE_THRESHOLD) || + (fabs(last_frame->intra_error - this_frame->intra_error) / + DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > + ERR_CHANGE_THRESHOLD) || + ((next_frame->intra_error / + DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > + II_IMPROVEMENT_THRESHOLD))))) { + int i; + const FIRSTPASS_STATS *start_pos = twopass->stats_in; + FIRSTPASS_STATS local_next_frame = *next_frame; + double boost_score = 0.0; + double old_boost_score = 0.0; + double decay_accumulator = 1.0; + + // Examine how well the key frame predicts subsequent frames. + for (i = 0; i < 16; ++i) { + double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error / + DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)); + + if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX; + + // Cumulative effect of decay in prediction quality. + if (local_next_frame.pcnt_inter > 0.85) + decay_accumulator *= local_next_frame.pcnt_inter; + else + decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0; + + // Keep a running total. + boost_score += (decay_accumulator * next_iiratio); + + // Test various breakout clauses. + if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) || + (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) < + 0.20) && + (next_iiratio < 3.0)) || + ((boost_score - old_boost_score) < 3.0) || + (local_next_frame.intra_error < 200)) { + break; + } + + old_boost_score = boost_score; + + // Get the next frame details + if (EOF == input_stats(twopass, &local_next_frame)) break; + } + + // If there is tolerable prediction for at least the next 3 frames then + // break out else discard this potential key frame and move on + if (boost_score > 30.0 && (i > 3)) { + is_viable_kf = 1; + } else { + // Reset the file position + reset_fpf_position(twopass, start_pos); + + is_viable_kf = 0; + } + } + + return is_viable_kf; +} + +#define FRAMES_TO_CHECK_DECAY 8 + +static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { + int i, j; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const FIRSTPASS_STATS first_frame = *this_frame; + const FIRSTPASS_STATS *const start_position = twopass->stats_in; + FIRSTPASS_STATS next_frame; + FIRSTPASS_STATS last_frame; + int kf_bits = 0; + int loop_decay_counter = 0; + double decay_accumulator = 1.0; + double av_decay_accumulator = 0.0; + double zero_motion_accumulator = 1.0; + double boost_score = 0.0; + double kf_mod_err = 0.0; + double kf_group_err = 0.0; + double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; + + av1_zero(next_frame); + + cpi->common.frame_type = KEY_FRAME; + + // Reset the GF group data structures. + av1_zero(*gf_group); + + // Is this a forced key frame by interval. + rc->this_key_frame_forced = rc->next_key_frame_forced; + + // Clear the alt ref active flag and last group multi arf flags as they + // can never be set for a key frame. + rc->source_alt_ref_active = 0; + cpi->multi_arf_last_grp_enabled = 0; + + // KF is always a GF so clear frames till next gf counter. + rc->frames_till_gf_update_due = 0; + + rc->frames_to_key = 1; + + twopass->kf_group_bits = 0; // Total bits available to kf group + twopass->kf_group_error_left = 0; // Group modified error score. + + kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + + // Initialize the decay rates for the recent frames to check + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; + + // Find the next keyframe. + i = 0; + while (twopass->stats_in < twopass->stats_in_end && + rc->frames_to_key < cpi->oxcf.key_freq) { + // Accumulate kf group error. + kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + + // Load the next frame's stats. + last_frame = *this_frame; + input_stats(twopass, this_frame); + + // Provided that we are not at the end of the file... + if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) { + double loop_decay_rate; + + // Check for a scene cut. + if (test_candidate_kf(twopass, &last_frame, this_frame, + twopass->stats_in)) + break; + + // How fast is the prediction quality decaying? + loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in); + + // We want to know something about the recent past... rather than + // as used elsewhere where we are concerned with decay in prediction + // quality since the last GF or KF. + recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate; + decay_accumulator = 1.0; + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) + decay_accumulator *= recent_loop_decay[j]; + + // Special check for transition or high motion followed by a + // static scene. + if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i, + loop_decay_rate, decay_accumulator)) + break; + + // Step on to the next frame. + ++rc->frames_to_key; + + // If we don't have a real key frame within the next two + // key_freq intervals then break out of the loop. + if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break; + } else { + ++rc->frames_to_key; + } + ++i; + } + + // If there is a max kf interval set by the user we must obey it. + // We already breakout of the loop above at 2x max. + // This code centers the extra kf if the actual natural interval + // is between 1x and 2x. + if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) { + FIRSTPASS_STATS tmp_frame = first_frame; + + rc->frames_to_key /= 2; + + // Reset to the start of the group. + reset_fpf_position(twopass, start_position); + + kf_group_err = 0.0; + + // Rescan to get the correct error data for the forced kf group. + for (i = 0; i < rc->frames_to_key; ++i) { + kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame); + input_stats(twopass, &tmp_frame); + } + rc->next_key_frame_forced = 1; + } else if (twopass->stats_in == twopass->stats_in_end || + rc->frames_to_key >= cpi->oxcf.key_freq) { + rc->next_key_frame_forced = 1; + } else { + rc->next_key_frame_forced = 0; + } + + // Special case for the last key frame of the file. + if (twopass->stats_in >= twopass->stats_in_end) { + // Accumulate kf group error. + kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + } + + // Calculate the number of bits that should be assigned to the kf group. + if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) { + // Maximum number of bits for a single normal frame (not key frame). + const int max_bits = frame_max_bits(rc, &cpi->oxcf); + + // Maximum number of bits allocated to the key frame group. + int64_t max_grp_bits; + + // Default allocation based on bits left and relative + // complexity of the section. + twopass->kf_group_bits = (int64_t)( + twopass->bits_left * (kf_group_err / twopass->modified_error_left)); + + // Clip based on maximum per frame rate defined by the user. + max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; + if (twopass->kf_group_bits > max_grp_bits) + twopass->kf_group_bits = max_grp_bits; + } else { + twopass->kf_group_bits = 0; + } + twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits); + + // Reset the first pass file position. + reset_fpf_position(twopass, start_position); + + // Scan through the kf group collating various stats used to determine + // how many bits to spend on it. + decay_accumulator = 1.0; + boost_score = 0.0; + for (i = 0; i < (rc->frames_to_key - 1); ++i) { + if (EOF == input_stats(twopass, &next_frame)) break; + + // Monitor for static sections. + zero_motion_accumulator = AOMMIN(zero_motion_accumulator, + get_zero_motion_factor(cpi, &next_frame)); + + // Not all frames in the group are necessarily used in calculating boost. + if ((i <= rc->max_gf_interval) || + ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) { + const double frame_boost = + calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST); + + // How fast is prediction quality decaying. + if (!detect_flash(twopass, 0)) { + const double loop_decay_rate = + get_prediction_decay_rate(cpi, &next_frame); + decay_accumulator *= loop_decay_rate; + decay_accumulator = AOMMAX(decay_accumulator, MIN_DECAY_FACTOR); + av_decay_accumulator += decay_accumulator; + ++loop_decay_counter; + } + boost_score += (decay_accumulator * frame_boost); + } + } + av_decay_accumulator /= (double)loop_decay_counter; + + reset_fpf_position(twopass, start_position); + + // Store the zero motion percentage + twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); + + // Calculate a section intra ratio used in setting max loop filter. + twopass->section_intra_rating = calculate_section_intra_ratio( + start_position, twopass->stats_in_end, rc->frames_to_key); + + // Apply various clamps for min and max boost + rc->kf_boost = (int)(av_decay_accumulator * boost_score); + rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3)); + rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST); + + // Work out how many bits to allocate for the key frame itself. + kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, + twopass->kf_group_bits); + + // Work out the fraction of the kf group bits reserved for the inter frames + // within the group after discounting the bits for the kf itself. + if (twopass->kf_group_bits) { + twopass->kfgroup_inter_fraction = + (double)(twopass->kf_group_bits - kf_bits) / + (double)twopass->kf_group_bits; + } else { + twopass->kfgroup_inter_fraction = 1.0; + } + + twopass->kf_group_bits -= kf_bits; + + // Save the bits to spend on the key frame. + gf_group->bit_allocation[0] = kf_bits; + gf_group->update_type[0] = KF_UPDATE; + gf_group->rf_level[0] = KF_STD; + + // Note the total error score of the kf group minus the key frame itself. + twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err); + + // Adjust the count of total modified error left. + // The count of bits left is adjusted elsewhere based on real coded frame + // sizes. + twopass->modified_error_left -= kf_group_err; + + if (oxcf->resize_mode == RESIZE_DYNAMIC) { + // Default to normal-sized frame on keyframes. + cpi->rc.next_frame_size_selector = UNSCALED; + } +} + +// Define the reference buffers that will be updated post encode. +static void configure_buffer_updates(AV1_COMP *cpi) { + TWO_PASS *const twopass = &cpi->twopass; + + // Wei-Ting: Should we define another function to take care of + // cpi->rc.is_$Source_Type to make this function as it is in the comment? + + cpi->rc.is_src_frame_alt_ref = 0; +#if CONFIG_EXT_REFS + cpi->rc.is_bwd_ref_frame = 0; + cpi->rc.is_last_bipred_frame = 0; + cpi->rc.is_bipred_frame = 0; + cpi->rc.is_src_frame_ext_arf = 0; +#endif // CONFIG_EXT_REFS + + switch (twopass->gf_group.update_type[twopass->gf_group.index]) { + case KF_UPDATE: +#if CONFIG_EXT_REFS + cpi->refresh_bwd_ref_frame = 1; +#endif // CONFIG_EXT_REFS + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + break; + + case LF_UPDATE: +#if CONFIG_EXT_REFS + // If we have extra ALT_REFs, we can use the farthest ALT (ALT0) as + // the BWD_REF. + if (cpi->num_extra_arfs) { + int tmp = cpi->bwd_fb_idx; + + cpi->bwd_fb_idx = cpi->alt_fb_idx; + cpi->alt_fb_idx = cpi->arf_map[0]; + cpi->arf_map[0] = tmp; + + cpi->rc.is_bwd_ref_frame = 1; + } else { + cpi->rc.is_bwd_ref_frame = 0; + } +#endif // CONFIG_EXT_REFS + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + break; + + case GF_UPDATE: +#if CONFIG_EXT_REFS + cpi->refresh_bwd_ref_frame = 0; +#endif // CONFIG_EXT_REFS + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + break; + + case OVERLAY_UPDATE: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 1; +#if CONFIG_EXT_REFS + cpi->refresh_bwd_ref_frame = 0; +#endif // CONFIG_EXT_REFS + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + break; + + case ARF_UPDATE: +#if CONFIG_EXT_REFS + cpi->refresh_bwd_ref_frame = 1; +#endif // CONFIG_EXT_REFS + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 1; + break; + +#if CONFIG_EXT_REFS + case BRF_UPDATE: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_bwd_ref_frame = 1; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_bwd_ref_frame = 1; + if (cpi->num_extra_arfs) { + // Allow BRF use the farthest ALT_REF (ALT0) as BWD_REF by swapping + // the virtual indices. + // NOTE: The indices will be swapped back after this frame is encoded + // (in av1_update_reference_frames()). + int tmp = cpi->bwd_fb_idx; + + cpi->bwd_fb_idx = cpi->alt_fb_idx; + cpi->alt_fb_idx = cpi->arf_map[0]; + cpi->arf_map[0] = tmp; + } + break; + + case LAST_BIPRED_UPDATE: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_bwd_ref_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_last_bipred_frame = 1; + break; + + case BIPRED_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_bwd_ref_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_bipred_frame = 1; + break; + + case INTNL_OVERLAY_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_bwd_ref_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + cpi->rc.is_src_frame_ext_arf = 1; + break; +#endif // CONFIG_EXT_REFS + + default: assert(0); break; + } +} + +static int is_skippable_frame(const AV1_COMP *cpi) { + // If the current frame does not have non-zero motion vector detected in the + // first pass, and so do its previous and forward frames, then this frame + // can be skipped for partition check, and the partition size is assigned + // according to the variance + const TWO_PASS *const twopass = &cpi->twopass; + + return (!frame_is_intra_only(&cpi->common) && + twopass->stats_in - 2 > twopass->stats_in_start && + twopass->stats_in < twopass->stats_in_end && + (twopass->stats_in - 1)->pcnt_inter - + (twopass->stats_in - 1)->pcnt_motion == + 1 && + (twopass->stats_in - 2)->pcnt_inter - + (twopass->stats_in - 2)->pcnt_motion == + 1 && + twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1); +} + +void av1_rc_get_second_pass_params(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + int frames_left; + FIRSTPASS_STATS this_frame; + + int target_rate; + + frames_left = (int)(twopass->total_stats.count - cm->current_video_frame); + + if (!twopass->stats_in) return; + + // If this is an arf frame then we dont want to read the stats file or + // advance the input pointer as we already have what we need. + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + configure_buffer_updates(cpi); + target_rate = gf_group->bit_allocation[gf_group->index]; + target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate); + rc->base_frame_target = target_rate; + + cm->frame_type = INTER_FRAME; + + // Do the firstpass stats indicate that this frame is skippable for the + // partition search? + if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) { + cpi->partition_search_skippable_frame = is_skippable_frame(cpi); + } + + return; + } + + aom_clear_system_state(); + + if (cpi->oxcf.rc_mode == AOM_Q) { + twopass->active_worst_quality = cpi->oxcf.cq_level; + } else if (cm->current_video_frame == 0) { + // Special case code for first frame. + const int section_target_bandwidth = + (int)(twopass->bits_left / frames_left); + const double section_length = twopass->total_left_stats.count; + const double section_error = + twopass->total_left_stats.coded_error / section_length; + const double section_intra_skip = + twopass->total_left_stats.intra_skip_pct / section_length; + const double section_inactive_zone = + (twopass->total_left_stats.inactive_zone_rows * 2) / + ((double)cm->mb_rows * section_length); + const int tmp_q = get_twopass_worst_quality( + cpi, section_error, section_intra_skip + section_inactive_zone, + section_target_bandwidth, DEFAULT_GRP_WEIGHT); + + twopass->active_worst_quality = tmp_q; + twopass->baseline_active_worst_quality = tmp_q; + rc->ni_av_qi = tmp_q; + rc->last_q[INTER_FRAME] = tmp_q; + rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->bit_depth); + rc->avg_frame_qindex[INTER_FRAME] = tmp_q; + rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2; + rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME]; + } + + av1_zero(this_frame); + if (EOF == input_stats(twopass, &this_frame)) return; + + // Set the frame content type flag. + if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH) + twopass->fr_content_type = FC_GRAPHICS_ANIMATION; + else + twopass->fr_content_type = FC_NORMAL; + + // Keyframe and section processing. + if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) { + FIRSTPASS_STATS this_frame_copy; + this_frame_copy = this_frame; + // Define next KF group and assign bits to it. + find_next_key_frame(cpi, &this_frame); + this_frame = this_frame_copy; + } else { + cm->frame_type = INTER_FRAME; + } + + // Define a new GF/ARF group. (Should always enter here for key frames). + if (rc->frames_till_gf_update_due == 0) { + define_gf_group(cpi, &this_frame); + + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + +#if ARF_STATS_OUTPUT + { + FILE *fpfile; + fpfile = fopen("arf.stt", "a"); + ++arf_count; + fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n", cm->current_video_frame, + rc->frames_till_gf_update_due, rc->kf_boost, arf_count, + rc->gfu_boost); + + fclose(fpfile); + } +#endif + } + + configure_buffer_updates(cpi); + + // Do the firstpass stats indicate that this frame is skippable for the + // partition search? + if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) { + cpi->partition_search_skippable_frame = is_skippable_frame(cpi); + } + + target_rate = gf_group->bit_allocation[gf_group->index]; + + if (cpi->common.frame_type == KEY_FRAME) + target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate); + else + target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate); + + rc->base_frame_target = target_rate; + + { + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.MBs; + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass->mb_av_energy = + log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0); + } + + // Update the total stats remaining structure. + subtract_stats(&twopass->total_left_stats, &this_frame); +} + +#define MINQ_ADJ_LIMIT 48 +#define MINQ_ADJ_LIMIT_CQ 20 +#define HIGH_UNDERSHOOT_RATIO 2 +void av1_twopass_postencode_update(AV1_COMP *cpi) { + TWO_PASS *const twopass = &cpi->twopass; + RATE_CONTROL *const rc = &cpi->rc; + const int bits_used = rc->base_frame_target; + + // VBR correction is done through rc->vbr_bits_off_target. Based on the + // sign of this value, a limited % adjustment is made to the target rate + // of subsequent frames, to try and push it back towards 0. This method + // is designed to prevent extreme behaviour at the end of a clip + // or group of frames. + rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; + twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0); + + // Calculate the pct rc error. + if (rc->total_actual_bits) { + rc->rate_error_estimate = + (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits); + rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100); + } else { + rc->rate_error_estimate = 0; + } + + if (cpi->common.frame_type != KEY_FRAME) { + twopass->kf_group_bits -= bits_used; + twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct; + } + twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0); + + // Increment the gf group index ready for the next frame. + ++twopass->gf_group.index; + + // If the rate control is drifting consider adjustment to min or maxq. + if ((cpi->oxcf.rc_mode != AOM_Q) && + (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) && + !cpi->rc.is_src_frame_alt_ref) { + const int maxq_adj_limit = + rc->worst_quality - twopass->active_worst_quality; + const int minq_adj_limit = + (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT); + + // Undershoot. + if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) { + --twopass->extend_maxq; + if (rc->rolling_target_bits >= rc->rolling_actual_bits) + ++twopass->extend_minq; + // Overshoot. + } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) { + --twopass->extend_minq; + if (rc->rolling_target_bits < rc->rolling_actual_bits) + ++twopass->extend_maxq; + } else { + // Adjustment for extreme local overshoot. + if (rc->projected_frame_size > (2 * rc->base_frame_target) && + rc->projected_frame_size > (2 * rc->avg_frame_bandwidth)) + ++twopass->extend_maxq; + + // Unwind undershoot or overshoot adjustment. + if (rc->rolling_target_bits < rc->rolling_actual_bits) + --twopass->extend_minq; + else if (rc->rolling_target_bits > rc->rolling_actual_bits) + --twopass->extend_maxq; + } + + twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit); + twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit); + + // If there is a big and undexpected undershoot then feed the extra + // bits back in quickly. One situation where this may happen is if a + // frame is unexpectedly almost perfectly predicted by the ARF or GF + // but not very well predcited by the previous frame. + if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) { + int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO; + if (rc->projected_frame_size < fast_extra_thresh) { + rc->vbr_bits_off_target_fast += + fast_extra_thresh - rc->projected_frame_size; + rc->vbr_bits_off_target_fast = + AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth)); + + // Fast adaptation of minQ if necessary to use up the extra bits. + if (rc->avg_frame_bandwidth) { + twopass->extend_minq_fast = + (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth); + } + twopass->extend_minq_fast = AOMMIN( + twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq); + } else if (rc->vbr_bits_off_target_fast) { + twopass->extend_minq_fast = AOMMIN( + twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq); + } else { + twopass->extend_minq_fast = 0; + } + } + } +} diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h new file mode 100644 index 000000000..db459cc22 --- /dev/null +++ b/third_party/aom/av1/encoder/firstpass.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_FIRSTPASS_H_ +#define AV1_ENCODER_FIRSTPASS_H_ + +#include "av1/encoder/lookahead.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_FP_MB_STATS + +#define FPMB_DCINTRA_MASK 0x01 + +#define FPMB_MOTION_ZERO_MASK 0x02 +#define FPMB_MOTION_LEFT_MASK 0x04 +#define FPMB_MOTION_RIGHT_MASK 0x08 +#define FPMB_MOTION_UP_MASK 0x10 +#define FPMB_MOTION_DOWN_MASK 0x20 + +#define FPMB_ERROR_SMALL_MASK 0x40 +#define FPMB_ERROR_LARGE_MASK 0x80 +#define FPMB_ERROR_SMALL_TH 2000 +#define FPMB_ERROR_LARGE_TH 48000 + +typedef struct { + uint8_t *mb_stats_start; + uint8_t *mb_stats_end; +} FIRSTPASS_MB_STATS; +#endif + +#if CONFIG_EXT_REFS +// Length of the bi-predictive frame group (BFG) +// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain +// number of bi-predictive frames. +#define BFG_INTERVAL 2 +// The maximum number of extra ALT_REF's +// NOTE: This number cannot be greater than 2 or the reference frame buffer will +// overflow. +#define MAX_EXT_ARFS 2 +#define MIN_EXT_ARF_INTERVAL 4 +#endif // CONFIG_EXT_REFS + +#define VLOW_MOTION_THRESHOLD 950 + +typedef struct { + double frame; + double weight; + double intra_error; + double coded_error; + double sr_coded_error; + double pcnt_inter; + double pcnt_motion; + double pcnt_second_ref; + double pcnt_neutral; + double intra_skip_pct; + double inactive_zone_rows; // Image mask rows top and bottom. + double inactive_zone_cols; // Image mask columns at left and right edges. + double MVr; + double mvr_abs; + double MVc; + double mvc_abs; + double MVrv; + double MVcv; + double mv_in_out_count; + double new_mv_count; + double duration; + double count; +} FIRSTPASS_STATS; + +typedef enum { + KF_UPDATE = 0, + LF_UPDATE = 1, + GF_UPDATE = 2, + ARF_UPDATE = 3, + OVERLAY_UPDATE = 4, +#if CONFIG_EXT_REFS + BRF_UPDATE = 5, // Backward Reference Frame + LAST_BIPRED_UPDATE = 6, // Last Bi-predictive Frame + BIPRED_UPDATE = 7, // Bi-predictive Frame, but not the last one + INTNL_OVERLAY_UPDATE = 8, // Internal Overlay Frame + FRAME_UPDATE_TYPES = 9 +#else + FRAME_UPDATE_TYPES = 5 +#endif // CONFIG_EXT_REFS +} FRAME_UPDATE_TYPE; + +#define FC_ANIMATION_THRESH 0.15 +typedef enum { + FC_NORMAL = 0, + FC_GRAPHICS_ANIMATION = 1, + FRAME_CONTENT_TYPES = 2 +} FRAME_CONTENT_TYPE; + +typedef struct { + unsigned char index; + RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1]; + FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1]; +#if CONFIG_EXT_REFS + unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1]; +#endif // CONFIG_EXT_REFS + int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1]; +} GF_GROUP; + +typedef struct { + unsigned int section_intra_rating; + FIRSTPASS_STATS total_stats; + FIRSTPASS_STATS this_frame_stats; + const FIRSTPASS_STATS *stats_in; + const FIRSTPASS_STATS *stats_in_start; + const FIRSTPASS_STATS *stats_in_end; + FIRSTPASS_STATS total_left_stats; + int first_pass_done; + int64_t bits_left; + double modified_error_min; + double modified_error_max; + double modified_error_left; + double mb_av_energy; + +#if CONFIG_FP_MB_STATS + uint8_t *frame_mb_stats_buf; + uint8_t *this_frame_mb_stats; + FIRSTPASS_MB_STATS firstpass_mb_stats; +#endif + // An indication of the content type of the current frame + FRAME_CONTENT_TYPE fr_content_type; + + // Projected total bits available for a key frame group of frames + int64_t kf_group_bits; + + // Error score of frames still to be coded in kf group + int64_t kf_group_error_left; + + // The fraction for a kf groups total bits allocated to the inter frames + double kfgroup_inter_fraction; + + int sr_update_lag; + + int kf_zeromotion_pct; + int last_kfgroup_zeromotion_pct; + int gf_zeromotion_pct; + int active_worst_quality; + int baseline_active_worst_quality; + int extend_minq; + int extend_maxq; + int extend_minq_fast; + + GF_GROUP gf_group; +} TWO_PASS; + +struct AV1_COMP; + +void av1_init_first_pass(struct AV1_COMP *cpi); +void av1_rc_get_first_pass_params(struct AV1_COMP *cpi); +void av1_first_pass(struct AV1_COMP *cpi, const struct lookahead_entry *source); +void av1_end_first_pass(struct AV1_COMP *cpi); + +void av1_init_second_pass(struct AV1_COMP *cpi); +void av1_rc_get_second_pass_params(struct AV1_COMP *cpi); +void av1_twopass_postencode_update(struct AV1_COMP *cpi); + +// Post encode update of the rate control parameters for 2-pass +void av1_twopass_postencode_update(struct AV1_COMP *cpi); + +void av1_init_subsampling(struct AV1_COMP *cpi); + +void av1_calculate_coded_size(struct AV1_COMP *cpi, int *scaled_frame_width, + int *scaled_frame_height); + +#if CONFIG_EXT_REFS +static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) { + if (arf_pending && MAX_EXT_ARFS > 0) + return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1) + ? MAX_EXT_ARFS + : interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS + ? MAX_EXT_ARFS - 1 + : 0; + else + return 0; +} +#endif // CONFIG_EXT_REFS + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_FIRSTPASS_H_ diff --git a/third_party/aom/av1/encoder/generic_encoder.c b/third_party/aom/av1/encoder/generic_encoder.c new file mode 100644 index 000000000..a31bb9ef6 --- /dev/null +++ b/third_party/aom/av1/encoder/generic_encoder.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* clang-format off */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include + +#include "aom_dsp/bitwriter.h" +#include "av1/common/generic_code.h" +#include "av1/common/odintrin.h" +#include "pvq_encoder.h" + +/** Encodes a value from 0 to N-1 (with N up to 16) based on a cdf and adapts + * the cdf accordingly. + * + * @param [in,out] w multi-symbol entropy encoder + * @param [in] val variable being encoded + * @param [in,out] cdf CDF of the variable (Q15) + * @param [in] n number of values possible + * @param [in,out] count number of symbols encoded with that cdf so far + * @param [in] rate adaptation rate shift (smaller is faster) + */ +void aom_encode_cdf_adapt_q15(aom_writer *w, int val, uint16_t *cdf, int n, + int *count, int rate) { + int i; + if (*count == 0) { + /* On the first call, we normalize the cdf to (32768 - n). This should + eventually be moved to the state init, but for now it makes it much + easier to experiment and convert symbols to the Q15 adaptation.*/ + int ft; + ft = cdf[n - 1]; + for (i = 0; i < n; i++) { + cdf[i] = AOM_ICDF(cdf[i]*32768/ft); + } + } + aom_write_cdf(w, val, cdf, n); + aom_cdf_adapt_q15(val, cdf, n, count, rate); +} + +/** Encodes a random variable using a "generic" model, assuming that the + * distribution is one-sided (zero and up), has a single mode, and decays + * exponentially past the model. + * + * @param [in,out] w multi-symbol entropy encoder + * @param [in,out] model generic probability model + * @param [in] x variable being encoded + * @param [in,out] ExQ16 expectation of x (adapted) + * @param [in] integration integration period of ExQ16 (leaky average over + * 1<> 1); + /* Choose the cdf to use: we have two per "octave" of ExQ16 */ + id = OD_MINI(GENERIC_TABLES - 1, lg_q1); + cdf = model->cdf[id]; + xs = (x + (1 << shift >> 1)) >> shift; + aom_write_symbol_pvq(w, OD_MINI(15, xs), cdf, 16); + if (xs >= 15) { + int e; + unsigned decay; + /* Estimate decay based on the assumption that the distribution is close + to Laplacian for large values. We should probably have an adaptive + estimate instead. Note: The 2* is a kludge that's not fully understood + yet. */ + OD_ASSERT(*ex_q16 < INT_MAX >> 1); + e = ((2**ex_q16 >> 8) + (1 << shift >> 1)) >> shift; + decay = OD_MAXI(2, OD_MINI(254, 256*e/(e + 256))); + /* Encode the tail of the distribution assuming exponential decay. */ + aom_laplace_encode_special(w, xs - 15, decay); + } + if (shift != 0) { + int special; + /* Because of the rounding, there's only half the number of possibilities + for xs=0. */ + special = xs == 0; + if (shift - special > 0) { + aom_write_literal(w, x - (xs << shift) + (!special << (shift - 1)), + shift - special); + } + } + generic_model_update(ex_q16, x, integration); + OD_LOG((OD_LOG_ENTROPY_CODER, OD_LOG_DEBUG, + "enc: %d %d %d %d %d %x", *ex_q16, x, shift, id, xs, enc->rng)); +} + +/** Estimates the cost of encoding a value with generic_encode(). + * + * @param [in,out] model generic probability model + * @param [in] x variable being encoded + * @param [in,out] ExQ16 expectation of x (adapted) + * @return number of bits (approximation) + */ +double generic_encode_cost(generic_encoder *model, int x, int *ex_q16) { + int lg_q1; + int shift; + int id; + uint16_t *cdf; + int xs; + int extra; + lg_q1 = log_ex(*ex_q16); + /* If expectation is too large, shift x to ensure that + all we have past xs=15 is the exponentially decaying tail + of the distribution */ + shift = OD_MAXI(0, (lg_q1 - 5) >> 1); + /* Choose the cdf to use: we have two per "octave" of ExQ16 */ + id = OD_MINI(GENERIC_TABLES - 1, lg_q1); + cdf = model->cdf[id]; + xs = (x + (1 << shift >> 1)) >> shift; + extra = 0; + if (shift) extra = shift - (xs == 0); + xs = OD_MINI(15, xs); + /* Shortcut: assume it's going to cost 2 bits for the Laplace coder. */ + if (xs == 15) extra += 2; + return + extra - OD_LOG2((double)(cdf[xs] - (xs == 0 ? 0 : cdf[xs - 1]))/cdf[15]); +} + +/*Estimates the cost of encoding a value with a given CDF.*/ +double od_encode_cdf_cost(int val, uint16_t *cdf, int n) { + int total_prob; + int prev_prob; + double val_prob; + OD_ASSERT(n > 0); + total_prob = cdf[n - 1]; + if (val == 0) { + prev_prob = 0; + } + else { + prev_prob = cdf[val - 1]; + } + val_prob = (cdf[val] - prev_prob) / (double)total_prob; + return -OD_LOG2(val_prob); +} diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c new file mode 100644 index 000000000..2a6204939 --- /dev/null +++ b/third_party/aom/av1/encoder/global_motion.c @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "av1/encoder/global_motion.h" + +#include "av1/common/warped_motion.h" + +#include "av1/encoder/segmentation.h" +#include "av1/encoder/corner_detect.h" +#include "av1/encoder/corner_match.h" +#include "av1/encoder/ransac.h" + +#define MAX_CORNERS 4096 +#define MIN_INLIER_PROB 0.1 + +#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR) + +// Border over which to compute the global motion +#define ERRORADV_BORDER 0 + +#define ERRORADV_MAX_THRESH 0.995 +#define ERRORADV_COST_PRODUCT_THRESH 26000 + +int is_enough_erroradvantage(double best_erroradvantage, int params_cost) { + return best_erroradvantage < ERRORADV_MAX_THRESH && + best_erroradvantage * params_cost < ERRORADV_COST_PRODUCT_THRESH; +} + +static void convert_to_params(const double *params, int32_t *model) { + int i; + int alpha_present = 0; + model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5); + model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5); + model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) * + GM_TRANS_DECODE_FACTOR; + model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) * + GM_TRANS_DECODE_FACTOR; + + for (i = 2; i < 6; ++i) { + const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0); + model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5); + model[i] = + (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX); + alpha_present |= (model[i] != 0); + model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR; + } + for (; i < 8; ++i) { + model[i] = (int32_t)floor(params[i] * (1 << GM_ROW3HOMO_PREC_BITS) + 0.5); + model[i] = (int32_t)clamp(model[i], GM_ROW3HOMO_MIN, GM_ROW3HOMO_MAX) * + GM_ROW3HOMO_DECODE_FACTOR; + alpha_present |= (model[i] != 0); + } + + if (!alpha_present) { + if (abs(model[0]) < MIN_TRANS_THRESH && abs(model[1]) < MIN_TRANS_THRESH) { + model[0] = 0; + model[1] = 0; + } + } +} + +void convert_model_to_params(const double *params, WarpedMotionParams *model) { + convert_to_params(params, model->wmmat); + model->wmtype = get_gmtype(model); +} + +// Adds some offset to a global motion parameter and handles +// all of the necessary precision shifts, clamping, and +// zero-centering. +static int32_t add_param_offset(int param_index, int32_t param_value, + int32_t offset) { + const int scale_vals[3] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF, + GM_ROW3HOMO_PREC_DIFF }; + const int clamp_vals[3] = { GM_TRANS_MAX, GM_ALPHA_MAX, GM_ROW3HOMO_MAX }; + // type of param: 0 - translation, 1 - affine, 2 - homography + const int param_type = (param_index < 2 ? 0 : (param_index < 6 ? 1 : 2)); + const int is_one_centered = (param_index == 2 || param_index == 5); + + // Make parameter zero-centered and offset the shift that was done to make + // it compatible with the warped model + param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >> + scale_vals[param_type]; + // Add desired offset to the rescaled/zero-centered parameter + param_value += offset; + // Clamp the parameter so it does not overflow the number of bits allotted + // to it in the bitstream + param_value = (int32_t)clamp(param_value, -clamp_vals[param_type], + clamp_vals[param_type]); + // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible + // with the warped motion library + param_value *= (1 << scale_vals[param_type]); + + // Undo the zero-centering step if necessary + return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS); +} + +static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) { + switch (wmtype) { + case IDENTITY: wm->wmmat[0] = 0; wm->wmmat[1] = 0; + case TRANSLATION: + wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS; + wm->wmmat[3] = 0; + case ROTZOOM: wm->wmmat[4] = -wm->wmmat[3]; wm->wmmat[5] = wm->wmmat[2]; + case AFFINE: wm->wmmat[6] = wm->wmmat[7] = 0; break; + case HORTRAPEZOID: wm->wmmat[6] = wm->wmmat[4] = 0; break; + case VERTRAPEZOID: wm->wmmat[7] = wm->wmmat[3] = 0; break; + case HOMOGRAPHY: break; + default: assert(0); + } + wm->wmtype = wmtype; +} + +double refine_integerized_param(WarpedMotionParams *wm, + TransformationType wmtype, +#if CONFIG_HIGHBITDEPTH + int use_hbd, int bd, +#endif // CONFIG_HIGHBITDEPTH + uint8_t *ref, int r_width, int r_height, + int r_stride, uint8_t *dst, int d_width, + int d_height, int d_stride, int n_refinements) { + static const int max_trans_model_params[TRANS_TYPES] = { + 0, 2, 4, 6, 8, 8, 8 + }; + const int border = ERRORADV_BORDER; + int i = 0, p; + int n_params = max_trans_model_params[wmtype]; + int32_t *param_mat = wm->wmmat; + double step_error; + int32_t step; + int32_t *param; + int32_t curr_param; + int32_t best_param; + double best_error; + + force_wmtype(wm, wmtype); + best_error = av1_warp_erroradv(wm, +#if CONFIG_HIGHBITDEPTH + use_hbd, bd, +#endif // CONFIG_HIGHBITDEPTH + ref, r_width, r_height, r_stride, + dst + border * d_stride + border, border, + border, d_width - 2 * border, + d_height - 2 * border, d_stride, 0, 0, 16, 16); + step = 1 << (n_refinements + 1); + for (i = 0; i < n_refinements; i++, step >>= 1) { + for (p = 0; p < n_params; ++p) { + int step_dir = 0; + // Skip searches for parameters that are forced to be 0 + if (wmtype == HORTRAPEZOID && (p == 4 || p == 6)) continue; + if (wmtype == VERTRAPEZOID && (p == 3 || p == 7)) continue; + param = param_mat + p; + curr_param = *param; + best_param = curr_param; + // look to the left + *param = add_param_offset(p, curr_param, -step); + step_error = av1_warp_erroradv( + wm, +#if CONFIG_HIGHBITDEPTH + use_hbd, bd, +#endif // CONFIG_HIGHBITDEPTH + ref, r_width, r_height, r_stride, dst + border * d_stride + border, + border, border, d_width - 2 * border, d_height - 2 * border, d_stride, + 0, 0, 16, 16); + if (step_error < best_error) { + best_error = step_error; + best_param = *param; + step_dir = -1; + } + + // look to the right + *param = add_param_offset(p, curr_param, step); + step_error = av1_warp_erroradv( + wm, +#if CONFIG_HIGHBITDEPTH + use_hbd, bd, +#endif // CONFIG_HIGHBITDEPTH + ref, r_width, r_height, r_stride, dst + border * d_stride + border, + border, border, d_width - 2 * border, d_height - 2 * border, d_stride, + 0, 0, 16, 16); + if (step_error < best_error) { + best_error = step_error; + best_param = *param; + step_dir = 1; + } + *param = best_param; + + // look to the direction chosen above repeatedly until error increases + // for the biggest step size + while (step_dir) { + *param = add_param_offset(p, best_param, step * step_dir); + step_error = av1_warp_erroradv( + wm, +#if CONFIG_HIGHBITDEPTH + use_hbd, bd, +#endif // CONFIG_HIGHBITDEPTH + ref, r_width, r_height, r_stride, dst + border * d_stride + border, + border, border, d_width - 2 * border, d_height - 2 * border, + d_stride, 0, 0, 16, 16); + if (step_error < best_error) { + best_error = step_error; + best_param = *param; + } else { + *param = best_param; + step_dir = 0; + } + } + } + } + force_wmtype(wm, wmtype); + wm->wmtype = get_gmtype(wm); + return best_error; +} + +static INLINE RansacFunc get_ransac_type(TransformationType type) { + switch (type) { + case HOMOGRAPHY: return ransac_homography; + case HORTRAPEZOID: return ransac_hortrapezoid; + case VERTRAPEZOID: return ransac_vertrapezoid; + case AFFINE: return ransac_affine; + case ROTZOOM: return ransac_rotzoom; + case TRANSLATION: return ransac_translation; + default: assert(0); return NULL; + } +} + +#if CONFIG_HIGHBITDEPTH +static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm, + int bit_depth) { + int i, j; + uint16_t *orig_buf = CONVERT_TO_SHORTPTR(frm->y_buffer); + uint8_t *buf = malloc(frm->y_height * frm->y_stride * sizeof(*buf)); + + for (i = 0; i < frm->y_height; ++i) + for (j = 0; j < frm->y_width; ++j) + buf[i * frm->y_stride + j] = + orig_buf[i * frm->y_stride + j] >> (bit_depth - 8); + + return buf; +} +#endif + +int compute_global_motion_feature_based( + TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref, +#if CONFIG_HIGHBITDEPTH + int bit_depth, +#endif + int *num_inliers_by_motion, double *params_by_motion, int num_motions) { + int i; + int num_frm_corners, num_ref_corners; + int num_correspondences; + int *correspondences; + int frm_corners[2 * MAX_CORNERS], ref_corners[2 * MAX_CORNERS]; + unsigned char *frm_buffer = frm->y_buffer; + unsigned char *ref_buffer = ref->y_buffer; + RansacFunc ransac = get_ransac_type(type); + +#if CONFIG_HIGHBITDEPTH + if (frm->flags & YV12_FLAG_HIGHBITDEPTH) { + // The frame buffer is 16-bit, so we need to convert to 8 bits for the + // following code. We cache the result until the frame is released. + if (frm->y_buffer_8bit) + frm_buffer = frm->y_buffer_8bit; + else + frm_buffer = frm->y_buffer_8bit = downconvert_frame(frm, bit_depth); + } + if (ref->flags & YV12_FLAG_HIGHBITDEPTH) { + if (ref->y_buffer_8bit) + ref_buffer = ref->y_buffer_8bit; + else + ref_buffer = ref->y_buffer_8bit = downconvert_frame(ref, bit_depth); + } +#endif + + // compute interest points in images using FAST features + num_frm_corners = fast_corner_detect(frm_buffer, frm->y_width, frm->y_height, + frm->y_stride, frm_corners, MAX_CORNERS); + num_ref_corners = fast_corner_detect(ref_buffer, ref->y_width, ref->y_height, + ref->y_stride, ref_corners, MAX_CORNERS); + + // find correspondences between the two images + correspondences = + (int *)malloc(num_frm_corners * 4 * sizeof(*correspondences)); + num_correspondences = determine_correspondence( + frm_buffer, (int *)frm_corners, num_frm_corners, ref_buffer, + (int *)ref_corners, num_ref_corners, frm->y_width, frm->y_height, + frm->y_stride, ref->y_stride, correspondences); + + ransac(correspondences, num_correspondences, num_inliers_by_motion, + params_by_motion, num_motions); + + free(correspondences); + + // Set num_inliers = 0 for motions with too few inliers so they are ignored. + for (i = 0; i < num_motions; ++i) { + if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) { + num_inliers_by_motion[i] = 0; + } + } + + // Return true if any one of the motions has inliers. + for (i = 0; i < num_motions; ++i) { + if (num_inliers_by_motion[i] > 0) return 1; + } + return 0; +} diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h new file mode 100644 index 000000000..8fc757f38 --- /dev/null +++ b/third_party/aom/av1/encoder/global_motion.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_GLOBAL_MOTION_H_ +#define AV1_ENCODER_GLOBAL_MOTION_H_ + +#include "aom/aom_integer.h" +#include "aom_scale/yv12config.h" +#include "av1/common/mv.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RANSAC_NUM_MOTIONS 1 + +void convert_model_to_params(const double *params, WarpedMotionParams *model); + +int is_enough_erroradvantage(double erroradv, int params_cost); + +double refine_integerized_param(WarpedMotionParams *wm, + TransformationType wmtype, +#if CONFIG_HIGHBITDEPTH + int use_hbd, int bd, +#endif // CONFIG_HIGHBITDEPTH + uint8_t *ref, int r_width, int r_height, + int r_stride, uint8_t *dst, int d_width, + int d_height, int d_stride, int n_refinements); + +/* + Computes "num_motions" candidate global motion parameters between two frames. + The array "params_by_motion" should be length 8 * "num_motions". The ordering + of each set of parameters is best described by the homography: + + [x' (m2 m3 m0 [x + z . y' = m4 m5 m1 * y + 1] m6 m7 1) 1] + + where m{i} represents the ith value in any given set of parameters. + + "num_inliers" should be length "num_motions", and will be populated with the + number of inlier feature points for each motion. Params for which the + num_inliers entry is 0 should be ignored by the caller. +*/ +int compute_global_motion_feature_based( + TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref, +#if CONFIG_HIGHBITDEPTH + int bit_depth, +#endif + int *num_inliers_by_motion, double *params_by_motion, int num_motions); +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AV1_ENCODER_GLOBAL_MOTION_H_ diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c new file mode 100644 index 000000000..4fd563163 --- /dev/null +++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c @@ -0,0 +1,499 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./av1_rtcd.h" +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "av1/common/idct.h" +#include "av1/encoder/hybrid_fwd_txfm.h" + +#if CONFIG_CB4X4 +static void fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, int lossless) { + tran_high_t a1 = src_diff[0]; + tran_high_t b1 = src_diff[1]; + tran_high_t c1 = src_diff[diff_stride]; + tran_high_t d1 = src_diff[1 + diff_stride]; + + tran_high_t a2 = a1 + c1; + tran_high_t b2 = b1 + d1; + tran_high_t c2 = a1 - c1; + tran_high_t d2 = b1 - d1; + + a1 = a2 + b2; + b1 = a2 - b2; + c1 = c2 + d2; + d1 = c2 - d2; + + coeff[0] = (tran_low_t)(4 * a1); + coeff[1] = (tran_low_t)(4 * b1); + coeff[2] = (tran_low_t)(4 * c1); + coeff[3] = (tran_low_t)(4 * d1); + + (void)tx_type; + (void)lossless; +} +#endif + +static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, int lossless) { + if (lossless) { + assert(tx_type == DCT_DCT); + av1_fwht4x4(src_diff, coeff, diff_stride); + return; + } + + av1_fht4x4(src_diff, coeff, diff_stride, tx_type); +} + +static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + (void)fwd_txfm_opt; + av1_fht4x8(src_diff, coeff, diff_stride, tx_type); +} + +static void fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + (void)fwd_txfm_opt; + av1_fht8x4(src_diff, coeff, diff_stride, tx_type); +} + +static void fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + (void)fwd_txfm_opt; + av1_fht8x16(src_diff, coeff, diff_stride, tx_type); +} + +static void fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + (void)fwd_txfm_opt; + av1_fht16x8(src_diff, coeff, diff_stride, tx_type); +} + +static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + (void)fwd_txfm_opt; + av1_fht16x32(src_diff, coeff, diff_stride, tx_type); +} + +static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + (void)fwd_txfm_opt; + av1_fht32x16(src_diff, coeff, diff_stride, tx_type); +} + +static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + (void)fwd_txfm_opt; + av1_fht8x8(src_diff, coeff, diff_stride, tx_type); +} + +static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + (void)fwd_txfm_opt; + av1_fht16x16(src_diff, coeff, diff_stride, tx_type); +} + +static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + (void)fwd_txfm_opt; + av1_fht32x32(src_diff, coeff, diff_stride, tx_type); +} + +#if CONFIG_TX64X64 +static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt) { + (void)fwd_txfm_opt; +#if CONFIG_EXT_TX + if (tx_type == IDTX) + av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); + else +#endif + av1_fht64x64(src_diff, coeff, diff_stride, tx_type); +} +#endif // CONFIG_TX64X64 + +#if CONFIG_HIGHBITDEPTH +#if CONFIG_CB4X4 +static void highbd_fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, int lossless, + const int bd) { + tran_high_t a1 = src_diff[0]; + tran_high_t b1 = src_diff[1]; + tran_high_t c1 = src_diff[diff_stride]; + tran_high_t d1 = src_diff[1 + diff_stride]; + + tran_high_t a2 = a1 + c1; + tran_high_t b2 = b1 + d1; + tran_high_t c2 = a1 - c1; + tran_high_t d2 = b1 - d1; + + a1 = a2 + b2; + b1 = a2 - b2; + c1 = c2 + d2; + d1 = c2 - d2; + + coeff[0] = (tran_low_t)(4 * a1); + coeff[1] = (tran_low_t)(4 * b1); + coeff[2] = (tran_low_t)(4 * c1); + coeff[3] = (tran_low_t)(4 * d1); + + (void)tx_type; + (void)lossless; + (void)bd; +} +#endif + +static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, int lossless, + const int bd) { + if (lossless) { + assert(tx_type == DCT_DCT); + av1_highbd_fwht4x4(src_diff, coeff, diff_stride); + return; + } + + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + av1_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + av1_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd); + break; + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + av1_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type); + break; + case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type); break; +#endif // CONFIG_EXT_TX + default: assert(0); + } +} + +static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void)fwd_txfm_opt; + (void)bd; + av1_highbd_fht4x8(src_diff, coeff, diff_stride, tx_type); +} + +static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void)fwd_txfm_opt; + (void)bd; + av1_highbd_fht8x4(src_diff, coeff, diff_stride, tx_type); +} + +static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void)fwd_txfm_opt; + (void)bd; + av1_highbd_fht8x16(src_diff, coeff, diff_stride, tx_type); +} + +static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void)fwd_txfm_opt; + (void)bd; + av1_highbd_fht16x8(src_diff, coeff, diff_stride, tx_type); +} + +static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void)fwd_txfm_opt; + (void)bd; + av1_highbd_fht16x32(src_diff, coeff, diff_stride, tx_type); +} + +static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void)fwd_txfm_opt; + (void)bd; + av1_highbd_fht32x16(src_diff, coeff, diff_stride, tx_type); +} + +static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void)fwd_txfm_opt; + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + av1_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + av1_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd); + break; + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + // Use C version since DST exists only in C + av1_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type); + break; + case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type); break; +#endif // CONFIG_EXT_TX + default: assert(0); + } +} + +static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void)fwd_txfm_opt; + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + av1_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + av1_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd); + break; + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + // Use C version since DST exists only in C + av1_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type); + break; + case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type); break; +#endif // CONFIG_EXT_TX + default: assert(0); + } +} + +static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void)fwd_txfm_opt; + switch (tx_type) { + case DCT_DCT: + av1_fwd_txfm2d_32x32(src_diff, coeff, diff_stride, tx_type, bd); + break; +#if CONFIG_EXT_TX + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + av1_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type); + break; + case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type); break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } +} + +#if CONFIG_TX64X64 +static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TX_TYPE tx_type, + FWD_TXFM_OPT fwd_txfm_opt, const int bd) { + (void)fwd_txfm_opt; + (void)bd; + switch (tx_type) { + case DCT_DCT: + av1_highbd_fht64x64(src_diff, coeff, diff_stride, tx_type); + break; +#if CONFIG_EXT_TX + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + av1_highbd_fht64x64(src_diff, coeff, diff_stride, tx_type); + break; + case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } +} +#endif // CONFIG_TX64X64 +#endif // CONFIG_HIGHBITDEPTH + +void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, + FWD_TXFM_PARAM *fwd_txfm_param) { + const int fwd_txfm_opt = FWD_TXFM_OPT_NORMAL; + const TX_TYPE tx_type = fwd_txfm_param->tx_type; + const TX_SIZE tx_size = fwd_txfm_param->tx_size; + const int lossless = fwd_txfm_param->lossless; + switch (tx_size) { +#if CONFIG_TX64X64 + case TX_64X64: + fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; +#endif // CONFIG_TX64X64 + case TX_32X32: + fwd_txfm_32x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; + case TX_16X16: + fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; + case TX_8X8: + fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; + case TX_4X8: + fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; + case TX_8X4: + fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; + case TX_8X16: + fwd_txfm_8x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; + case TX_16X8: + fwd_txfm_16x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; + case TX_16X32: + fwd_txfm_16x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; + case TX_32X16: + fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt); + break; + case TX_4X4: + fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless); + break; +#if CONFIG_CB4X4 + case TX_2X2: + fwd_txfm_2x2(src_diff, coeff, diff_stride, tx_type, lossless); + break; +#endif + default: assert(0); break; + } +} + +#if CONFIG_HIGHBITDEPTH +void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param) { + const int fwd_txfm_opt = FWD_TXFM_OPT_NORMAL; + const TX_TYPE tx_type = fwd_txfm_param->tx_type; + const TX_SIZE tx_size = fwd_txfm_param->tx_size; + const int lossless = fwd_txfm_param->lossless; + const int bd = fwd_txfm_param->bd; + switch (tx_size) { +#if CONFIG_TX64X64 + case TX_64X64: + highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, + bd); + break; +#endif // CONFIG_TX64X64 + case TX_32X32: + highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, + bd); + break; + case TX_16X16: + highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, + bd); + break; + case TX_8X8: + highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, + bd); + break; + case TX_4X8: + highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, + bd); + break; + case TX_8X4: + highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, + bd); + break; + case TX_8X16: + highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, + bd); + break; + case TX_16X8: + highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, + bd); + break; + case TX_16X32: + highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, + bd); + break; + case TX_32X16: + highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt, + bd); + break; + case TX_4X4: + highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless, bd); + break; +#if CONFIG_CB4X4 + case TX_2X2: + highbd_fwd_txfm_2x2(src_diff, coeff, diff_stride, tx_type, lossless, bd); + break; +#endif + default: assert(0); break; + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h new file mode 100644 index 000000000..e6fd17275 --- /dev/null +++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_HYBRID_FWD_TXFM_H_ +#define AV1_ENCODER_HYBRID_FWD_TXFM_H_ + +#include "./aom_config.h" + +typedef enum FWD_TXFM_OPT { FWD_TXFM_OPT_NORMAL } FWD_TXFM_OPT; + +typedef struct FWD_TXFM_PARAM { + TX_TYPE tx_type; + TX_SIZE tx_size; + int lossless; +#if CONFIG_HIGHBITDEPTH + int bd; +#endif // CONFIG_HIGHBITDEPTH +} FWD_TXFM_PARAM; + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, + FWD_TXFM_PARAM *fwd_txfm_param); + +#if CONFIG_HIGHBITDEPTH +void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param); +#endif // CONFIG_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_HYBRID_FWD_TXFM_H_ diff --git a/third_party/aom/av1/encoder/laplace_encoder.c b/third_party/aom/av1/encoder/laplace_encoder.c new file mode 100644 index 000000000..54ffc88fb --- /dev/null +++ b/third_party/aom/av1/encoder/laplace_encoder.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* clang-format off */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include + +#include "aom_dsp/bitwriter.h" +#include "av1/common/odintrin.h" +#include "av1/common/pvq.h" +#include "pvq_encoder.h" + +static void aom_encode_pvq_split(aom_writer *w, od_pvq_codeword_ctx *adapt, + int count, int sum, int ctx) { + int shift; + int rest; + int fctx; + if (sum == 0) return; + shift = OD_MAXI(0, OD_ILOG(sum) - 3); + if (shift) { + rest = count & ((1 << shift) - 1); + count >>= shift; + sum >>= shift; + } + fctx = 7*ctx + sum - 1; + aom_write_symbol_pvq(w, count, adapt->pvq_split_cdf[fctx], sum + 1); + if (shift) aom_write_literal(w, rest, shift); +} + +void aom_encode_band_pvq_splits(aom_writer *w, od_pvq_codeword_ctx *adapt, + const int *y, int n, int k, int level) { + int mid; + int i; + int count_right; + if (n <= 1 || k == 0) return; + if (k == 1 && n <= 16) { + int cdf_id; + int pos; + cdf_id = od_pvq_k1_ctx(n, level == 0); + for (pos = 0; !y[pos]; pos++); + OD_ASSERT(pos < n); + aom_write_symbol_pvq(w, pos, adapt->pvq_k1_cdf[cdf_id], n); + } + else { + mid = n >> 1; + count_right = k; + for (i = 0; i < mid; i++) count_right -= abs(y[i]); + aom_encode_pvq_split(w, adapt, count_right, k, od_pvq_size_ctx(n)); + aom_encode_band_pvq_splits(w, adapt, y, mid, k - count_right, level + 1); + aom_encode_band_pvq_splits(w, adapt, y + mid, n - mid, count_right, + level + 1); + } +} + +/** Encodes the tail of a Laplace-distributed variable, i.e. it doesn't + * do anything special for the zero case. + * + * @param [in,out] enc range encoder + * @param [in] x variable to encode (has to be positive) + * @param [in] decay decay factor of the distribution in Q8 format, + * i.e. pdf ~= decay^x + */ +void aom_laplace_encode_special(aom_writer *w, int x, unsigned decay) { + int shift; + int xs; + int sym; + const uint16_t *cdf; + shift = 0; + /* We don't want a large decay value because that would require too many + symbols. */ + while (decay > 235) { + decay = (decay*decay + 128) >> 8; + shift++; + } + decay = OD_MINI(decay, 254); + decay = OD_MAXI(decay, 2); + xs = x >> shift; + cdf = EXP_CDF_TABLE[(decay + 1) >> 1]; + OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "decay = %d", decay)); + do { + sym = OD_MINI(xs, 15); + { + int i; + OD_LOG((OD_LOG_PVQ, OD_LOG_DEBUG, "%d %d %d %d %d\n", x, xs, shift, + sym, max)); + for (i = 0; i < 16; i++) { + OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "%d ", cdf[i])); + } + OD_LOG_PARTIAL((OD_LOG_PVQ, OD_LOG_DEBUG, "\n")); + } + aom_write_cdf(w, sym, cdf, 16); + xs -= 15; + } while (sym >= 15); + if (shift) aom_write_literal(w, x & ((1 << shift) - 1), shift); +} diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c new file mode 100644 index 000000000..591ca6152 --- /dev/null +++ b/third_party/aom/av1/encoder/lookahead.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "./aom_config.h" + +#include "av1/common/common.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/lookahead.h" + +/* Return the buffer at the given absolute index and increment the index */ +static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) { + int index = *idx; + struct lookahead_entry *buf = ctx->buf + index; + + assert(index < ctx->max_sz); + if (++index >= ctx->max_sz) index -= ctx->max_sz; + *idx = index; + return buf; +} + +void av1_lookahead_destroy(struct lookahead_ctx *ctx) { + if (ctx) { + if (ctx->buf) { + int i; + + for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img); + free(ctx->buf); + } + free(ctx); + } +} + +struct lookahead_ctx *av1_lookahead_init(unsigned int width, + unsigned int height, + unsigned int subsampling_x, + unsigned int subsampling_y, +#if CONFIG_HIGHBITDEPTH + int use_highbitdepth, +#endif + unsigned int depth) { + struct lookahead_ctx *ctx = NULL; + + // Clamp the lookahead queue depth + depth = clamp(depth, 1, MAX_LAG_BUFFERS); + + // Allocate memory to keep previous source frames available. + depth += MAX_PRE_FRAMES; + + // Allocate the lookahead structures + ctx = calloc(1, sizeof(*ctx)); + if (ctx) { + const int legacy_byte_alignment = 0; + unsigned int i; + ctx->max_sz = depth; + ctx->buf = calloc(depth, sizeof(*ctx->buf)); + if (!ctx->buf) goto bail; + for (i = 0; i < depth; i++) + if (aom_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x, + subsampling_y, +#if CONFIG_HIGHBITDEPTH + use_highbitdepth, +#endif + AOM_BORDER_IN_PIXELS, legacy_byte_alignment)) + goto bail; + } + return ctx; +bail: + av1_lookahead_destroy(ctx); + return NULL; +} + +#define USE_PARTIAL_COPY 0 + +int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, +#if CONFIG_HIGHBITDEPTH + int use_highbitdepth, +#endif + aom_enc_frame_flags_t flags) { + struct lookahead_entry *buf; +#if USE_PARTIAL_COPY + int row, col, active_end; + int mb_rows = (src->y_height + 15) >> 4; + int mb_cols = (src->y_width + 15) >> 4; +#endif + int width = src->y_crop_width; + int height = src->y_crop_height; + int uv_width = src->uv_crop_width; + int uv_height = src->uv_crop_height; + int subsampling_x = src->subsampling_x; + int subsampling_y = src->subsampling_y; + int larger_dimensions, new_dimensions; + + if (ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz) return 1; + ctx->sz++; + buf = pop(ctx, &ctx->write_idx); + + new_dimensions = width != buf->img.y_crop_width || + height != buf->img.y_crop_height || + uv_width != buf->img.uv_crop_width || + uv_height != buf->img.uv_crop_height; + larger_dimensions = width > buf->img.y_width || height > buf->img.y_height || + uv_width > buf->img.uv_width || + uv_height > buf->img.uv_height; + assert(!larger_dimensions || new_dimensions); + +#if USE_PARTIAL_COPY + // TODO(jkoleszar): This is disabled for now, as + // av1_copy_and_extend_frame_with_rect is not subsampling/alpha aware. + + // Only do this partial copy if the following conditions are all met: + // 1. Lookahead queue has has size of 1. + // 2. Active map is provided. + // 3. This is not a key frame, golden nor altref frame. + if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) { + for (row = 0; row < mb_rows; ++row) { + col = 0; + + while (1) { + // Find the first active macroblock in this row. + for (; col < mb_cols; ++col) { + if (active_map[col]) break; + } + + // No more active macroblock in this row. + if (col == mb_cols) break; + + // Find the end of active region in this row. + active_end = col; + + for (; active_end < mb_cols; ++active_end) { + if (!active_map[active_end]) break; + } + + // Only copy this active region. + av1_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4, + 16, (active_end - col) << 4); + + // Start again from the end of this active region. + col = active_end; + } + + active_map += mb_cols; + } + } else { +#endif + if (larger_dimensions) { + YV12_BUFFER_CONFIG new_img; + memset(&new_img, 0, sizeof(new_img)); + if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x, + subsampling_y, +#if CONFIG_HIGHBITDEPTH + use_highbitdepth, +#endif + AOM_BORDER_IN_PIXELS, 0)) + return 1; + aom_free_frame_buffer(&buf->img); + buf->img = new_img; + } else if (new_dimensions) { + buf->img.y_crop_width = src->y_crop_width; + buf->img.y_crop_height = src->y_crop_height; + buf->img.uv_crop_width = src->uv_crop_width; + buf->img.uv_crop_height = src->uv_crop_height; + buf->img.subsampling_x = src->subsampling_x; + buf->img.subsampling_y = src->subsampling_y; + } + // Partial copy not implemented yet + av1_copy_and_extend_frame(src, &buf->img); +#if USE_PARTIAL_COPY + } +#endif + + buf->ts_start = ts_start; + buf->ts_end = ts_end; + buf->flags = flags; + return 0; +} + +struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, + int drain) { + struct lookahead_entry *buf = NULL; + + if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) { + buf = pop(ctx, &ctx->read_idx); + ctx->sz--; + } + return buf; +} + +struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, + int index) { + struct lookahead_entry *buf = NULL; + + if (index >= 0) { + // Forward peek + if (index < ctx->sz) { + index += ctx->read_idx; + if (index >= ctx->max_sz) index -= ctx->max_sz; + buf = ctx->buf + index; + } + } else if (index < 0) { + // Backward peek + if (-index <= MAX_PRE_FRAMES) { + index += (int)(ctx->read_idx); + if (index < 0) index += (int)(ctx->max_sz); + buf = ctx->buf + index; + } + } + + return buf; +} + +unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx) { return ctx->sz; } diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h new file mode 100644 index 000000000..19f75d7e4 --- /dev/null +++ b/third_party/aom/av1/encoder/lookahead.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_LOOKAHEAD_H_ +#define AV1_ENCODER_LOOKAHEAD_H_ + +#include "aom_scale/yv12config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_LAG_BUFFERS 25 + +struct lookahead_entry { + YV12_BUFFER_CONFIG img; + int64_t ts_start; + int64_t ts_end; + aom_enc_frame_flags_t flags; +}; + +// The max of past frames we want to keep in the queue. +#define MAX_PRE_FRAMES 1 + +struct lookahead_ctx { + int max_sz; /* Absolute size of the queue */ + int sz; /* Number of buffers currently in the queue */ + int read_idx; /* Read index */ + int write_idx; /* Write index */ + struct lookahead_entry *buf; /* Buffer list */ +}; + +/**\brief Initializes the lookahead stage + * + * The lookahead stage is a queue of frame buffers on which some analysis + * may be done when buffers are enqueued. + */ +struct lookahead_ctx *av1_lookahead_init(unsigned int width, + unsigned int height, + unsigned int subsampling_x, + unsigned int subsampling_y, +#if CONFIG_HIGHBITDEPTH + int use_highbitdepth, +#endif + unsigned int depth); + +/**\brief Destroys the lookahead stage + */ +void av1_lookahead_destroy(struct lookahead_ctx *ctx); + +/**\brief Enqueue a source buffer + * + * This function will copy the source image into a new framebuffer with + * the expected stride/border. + * + * If active_map is non-NULL and there is only one frame in the queue, then copy + * only active macroblocks. + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] src Pointer to the image to enqueue + * \param[in] ts_start Timestamp for the start of this frame + * \param[in] ts_end Timestamp for the end of this frame + * \param[in] flags Flags set on this frame + * \param[in] active_map Map that specifies which macroblock is active + */ +int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, +#if CONFIG_HIGHBITDEPTH + int use_highbitdepth, +#endif + aom_enc_frame_flags_t flags); + +/**\brief Get the next source buffer to encode + * + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] drain Flag indicating the buffer should be drained + * (return a buffer regardless of the current queue depth) + * + * \retval NULL, if drain set and queue is empty + * \retval NULL, if drain not set and queue not of the configured depth + */ +struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain); + +/**\brief Get a future source buffer to encode + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] index Index of the frame to be returned, 0 == next frame + * + * \retval NULL, if no buffer exists at the specified index + */ +struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, + int index); + +/**\brief Get the number of frames currently in the lookahead queue + * + * \param[in] ctx Pointer to the lookahead context + */ +unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_LOOKAHEAD_H_ diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c new file mode 100644 index 000000000..1296027dc --- /dev/null +++ b/third_party/aom/av1/encoder/mbgraph.c @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./av1_rtcd.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/system_state.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/mcomp.h" +#include "av1/common/blockd.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" + +static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv, + int mb_row, int mb_col) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; + + const MvLimits tmp_mv_limits = x->mv_limits; + MV ref_full; + int cost_list[5]; + + // Further step/diamond searches as necessary + int step_param = mv_sf->reduce_first_step_size; + step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + av1_set_mv_search_range(&x->mv_limits, ref_mv); + + ref_full.col = ref_mv->col >> 3; + ref_full.row = ref_mv->row >> 3; + + /*cpi->sf.search_method == HEX*/ + av1_hex_search(x, &ref_full, step_param, x->errorperbit, 0, + cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv); + + // Try sub-pixel MC + // if (bestsme > error_thresh && bestsme < INT_MAX) + { + int distortion; + unsigned int sse; + cpi->find_fractional_mv_step(x, ref_mv, cpi->common.allow_high_precision_mv, + x->errorperbit, &v_fn_ptr, 0, + mv_sf->subpel_iters_per_step, + cond_cost_list(cpi, cost_list), NULL, NULL, + &distortion, &sse, NULL, 0, 0, 0); + } + +#if CONFIG_EXT_INTER + if (has_second_ref(&xd->mi[0]->mbmi)) + xd->mi[0]->mbmi.mode = NEW_NEWMV; + else +#endif // CONFIG_EXT_INTER + xd->mi[0]->mbmi.mode = NEWMV; + + xd->mi[0]->mbmi.mv[0] = x->best_mv; +#if CONFIG_EXT_INTER + xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME; +#endif // CONFIG_EXT_INTER + + av1_build_inter_predictors_sby(xd, mb_row, mb_col, NULL, BLOCK_16X16); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + return aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, xd->plane[0].dst.stride); +} + +static int do_16x16_motion_search(AV1_COMP *cpi, const MV *ref_mv, int mb_row, + int mb_col) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + unsigned int err, tmp_err; + MV best_mv; + + // Try zero MV first + // FIXME should really use something like near/nearest MV and/or MV prediction + err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); + best_mv.col = best_mv.row = 0; + + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search + tmp_err = do_16x16_motion_iteration(cpi, ref_mv, mb_row, mb_col); + if (tmp_err < err) { + err = tmp_err; + best_mv = x->best_mv.as_mv; + } + + // If the current best reference mv is not centered on 0,0 then do a 0,0 + // based search as well. + if (ref_mv->row != 0 || ref_mv->col != 0) { + MV zero_ref_mv = { 0, 0 }; + + tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, mb_row, mb_col); + if (tmp_err < err) { + err = tmp_err; + best_mv = x->best_mv.as_mv; + } + } + + x->best_mv.as_mv = best_mv; + return err; +} + +static int do_16x16_zerozero_search(AV1_COMP *cpi, int_mv *dst_mv) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + unsigned int err; + + // Try zero MV first + // FIXME should really use something like near/nearest MV and/or MV prediction + err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); + + dst_mv->as_int = 0; + + return err; +} +static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + PREDICTION_MODE best_mode = -1, mode; + unsigned int best_err = INT_MAX; + + // calculate SATD for each intra prediction mode; + // we're intentionally not doing 4x4, we just want a rough estimate + for (mode = DC_PRED; mode <= TM_PRED; mode++) { + unsigned int err; + + xd->mi[0]->mbmi.mode = mode; + av1_predict_intra_block(xd, 16, 16, BLOCK_16X16, mode, x->plane[0].src.buf, + x->plane[0].src.stride, xd->plane[0].dst.buf, + xd->plane[0].dst.stride, 0, 0, 0); + err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, xd->plane[0].dst.stride); + + // find best + if (err < best_err) { + best_err = err; + best_mode = mode; + } + } + + if (pbest_mode) *pbest_mode = best_mode; + + return best_err; +} + +static void update_mbgraph_mb_stats(AV1_COMP *cpi, MBGRAPH_MB_STATS *stats, + YV12_BUFFER_CONFIG *buf, int mb_y_offset, + YV12_BUFFER_CONFIG *golden_ref, + const MV *prev_golden_ref_mv, + YV12_BUFFER_CONFIG *alt_ref, int mb_row, + int mb_col) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + int intra_error; + AV1_COMMON *cm = &cpi->common; + + // FIXME in practice we're completely ignoring chroma here + x->plane[0].src.buf = buf->y_buffer + mb_y_offset; + x->plane[0].src.stride = buf->y_stride; + + xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset; + xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride; + + // do intra 16x16 prediction + intra_error = find_best_16x16_intra(cpi, &stats->ref[INTRA_FRAME].m.mode); + if (intra_error <= 0) intra_error = 1; + stats->ref[INTRA_FRAME].err = intra_error; + + // Golden frame MV search, if it exists and is different than last frame + if (golden_ref) { + int g_motion_error; + xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset; + xd->plane[0].pre[0].stride = golden_ref->y_stride; + g_motion_error = + do_16x16_motion_search(cpi, prev_golden_ref_mv, mb_row, mb_col); + stats->ref[GOLDEN_FRAME].m.mv = x->best_mv; + stats->ref[GOLDEN_FRAME].err = g_motion_error; + } else { + stats->ref[GOLDEN_FRAME].err = INT_MAX; + stats->ref[GOLDEN_FRAME].m.mv.as_int = 0; + } + + // Do an Alt-ref frame MV search, if it exists and is different than + // last/golden frame. + if (alt_ref) { + int a_motion_error; + xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset; + xd->plane[0].pre[0].stride = alt_ref->y_stride; + a_motion_error = + do_16x16_zerozero_search(cpi, &stats->ref[ALTREF_FRAME].m.mv); + + stats->ref[ALTREF_FRAME].err = a_motion_error; + } else { + stats->ref[ALTREF_FRAME].err = INT_MAX; + stats->ref[ALTREF_FRAME].m.mv.as_int = 0; + } +} + +static void update_mbgraph_frame_stats(AV1_COMP *cpi, + MBGRAPH_FRAME_STATS *stats, + YV12_BUFFER_CONFIG *buf, + YV12_BUFFER_CONFIG *golden_ref, + YV12_BUFFER_CONFIG *alt_ref) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + AV1_COMMON *const cm = &cpi->common; + + int mb_col, mb_row, offset = 0; + int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0; + MV gld_top_mv = { 0, 0 }; + MODE_INFO mi_local; + + av1_zero(mi_local); + // Set up limit values for motion vectors to prevent them extending outside + // the UMV borders. + x->mv_limits.row_min = -BORDER_MV_PIXELS_B16; + x->mv_limits.row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16; + xd->up_available = 0; + xd->plane[0].dst.stride = buf->y_stride; + xd->plane[0].pre[0].stride = buf->y_stride; + xd->plane[1].dst.stride = buf->uv_stride; + xd->mi[0] = &mi_local; + mi_local.mbmi.sb_type = BLOCK_16X16; + mi_local.mbmi.ref_frame[0] = LAST_FRAME; + mi_local.mbmi.ref_frame[1] = NONE_FRAME; + + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { + MV gld_left_mv = gld_top_mv; + int mb_y_in_offset = mb_y_offset; + int arf_y_in_offset = arf_y_offset; + int gld_y_in_offset = gld_y_offset; + + // Set up limit values for motion vectors to prevent them extending outside + // the UMV borders. + x->mv_limits.col_min = -BORDER_MV_PIXELS_B16; + x->mv_limits.col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16; + xd->left_available = 0; + + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col]; + + update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset, golden_ref, + &gld_left_mv, alt_ref, mb_row, mb_col); + gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv; + if (mb_col == 0) { + gld_top_mv = gld_left_mv; + } + xd->left_available = 1; + mb_y_in_offset += 16; + gld_y_in_offset += 16; + arf_y_in_offset += 16; + x->mv_limits.col_min -= 16; + x->mv_limits.col_max -= 16; + } + xd->up_available = 1; + mb_y_offset += buf->y_stride * 16; + gld_y_offset += golden_ref->y_stride * 16; + if (alt_ref) arf_y_offset += alt_ref->y_stride * 16; + x->mv_limits.row_min -= 16; + x->mv_limits.row_max -= 16; + offset += cm->mb_cols; + } +} + +// void separate_arf_mbs_byzz +static void separate_arf_mbs(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + int mb_col, mb_row, offset, i; + int mi_row, mi_col; + int ncnt[4] = { 0 }; + int n_frames = cpi->mbgraph_n_frames; + + int *arf_not_zz; + + CHECK_MEM_ERROR( + cm, arf_not_zz, + aom_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1)); + + // We are not interested in results beyond the alt ref itself. + if (n_frames > cpi->rc.frames_till_gf_update_due) + n_frames = cpi->rc.frames_till_gf_update_due; + + // defer cost to reference frames + for (i = n_frames - 1; i >= 0; i--) { + MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i]; + + for (offset = 0, mb_row = 0; mb_row < cm->mb_rows; + offset += cm->mb_cols, mb_row++) { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col]; + + int altref_err = mb_stats->ref[ALTREF_FRAME].err; + int intra_err = mb_stats->ref[INTRA_FRAME].err; + int golden_err = mb_stats->ref[GOLDEN_FRAME].err; + + // Test for altref vs intra and gf and that its mv was 0,0. + if (altref_err > 1000 || altref_err > intra_err || + altref_err > golden_err) { + arf_not_zz[offset + mb_col]++; + } + } + } + } + + // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out + // of bound access in segmentation_map + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { + // If any of the blocks in the sequence failed then the MB + // goes in segment 0 + if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) { + ncnt[0]++; + cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0; + } else { + cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1; + ncnt[1]++; + } + } + } + + // Only bother with segmentation if over 10% of the MBs in static segment + // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) ) + if (1) { + // Note % of blocks that are marked as static + if (cm->MBs) + cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols); + + // This error case should not be reachable as this function should + // never be called with the common data structure uninitialized. + else + cpi->static_mb_pct = 0; + + av1_enable_segmentation(&cm->seg); + } else { + cpi->static_mb_pct = 0; + av1_disable_segmentation(&cm->seg); + } + + // Free localy allocated storage + aom_free(arf_not_zz); +} + +void av1_update_mbgraph_stats(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + int i, n_frames = av1_lookahead_depth(cpi->lookahead); + YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + + assert(golden_ref != NULL); + + // we need to look ahead beyond where the ARF transitions into + // being a GF - so exit if we don't look ahead beyond that + if (n_frames <= cpi->rc.frames_till_gf_update_due) return; + + if (n_frames > MAX_LAG_BUFFERS) n_frames = MAX_LAG_BUFFERS; + + cpi->mbgraph_n_frames = n_frames; + for (i = 0; i < n_frames; i++) { + MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i]; + memset(frame_stats->mb_stats, 0, + cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats)); + } + + // do motion search to find contribution of each reference to data + // later on in this GF group + // FIXME really, the GF/last MC search should be done forward, and + // the ARF MC search backwards, to get optimal results for MV caching + for (i = 0; i < n_frames; i++) { + MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i]; + struct lookahead_entry *q_cur = av1_lookahead_peek(cpi->lookahead, i); + + assert(q_cur != NULL); + + update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img, golden_ref, + cpi->source); + } + + aom_clear_system_state(); + + separate_arf_mbs(cpi); +} diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h new file mode 100644 index 000000000..758e2ad15 --- /dev/null +++ b/third_party/aom/av1/encoder/mbgraph.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_MBGRAPH_H_ +#define AV1_ENCODER_MBGRAPH_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + struct { + int err; + union { + int_mv mv; + PREDICTION_MODE mode; + } m; + } ref[TOTAL_REFS_PER_FRAME]; +} MBGRAPH_MB_STATS; + +typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS; + +struct AV1_COMP; + +void av1_update_mbgraph_stats(struct AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_MBGRAPH_H_ diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c new file mode 100644 index 000000000..d069eefb0 --- /dev/null +++ b/third_party/aom/av1/encoder/mcomp.c @@ -0,0 +1,3493 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/common.h" +#include "av1/common/mvref_common.h" +#include "av1/common/reconinter.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/rdopt.h" + +// #define NEW_DIAMOND_SEARCH + +static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, + const MV *mv) { + return &buf->buf[mv->row * buf->stride + mv->col]; +} + +void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv) { + int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0); + int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0); + int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL; + int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL; + + col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1); + row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1); + col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1); + row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1); + + // Get intersection of UMV window and valid MV window to reduce # of checks + // in diamond search. + if (mv_limits->col_min < col_min) mv_limits->col_min = col_min; + if (mv_limits->col_max > col_max) mv_limits->col_max = col_max; + if (mv_limits->row_min < row_min) mv_limits->row_min = row_min; + if (mv_limits->row_max > row_max) mv_limits->row_max = row_max; +} + +static void av1_set_subpel_mv_search_range(const MvLimits *mv_limits, + int *col_min, int *col_max, + int *row_min, int *row_max, + const MV *ref_mv) { + const int max_mv = MAX_FULL_PEL_VAL * 8; + const int minc = AOMMAX(mv_limits->col_min * 8, ref_mv->col - max_mv); + const int maxc = AOMMIN(mv_limits->col_max * 8, ref_mv->col + max_mv); + const int minr = AOMMAX(mv_limits->row_min * 8, ref_mv->row - max_mv); + const int maxr = AOMMIN(mv_limits->row_max * 8, ref_mv->row + max_mv); + + *col_min = AOMMAX(MV_LOW + 1, minc); + *col_max = AOMMIN(MV_UPP - 1, maxc); + *row_min = AOMMAX(MV_LOW + 1, minr); + *row_max = AOMMIN(MV_UPP - 1, maxr); +} + +int av1_init_search_range(int size) { + int sr = 0; + // Minimum search size no matter what the passed in value. + size = AOMMAX(16, size); + + while ((size << sr) < MAX_FULL_PEL_VAL) sr++; + + sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2); + return sr; +} + +static INLINE int mv_cost(const MV *mv, const int *joint_cost, + int *const comp_cost[2]) { + return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] + + comp_cost[1][mv->col]; +} + +int av1_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost, + int *mvcost[2], int weight) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7); +} + +#define PIXEL_TRANSFORM_ERROR_SCALE 4 +static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost, + int *mvcost[2], int error_per_bit) { + if (mvcost) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + return (int)ROUND_POWER_OF_TWO_64( + (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit, + RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT + + PIXEL_TRANSFORM_ERROR_SCALE); + } + return 0; +} + +static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref, + int sad_per_bit) { + const MV diff = { (mv->row - ref->row) * 8, (mv->col - ref->col) * 8 }; + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->mvsadcost) * sad_per_bit, + AV1_PROB_COST_SHIFT); +} + +void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) { + int len, ss_count = 1; + + cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0; + cfg->ss[0].offset = 0; + + for (len = MAX_FIRST_STEP; len > 0; len /= 2) { + // Generate offsets for 4 search sites per step. + const MV ss_mvs[] = { { -len, 0 }, { len, 0 }, { 0, -len }, { 0, len } }; + int i; + for (i = 0; i < 4; ++i) { + search_site *const ss = &cfg->ss[ss_count++]; + ss->mv = ss_mvs[i]; + ss->offset = ss->mv.row * stride + ss->mv.col; + } + } + + cfg->ss_count = ss_count; + cfg->searches_per_step = 4; +} + +void av1_init3smotion_compensation(search_site_config *cfg, int stride) { + int len, ss_count = 1; + + cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0; + cfg->ss[0].offset = 0; + + for (len = MAX_FIRST_STEP; len > 0; len /= 2) { + // Generate offsets for 8 search sites per step. + const MV ss_mvs[8] = { { -len, 0 }, { len, 0 }, { 0, -len }, + { 0, len }, { -len, -len }, { -len, len }, + { len, -len }, { len, len } }; + int i; + for (i = 0; i < 8; ++i) { + search_site *const ss = &cfg->ss[ss_count++]; + ss->mv = ss_mvs[i]; + ss->offset = ss->mv.row * stride + ss->mv.col; + } + } + + cfg->ss_count = ss_count; + cfg->searches_per_step = 8; +} + +/* + * To avoid the penalty for crossing cache-line read, preload the reference + * area in a small buffer, which is aligned to make sure there won't be crossing + * cache-line read while reading from this buffer. This reduced the cpu + * cycles spent on reading ref data in sub-pixel filter functions. + * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x + * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we + * could reduce the area. + */ + +// convert motion vector component to offset for sv[a]f calc +static INLINE int sp(int x) { return x & 7; } + +static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { + return &buf[(r >> 3) * stride + (c >> 3)]; +} + +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + MV this_mv = { r, c }; \ + v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \ + if (second_pred == NULL) \ + thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \ + src_address, src_stride, &sse); \ + else \ + thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \ + src_address, src_stride, &sse, second_pred); \ + v += thismse; \ + if (v < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } + +#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c) + +static INLINE const uint8_t *upre(const uint8_t *buf, int stride, int r, + int c) { + return &buf[(r)*stride + (c)]; +} + +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER1(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + MV this_mv = { r, c }; \ + thismse = upsampled_pref_error(xd, vfp, src_address, src_stride, \ + upre(y, y_stride, r, c), y_stride, \ + second_pred, w, h, &sse); \ + v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \ + v += thismse; \ + if (v < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } + +#define FIRST_LEVEL_CHECKS \ + { \ + unsigned int left, right, up, down, diag; \ + CHECK_BETTER(left, tr, tc - hstep); \ + CHECK_BETTER(right, tr, tc + hstep); \ + CHECK_BETTER(up, tr - hstep, tc); \ + CHECK_BETTER(down, tr + hstep, tc); \ + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); \ + switch (whichdir) { \ + case 0: CHECK_BETTER(diag, tr - hstep, tc - hstep); break; \ + case 1: CHECK_BETTER(diag, tr - hstep, tc + hstep); break; \ + case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \ + case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \ + } \ + } + +#define SECOND_LEVEL_CHECKS \ + { \ + int kr, kc; \ + unsigned int second; \ + if (tr != br && tc != bc) { \ + kr = br - tr; \ + kc = bc - tc; \ + CHECK_BETTER(second, tr + kr, tc + 2 * kc); \ + CHECK_BETTER(second, tr + 2 * kr, tc + kc); \ + } else if (tr == br && tc != bc) { \ + kc = bc - tc; \ + CHECK_BETTER(second, tr + hstep, tc + 2 * kc); \ + CHECK_BETTER(second, tr - hstep, tc + 2 * kc); \ + switch (whichdir) { \ + case 0: \ + case 1: CHECK_BETTER(second, tr + hstep, tc + kc); break; \ + case 2: \ + case 3: CHECK_BETTER(second, tr - hstep, tc + kc); break; \ + } \ + } else if (tr != br && tc == bc) { \ + kr = br - tr; \ + CHECK_BETTER(second, tr + 2 * kr, tc + hstep); \ + CHECK_BETTER(second, tr + 2 * kr, tc - hstep); \ + switch (whichdir) { \ + case 0: \ + case 2: CHECK_BETTER(second, tr + kr, tc + hstep); break; \ + case 1: \ + case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \ + } \ + } \ + } + +// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of +// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten +// later in the same way. +#define SECOND_LEVEL_CHECKS_BEST(k) \ + { \ + unsigned int second; \ + int br0 = br; \ + int bc0 = bc; \ + assert(tr == br || tc == bc); \ + if (tr == br && tc != bc) { \ + kc = bc - tc; \ + } else if (tr != br && tc == bc) { \ + kr = br - tr; \ + } \ + CHECK_BETTER##k(second, br0 + kr, bc0); \ + CHECK_BETTER##k(second, br0, bc0 + kc); \ + if (br0 != br || bc0 != bc) { \ + CHECK_BETTER##k(second, br0 + kr, bc0 + kc); \ + } \ + } + +#define SETUP_SUBPEL_SEARCH \ + const uint8_t *const src_address = x->plane[0].src.buf; \ + const int src_stride = x->plane[0].src.stride; \ + const MACROBLOCKD *xd = &x->e_mbd; \ + unsigned int besterr = INT_MAX; \ + unsigned int sse; \ + unsigned int whichdir; \ + int thismse; \ + MV *bestmv = &x->best_mv.as_mv; \ + const unsigned int halfiters = iters_per_step; \ + const unsigned int quarteriters = iters_per_step; \ + const unsigned int eighthiters = iters_per_step; \ + const int y_stride = xd->plane[0].pre[0].stride; \ + const int offset = bestmv->row * y_stride + bestmv->col; \ + const uint8_t *const y = xd->plane[0].pre[0].buf; \ + \ + int br = bestmv->row * 8; \ + int bc = bestmv->col * 8; \ + int hstep = 4; \ + int minc, maxc, minr, maxr; \ + int tr = br; \ + int tc = bc; \ + \ + av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, \ + ref_mv); \ + \ + bestmv->row *= 8; \ + bestmv->col *= 8; + +static unsigned int setup_center_error( + const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv, + int error_per_bit, const aom_variance_fn_ptr_t *vfp, + const uint8_t *const src, const int src_stride, const uint8_t *const y, + int y_stride, const uint8_t *second_pred, int w, int h, int offset, + int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) { + unsigned int besterr; +#if CONFIG_HIGHBITDEPTH + if (second_pred != NULL) { + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]); + aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, + y_stride); + besterr = + vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); + } else { + DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); + aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); + } + } else { + besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1); + } + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); +#else + (void)xd; + if (second_pred != NULL) { + DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); + aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); + } else { + besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1); + } + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); +#endif // CONFIG_HIGHBITDEPTH + return besterr; +} + +static INLINE int divide_and_round(int n, int d) { + return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d); +} + +static INLINE int is_cost_list_wellbehaved(int *cost_list) { + return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] && + cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4]; +} + +// Returns surface minima estimate at given precision in 1/2^n bits. +// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C +// For a given set of costs S0, S1, S2, S3, S4 at points +// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively, +// the solution for the location of the minima (x0, y0) is given by: +// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0), +// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0). +// The code below is an integerized version of that. +static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) { + *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)), + (cost_list[1] - 2 * cost_list[0] + cost_list[3])); + *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)), + (cost_list[4] - 2 * cost_list[0] + cost_list[2])); +} + +int av1_find_best_sub_pixel_tree_pruned_evenmore( + MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, + int *cost_list, int *mvjcost, int *mvcost[2], int *distortion, + unsigned int *sse1, const uint8_t *second_pred, int w, int h, + int use_upsampled_ref) { + SETUP_SUBPEL_SEARCH; + besterr = setup_center_error( + xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, + y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion); + (void)halfiters; + (void)quarteriters; + (void)eighthiters; + (void)whichdir; + (void)allow_hp; + (void)forced_stop; + (void)hstep; + (void)use_upsampled_ref; + + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) { + int ir, ic; + unsigned int minpt; + get_cost_surf_min(cost_list, &ir, &ic, 2); + if (ir != 0 || ic != 0) { + CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic); + } + } else { + FIRST_LEVEL_CHECKS; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; + } + + tr = br; + tc = bc; + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; + } + } + } + + tr = br; + tc = bc; + + if (allow_hp && forced_stop == 0) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; + } + } + + bestmv->row = br; + bestmv->col = bc; + + return besterr; +} + +int av1_find_best_sub_pixel_tree_pruned_more( + MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, + int *cost_list, int *mvjcost, int *mvcost[2], int *distortion, + unsigned int *sse1, const uint8_t *second_pred, int w, int h, + int use_upsampled_ref) { + SETUP_SUBPEL_SEARCH; + (void)use_upsampled_ref; + + besterr = setup_center_error( + xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, + y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion); + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) { + unsigned int minpt; + int ir, ic; + get_cost_surf_min(cost_list, &ir, &ic, 1); + if (ir != 0 || ic != 0) { + CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep); + } + } else { + FIRST_LEVEL_CHECKS; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; + } + } + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + tr = br; + tc = bc; + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; + } + } + + if (allow_hp && forced_stop == 0) { + tr = br; + tc = bc; + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; + } + } + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void)tr; + (void)tc; + + bestmv->row = br; + bestmv->col = bc; + + return besterr; +} + +int av1_find_best_sub_pixel_tree_pruned( + MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, + int *cost_list, int *mvjcost, int *mvcost[2], int *distortion, + unsigned int *sse1, const uint8_t *second_pred, int w, int h, + int use_upsampled_ref) { + SETUP_SUBPEL_SEARCH; + (void)use_upsampled_ref; + + besterr = setup_center_error( + xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, + y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion); + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX) { + unsigned int left, right, up, down, diag; + whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) + + (cost_list[2] < cost_list[4] ? 0 : 2); + switch (whichdir) { + case 0: + CHECK_BETTER(left, tr, tc - hstep); + CHECK_BETTER(down, tr + hstep, tc); + CHECK_BETTER(diag, tr + hstep, tc - hstep); + break; + case 1: + CHECK_BETTER(right, tr, tc + hstep); + CHECK_BETTER(down, tr + hstep, tc); + CHECK_BETTER(diag, tr + hstep, tc + hstep); + break; + case 2: + CHECK_BETTER(left, tr, tc - hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(diag, tr - hstep, tc - hstep); + break; + case 3: + CHECK_BETTER(right, tr, tc + hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(diag, tr - hstep, tc + hstep); + break; + } + } else { + FIRST_LEVEL_CHECKS; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; + } + } + + tr = br; + tc = bc; + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + } + + if (allow_hp && forced_stop == 0) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + } + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void)tr; + (void)tc; + + bestmv->row = br; + bestmv->col = bc; + + return besterr; +} + +/* clang-format off */ +static const MV search_step_table[12] = { + // left, right, up, down + { 0, -4 }, { 0, 4 }, { -4, 0 }, { 4, 0 }, + { 0, -2 }, { 0, 2 }, { -2, 0 }, { 2, 0 }, + { 0, -1 }, { 0, 1 }, { -1, 0 }, { 1, 0 } +}; +/* clang-format on */ + +static int upsampled_pref_error(const MACROBLOCKD *xd, + const aom_variance_fn_ptr_t *vfp, + const uint8_t *const src, const int src_stride, + const uint8_t *const y, int y_stride, + const uint8_t *second_pred, int w, int h, + unsigned int *sse) { + unsigned int besterr; +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); + if (second_pred != NULL) + aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y, + y_stride); + else + aom_highbd_upsampled_pred(pred16, w, h, y, y_stride); + + besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse); + } else { + DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); +#else + DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); + (void)xd; +#endif // CONFIG_HIGHBITDEPTH + if (second_pred != NULL) + aom_comp_avg_upsampled_pred(pred, second_pred, w, h, y, y_stride); + else + aom_upsampled_pred(pred, w, h, y, y_stride); + + besterr = vfp->vf(pred, w, src, src_stride, sse); +#if CONFIG_HIGHBITDEPTH + } +#endif + return besterr; +} + +static unsigned int upsampled_setup_center_error( + const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv, + int error_per_bit, const aom_variance_fn_ptr_t *vfp, + const uint8_t *const src, const int src_stride, const uint8_t *const y, + int y_stride, const uint8_t *second_pred, int w, int h, int offset, + int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) { + unsigned int besterr = upsampled_pref_error( + xd, vfp, src, src_stride, y + offset, y_stride, second_pred, w, h, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + return besterr; +} + +int av1_find_best_sub_pixel_tree(MACROBLOCK *x, const MV *ref_mv, int allow_hp, + int error_per_bit, + const aom_variance_fn_ptr_t *vfp, + int forced_stop, int iters_per_step, + int *cost_list, int *mvjcost, int *mvcost[2], + int *distortion, unsigned int *sse1, + const uint8_t *second_pred, int w, int h, + int use_upsampled_ref) { + const uint8_t *const src_address = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const MACROBLOCKD *xd = &x->e_mbd; + unsigned int besterr = INT_MAX; + unsigned int sse; + unsigned int thismse; + const int y_stride = xd->plane[0].pre[0].stride; + MV *bestmv = &x->best_mv.as_mv; + const int offset = bestmv->row * y_stride + bestmv->col; + const uint8_t *const y = xd->plane[0].pre[0].buf; + + int br = bestmv->row * 8; + int bc = bestmv->col * 8; + int hstep = 4; + int iter, round = 3 - forced_stop; + int tr = br; + int tc = bc; + const MV *search_step = search_step_table; + int idx, best_idx = -1; + unsigned int cost_array[5]; + int kr, kc; + int minc, maxc, minr, maxr; + + av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, + ref_mv); + + if (!allow_hp) + if (round == 3) round = 2; + + bestmv->row *= 8; + bestmv->col *= 8; + + // use_upsampled_ref can be 0 or 1 + if (use_upsampled_ref) + besterr = upsampled_setup_center_error( + xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, + y_stride, second_pred, w, h, (offset * 8), mvjcost, mvcost, sse1, + distortion); + else + besterr = setup_center_error( + xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y, + y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion); + + (void)cost_list; // to silence compiler warning + + for (iter = 0; iter < round; ++iter) { + // Check vertical and horizontal sub-pixel positions. + for (idx = 0; idx < 4; ++idx) { + tr = br + search_step[idx].row; + tc = bc + search_step[idx].col; + if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { + MV this_mv = { tr, tc }; + + if (use_upsampled_ref) { + const uint8_t *const pre_address = y + tr * y_stride + tc; + + thismse = upsampled_pref_error(xd, vfp, src_address, src_stride, + pre_address, y_stride, second_pred, w, + h, &sse); + } else { + const uint8_t *const pre_address = + y + (tr >> 3) * y_stride + (tc >> 3); + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); + } + + cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, + mvcost, error_per_bit); + + if (cost_array[idx] < besterr) { + best_idx = idx; + besterr = cost_array[idx]; + *distortion = thismse; + *sse1 = sse; + } + } else { + cost_array[idx] = INT_MAX; + } + } + + // Check diagonal sub-pixel position + kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep); + kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep); + + tc = bc + kc; + tr = br + kr; + if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { + MV this_mv = { tr, tc }; + + if (use_upsampled_ref) { + const uint8_t *const pre_address = y + tr * y_stride + tc; + + thismse = + upsampled_pref_error(xd, vfp, src_address, src_stride, pre_address, + y_stride, second_pred, w, h, &sse); + } else { + const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); + + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, + src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); + } + + cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, + error_per_bit); + + if (cost_array[4] < besterr) { + best_idx = 4; + besterr = cost_array[4]; + *distortion = thismse; + *sse1 = sse; + } + } else { + cost_array[idx] = INT_MAX; + } + + if (best_idx < 4 && best_idx >= 0) { + br += search_step[best_idx].row; + bc += search_step[best_idx].col; + } else if (best_idx == 4) { + br = tr; + bc = tc; + } + + if (iters_per_step > 1 && best_idx != -1) { + if (use_upsampled_ref) { + SECOND_LEVEL_CHECKS_BEST(1); + } else { + SECOND_LEVEL_CHECKS_BEST(0); + } + } + + search_step += 4; + hstep >>= 1; + best_idx = -1; + } + + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void)tr; + (void)tc; + + bestmv->row = br; + bestmv->col = bc; + + return besterr; +} + +#undef PRE +#undef CHECK_BETTER + +static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col, + int range) { + return ((row - range) >= mv_limits->row_min) & + ((row + range) <= mv_limits->row_max) & + ((col - range) >= mv_limits->col_min) & + ((col + range) <= mv_limits->col_max); +} + +static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) { + return (mv->col >= mv_limits->col_min) && (mv->col <= mv_limits->col_max) && + (mv->row >= mv_limits->row_min) && (mv->row <= mv_limits->row_max); +} + +#define CHECK_BETTER \ + { \ + if (thissad < bestsad) { \ + if (use_mvcost) \ + thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); \ + if (thissad < bestsad) { \ + bestsad = thissad; \ + best_site = i; \ + } \ + } \ + } + +#define MAX_PATTERN_SCALES 11 +#define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale +#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates + +// Calculate and return a sad+mvcost list around an integer best pel. +static INLINE void calc_int_cost_list(const MACROBLOCK *x, + const MV *const ref_mv, int sadpb, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *best_mv, int *cost_list) { + static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } }; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0]; + const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 }; + const int br = best_mv->row; + const int bc = best_mv->col; + int i; + unsigned int sse; + const MV this_mv = { br, bc }; + + cost_list[0] = + fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), + in_what->stride, &sse) + + mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb); + if (check_bounds(&x->mv_limits, br, bc, 1)) { + for (i = 0; i < 4; i++) { + const MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col }; + cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride, + get_buf_from_mv(in_what, &neighbor_mv), + in_what->stride, &sse) + + mv_err_cost(&neighbor_mv, &fcenter_mv, x->nmvjointcost, + x->mvcost, x->errorperbit); + } + } else { + for (i = 0; i < 4; i++) { + const MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col }; + if (!is_mv_in(&x->mv_limits, &neighbor_mv)) + cost_list[i + 1] = INT_MAX; + else + cost_list[i + 1] = + fn_ptr->vf(what->buf, what->stride, + get_buf_from_mv(in_what, &neighbor_mv), in_what->stride, + &sse) + + mv_err_cost(&neighbor_mv, &fcenter_mv, x->nmvjointcost, x->mvcost, + x->errorperbit); + } + } +} + +static INLINE void calc_int_sad_list(const MACROBLOCK *x, + const MV *const ref_mv, int sadpb, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *best_mv, int *cost_list, + const int use_mvcost, const int bestsad) { + static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } }; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0]; + const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 }; + int i; + const int br = best_mv->row; + const int bc = best_mv->col; + + if (cost_list[0] == INT_MAX) { + cost_list[0] = bestsad; + if (check_bounds(&x->mv_limits, br, bc, 1)) { + for (i = 0; i < 4; i++) { + const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; + cost_list[i + 1] = + fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + } + } else { + for (i = 0; i < 4; i++) { + const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; + if (!is_mv_in(&x->mv_limits, &this_mv)) + cost_list[i + 1] = INT_MAX; + else + cost_list[i + 1] = + fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + } + } + } else { + if (use_mvcost) { + for (i = 0; i < 4; i++) { + const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; + if (cost_list[i + 1] != INT_MAX) { + cost_list[i + 1] += mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb); + } + } + } + } +} + +// Generic pattern search function that searches over multiple scales. +// Each scale can have a different number of candidates and shape of +// candidates as indicated in the num_candidates and candidates arrays +// passed into this function +// +static int pattern_search( + MACROBLOCK *x, MV *start_mv, int search_param, int sad_per_bit, + int do_init_search, int *cost_list, const aom_variance_fn_ptr_t *vfp, + int use_mvcost, const MV *center_mv, + const int num_candidates[MAX_PATTERN_SCALES], + const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) { + const MACROBLOCKD *const xd = &x->e_mbd; + static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = { + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + }; + int i, s, t; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const int last_is_4 = num_candidates[0] == 4; + int br, bc; + int bestsad = INT_MAX; + int thissad; + int k = -1; + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + int best_init_s = search_param_to_steps[search_param]; + // adjust ref_mv to make sure it is within MV range + clamp_mv(start_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + br = start_mv->row; + bc = start_mv->col; + if (cost_list != NULL) { + cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = + INT_MAX; + } + + // Work out the start point for the search + bestsad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, start_mv), in_what->stride) + + mvsad_err_cost(x, start_mv, &fcenter_mv, sad_per_bit); + + // Search all possible scales upto the search param around the center point + // pick the scale of the point that is best as the starting scale of + // further steps around it. + if (do_init_search) { + s = best_init_s; + best_init_s = -1; + for (t = 0; t <= s; ++t) { + int best_site = -1; + if (check_bounds(&x->mv_limits, br, bc, 1 << t)) { + for (i = 0; i < num_candidates[t]; i++) { + const MV this_mv = { br + candidates[t][i].row, + bc + candidates[t][i].col }; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[t]; i++) { + const MV this_mv = { br + candidates[t][i].row, + bc + candidates[t][i].col }; + if (!is_mv_in(&x->mv_limits, &this_mv)) continue; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + if (best_site == -1) { + continue; + } else { + best_init_s = t; + k = best_site; + } + } + if (best_init_s != -1) { + br += candidates[best_init_s][k].row; + bc += candidates[best_init_s][k].col; + } + } + + // If the center point is still the best, just skip this and move to + // the refinement step. + if (best_init_s != -1) { + const int last_s = (last_is_4 && cost_list != NULL); + int best_site = -1; + s = best_init_s; + + for (; s >= last_s; s--) { + // No need to search all points the 1st time if initial search was used + if (!do_init_search || s != best_init_s) { + if (check_bounds(&x->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + if (!is_mv_in(&x->mv_limits, &this_mv)) continue; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + + if (best_site == -1) { + continue; + } else { + br += candidates[s][best_site].row; + bc += candidates[s][best_site].col; + k = best_site; + } + } + + do { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + + if (check_bounds(&x->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + if (!is_mv_in(&x->mv_limits, &this_mv)) continue; + thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += candidates[s][k].row; + bc += candidates[s][k].col; + } + } while (best_site != -1); + } + + // Note: If we enter the if below, then cost_list must be non-NULL. + if (s == 0) { + cost_list[0] = bestsad; + if (!do_init_search || s != best_init_s) { + if (check_bounds(&x->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + cost_list[i + 1] = thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[s]; i++) { + const MV this_mv = { br + candidates[s][i].row, + bc + candidates[s][i].col }; + if (!is_mv_in(&x->mv_limits, &this_mv)) continue; + cost_list[i + 1] = thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + br += candidates[s][best_site].row; + bc += candidates[s][best_site].col; + k = best_site; + } + } + while (best_site != -1) { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX; + cost_list[((k + 2) % 4) + 1] = cost_list[0]; + cost_list[0] = bestsad; + + if (check_bounds(&x->mv_limits, br, bc, 1 << s)) { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + cost_list[next_chkpts_indices[i] + 1] = thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } else { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + const MV this_mv = { + br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col + }; + if (!is_mv_in(&x->mv_limits, &this_mv)) { + cost_list[next_chkpts_indices[i] + 1] = INT_MAX; + continue; + } + cost_list[next_chkpts_indices[i] + 1] = thissad = + vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), in_what->stride); + CHECK_BETTER + } + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += candidates[s][k].row; + bc += candidates[s][k].col; + } + } + } + } + + // Returns the one-away integer pel cost/sad around the best as follows: + // cost_list[0]: cost/sad at the best integer pel + // cost_list[1]: cost/sad at delta {0, -1} (left) from the best integer pel + // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel + // cost_list[3]: cost/sad at delta { 0, 1} (right) from the best integer pel + // cost_list[4]: cost/sad at delta {-1, 0} (top) from the best integer pel + if (cost_list) { + const MV best_int_mv = { br, bc }; + if (last_is_4) { + calc_int_sad_list(x, center_mv, sad_per_bit, vfp, &best_int_mv, cost_list, + use_mvcost, bestsad); + } else { + calc_int_cost_list(x, center_mv, sad_per_bit, vfp, &best_int_mv, + cost_list); + } + } + x->best_mv.as_mv.row = br; + x->best_mv.as_mv.col = bc; + return bestsad; +} + +int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv, + const MV *center_mv, const aom_variance_fn_ptr_t *vfp, + int use_mvcost) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV mv = { best_mv->row * 8, best_mv->col * 8 }; + unsigned int unused; + + return vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv), + in_what->stride, &unused) + + (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost, + x->errorperbit) + : 0); +} + +int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv, + const MV *center_mv, const uint8_t *second_pred, + const aom_variance_fn_ptr_t *vfp, int use_mvcost) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV mv = { best_mv->row * 8, best_mv->col * 8 }; + unsigned int unused; + + return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0, + what->buf, what->stride, &unused, second_pred) + + (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost, + x->errorperbit) + : 0); +} + +int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param, + int sad_per_bit, int do_init_search, int *cost_list, + const aom_variance_fn_ptr_t *vfp, int use_mvcost, + const MV *center_mv) { + // First scale has 8-closest points, the rest have 6 points in hex shape + // at increasing scales + static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6 }; + // Note that the largest candidate step at each scale is 2^scale + /* clang-format off */ + static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 }, + { -1, 0 } }, + { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } }, + { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } }, + { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } }, + { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } }, + { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 }, + { -32, 0 } }, + { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 }, + { -64, 0 } }, + { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 }, + { -128, 0 } }, + { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 }, + { -256, 0 } }, + { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 }, + { -512, 0 } }, + { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 }, + { -512, 1024 }, { -1024, 0 } }, + }; + /* clang-format on */ + return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search, + cost_list, vfp, use_mvcost, center_mv, + hex_num_candidates, hex_candidates); +} + +static int bigdia_search(MACROBLOCK *x, MV *start_mv, int search_param, + int sad_per_bit, int do_init_search, int *cost_list, + const aom_variance_fn_ptr_t *vfp, int use_mvcost, + const MV *center_mv) { + // First scale has 4-closest points, the rest have 8 points in diamond + // shape at increasing scales + static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = { + 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + }; + // Note that the largest candidate step at each scale is 2^scale + /* clang-format off */ + static const MV + bigdia_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } }, + { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 }, + { -1, 1 }, { -2, 0 } }, + { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 }, + { -2, 2 }, { -4, 0 } }, + { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 }, + { -4, 4 }, { -8, 0 } }, + { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 }, + { -8, 8 }, { -16, 0 } }, + { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 }, + { 0, 32 }, { -16, 16 }, { -32, 0 } }, + { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 }, + { 0, 64 }, { -32, 32 }, { -64, 0 } }, + { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 }, + { 0, 128 }, { -64, 64 }, { -128, 0 } }, + { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 }, + { 0, 256 }, { -128, 128 }, { -256, 0 } }, + { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 }, + { 0, 512 }, { -256, 256 }, { -512, 0 } }, + { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 }, + { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } }, + }; + /* clang-format on */ + return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search, + cost_list, vfp, use_mvcost, center_mv, + bigdia_num_candidates, bigdia_candidates); +} + +static int square_search(MACROBLOCK *x, MV *start_mv, int search_param, + int sad_per_bit, int do_init_search, int *cost_list, + const aom_variance_fn_ptr_t *vfp, int use_mvcost, + const MV *center_mv) { + // All scales have 8 closest points in square shape + static const int square_num_candidates[MAX_PATTERN_SCALES] = { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + }; + // Note that the largest candidate step at each scale is 2^scale + /* clang-format off */ + static const MV + square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, + { -1, 1 }, { -1, 0 } }, + { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 }, + { -2, 2 }, { -2, 0 } }, + { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 }, + { -4, 4 }, { -4, 0 } }, + { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 }, + { -8, 8 }, { -8, 0 } }, + { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 }, + { 0, 16 }, { -16, 16 }, { -16, 0 } }, + { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 }, + { 0, 32 }, { -32, 32 }, { -32, 0 } }, + { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 }, + { 0, 64 }, { -64, 64 }, { -64, 0 } }, + { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 }, + { 0, 128 }, { -128, 128 }, { -128, 0 } }, + { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 }, + { 0, 256 }, { -256, 256 }, { -256, 0 } }, + { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 }, + { 0, 512 }, { -512, 512 }, { -512, 0 } }, + { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 }, + { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } }, + }; + /* clang-format on */ + return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search, + cost_list, vfp, use_mvcost, center_mv, + square_num_candidates, square_candidates); +} + +static int fast_hex_search(MACROBLOCK *x, MV *ref_mv, int search_param, + int sad_per_bit, + int do_init_search, // must be zero for fast_hex + int *cost_list, const aom_variance_fn_ptr_t *vfp, + int use_mvcost, const MV *center_mv) { + return av1_hex_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param), + sad_per_bit, do_init_search, cost_list, vfp, use_mvcost, + center_mv); +} + +static int fast_dia_search(MACROBLOCK *x, MV *ref_mv, int search_param, + int sad_per_bit, int do_init_search, int *cost_list, + const aom_variance_fn_ptr_t *vfp, int use_mvcost, + const MV *center_mv) { + return bigdia_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param), + sad_per_bit, do_init_search, cost_list, vfp, use_mvcost, + center_mv); +} + +#undef CHECK_BETTER + +// Exhuastive motion search around a given centre position with a given +// step size. +static int exhuastive_mesh_search(MACROBLOCK *x, MV *ref_mv, MV *best_mv, + int range, int step, int sad_per_bit, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + MV fcenter_mv = { center_mv->row, center_mv->col }; + unsigned int best_sad = INT_MAX; + int r, c, i; + int start_col, end_col, start_row, end_row; + int col_step = (step > 1) ? step : 4; + + assert(step >= 1); + + clamp_mv(&fcenter_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + *best_mv = fcenter_mv; + best_sad = + fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) + + mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit); + start_row = AOMMAX(-range, x->mv_limits.row_min - fcenter_mv.row); + start_col = AOMMAX(-range, x->mv_limits.col_min - fcenter_mv.col); + end_row = AOMMIN(range, x->mv_limits.row_max - fcenter_mv.row); + end_col = AOMMIN(range, x->mv_limits.col_max - fcenter_mv.col); + + for (r = start_row; r <= end_row; r += step) { + for (c = start_col; c <= end_col; c += col_step) { + // Step > 1 means we are not checking every location in this pass. + if (step > 1) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c }; + unsigned int sad = + fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), + in_what->stride); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + x->second_best_mv.as_mv = *best_mv; + *best_mv = mv; + } + } + } else { + // 4 sads in a single call if we are checking every location + if (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; + for (i = 0; i < 4; ++i) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i }; + addrs[i] = get_buf_from_mv(in_what, &mv); + } + fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads); + + for (i = 0; i < 4; ++i) { + if (sads[i] < best_sad) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i }; + const unsigned int sad = + sads[i] + mvsad_err_cost(x, &mv, ref_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + x->second_best_mv.as_mv = *best_mv; + *best_mv = mv; + } + } + } + } else { + for (i = 0; i < end_col - c; ++i) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i }; + unsigned int sad = + fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + x->second_best_mv.as_mv = *best_mv; + *best_mv = mv; + } + } + } + } + } + } + } + + return best_sad; +} + +int av1_diamond_search_sad_c(MACROBLOCK *x, const search_site_config *cfg, + MV *ref_mv, MV *best_mv, int search_param, + int sad_per_bit, int *num00, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv) { + int i, j, step; + + const MACROBLOCKD *const xd = &x->e_mbd; + uint8_t *what = x->plane[0].src.buf; + const int what_stride = x->plane[0].src.stride; + const uint8_t *in_what; + const int in_what_stride = xd->plane[0].pre[0].stride; + const uint8_t *best_address; + + unsigned int bestsad = INT_MAX; + int best_site = 0; + int last_site = 0; + + int ref_row; + int ref_col; + + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... + const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step]; + const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param; + + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + ref_row = ref_mv->row; + ref_col = ref_mv->col; + *num00 = 0; + best_mv->row = ref_row; + best_mv->col = ref_col; + + // Work out the start point for the search + in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; + best_address = in_what; + + // Check the starting position + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit); + + i = 1; + + for (step = 0; step < tot_steps; step++) { + int all_in = 1, t; + + // All_in is true if every one of the points we are checking are within + // the bounds of the image. + all_in &= ((best_mv->row + ss[i].mv.row) > x->mv_limits.row_min); + all_in &= ((best_mv->row + ss[i + 1].mv.row) < x->mv_limits.row_max); + all_in &= ((best_mv->col + ss[i + 2].mv.col) > x->mv_limits.col_min); + all_in &= ((best_mv->col + ss[i + 3].mv.col) < x->mv_limits.col_max); + + // If all the pixels are within the bounds we don't check whether the + // search point is valid in this loop, otherwise we check each point + // for validity.. + if (all_in) { + unsigned int sad_array[4]; + + for (j = 0; j < cfg->searches_per_step; j += 4) { + unsigned char const *block_offset[4]; + + for (t = 0; t < 4; t++) + block_offset[t] = ss[i + t].offset + best_address; + + fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); + + for (t = 0; t < 4; t++, i++) { + if (sad_array[t] < bestsad) { + const MV this_mv = { best_mv->row + ss[i].mv.row, + best_mv->col + ss[i].mv.col }; + sad_array[t] += + mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); + if (sad_array[t] < bestsad) { + bestsad = sad_array[t]; + best_site = i; + } + } + } + } + } else { + for (j = 0; j < cfg->searches_per_step; j++) { + // Trap illegal vectors + const MV this_mv = { best_mv->row + ss[i].mv.row, + best_mv->col + ss[i].mv.col }; + + if (is_mv_in(&x->mv_limits, &this_mv)) { + const uint8_t *const check_here = ss[i].offset + best_address; + unsigned int thissad = + fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + + if (thissad < bestsad) { + thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); + if (thissad < bestsad) { + bestsad = thissad; + best_site = i; + } + } + } + i++; + } + } + if (best_site != last_site) { + x->second_best_mv.as_mv = *best_mv; + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + last_site = best_site; +#if defined(NEW_DIAMOND_SEARCH) + while (1) { + const MV this_mv = { best_mv->row + ss[best_site].mv.row, + best_mv->col + ss[best_site].mv.col }; + if (is_mv_in(&x->mv_limits, &this_mv)) { + const uint8_t *const check_here = ss[best_site].offset + best_address; + unsigned int thissad = + fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + if (thissad < bestsad) { + thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); + if (thissad < bestsad) { + bestsad = thissad; + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + continue; + } + } + } + break; + } +#endif + } else if (best_address == in_what) { + (*num00)++; + } + } + return bestsad; +} + +static int vector_match(int16_t *ref, int16_t *src, int bwl) { + int best_sad = INT_MAX; + int this_sad; + int d; + int center, offset = 0; + int bw = 4 << bwl; // redundant variable, to be changed in the experiments. + for (d = 0; d <= bw; d += 16) { + this_sad = aom_vector_var(&ref[d], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + offset = d; + } + } + center = offset; + + for (d = -8; d <= 8; d += 16) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -4; d <= 4; d += 8) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -2; d <= 2; d += 4) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -1; d <= 1; d += 2) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + + return (center - (bw >> 1)); +} + +static const MV search_pos[4] = { + { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 }, +}; + +unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; + DECLARE_ALIGNED(16, int16_t, hbuf[2 * MAX_SB_SIZE]); + DECLARE_ALIGNED(16, int16_t, vbuf[2 * MAX_SB_SIZE]); + DECLARE_ALIGNED(16, int16_t, src_hbuf[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, int16_t, src_vbuf[MAX_SB_SQUARE]); + int idx; + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int search_width = bw << 1; + const int search_height = bh << 1; + const int src_stride = x->plane[0].src.stride; + const int ref_stride = xd->plane[0].pre[0].stride; + uint8_t const *ref_buf, *src_buf; + MV *tmp_mv = &xd->mi[0]->mbmi.mv[0].as_mv; + unsigned int best_sad, tmp_sad, sad_arr[4]; + MV this_mv; + const int norm_factor = 3 + (bw >> 5); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]); + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; + av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); + } + +#if CONFIG_HIGHBITDEPTH + { + unsigned int this_sad; + tmp_mv->row = 0; + tmp_mv->col = 0; + this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, + xd->plane[0].pre[0].buf, ref_stride); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + return this_sad; + } +#endif + + // Set up prediction 1-D reference set + ref_buf = xd->plane[0].pre[0].buf - (bw >> 1); + for (idx = 0; idx < search_width; idx += 16) { + aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh); + ref_buf += 16; + } + + ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride; + for (idx = 0; idx < search_height; ++idx) { + vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor; + ref_buf += ref_stride; + } + + // Set up src 1-D reference set + for (idx = 0; idx < bw; idx += 16) { + src_buf = x->plane[0].src.buf + idx; + aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh); + } + + src_buf = x->plane[0].src.buf; + for (idx = 0; idx < bh; ++idx) { + src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor; + src_buf += src_stride; + } + + // Find the best match per 1-D search + tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]); + tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]); + + this_mv = *tmp_mv; + src_buf = x->plane[0].src.buf; + ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col; + best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + + { + const uint8_t *const pos[4] = { + ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride, + }; + + cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, sad_arr); + } + + for (idx = 0; idx < 4; ++idx) { + if (sad_arr[idx] < best_sad) { + best_sad = sad_arr[idx]; + tmp_mv->row = search_pos[idx].row + this_mv.row; + tmp_mv->col = search_pos[idx].col + this_mv.col; + } + } + + if (sad_arr[0] < sad_arr[3]) + this_mv.row -= 1; + else + this_mv.row += 1; + + if (sad_arr[1] < sad_arr[2]) + this_mv.col -= 1; + else + this_mv.col += 1; + + ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col; + + tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + if (best_sad > tmp_sad) { + *tmp_mv = this_mv; + best_sad = tmp_sad; + } + + tmp_mv->row *= 8; + tmp_mv->col *= 8; + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + + return best_sad; +} + +/* do_refine: If last step (1-away) of n-step search doesn't pick the center + point as the best match, we will do a final 1-away diamond + refining search */ +static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, int *cost_list, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv) { + MV temp_mv; + int thissme, n, num00 = 0; + int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, + step_param, sadpb, &n, fn_ptr, ref_mv); + if (bestsme < INT_MAX) + bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); + x->best_mv.as_mv = temp_mv; + + // If there won't be more n-step search, check to see if refining search is + // needed. + if (n > further_steps) do_refine = 0; + + while (n < further_steps) { + ++n; + + if (num00) { + num00--; + } else { + thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, + step_param + n, sadpb, &num00, fn_ptr, + ref_mv); + if (thissme < INT_MAX) + thissme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); + + // check to see if refining search is needed. + if (num00 > further_steps - n) do_refine = 0; + + if (thissme < bestsme) { + bestsme = thissme; + x->best_mv.as_mv = temp_mv; + } + } + } + + // final 1-away diamond refining search + if (do_refine) { + const int search_range = 8; + MV best_mv = x->best_mv.as_mv; + thissme = av1_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr, + ref_mv); + if (thissme < INT_MAX) + thissme = av1_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1); + if (thissme < bestsme) { + bestsme = thissme; + x->best_mv.as_mv = best_mv; + } + } + + // Return cost list. + if (cost_list) { + calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, &x->best_mv.as_mv, cost_list); + } + return bestsme; +} + +#define MIN_RANGE 7 +#define MAX_RANGE 256 +#define MIN_INTERVAL 1 +// Runs an limited range exhaustive mesh search using a pattern set +// according to the encode speed profile. +static int full_pixel_exhaustive(const AV1_COMP *const cpi, MACROBLOCK *x, + const MV *centre_mv_full, int sadpb, + int *cost_list, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv) { + const SPEED_FEATURES *const sf = &cpi->sf; + MV temp_mv = { centre_mv_full->row, centre_mv_full->col }; + MV f_ref_mv = { ref_mv->row >> 3, ref_mv->col >> 3 }; + int bestsme; + int i; + int interval = sf->mesh_patterns[0].interval; + int range = sf->mesh_patterns[0].range; + int baseline_interval_divisor; + + // Keep track of number of exhaustive calls (this frame in this thread). + ++(*x->ex_search_count_ptr); + + // Trap illegal values for interval and range for this function. + if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) || + (interval > range)) + return INT_MAX; + + baseline_interval_divisor = range / interval; + + // Check size of proposed first range against magnitude of the centre + // value used as a starting point. + range = AOMMAX(range, (5 * AOMMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4); + range = AOMMIN(range, MAX_RANGE); + interval = AOMMAX(interval, range / baseline_interval_divisor); + + // initial search + bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval, + sadpb, fn_ptr, &temp_mv); + + if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) { + // Progressive searches with range and step size decreasing each time + // till we reach a step size of 1. Then break out. + for (i = 1; i < MAX_MESH_STEP; ++i) { + // First pass with coarser step and longer range + bestsme = exhuastive_mesh_search( + x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range, + sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv); + + if (sf->mesh_patterns[i].interval == 1) break; + } + } + + if (bestsme < INT_MAX) + bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); + *dst_mv = temp_mv; + + // Return cost list. + if (cost_list) { + calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list); + } + return bestsme; +} + +int av1_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv, + int sad_per_bit, int distance, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, MV *best_mv) { + int r, c; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min); + const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max); + const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min); + const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max); + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + int best_sad = + fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), + in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + *best_mv = *ref_mv; + + for (r = row_min; r < row_max; ++r) { + for (c = col_min; c < col_max; ++c) { + const MV mv = { r, c }; + const int sad = + fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), + in_what->stride) + + mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + return best_sad; +} + +int av1_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv, + int sad_per_bit, int distance, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, MV *best_mv) { + int r; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min); + const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max); + const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min); + const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max); + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + unsigned int best_sad = + fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), + in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + *best_mv = *ref_mv; + + for (r = row_min; r < row_max; ++r) { + int c = col_min; + const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; + + if (fn_ptr->sdx3f != NULL) { + while ((c + 2) < col_max) { + int i; + DECLARE_ALIGNED(16, uint32_t, sads[3]); + + fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, + sads); + + for (i = 0; i < 3; ++i) { + unsigned int sad = sads[i]; + if (sad < best_sad) { + const MV mv = { r, c }; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + ++check_here; + ++c; + } + } + } + + while (c < col_max) { + unsigned int sad = + fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride); + if (sad < best_sad) { + const MV mv = { r, c }; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + ++check_here; + ++c; + } + } + + return best_sad; +} + +int av1_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv, + int sad_per_bit, int distance, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, MV *best_mv) { + int r; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const int row_min = AOMMAX(ref_mv->row - distance, x->mv_limits.row_min); + const int row_max = AOMMIN(ref_mv->row + distance, x->mv_limits.row_max); + const int col_min = AOMMAX(ref_mv->col - distance, x->mv_limits.col_min); + const int col_max = AOMMIN(ref_mv->col + distance, x->mv_limits.col_max); + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + unsigned int best_sad = + fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), + in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + *best_mv = *ref_mv; + + for (r = row_min; r < row_max; ++r) { + int c = col_min; + const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; + + if (fn_ptr->sdx8f != NULL) { + while ((c + 7) < col_max) { + int i; + DECLARE_ALIGNED(16, uint32_t, sads[8]); + + fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride, + sads); + + for (i = 0; i < 8; ++i) { + unsigned int sad = sads[i]; + if (sad < best_sad) { + const MV mv = { r, c }; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + ++check_here; + ++c; + } + } + } + + if (fn_ptr->sdx3f != NULL) { + while ((c + 2) < col_max) { + int i; + DECLARE_ALIGNED(16, uint32_t, sads[3]); + + fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, + sads); + + for (i = 0; i < 3; ++i) { + unsigned int sad = sads[i]; + if (sad < best_sad) { + const MV mv = { r, c }; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + ++check_here; + ++c; + } + } + } + + while (c < col_max) { + unsigned int sad = + fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride); + if (sad < best_sad) { + const MV mv = { r, c }; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + ++check_here; + ++c; + } + } + + return best_sad; +} + +int av1_refining_search_sad(MACROBLOCK *x, MV *ref_mv, int error_per_bit, + int search_range, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv); + unsigned int best_sad = + fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); + int i, j; + + for (i = 0; i < search_range; i++) { + int best_site = -1; + const int all_in = ((ref_mv->row - 1) > x->mv_limits.row_min) & + ((ref_mv->row + 1) < x->mv_limits.row_max) & + ((ref_mv->col - 1) > x->mv_limits.col_min) & + ((ref_mv->col + 1) < x->mv_limits.col_max); + + if (all_in) { + unsigned int sads[4]; + const uint8_t *const positions[4] = { best_address - in_what->stride, + best_address - 1, best_address + 1, + best_address + in_what->stride }; + + fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads); + + for (j = 0; j < 4; ++j) { + if (sads[j] < best_sad) { + const MV mv = { ref_mv->row + neighbors[j].row, + ref_mv->col + neighbors[j].col }; + sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); + if (sads[j] < best_sad) { + best_sad = sads[j]; + best_site = j; + } + } + } + } else { + for (j = 0; j < 4; ++j) { + const MV mv = { ref_mv->row + neighbors[j].row, + ref_mv->col + neighbors[j].col }; + + if (is_mv_in(&x->mv_limits, &mv)) { + unsigned int sad = + fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); + if (sad < best_sad) { + best_sad = sad; + best_site = j; + } + } + } + } + } + + if (best_site == -1) { + break; + } else { + x->second_best_mv.as_mv = *ref_mv; + ref_mv->row += neighbors[best_site].row; + ref_mv->col += neighbors[best_site].col; + best_address = get_buf_from_mv(in_what, ref_mv); + } + } + + return best_sad; +} + +// This function is called when we do joint motion search in comp_inter_inter +// mode. +int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, const uint8_t *second_pred) { + const MV neighbors[8] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 }, + { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + MV *best_mv = &x->best_mv.as_mv; + unsigned int best_sad = INT_MAX; + int i, j; + + clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + best_sad = + fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv), + in_what->stride, second_pred) + + mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit); + + for (i = 0; i < search_range; ++i) { + int best_site = -1; + + for (j = 0; j < 8; ++j) { + const MV mv = { best_mv->row + neighbors[j].row, + best_mv->col + neighbors[j].col }; + + if (is_mv_in(&x->mv_limits, &mv)) { + unsigned int sad = + fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), + in_what->stride, second_pred); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); + if (sad < best_sad) { + best_sad = sad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + best_mv->row += neighbors[best_site].row; + best_mv->col += neighbors[best_site].col; + } + } + return best_sad; +} + +#define MIN_EX_SEARCH_LIMIT 128 +static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) { + const SPEED_FEATURES *const sf = &cpi->sf; + const int max_ex = + AOMMAX(MIN_EX_SEARCH_LIMIT, + (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100); + + return sf->allow_exhaustive_searches && + (sf->exhaustive_searches_thresh < INT_MAX) && + (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref; +} + +int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + MV *mvp_full, int step_param, int error_per_bit, + int *cost_list, const MV *ref_mv, int var_max, + int rd) { + const SPEED_FEATURES *const sf = &cpi->sf; + const SEARCH_METHODS method = sf->mv.search_method; + const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; + int var = 0; + + if (cost_list) { + cost_list[0] = INT_MAX; + cost_list[1] = INT_MAX; + cost_list[2] = INT_MAX; + cost_list[3] = INT_MAX; + cost_list[4] = INT_MAX; + } + + // Keep track of number of searches (this frame in this thread). + ++(*x->m_search_count_ptr); + + switch (method) { + case FAST_DIAMOND: + var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, + cost_list, fn_ptr, 1, ref_mv); + break; + case FAST_HEX: + var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0, + cost_list, fn_ptr, 1, ref_mv); + break; + case HEX: + var = av1_hex_search(x, mvp_full, step_param, error_per_bit, 1, cost_list, + fn_ptr, 1, ref_mv); + break; + case SQUARE: + var = square_search(x, mvp_full, step_param, error_per_bit, 1, cost_list, + fn_ptr, 1, ref_mv); + break; + case BIGDIA: + var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list, + fn_ptr, 1, ref_mv); + break; + case NSTEP: + var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, + MAX_MVSEARCH_STEPS - 1 - step_param, 1, + cost_list, fn_ptr, ref_mv); + + // Should we allow a follow on exhaustive search? + if (is_exhaustive_allowed(cpi, x)) { + int exhuastive_thr = sf->exhaustive_searches_thresh; + exhuastive_thr >>= + 10 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + + // Threshold variance for an exhaustive full search. + if (var > exhuastive_thr) { + int var_ex; + MV tmp_mv_ex; + var_ex = + full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit, + cost_list, fn_ptr, ref_mv, &tmp_mv_ex); + + if (var_ex < var) { + var = var_ex; + x->best_mv.as_mv = tmp_mv_ex; + } + } + } + break; + + break; + default: assert(0 && "Invalid search method."); + } + + if (method != NSTEP && rd && var < var_max) + var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1); + + return var; +} + +#if CONFIG_EXT_INTER +/* returns subpixel variance error function */ +#define DIST(r, c) \ + vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, src_stride, \ + mask, mask_stride, &sse) + +/* checks if (r, c) has better score than previous best */ + +#define MVC(r, c) \ + (mvcost \ + ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + mvcost[0][((r)-rr)] + \ + mvcost[1][((c)-rc)]) * \ + error_per_bit + \ + 4096) >> \ + 13 \ + : 0) + +#define CHECK_BETTER(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + thismse = (DIST(r, c)); \ + if ((v = MVC(r, c) + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } + +#undef CHECK_BETTER0 +#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c) + +#undef CHECK_BETTER1 +#define CHECK_BETTER1(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + thismse = upsampled_masked_pref_error(xd, mask, mask_stride, vfp, z, \ + src_stride, upre(y, y_stride, r, c), \ + y_stride, w, h, &sse); \ + if ((v = MVC(r, c) + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } + +int av1_find_best_masked_sub_pixel_tree( + const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv, + const MV *ref_mv, int allow_hp, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, + int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, + int is_second) { + const uint8_t *const z = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const MACROBLOCKD *xd = &x->e_mbd; + unsigned int besterr = INT_MAX; + unsigned int sse; + int thismse; + unsigned int whichdir; + unsigned int halfiters = iters_per_step; + unsigned int quarteriters = iters_per_step; + unsigned int eighthiters = iters_per_step; + + const int y_stride = xd->plane[0].pre[is_second].stride; + const int offset = bestmv->row * y_stride + bestmv->col; + const uint8_t *const y = xd->plane[0].pre[is_second].buf; + + int rr = ref_mv->row; + int rc = ref_mv->col; + int br = bestmv->row * 8; + int bc = bestmv->col * 8; + int hstep = 4; + int tr = br; + int tc = bc; + int minc, maxc, minr, maxr; + + av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, + ref_mv); + + // central mv + bestmv->row *= 8; + bestmv->col *= 8; + + // calculate central point error + besterr = + vfp->mvf(y + offset, y_stride, z, src_stride, mask, mask_stride, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + + // 1/2 pel + FIRST_LEVEL_CHECKS; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + } + + if (allow_hp && forced_stop == 0) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + } + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void)tr; + (void)tc; + + bestmv->row = br; + bestmv->col = bc; + + return besterr; +} + +static unsigned int setup_masked_center_error( + const uint8_t *mask, int mask_stride, const MV *bestmv, const MV *ref_mv, + int error_per_bit, const aom_variance_fn_ptr_t *vfp, + const uint8_t *const src, const int src_stride, const uint8_t *const y, + int y_stride, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1, + int *distortion) { + unsigned int besterr; + besterr = + vfp->mvf(y + offset, y_stride, src, src_stride, mask, mask_stride, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + return besterr; +} + +static int upsampled_masked_pref_error(const MACROBLOCKD *xd, + const uint8_t *mask, int mask_stride, + const aom_variance_fn_ptr_t *vfp, + const uint8_t *const src, + const int src_stride, + const uint8_t *const y, int y_stride, + int w, int h, unsigned int *sse) { + unsigned int besterr; +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); + aom_highbd_upsampled_pred(pred16, w, h, y, y_stride); + + besterr = vfp->mvf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, mask, + mask_stride, sse); + } else { + DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); +#else + DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); + (void)xd; +#endif // CONFIG_HIGHBITDEPTH + aom_upsampled_pred(pred, w, h, y, y_stride); + + besterr = vfp->mvf(pred, w, src, src_stride, mask, mask_stride, sse); +#if CONFIG_HIGHBITDEPTH + } +#endif + return besterr; +} + +static unsigned int upsampled_setup_masked_center_error( + const MACROBLOCKD *xd, const uint8_t *mask, int mask_stride, + const MV *bestmv, const MV *ref_mv, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, const uint8_t *const src, + const int src_stride, const uint8_t *const y, int y_stride, int w, int h, + int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1, + int *distortion) { + unsigned int besterr = + upsampled_masked_pref_error(xd, mask, mask_stride, vfp, src, src_stride, + y + offset, y_stride, w, h, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + return besterr; +} + +int av1_find_best_masked_sub_pixel_tree_up( + const AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask, int mask_stride, + int mi_row, int mi_col, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion, + unsigned int *sse1, int is_second, int use_upsampled_ref) { + const uint8_t *const z = x->plane[0].src.buf; + const uint8_t *const src_address = z; + const int src_stride = x->plane[0].src.stride; + MACROBLOCKD *xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[0]; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + unsigned int besterr = INT_MAX; + unsigned int sse; + unsigned int thismse; + + int rr = ref_mv->row; + int rc = ref_mv->col; + int br = bestmv->row * 8; + int bc = bestmv->col * 8; + int hstep = 4; + int iter; + int round = 3 - forced_stop; + int tr = br; + int tc = bc; + const MV *search_step = search_step_table; + int idx, best_idx = -1; + unsigned int cost_array[5]; + int kr, kc; + const int w = block_size_wide[mbmi->sb_type]; + const int h = block_size_high[mbmi->sb_type]; + int offset; + int y_stride; + const uint8_t *y; + + const struct buf_2d backup_pred = pd->pre[is_second]; + int minc, maxc, minr, maxr; + + av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, + ref_mv); + + if (use_upsampled_ref) { + int ref = xd->mi[0]->mbmi.ref_frame[is_second]; + const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref); + setup_pred_plane(&pd->pre[is_second], mbmi->sb_type, + upsampled_ref->y_buffer, upsampled_ref->y_crop_width, + upsampled_ref->y_crop_height, upsampled_ref->y_stride, + (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x, + pd->subsampling_y); + } + y = pd->pre[is_second].buf; + y_stride = pd->pre[is_second].stride; + offset = bestmv->row * y_stride + bestmv->col; + + if (!allow_hp) + if (round == 3) round = 2; + + bestmv->row *= 8; + bestmv->col *= 8; + + // use_upsampled_ref can be 0 or 1 + if (use_upsampled_ref) + besterr = upsampled_setup_masked_center_error( + xd, mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z, + src_stride, y, y_stride, w, h, (offset * 8), mvjcost, mvcost, sse1, + distortion); + else + besterr = setup_masked_center_error( + mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, + y_stride, offset, mvjcost, mvcost, sse1, distortion); + + for (iter = 0; iter < round; ++iter) { + // Check vertical and horizontal sub-pixel positions. + for (idx = 0; idx < 4; ++idx) { + tr = br + search_step[idx].row; + tc = bc + search_step[idx].col; + if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { + MV this_mv = { tr, tc }; + + if (use_upsampled_ref) { + const uint8_t *const pre_address = y + tr * y_stride + tc; + + thismse = upsampled_masked_pref_error( + xd, mask, mask_stride, vfp, src_address, src_stride, pre_address, + y_stride, w, h, &sse); + } else { + const uint8_t *const pre_address = + y + (tr >> 3) * y_stride + (tc >> 3); + thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, mask, mask_stride, &sse); + } + + cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, + mvcost, error_per_bit); + + if (cost_array[idx] < besterr) { + best_idx = idx; + besterr = cost_array[idx]; + *distortion = thismse; + *sse1 = sse; + } + } else { + cost_array[idx] = INT_MAX; + } + } + + // Check diagonal sub-pixel position + kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep); + kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep); + + tc = bc + kc; + tr = br + kr; + if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { + MV this_mv = { tr, tc }; + + if (use_upsampled_ref) { + const uint8_t *const pre_address = y + tr * y_stride + tc; + + thismse = upsampled_masked_pref_error( + xd, mask, mask_stride, vfp, src_address, src_stride, pre_address, + y_stride, w, h, &sse); + } else { + const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); + + thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr), src_address, + src_stride, mask, mask_stride, &sse); + } + + cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, + error_per_bit); + + if (cost_array[4] < besterr) { + best_idx = 4; + besterr = cost_array[4]; + *distortion = thismse; + *sse1 = sse; + } + } else { + cost_array[idx] = INT_MAX; + } + + if (best_idx < 4 && best_idx >= 0) { + br += search_step[best_idx].row; + bc += search_step[best_idx].col; + } else if (best_idx == 4) { + br = tr; + bc = tc; + } + + if (iters_per_step > 1 && best_idx != -1) { + if (use_upsampled_ref) { + SECOND_LEVEL_CHECKS_BEST(1); + } else { + SECOND_LEVEL_CHECKS_BEST(0); + } + } + + tr = br; + tc = bc; + + search_step += 4; + hstep >>= 1; + best_idx = -1; + } + + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void)tr; + (void)tc; + + bestmv->row = br; + bestmv->col = bc; + + if (use_upsampled_ref) { + pd->pre[is_second] = backup_pred; + } + + return besterr; +} + +#undef DIST +#undef MVC +#undef CHECK_BETTER + +static int get_masked_mvpred_var(const MACROBLOCK *x, const uint8_t *mask, + int mask_stride, const MV *best_mv, + const MV *center_mv, + const aom_variance_fn_ptr_t *vfp, + int use_mvcost, int is_second) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[is_second]; + const MV mv = { best_mv->row * 8, best_mv->col * 8 }; + unsigned int unused; + + return vfp->mvf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv), + in_what->stride, mask, mask_stride, &unused) + + (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost, + x->errorperbit) + : 0); +} + +int masked_refining_search_sad(const MACROBLOCK *x, const uint8_t *mask, + int mask_stride, MV *ref_mv, int error_per_bit, + int search_range, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, int is_second) { + const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[is_second]; + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + unsigned int best_sad = + fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), + in_what->stride, mask, mask_stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); + int i, j; + + for (i = 0; i < search_range; i++) { + int best_site = -1; + + for (j = 0; j < 4; j++) { + const MV mv = { ref_mv->row + neighbors[j].row, + ref_mv->col + neighbors[j].col }; + if (is_mv_in(&x->mv_limits, &mv)) { + unsigned int sad = + fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), + in_what->stride, mask, mask_stride); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); + if (sad < best_sad) { + best_sad = sad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + ref_mv->row += neighbors[best_site].row; + ref_mv->col += neighbors[best_site].col; + } + } + return best_sad; +} + +int masked_diamond_search_sad(const MACROBLOCK *x, + const search_site_config *cfg, + const uint8_t *mask, int mask_stride, MV *ref_mv, + MV *best_mv, int search_param, int sad_per_bit, + int *num00, const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, int is_second) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[is_second]; + // search_param determines the length of the initial step and hence the number + // of iterations + // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = + // (MAX_FIRST_STEP/4) pel... etc. + const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step]; + const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param; + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + const uint8_t *best_address, *in_what_ref; + int best_sad = INT_MAX; + int best_site = 0; + int last_site = 0; + int i, j, step; + + clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + in_what_ref = get_buf_from_mv(in_what, ref_mv); + best_address = in_what_ref; + *num00 = 0; + *best_mv = *ref_mv; + + // Check the starting position + best_sad = fn_ptr->msdf(what->buf, what->stride, best_address, + in_what->stride, mask, mask_stride) + + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit); + + i = 1; + + for (step = 0; step < tot_steps; step++) { + for (j = 0; j < cfg->searches_per_step; j++) { + const MV mv = { best_mv->row + ss[i].mv.row, + best_mv->col + ss[i].mv.col }; + if (is_mv_in(&x->mv_limits, &mv)) { + int sad = + fn_ptr->msdf(what->buf, what->stride, best_address + ss[i].offset, + in_what->stride, mask, mask_stride); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + best_site = i; + } + } + } + + i++; + } + + if (best_site != last_site) { + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + last_site = best_site; +#if defined(NEW_DIAMOND_SEARCH) + while (1) { + const MV this_mv = { best_mv->row + ss[best_site].mv.row, + best_mv->col + ss[best_site].mv.col }; + if (is_mv_in(&x->mv_limits, &this_mv)) { + int sad = fn_ptr->msdf(what->buf, what->stride, + best_address + ss[best_site].offset, + in_what->stride, mask, mask_stride); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + continue; + } + } + } + break; + } +#endif + } else if (best_address == in_what_ref) { + (*num00)++; + } + } + return best_sad; +} + +int av1_masked_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x, + const uint8_t *mask, int mask_stride, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv, int is_second) { + MV temp_mv; + int thissme, n, num00 = 0; + int bestsme = masked_diamond_search_sad(x, &cpi->ss_cfg, mask, mask_stride, + mvp_full, &temp_mv, step_param, sadpb, + &n, fn_ptr, ref_mv, is_second); + if (bestsme < INT_MAX) + bestsme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv, + fn_ptr, 1, is_second); + *dst_mv = temp_mv; + + // If there won't be more n-step search, check to see if refining search is + // needed. + if (n > further_steps) do_refine = 0; + + while (n < further_steps) { + ++n; + + if (num00) { + num00--; + } else { + thissme = masked_diamond_search_sad( + x, &cpi->ss_cfg, mask, mask_stride, mvp_full, &temp_mv, + step_param + n, sadpb, &num00, fn_ptr, ref_mv, is_second); + if (thissme < INT_MAX) + thissme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv, + fn_ptr, 1, is_second); + + // check to see if refining search is needed. + if (num00 > further_steps - n) do_refine = 0; + + if (thissme < bestsme) { + bestsme = thissme; + *dst_mv = temp_mv; + } + } + } + + // final 1-away diamond refining search + if (do_refine) { + const int search_range = 8; + MV best_mv = *dst_mv; + thissme = + masked_refining_search_sad(x, mask, mask_stride, &best_mv, sadpb, + search_range, fn_ptr, ref_mv, is_second); + if (thissme < INT_MAX) + thissme = get_masked_mvpred_var(x, mask, mask_stride, &best_mv, ref_mv, + fn_ptr, 1, is_second); + if (thissme < bestsme) { + bestsme = thissme; + *dst_mv = best_mv; + } + } + return bestsme; +} +#endif // CONFIG_EXT_INTER + +#if CONFIG_MOTION_VAR +/* returns subpixel variance error function */ +#define DIST(r, c) \ + vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, mask, &sse) + +/* checks if (r, c) has better score than previous best */ +#define MVC(r, c) \ + (mvcost \ + ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + mvcost[0][((r)-rr)] + \ + mvcost[1][((c)-rc)]) * \ + error_per_bit + \ + 4096) >> \ + 13 \ + : 0) + +#define CHECK_BETTER(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + thismse = (DIST(r, c)); \ + if ((v = MVC(r, c) + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } + +#undef CHECK_BETTER0 +#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c) + +#undef CHECK_BETTER1 +#define CHECK_BETTER1(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + thismse = upsampled_obmc_pref_error( \ + xd, mask, vfp, z, upre(y, y_stride, r, c), y_stride, w, h, &sse); \ + if ((v = MVC(r, c) + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } + +static unsigned int setup_obmc_center_error( + const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc, + const uint8_t *const y, int y_stride, int offset, int *mvjcost, + int *mvcost[2], unsigned int *sse1, int *distortion) { + unsigned int besterr; + besterr = vfp->ovf(y + offset, y_stride, wsrc, mask, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + return besterr; +} + +static int upsampled_obmc_pref_error(const MACROBLOCKD *xd, const int32_t *mask, + const aom_variance_fn_ptr_t *vfp, + const int32_t *const wsrc, + const uint8_t *const y, int y_stride, + int w, int h, unsigned int *sse) { + unsigned int besterr; +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); + aom_highbd_upsampled_pred(pred16, w, h, y, y_stride); + + besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse); + } else { + DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); +#else + DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); + (void)xd; +#endif // CONFIG_HIGHBITDEPTH + aom_upsampled_pred(pred, w, h, y, y_stride); + + besterr = vfp->ovf(pred, w, wsrc, mask, sse); +#if CONFIG_HIGHBITDEPTH + } +#endif + return besterr; +} + +static unsigned int upsampled_setup_obmc_center_error( + const MACROBLOCKD *xd, const int32_t *mask, const MV *bestmv, + const MV *ref_mv, int error_per_bit, const aom_variance_fn_ptr_t *vfp, + const int32_t *const wsrc, const uint8_t *const y, int y_stride, int w, + int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1, + int *distortion) { + unsigned int besterr = upsampled_obmc_pref_error( + xd, mask, vfp, wsrc, y + offset, y_stride, w, h, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + return besterr; +} + +int av1_find_best_obmc_sub_pixel_tree_up( + const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, MV *bestmv, + const MV *ref_mv, int allow_hp, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, + int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, + int is_second, int use_upsampled_ref) { + const int32_t *wsrc = x->wsrc_buf; + const int32_t *mask = x->mask_buf; + const int *const z = wsrc; + const int *const src_address = z; + MACROBLOCKD *xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[0]; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + unsigned int besterr = INT_MAX; + unsigned int sse; + unsigned int thismse; + + int rr = ref_mv->row; + int rc = ref_mv->col; + int br = bestmv->row * 8; + int bc = bestmv->col * 8; + int hstep = 4; + int iter; + int round = 3 - forced_stop; + int tr = br; + int tc = bc; + const MV *search_step = search_step_table; + int idx, best_idx = -1; + unsigned int cost_array[5]; + int kr, kc; + const int w = block_size_wide[mbmi->sb_type]; + const int h = block_size_high[mbmi->sb_type]; + int offset; + int y_stride; + const uint8_t *y; + + const struct buf_2d backup_pred = pd->pre[is_second]; + int minc, maxc, minr, maxr; + + av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, + ref_mv); + + if (use_upsampled_ref) { + int ref = xd->mi[0]->mbmi.ref_frame[is_second]; + const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref); + setup_pred_plane(&pd->pre[is_second], mbmi->sb_type, + upsampled_ref->y_buffer, upsampled_ref->y_crop_width, + upsampled_ref->y_crop_height, upsampled_ref->y_stride, + (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x, + pd->subsampling_y); + } + y = pd->pre[is_second].buf; + y_stride = pd->pre[is_second].stride; + offset = bestmv->row * y_stride + bestmv->col; + + if (!allow_hp) + if (round == 3) round = 2; + + bestmv->row *= 8; + bestmv->col *= 8; + // use_upsampled_ref can be 0 or 1 + if (use_upsampled_ref) + besterr = upsampled_setup_obmc_center_error( + xd, mask, bestmv, ref_mv, error_per_bit, vfp, z, y, y_stride, w, h, + (offset * 8), mvjcost, mvcost, sse1, distortion); + else + besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp, + z, y, y_stride, offset, mvjcost, mvcost, + sse1, distortion); + + for (iter = 0; iter < round; ++iter) { + // Check vertical and horizontal sub-pixel positions. + for (idx = 0; idx < 4; ++idx) { + tr = br + search_step[idx].row; + tc = bc + search_step[idx].col; + if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { + MV this_mv = { tr, tc }; + + if (use_upsampled_ref) { + const uint8_t *const pre_address = y + tr * y_stride + tc; + + thismse = upsampled_obmc_pref_error( + xd, mask, vfp, src_address, pre_address, y_stride, w, h, &sse); + } else { + const uint8_t *const pre_address = + y + (tr >> 3) * y_stride + (tc >> 3); + thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr), + src_address, mask, &sse); + } + + cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, + mvcost, error_per_bit); + if (cost_array[idx] < besterr) { + best_idx = idx; + besterr = cost_array[idx]; + *distortion = thismse; + *sse1 = sse; + } + } else { + cost_array[idx] = INT_MAX; + } + } + + // Check diagonal sub-pixel position + kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep); + kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep); + + tc = bc + kc; + tr = br + kr; + if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { + MV this_mv = { tr, tc }; + + if (use_upsampled_ref) { + const uint8_t *const pre_address = y + tr * y_stride + tc; + + thismse = upsampled_obmc_pref_error(xd, mask, vfp, src_address, + pre_address, y_stride, w, h, &sse); + } else { + const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); + + thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr), src_address, + mask, &sse); + } + + cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, + error_per_bit); + + if (cost_array[4] < besterr) { + best_idx = 4; + besterr = cost_array[4]; + *distortion = thismse; + *sse1 = sse; + } + } else { + cost_array[idx] = INT_MAX; + } + + if (best_idx < 4 && best_idx >= 0) { + br += search_step[best_idx].row; + bc += search_step[best_idx].col; + } else if (best_idx == 4) { + br = tr; + bc = tc; + } + + if (iters_per_step > 1 && best_idx != -1) { + if (use_upsampled_ref) { + SECOND_LEVEL_CHECKS_BEST(1); + } else { + SECOND_LEVEL_CHECKS_BEST(0); + } + } + + tr = br; + tc = bc; + + search_step += 4; + hstep >>= 1; + best_idx = -1; + } + + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void)tr; + (void)tc; + + bestmv->row = br; + bestmv->col = bc; + + if (use_upsampled_ref) { + pd->pre[is_second] = backup_pred; + } + + return besterr; +} + +#undef DIST +#undef MVC +#undef CHECK_BETTER + +static int get_obmc_mvpred_var(const MACROBLOCK *x, const int32_t *wsrc, + const int32_t *mask, const MV *best_mv, + const MV *center_mv, + const aom_variance_fn_ptr_t *vfp, int use_mvcost, + int is_second) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const in_what = &xd->plane[0].pre[is_second]; + const MV mv = { best_mv->row * 8, best_mv->col * 8 }; + unsigned int unused; + + return vfp->ovf(get_buf_from_mv(in_what, best_mv), in_what->stride, wsrc, + mask, &unused) + + (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost, + x->errorperbit) + : 0); +} + +int obmc_refining_search_sad(const MACROBLOCK *x, const int32_t *wsrc, + const int32_t *mask, MV *ref_mv, int error_per_bit, + int search_range, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, int is_second) { + const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const in_what = &xd->plane[0].pre[is_second]; + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + unsigned int best_sad = fn_ptr->osdf(get_buf_from_mv(in_what, ref_mv), + in_what->stride, wsrc, mask) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); + int i, j; + + for (i = 0; i < search_range; i++) { + int best_site = -1; + + for (j = 0; j < 4; j++) { + const MV mv = { ref_mv->row + neighbors[j].row, + ref_mv->col + neighbors[j].col }; + if (is_mv_in(&x->mv_limits, &mv)) { + unsigned int sad = fn_ptr->osdf(get_buf_from_mv(in_what, &mv), + in_what->stride, wsrc, mask); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); + if (sad < best_sad) { + best_sad = sad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + ref_mv->row += neighbors[best_site].row; + ref_mv->col += neighbors[best_site].col; + } + } + return best_sad; +} + +int obmc_diamond_search_sad(const MACROBLOCK *x, const search_site_config *cfg, + const int32_t *wsrc, const int32_t *mask, + MV *ref_mv, MV *best_mv, int search_param, + int sad_per_bit, int *num00, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, int is_second) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const in_what = &xd->plane[0].pre[is_second]; + // search_param determines the length of the initial step and hence the number + // of iterations + // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = + // (MAX_FIRST_STEP/4) pel... etc. + const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step]; + const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param; + const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; + const uint8_t *best_address, *in_what_ref; + int best_sad = INT_MAX; + int best_site = 0; + int last_site = 0; + int i, j, step; + + clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + in_what_ref = in_what->buf + ref_mv->row * in_what->stride + ref_mv->col; + best_address = in_what_ref; + *num00 = 0; + *best_mv = *ref_mv; + + // Check the starting position + best_sad = fn_ptr->osdf(best_address, in_what->stride, wsrc, mask) + + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit); + + i = 1; + + for (step = 0; step < tot_steps; step++) { + for (j = 0; j < cfg->searches_per_step; j++) { + const MV mv = { best_mv->row + ss[i].mv.row, + best_mv->col + ss[i].mv.col }; + if (is_mv_in(&x->mv_limits, &mv)) { + int sad = fn_ptr->osdf(best_address + ss[i].offset, in_what->stride, + wsrc, mask); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + best_site = i; + } + } + } + + i++; + } + + if (best_site != last_site) { + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + last_site = best_site; +#if defined(NEW_DIAMOND_SEARCH) + while (1) { + const MV this_mv = { best_mv->row + ss[best_site].mv.row, + best_mv->col + ss[best_site].mv.col }; + if (is_mv_in(&x->mv_limits, &this_mv)) { + int sad = fn_ptr->osdf(best_address + ss[best_site].offset, + in_what->stride, wsrc, mask); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + continue; + } + } + } + break; + } +#endif + } else if (best_address == in_what_ref) { + (*num00)++; + } + } + return best_sad; +} + +int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv, int is_second) { + const int32_t *wsrc = x->wsrc_buf; + const int32_t *mask = x->mask_buf; + MV temp_mv; + int thissme, n, num00 = 0; + int bestsme = + obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full, &temp_mv, + step_param, sadpb, &n, fn_ptr, ref_mv, is_second); + if (bestsme < INT_MAX) + bestsme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr, 1, + is_second); + *dst_mv = temp_mv; + + // If there won't be more n-step search, check to see if refining search is + // needed. + if (n > further_steps) do_refine = 0; + + while (n < further_steps) { + ++n; + + if (num00) { + num00--; + } else { + thissme = obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full, + &temp_mv, step_param + n, sadpb, &num00, + fn_ptr, ref_mv, is_second); + if (thissme < INT_MAX) + thissme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr, + 1, is_second); + + // check to see if refining search is needed. + if (num00 > further_steps - n) do_refine = 0; + + if (thissme < bestsme) { + bestsme = thissme; + *dst_mv = temp_mv; + } + } + } + + // final 1-away diamond refining search + if (do_refine) { + const int search_range = 8; + MV best_mv = *dst_mv; + thissme = obmc_refining_search_sad(x, wsrc, mask, &best_mv, sadpb, + search_range, fn_ptr, ref_mv, is_second); + if (thissme < INT_MAX) + thissme = get_obmc_mvpred_var(x, wsrc, mask, &best_mv, ref_mv, fn_ptr, 1, + is_second); + if (thissme < bestsme) { + bestsme = thissme; + *dst_mv = best_mv; + } + } + return bestsme; +} +#endif // CONFIG_MOTION_VAR + +// Note(yunqingwang): The following 2 functions are only used in the motion +// vector unit test, which return extreme motion vectors allowed by the MV +// limits. +#define COMMON_MV_TEST \ + SETUP_SUBPEL_SEARCH; \ + \ + (void)error_per_bit; \ + (void)vfp; \ + (void)src_address; \ + (void)src_stride; \ + (void)y; \ + (void)y_stride; \ + (void)second_pred; \ + (void)w; \ + (void)h; \ + (void)use_upsampled_ref; \ + (void)offset; \ + (void)mvjcost; \ + (void)mvcost; \ + (void)sse1; \ + (void)distortion; \ + \ + (void)halfiters; \ + (void)quarteriters; \ + (void)eighthiters; \ + (void)whichdir; \ + (void)forced_stop; \ + (void)hstep; \ + \ + (void)tr; \ + (void)tc; \ + (void)sse; \ + (void)thismse; \ + (void)cost_list; +// Return the maximum MV. +int av1_return_max_sub_pixel_mv(MACROBLOCK *x, const MV *ref_mv, int allow_hp, + int error_per_bit, + const aom_variance_fn_ptr_t *vfp, + int forced_stop, int iters_per_step, + int *cost_list, int *mvjcost, int *mvcost[2], + int *distortion, unsigned int *sse1, + const uint8_t *second_pred, int w, int h, + int use_upsampled_ref) { + COMMON_MV_TEST; + (void)minr; + (void)minc; + bestmv->row = maxr; + bestmv->col = maxc; + besterr = 0; + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp); + return besterr; +} +// Return the minimum MV. +int av1_return_min_sub_pixel_mv(MACROBLOCK *x, const MV *ref_mv, int allow_hp, + int error_per_bit, + const aom_variance_fn_ptr_t *vfp, + int forced_stop, int iters_per_step, + int *cost_list, int *mvjcost, int *mvcost[2], + int *distortion, unsigned int *sse1, + const uint8_t *second_pred, int w, int h, + int use_upsampled_ref) { + COMMON_MV_TEST; + (void)maxr; + (void)maxc; + bestmv->row = minr; + bestmv->col = minc; + besterr = 0; + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp); + return besterr; +} diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h new file mode 100644 index 000000000..8465860ad --- /dev/null +++ b/third_party/aom/av1/encoder/mcomp.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_MCOMP_H_ +#define AV1_ENCODER_MCOMP_H_ + +#include "av1/encoder/block.h" +#include "aom_dsp/variance.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The maximum number of steps in a step search given the largest +// allowed initial step +#define MAX_MVSEARCH_STEPS 11 +// Max full pel mv specified in the unit of full pixel +// Enable the use of motion vector in range [-1023, 1023]. +#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1) +// Maximum size of the first step in full pel units +#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1)) +// Allowed motion vector pixel distance outside image border +// for Block_16x16 +#define BORDER_MV_PIXELS_B16 (16 + AOM_INTERP_EXTEND) + +// motion search site +typedef struct search_site { + MV mv; + int offset; +} search_site; + +typedef struct search_site_config { + search_site ss[8 * MAX_MVSEARCH_STEPS + 1]; + int ss_count; + int searches_per_step; +} search_site_config; + +void av1_init_dsmotion_compensation(search_site_config *cfg, int stride); +void av1_init3smotion_compensation(search_site_config *cfg, int stride); + +void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv); + +int av1_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost, + int *mvcost[2], int weight); + +// Utility to compute variance + MV rate cost for a given MV +int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv, + const MV *center_mv, const aom_variance_fn_ptr_t *vfp, + int use_mvcost); +int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv, + const MV *center_mv, const uint8_t *second_pred, + const aom_variance_fn_ptr_t *vfp, int use_mvcost); + +struct AV1_COMP; +struct SPEED_FEATURES; + +int av1_init_search_range(int size); + +int av1_refining_search_sad(struct macroblock *x, struct mv *ref_mv, + int sad_per_bit, int distance, + const aom_variance_fn_ptr_t *fn_ptr, + const struct mv *center_mv); + +// Runs sequence of diamond searches in smaller steps for RD. +int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, int *cost_list, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv); + +// Perform integral projection based motion estimation. +unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col); + +int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param, + int sad_per_bit, int do_init_search, int *cost_list, + const aom_variance_fn_ptr_t *vfp, int use_mvcost, + const MV *center_mv); + +typedef int(fractional_mv_step_fp)( + MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, + int forced_stop, // 0 - full, 1 - qtr only, 2 - half only + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w, + int h, int use_upsampled_ref); + +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree; +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned; +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more; +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_evenmore; +extern fractional_mv_step_fp av1_return_max_sub_pixel_mv; +extern fractional_mv_step_fp av1_return_min_sub_pixel_mv; + +typedef int (*av1_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv, + int sad_per_bit, int distance, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, MV *best_mv); + +typedef int (*av1_diamond_search_fn_t)( + MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const aom_variance_fn_ptr_t *fn_ptr, const MV *center_mv); + +int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, const uint8_t *second_pred); + +struct AV1_COMP; + +int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *mvp_full, int step_param, + int error_per_bit, int *cost_list, const MV *ref_mv, + int var_max, int rd); + +#if CONFIG_EXT_INTER +int av1_find_best_masked_sub_pixel_tree( + const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv, + const MV *ref_mv, int allow_hp, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, + int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, + int is_second); +int av1_find_best_masked_sub_pixel_tree_up( + const struct AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask, + int mask_stride, int mi_row, int mi_col, MV *bestmv, const MV *ref_mv, + int allow_hp, int error_per_bit, const aom_variance_fn_ptr_t *vfp, + int forced_stop, int iters_per_step, int *mvjcost, int *mvcost[2], + int *distortion, unsigned int *sse1, int is_second, int use_upsampled_ref); +int av1_masked_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x, + const uint8_t *mask, int mask_stride, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv, int is_second); +#endif // CONFIG_EXT_INTER + +#if CONFIG_MOTION_VAR +int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + const aom_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv, int is_second); +int av1_find_best_obmc_sub_pixel_tree_up( + const struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, + MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit, + const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, + int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, + int is_second, int use_upsampled_ref); +#endif // CONFIG_MOTION_VAR +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_MCOMP_H_ diff --git a/third_party/aom/av1/encoder/mips/msa/error_msa.c b/third_party/aom/av1/encoder/mips/msa/error_msa.c new file mode 100644 index 000000000..8d13af7ad --- /dev/null +++ b/third_party/aom/av1/encoder/mips/msa/error_msa.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./av1_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \ + static int64_t block_error_##BSize##size_msa( \ + const int16_t *coeff_ptr, const int16_t *dq_coeff_ptr, int64_t *ssz) { \ + int64_t err = 0; \ + uint32_t loop_cnt; \ + v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h; \ + v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w; \ + v2i64 sq_coeff_r, sq_coeff_l; \ + v2i64 err0, err_dup0, err1, err_dup1; \ + \ + coeff = LD_SH(coeff_ptr); \ + dq_coeff = LD_SH(dq_coeff_ptr); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, sq_coeff_r, \ + sq_coeff_l); \ + DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1); \ + \ + coeff = LD_SH(coeff_ptr + 8); \ + dq_coeff = LD_SH(dq_coeff_ptr + 8); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff_ptr += 16; \ + dq_coeff_ptr += 16; \ + \ + for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) { \ + coeff = LD_SH(coeff_ptr); \ + dq_coeff = LD_SH(dq_coeff_ptr); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff = LD_SH(coeff_ptr + 8); \ + dq_coeff = LD_SH(dq_coeff_ptr + 8); \ + UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \ + ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \ + HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \ + DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \ + DPADD_SD2_SD(diff_r, diff_l, err0, err1); \ + \ + coeff_ptr += 16; \ + dq_coeff_ptr += 16; \ + } \ + \ + err_dup0 = __msa_splati_d(sq_coeff_r, 1); \ + err_dup1 = __msa_splati_d(sq_coeff_l, 1); \ + sq_coeff_r += err_dup0; \ + sq_coeff_l += err_dup1; \ + *ssz = __msa_copy_s_d(sq_coeff_r, 0); \ + *ssz += __msa_copy_s_d(sq_coeff_l, 0); \ + \ + err_dup0 = __msa_splati_d(err0, 1); \ + err_dup1 = __msa_splati_d(err1, 1); \ + err0 += err_dup0; \ + err1 += err_dup1; \ + err = __msa_copy_s_d(err0, 0); \ + err += __msa_copy_s_d(err1, 0); \ + \ + return err; \ + } + +/* clang-format off */ +BLOCK_ERROR_BLOCKSIZE_MSA(16) +BLOCK_ERROR_BLOCKSIZE_MSA(64) +BLOCK_ERROR_BLOCKSIZE_MSA(256) +BLOCK_ERROR_BLOCKSIZE_MSA(1024) +/* clang-format on */ + +int64_t av1_block_error_msa(const tran_low_t *coeff_ptr, + const tran_low_t *dq_coeff_ptr, intptr_t blk_size, + int64_t *ssz) { + int64_t err; + const int16_t *coeff = (const int16_t *)coeff_ptr; + const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr; + + switch (blk_size) { + case 16: err = block_error_16size_msa(coeff, dq_coeff, ssz); break; + case 64: err = block_error_64size_msa(coeff, dq_coeff, ssz); break; + case 256: err = block_error_256size_msa(coeff, dq_coeff, ssz); break; + case 1024: err = block_error_1024size_msa(coeff, dq_coeff, ssz); break; + default: + err = av1_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz); + break; + } + + return err; +} diff --git a/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c new file mode 100644 index 000000000..4b0364d6c --- /dev/null +++ b/third_party/aom/av1/encoder/mips/msa/fdct16x16_msa.c @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/enums.h" +#include "av1/encoder/mips/msa/fdct_msa.h" +#include "aom_dsp/mips/fwd_txfm_msa.h" + +static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride, + const int32_t *const0, int16_t *int_buf) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; + v4i32 k0, k1, k2, k3; + + /* load input data */ + r0 = LD_SH(input); + r15 = LD_SH(input + 15 * stride); + r7 = LD_SH(input + 7 * stride); + r8 = LD_SH(input + 8 * stride); + SLLI_4V(r0, r15, r7, r8, 2); + + /* stage 1 */ + LD_SW2(const0, 4, k0, k1); + LD_SW2(const0 + 8, 4, k2, k3); + MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + + r3 = LD_SH(input + 3 * stride); + r4 = LD_SH(input + 4 * stride); + r11 = LD_SH(input + 11 * stride); + r12 = LD_SH(input + 12 * stride); + SLLI_4V(r3, r4, r11, r12, 2); + + LD_SW2(const0 + 4 * 4, 4, k0, k1); + LD_SW2(const0 + 4 * 6, 4, k2, k3); + MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + + /* stage 2 */ + BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); + ST_SH2(tp0, tp2, int_buf, 8); + ST_SH2(tp1, tp3, int_buf + 4 * 8, 8); + + LD_SW2(const0 + 4 * 8, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 10); + MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + + ST_SH2(h0, h1, int_buf + 8 * 8, 8); + ST_SH2(h3, h2, int_buf + 12 * 8, 8); + + r9 = LD_SH(input + 9 * stride); + r6 = LD_SH(input + 6 * stride); + r1 = LD_SH(input + stride); + r14 = LD_SH(input + 14 * stride); + SLLI_4V(r9, r6, r1, r14, 2); + + LD_SW2(const0 + 4 * 11, 4, k0, k1); + LD_SW2(const0 + 4 * 13, 4, k2, k3); + MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); + + ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); + + r13 = LD_SH(input + 13 * stride); + r2 = LD_SH(input + 2 * stride); + r5 = LD_SH(input + 5 * stride); + r10 = LD_SH(input + 10 * stride); + SLLI_4V(r13, r2, r5, r10, 2); + + LD_SW2(const0 + 4 * 15, 4, k0, k1); + LD_SW2(const0 + 4 * 17, 4, k2, k3); + MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); + + ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); + + BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); + ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); +} + +static void fadst16_step2_msa_helper(int16_t *int_buf, const int32_t *const0, + int16_t *out, int16_t *out_ptr) { + v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v4i32 k0, k1, k2, k3; + + LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15); + LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7); + LD_SW2(const0 + 4 * 19, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 21); + MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + + tp0 = LD_SH(int_buf + 4 * 8); + tp1 = LD_SH(int_buf + 5 * 8); + tp3 = LD_SH(int_buf + 10 * 8); + tp2 = LD_SH(int_buf + 14 * 8); + LD_SW2(const0 + 4 * 22, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 24); + MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + ST_SH(out4, (out + 3 * 16)); + ST_SH(out5, (out_ptr + 4 * 16)); + + h1 = LD_SH(int_buf + 9 * 8); + h3 = LD_SH(int_buf + 12 * 8); + MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + ST_SH(out12, (out + 2 * 16)); + ST_SH(out13, (out_ptr + 5 * 16)); + + tp0 = LD_SH(int_buf); + tp1 = LD_SH(int_buf + 8); + tp2 = LD_SH(int_buf + 2 * 8); + tp3 = LD_SH(int_buf + 6 * 8); + + BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10); + out1 = -out1; + ST_SH(out0, (out)); + ST_SH(out1, (out_ptr + 7 * 16)); + + h0 = LD_SH(int_buf + 8 * 8); + h2 = LD_SH(int_buf + 13 * 8); + + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + ST_SH(out8, (out + 16)); + ST_SH(out9, (out_ptr + 6 * 16)); + + /* stage 4 */ + LD_SW2(const0 + 4 * 25, 4, k0, k1); + LD_SW2(const0 + 4 * 27, 4, k2, k3); + MADD_SHORT(h10, h11, k1, k2, out2, out3); + ST_SH(out2, (out + 7 * 16)); + ST_SH(out3, (out_ptr)); + + MADD_SHORT(out6, out7, k0, k3, out6, out7); + ST_SH(out6, (out + 4 * 16)); + ST_SH(out7, (out_ptr + 3 * 16)); + + MADD_SHORT(out10, out11, k0, k3, out10, out11); + ST_SH(out10, (out + 6 * 16)); + ST_SH(out11, (out_ptr + 16)); + + MADD_SHORT(out14, out15, k1, k2, out14, out15); + ST_SH(out14, (out + 5 * 16)); + ST_SH(out15, (out_ptr + 2 * 16)); +} + +static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0, + int16_t *out) { + fadst16_step2_msa_helper(int_buf, const0, out, out + 128); +} + +static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, + r7); + FDCT_POSTPROC_2V_NEG_H(r0, r1); + FDCT_POSTPROC_2V_NEG_H(r2, r3); + FDCT_POSTPROC_2V_NEG_H(r4, r5); + FDCT_POSTPROC_2V_NEG_H(r6, r7); + ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); + out += 64; + + LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, + r12, r13, r14, r15); + FDCT_POSTPROC_2V_NEG_H(r8, r9); + FDCT_POSTPROC_2V_NEG_H(r10, r11); + FDCT_POSTPROC_2V_NEG_H(r12, r13); + FDCT_POSTPROC_2V_NEG_H(r14, r15); + ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); + out += 64; + + /* load input data */ + input += 128; + LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, + r7); + FDCT_POSTPROC_2V_NEG_H(r0, r1); + FDCT_POSTPROC_2V_NEG_H(r2, r3); + FDCT_POSTPROC_2V_NEG_H(r4, r5); + FDCT_POSTPROC_2V_NEG_H(r6, r7); + ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); + out += 64; + + LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, + r12, r13, r14, r15); + FDCT_POSTPROC_2V_NEG_H(r8, r9); + FDCT_POSTPROC_2V_NEG_H(r10, r11); + FDCT_POSTPROC_2V_NEG_H(r12, r13); + FDCT_POSTPROC_2V_NEG_H(r14, r15); + ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); +} + +static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0, + int16_t *int_buf) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; + v4i32 k0, k1, k2, k3; + + /* load input data */ + r0 = LD_SH(input); + r7 = LD_SH(input + 7 * 8); + r8 = LD_SH(input + 8 * 8); + r15 = LD_SH(input + 15 * 8); + + /* stage 1 */ + LD_SW2(const0, 4, k0, k1); + LD_SW2(const0 + 4 * 2, 4, k2, k3); + MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + + r3 = LD_SH(input + 3 * 8); + r4 = LD_SH(input + 4 * 8); + r11 = LD_SH(input + 11 * 8); + r12 = LD_SH(input + 12 * 8); + + LD_SW2(const0 + 4 * 4, 4, k0, k1); + LD_SW2(const0 + 4 * 6, 4, k2, k3); + MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + + /* stage 2 */ + BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); + ST_SH2(tp0, tp1, int_buf, 4 * 8); + ST_SH2(tp2, tp3, int_buf + 8, 4 * 8); + + LD_SW2(const0 + 4 * 8, 4, k0, k1); + k2 = LD_SW(const0 + 4 * 10); + MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8); + ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8); + + r1 = LD_SH(input + 8); + r6 = LD_SH(input + 6 * 8); + r9 = LD_SH(input + 9 * 8); + r14 = LD_SH(input + 14 * 8); + + LD_SW2(const0 + 4 * 11, 4, k0, k1); + LD_SW2(const0 + 4 * 13, 4, k2, k3); + MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); + ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); + + r2 = LD_SH(input + 2 * 8); + r5 = LD_SH(input + 5 * 8); + r10 = LD_SH(input + 10 * 8); + r13 = LD_SH(input + 13 * 8); + + LD_SW2(const0 + 4 * 15, 4, k0, k1); + LD_SW2(const0 + 4 * 17, 4, k2, k3); + MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); + ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); + BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); + ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); +} + +static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0, + int16_t *out) { + fadst16_step2_msa_helper(int_buf, const0, out, out + 8); +} + +static void fadst16_transpose_msa(int16_t *input, int16_t *out) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, + l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, + r7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, + r12, r13, r14, r15); + ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); + ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); + out += 16 * 8; + + /* load input data */ + input += 128; + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, + l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, + r7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, + r12, r13, r14, r15); + ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); + ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); +} + +static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) { + int16_t *temp = intermediate; + int16_t *out = output; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11; + v8i16 in12, in13, in14, in15; + + LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7); + temp = intermediate + 8; + LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + FDCT_POSTPROC_2V_NEG_H(in0, in1); + FDCT_POSTPROC_2V_NEG_H(in2, in3); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + FDCT_POSTPROC_2V_NEG_H(in6, in7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, + tmp7, in8, in9, in10, in11, in12, in13, in14, in15); + temp = intermediate; + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + temp = intermediate; + LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, + tmp1, in1, tmp2, in2, tmp3, in3); + ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16); + TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, + tmp5, in5, tmp6, in6, tmp7, in7); + out = output + 8; + ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16); +} + +void av1_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + DECLARE_ALIGNED(32, int16_t, tmp[256]); + DECLARE_ALIGNED(32, int16_t, trans_buf[256]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[128]); + int32_t i; + int16_t *ptmpbuf = &tmp_buf[0]; + int16_t *trans = &trans_buf[0]; + const int32_t const_arr[29 * 4] = { + 52707308, 52707308, 52707308, 52707308, -1072430300, + -1072430300, -1072430300, -1072430300, 795618043, 795618043, + 795618043, 795618043, -721080468, -721080468, -721080468, + -721080468, 459094491, 459094491, 459094491, 459094491, + -970646691, -970646691, -970646691, -970646691, 1010963856, + 1010963856, 1010963856, 1010963856, -361743294, -361743294, + -361743294, -361743294, 209469125, 209469125, 209469125, + 209469125, -1053094788, -1053094788, -1053094788, -1053094788, + 1053160324, 1053160324, 1053160324, 1053160324, 639644520, + 639644520, 639644520, 639644520, -862444000, -862444000, + -862444000, -862444000, 1062144356, 1062144356, 1062144356, + 1062144356, -157532337, -157532337, -157532337, -157532337, + 260914709, 260914709, 260914709, 260914709, -1041559667, + -1041559667, -1041559667, -1041559667, 920985831, 920985831, + 920985831, 920985831, -551995675, -551995675, -551995675, + -551995675, 596522295, 596522295, 596522295, 596522295, + 892853362, 892853362, 892853362, 892853362, -892787826, + -892787826, -892787826, -892787826, 410925857, 410925857, + 410925857, 410925857, -992012162, -992012162, -992012162, + -992012162, 992077698, 992077698, 992077698, 992077698, + 759246145, 759246145, 759246145, 759246145, -759180609, + -759180609, -759180609, -759180609, -759222975, -759222975, + -759222975, -759222975, 759288511, 759288511, 759288511, + 759288511 + }; + + switch (tx_type) { + case DCT_DCT: + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); + } + break; + case ADST_DCT: + /* column transform */ + for (i = 0; i < 2; ++i) { + fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); + fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); + } + break; + case DCT_ADST: + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); + } + + fadst16_transpose_postproc_msa(tmp, trans); + + /* row transform */ + for (i = 0; i < 2; ++i) { + fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); + fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); + } + + fadst16_transpose_msa(tmp, output); + break; + case ADST_ADST: + /* column transform */ + for (i = 0; i < 2; ++i) { + fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); + fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); + } + + fadst16_transpose_postproc_msa(tmp, trans); + + /* row transform */ + for (i = 0; i < 2; ++i) { + fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); + fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); + } + + fadst16_transpose_msa(tmp, output); + break; + default: assert(0); break; + } +} diff --git a/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c new file mode 100644 index 000000000..da1ac74f0 --- /dev/null +++ b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/enums.h" +#include "av1/encoder/mips/msa/fdct_msa.h" + +void av1_fwht4x4_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3, in4; + + LD_SH4(input, src_stride, in0, in1, in2, in3); + + in0 += in1; + in3 -= in2; + in4 = (in0 - in3) >> 1; + SUB2(in4, in1, in4, in2, in1, in2); + in0 -= in2; + in3 += in1; + + TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); + + in0 += in2; + in1 -= in3; + in4 = (in0 - in1) >> 1; + SUB2(in4, in2, in4, in3, in2, in3); + in0 -= in3; + in1 += in2; + + SLLI_4V(in0, in1, in2, in3, 2); + + TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2); + + ST4x2_UB(in0, output, 4); + ST4x2_UB(in3, output + 4, 4); + ST4x2_UB(in1, output + 8, 4); + ST4x2_UB(in2, output + 12, 4); +} + +void av1_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + v8i16 in0, in1, in2, in3; + + LD_SH4(input, stride, in0, in1, in2, in3); + + /* fdct4 pre-process */ + { + v8i16 temp, mask; + v16i8 zero = { 0 }; + v16i8 one = __msa_ldi_b(1); + + mask = (v8i16)__msa_sldi_b(zero, one, 15); + SLLI_4V(in0, in1, in2, in3, 4); + temp = __msa_ceqi_h(in0, 0); + temp = (v8i16)__msa_xori_b((v16u8)temp, 255); + temp = mask & temp; + in0 += temp; + } + + switch (tx_type) { + case DCT_DCT: + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_DCT: + AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case DCT_ADST: + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + case ADST_ADST: + AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); + break; + default: assert(0); break; + } + + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + SRA_4V(in0, in1, in2, in3, 2); + PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output, 8); +} diff --git a/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c new file mode 100644 index 000000000..4cbf60a11 --- /dev/null +++ b/third_party/aom/av1/encoder/mips/msa/fdct8x8_msa.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/enums.h" +#include "av1/encoder/mips/msa/fdct_msa.h" + +void av1_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride, + int32_t tx_type) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + + switch (tx_type) { + case DCT_DCT: + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + case ADST_DCT: + AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + case DCT_ADST: + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + case ADST_ADST: + AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, + in3, in4, in5, in6, in7); + AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + break; + default: assert(0); break; + } + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); +} diff --git a/third_party/aom/av1/encoder/mips/msa/fdct_msa.h b/third_party/aom/av1/encoder/mips/msa/fdct_msa.h new file mode 100644 index 000000000..52bcf790c --- /dev/null +++ b/third_party/aom/av1/encoder/mips/msa/fdct_msa.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_ +#define AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_ + +#include "aom_dsp/mips/fwd_txfm_msa.h" +#include "aom_dsp/mips/txfm_macros_msa.h" +#include "aom_ports/mem.h" + +#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ + v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ + cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ + v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ + cospi_24_64, -cospi_24_64, 0, 0 }; \ + \ + SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ + ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in7, in0, in4, in3); \ + \ + SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in5, in2, in6, in1); \ + BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ + out7 = -s0_m; \ + out0 = s1_m; \ + \ + SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ + \ + ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + cnst1_m = cnst0_m; \ + \ + ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \ + cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \ + \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + out1 = -out1; \ + out3 = -out3; \ + out5 = -out5; \ + } + +#define AOM_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \ + v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \ + \ + UNPCK_R_SH_SW(in0, in0_r_m); \ + UNPCK_R_SH_SW(in1, in1_r_m); \ + UNPCK_R_SH_SW(in2, in2_r_m); \ + UNPCK_R_SH_SW(in3, in3_r_m); \ + \ + constant_m = __msa_fill_w(sinpi_4_9); \ + MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m); \ + \ + constant_m = __msa_fill_w(sinpi_1_9); \ + s0_m += in0_r_m * constant_m; \ + s1_m -= in1_r_m * constant_m; \ + \ + constant_m = __msa_fill_w(sinpi_2_9); \ + s0_m += in1_r_m * constant_m; \ + s1_m += in3_r_m * constant_m; \ + \ + s2_m = in0_r_m + in1_r_m - in3_r_m; \ + \ + constant_m = __msa_fill_w(sinpi_3_9); \ + MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m); \ + \ + in0_r_m = s0_m + s3_m; \ + s2_m = s1_m - s3_m; \ + s3_m = s1_m - s0_m + s3_m; \ + \ + SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \ + out0, out1, out2, out3); \ + } +#endif // AV1_ENCODER_MIPS_MSA_AV1_FDCT_MSA_H_ diff --git a/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c new file mode 100644 index 000000000..4ec679642 --- /dev/null +++ b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./av1_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride, + uint8_t *frm2_ptr, int32_t filt_sth, + int32_t filt_wgt, uint32_t *acc, + uint16_t *cnt) { + uint32_t row; + uint64_t f0, f1, f2, f3; + v16i8 frm2, frm1 = { 0 }; + v16i8 frm4, frm3 = { 0 }; + v16u8 frm_r, frm_l; + v8i16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 2; row--;) { + LD4(frm1_ptr, stride, f0, f1, f2, f3); + frm1_ptr += (4 * stride); + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 32; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + INSERT_D2_SB(f0, f1, frm1); + INSERT_D2_SB(f2, f3, frm3); + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, + mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, + mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + UNPCK_UB_SH(frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + } +} + +static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride, + uint8_t *frm2_ptr, + int32_t filt_sth, int32_t filt_wgt, + uint32_t *acc, uint16_t *cnt) { + uint32_t row; + v16i8 frm1, frm2, frm3, frm4; + v16u8 frm_r, frm_l; + v16i8 zero = { 0 }; + v8u16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 8; row--;) { + LD_SB2(frm1_ptr, stride, frm1, frm3); + frm1_ptr += stride; + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 16; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, + mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, + mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, + mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, + mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + frm1_ptr += stride; + frm2_ptr += 16; + } +} + +void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride, + uint8_t *frame2_ptr, uint32_t blk_w, + uint32_t blk_h, int32_t strength, + int32_t filt_wgt, uint32_t *accu, + uint16_t *cnt) { + if (8 == (blk_w * blk_h)) { + temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength, + filt_wgt, accu, cnt); + } else if (16 == (blk_w * blk_h)) { + temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength, + filt_wgt, accu, cnt); + } else { + av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, + strength, filt_wgt, accu, cnt); + } +} diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c new file mode 100644 index 000000000..355141de5 --- /dev/null +++ b/third_party/aom/av1/encoder/palette.c @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/encoder/cost.h" +#include "av1/encoder/palette.h" + +static float calc_dist(const float *p1, const float *p2, int dim) { + float dist = 0; + int i; + for (i = 0; i < dim; ++i) { + const float diff = p1[i] - p2[i]; + dist += diff * diff; + } + return dist; +} + +void av1_calc_indices(const float *data, const float *centroids, + uint8_t *indices, int n, int k, int dim) { + int i, j; + for (i = 0; i < n; ++i) { + float min_dist = calc_dist(data + i * dim, centroids, dim); + indices[i] = 0; + for (j = 1; j < k; ++j) { + const float this_dist = + calc_dist(data + i * dim, centroids + j * dim, dim); + if (this_dist < min_dist) { + min_dist = this_dist; + indices[i] = j; + } + } + } +} + +// Generate a random number in the range [0, 32768). +static unsigned int lcg_rand16(unsigned int *state) { + *state = (unsigned int)(*state * 1103515245ULL + 12345); + return *state / 65536 % 32768; +} + +static void calc_centroids(const float *data, float *centroids, + const uint8_t *indices, int n, int k, int dim) { + int i, j, index; + int count[PALETTE_MAX_SIZE]; + unsigned int rand_state = (unsigned int)data[0]; + + assert(n <= 32768); + + memset(count, 0, sizeof(count[0]) * k); + memset(centroids, 0, sizeof(centroids[0]) * k * dim); + + for (i = 0; i < n; ++i) { + index = indices[i]; + assert(index < k); + ++count[index]; + for (j = 0; j < dim; ++j) { + centroids[index * dim + j] += data[i * dim + j]; + } + } + + for (i = 0; i < k; ++i) { + if (count[i] == 0) { + memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim, + sizeof(centroids[0]) * dim); + } else { + const float norm = 1.0f / count[i]; + for (j = 0; j < dim; ++j) centroids[i * dim + j] *= norm; + } + } + + // Round to nearest integers. + for (i = 0; i < k * dim; ++i) { + centroids[i] = roundf(centroids[i]); + } +} + +static float calc_total_dist(const float *data, const float *centroids, + const uint8_t *indices, int n, int k, int dim) { + float dist = 0; + int i; + (void)k; + + for (i = 0; i < n; ++i) + dist += calc_dist(data + i * dim, centroids + indices[i] * dim, dim); + + return dist; +} + +void av1_k_means(const float *data, float *centroids, uint8_t *indices, int n, + int k, int dim, int max_itr) { + int i; + float this_dist; + float pre_centroids[2 * PALETTE_MAX_SIZE]; + uint8_t pre_indices[MAX_SB_SQUARE]; + + av1_calc_indices(data, centroids, indices, n, k, dim); + this_dist = calc_total_dist(data, centroids, indices, n, k, dim); + + for (i = 0; i < max_itr; ++i) { + const float pre_dist = this_dist; + memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim); + memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n); + + calc_centroids(data, centroids, indices, n, k, dim); + av1_calc_indices(data, centroids, indices, n, k, dim); + this_dist = calc_total_dist(data, centroids, indices, n, k, dim); + + if (this_dist > pre_dist) { + memcpy(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim); + memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n); + break; + } + if (!memcmp(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim)) + break; + } +} + +static int float_comparer(const void *a, const void *b) { + const float fa = *(const float *)a; + const float fb = *(const float *)b; + return (fa > fb) - (fa < fb); +} + +int av1_remove_duplicates(float *centroids, int num_centroids) { + int num_unique; // number of unique centroids + int i; + qsort(centroids, num_centroids, sizeof(*centroids), float_comparer); + // Remove duplicates. + num_unique = 1; + for (i = 1; i < num_centroids; ++i) { + if (centroids[i] != centroids[i - 1]) { // found a new unique centroid + centroids[num_unique++] = centroids[i]; + } + } + return num_unique; +} + +int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) { + int n = 0, r, c, i, val_count[256]; + uint8_t val; + memset(val_count, 0, sizeof(val_count)); + + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) { + val = src[r * stride + c]; + ++val_count[val]; + } + } + + for (i = 0; i < 256; ++i) { + if (val_count[i]) { + ++n; + } + } + + return n; +} + +#if CONFIG_PALETTE_DELTA_ENCODING +int av1_get_palette_delta_bits_y(const PALETTE_MODE_INFO *const pmi, + int bit_depth, int *min_bits) { + const int n = pmi->palette_size[0]; + int max_d = 0, i; + *min_bits = bit_depth - 3; + for (i = 1; i < n; ++i) { + const int delta = pmi->palette_colors[i] - pmi->palette_colors[i - 1]; + assert(delta > 0); + if (delta > max_d) max_d = delta; + } + return AOMMAX(av1_ceil_log2(max_d), *min_bits); +} + +int av1_get_palette_delta_bits_u(const PALETTE_MODE_INFO *const pmi, + int bit_depth, int *min_bits) { + const int n = pmi->palette_size[1]; + int max_d = 0, i; + *min_bits = bit_depth - 3; + for (i = 1; i < n; ++i) { + const int delta = pmi->palette_colors[PALETTE_MAX_SIZE + i] - + pmi->palette_colors[PALETTE_MAX_SIZE + i - 1]; + assert(delta >= 0); + if (delta > max_d) max_d = delta; + } + return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits); +} + +int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi, + int bit_depth, int *zero_count, + int *min_bits) { + const int n = pmi->palette_size[1]; + const int max_val = 1 << bit_depth; + int max_d = 0, i; + *min_bits = bit_depth - 4; + *zero_count = 0; + for (i = 1; i < n; ++i) { + const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] - + pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1]; + const int v = abs(delta); + const int d = AOMMIN(v, max_val - v); + if (d > max_d) max_d = d; + if (d == 0) ++(*zero_count); + } + return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits); +} +#endif // CONFIG_PALETTE_DELTA_ENCODING + +int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, + int bit_depth) { + const int n = pmi->palette_size[0]; +#if CONFIG_PALETTE_DELTA_ENCODING + int min_bits = 0; + const int bits = av1_get_palette_delta_bits_y(pmi, bit_depth, &min_bits); + return av1_cost_bit(128, 0) * (2 + bit_depth + bits * (n - 1)); +#else + return bit_depth * n * av1_cost_bit(128, 0); +#endif // CONFIG_PALETTE_DELTA_ENCODING +} + +int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, + int bit_depth) { + const int n = pmi->palette_size[1]; +#if CONFIG_PALETTE_DELTA_ENCODING + int cost = 0; + // U channel palette color cost. + int min_bits_u = 0; + const int bits_u = av1_get_palette_delta_bits_u(pmi, bit_depth, &min_bits_u); + cost += av1_cost_bit(128, 0) * (2 + bit_depth + bits_u * (n - 1)); + // V channel palette color cost. + int zero_count = 0, min_bits_v = 0; + const int bits_v = + av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v); + const int bits_using_delta = + 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count; + const int bits_using_raw = bit_depth * n; + cost += av1_cost_bit(128, 0) * (1 + AOMMIN(bits_using_delta, bits_using_raw)); + return cost; +#else + return 2 * bit_depth * n * av1_cost_bit(128, 0); +#endif // CONFIG_PALETTE_DELTA_ENCODING +} + +#if CONFIG_HIGHBITDEPTH +int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols, + int bit_depth) { + int n = 0, r, c, i; + uint16_t val; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + int val_count[1 << 12]; + + assert(bit_depth <= 12); + memset(val_count, 0, (1 << 12) * sizeof(val_count[0])); + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) { + val = src[r * stride + c]; + ++val_count[val]; + } + } + + for (i = 0; i < (1 << bit_depth); ++i) { + if (val_count[i]) { + ++n; + } + } + + return n; +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h new file mode 100644 index 000000000..5403ac5e6 --- /dev/null +++ b/third_party/aom/av1/encoder/palette.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_PALETTE_H_ +#define AV1_ENCODER_PALETTE_H_ + +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim', +// calculate the centroid 'indices' for the data points. +void av1_calc_indices(const float *data, const float *centroids, + uint8_t *indices, int n, int k, int dim); + +// Given 'n' 'data' points and an initial guess of 'k' 'centroids' each of +// dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get +// updated 'centroids' and the centroid 'indices' for elements in 'data'. +// Note: the output centroids are rounded off to nearest integers. +void av1_k_means(const float *data, float *centroids, uint8_t *indices, int n, + int k, int dim, int max_itr); + +// Given a list of centroids, returns the unique number of centroids 'k', and +// puts these unique centroids in first 'k' indices of 'centroids' array. +// Ideally, the centroids should be rounded to integers before calling this +// method. +int av1_remove_duplicates(float *centroids, int num_centroids); + +// Returns the number of colors in 'src'. +int av1_count_colors(const uint8_t *src, int stride, int rows, int cols); +#if CONFIG_HIGHBITDEPTH +// Same as av1_count_colors(), but for high-bitdepth mode. +int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols, + int bit_depth); +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_PALETTE_DELTA_ENCODING +// Return the number of bits used to transmit each luma palette color delta. +int av1_get_palette_delta_bits_y(const PALETTE_MODE_INFO *const pmi, + int bit_depth, int *min_bits); + +// Return the number of bits used to transmit each U palette color delta. +int av1_get_palette_delta_bits_u(const PALETTE_MODE_INFO *const pmi, + int bit_depth, int *min_bits); + +// Return the number of bits used to transmit each v palette color delta; +// assign zero_count with the number of deltas being 0. +int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi, + int bit_depth, int *zero_count, int *min_bits); +#endif // CONFIG_PALETTE_DELTA_ENCODING + +// Return the rate cost for transmitting luma palette color values. +int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, int bit_depth); + +// Return the rate cost for transmitting chroma palette color values. +int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, + int bit_depth); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* AV1_ENCODER_PALETTE_H_ */ diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c new file mode 100644 index 000000000..da64fb48d --- /dev/null +++ b/third_party/aom/av1/encoder/pickcdef.c @@ -0,0 +1,490 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_scale_rtcd.h" +#include "aom/aom_integer.h" +#include "av1/common/cdef.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/encoder.h" + +#define TOTAL_STRENGTHS (DERING_STRENGTHS * CLPF_STRENGTHS) + +/* Search for the best strength to add as an option, knowing we + already selected nb_strengths options. */ +static uint64_t search_one(int *lev, int nb_strengths, + uint64_t mse[][TOTAL_STRENGTHS], int sb_count) { + uint64_t tot_mse[TOTAL_STRENGTHS]; + int i, j; + uint64_t best_tot_mse = (uint64_t)1 << 63; + int best_id = 0; + memset(tot_mse, 0, sizeof(tot_mse)); + for (i = 0; i < sb_count; i++) { + int gi; + uint64_t best_mse = (uint64_t)1 << 63; + /* Find best mse among already selected options. */ + for (gi = 0; gi < nb_strengths; gi++) { + if (mse[i][lev[gi]] < best_mse) { + best_mse = mse[i][lev[gi]]; + } + } + /* Find best mse when adding each possible new option. */ + for (j = 0; j < TOTAL_STRENGTHS; j++) { + uint64_t best = best_mse; + if (mse[i][j] < best) best = mse[i][j]; + tot_mse[j] += best; + } + } + for (j = 0; j < TOTAL_STRENGTHS; j++) { + if (tot_mse[j] < best_tot_mse) { + best_tot_mse = tot_mse[j]; + best_id = j; + } + } + lev[nb_strengths] = best_id; + return best_tot_mse; +} + +/* Search for the best luma+chroma strength to add as an option, knowing we + already selected nb_strengths options. */ +static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths, + uint64_t (**mse)[TOTAL_STRENGTHS], + int sb_count) { + uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS]; + int i, j; + uint64_t best_tot_mse = (uint64_t)1 << 63; + int best_id0 = 0; + int best_id1 = 0; + memset(tot_mse, 0, sizeof(tot_mse)); + for (i = 0; i < sb_count; i++) { + int gi; + uint64_t best_mse = (uint64_t)1 << 63; + /* Find best mse among already selected options. */ + for (gi = 0; gi < nb_strengths; gi++) { + uint64_t curr = mse[0][i][lev0[gi]]; + curr += mse[1][i][lev1[gi]]; + if (curr < best_mse) { + best_mse = curr; + } + } + /* Find best mse when adding each possible new option. */ + for (j = 0; j < TOTAL_STRENGTHS; j++) { + int k; + for (k = 0; k < TOTAL_STRENGTHS; k++) { + uint64_t best = best_mse; + uint64_t curr = mse[0][i][j]; + curr += mse[1][i][k]; + if (curr < best) best = curr; + tot_mse[j][k] += best; + } + } + } + for (j = 0; j < TOTAL_STRENGTHS; j++) { + int k; + for (k = 0; k < TOTAL_STRENGTHS; k++) { + if (tot_mse[j][k] < best_tot_mse) { + best_tot_mse = tot_mse[j][k]; + best_id0 = j; + best_id1 = k; + } + } + } + lev0[nb_strengths] = best_id0; + lev1[nb_strengths] = best_id1; + return best_tot_mse; +} + +/* Search for the set of strengths that minimizes mse. */ +static uint64_t joint_strength_search(int *best_lev, int nb_strengths, + uint64_t mse[][TOTAL_STRENGTHS], + int sb_count) { + uint64_t best_tot_mse; + int i; + best_tot_mse = (uint64_t)1 << 63; + /* Greedy search: add one strength options at a time. */ + for (i = 0; i < nb_strengths; i++) { + best_tot_mse = search_one(best_lev, i, mse, sb_count); + } + /* Trying to refine the greedy search by reconsidering each + already-selected option. */ + for (i = 0; i < 4 * nb_strengths; i++) { + int j; + for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1]; + best_tot_mse = search_one(best_lev, nb_strengths - 1, mse, sb_count); + } + return best_tot_mse; +} + +/* Search for the set of luma+chroma strengths that minimizes mse. */ +static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1, + int nb_strengths, + uint64_t (**mse)[TOTAL_STRENGTHS], + int sb_count) { + uint64_t best_tot_mse; + int i; + best_tot_mse = (uint64_t)1 << 63; + /* Greedy search: add one strength options at a time. */ + for (i = 0; i < nb_strengths; i++) { + best_tot_mse = search_one_dual(best_lev0, best_lev1, i, mse, sb_count); + } + /* Trying to refine the greedy search by reconsidering each + already-selected option. */ + for (i = 0; i < 4 * nb_strengths; i++) { + int j; + for (j = 0; j < nb_strengths - 1; j++) { + best_lev0[j] = best_lev0[j + 1]; + best_lev1[j] = best_lev1[j + 1]; + } + best_tot_mse = + search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse, sb_count); + } + return best_tot_mse; +} + +/* FIXME: SSE-optimize this. */ +static void copy_sb16_16(uint16_t *dst, int dstride, const uint16_t *src, + int src_voffset, int src_hoffset, int sstride, + int vsize, int hsize) { + int r, c; + const uint16_t *base = &src[src_voffset * sstride + src_hoffset]; + for (r = 0; r < vsize; r++) { + for (c = 0; c < hsize; c++) { + dst[r * dstride + c] = base[r * sstride + c]; + } + } +} + +static INLINE uint64_t dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src, + int sstride, int coeff_shift) { + uint64_t svar = 0; + uint64_t dvar = 0; + uint64_t sum_s = 0; + uint64_t sum_d = 0; + uint64_t sum_s2 = 0; + uint64_t sum_d2 = 0; + uint64_t sum_sd = 0; + int i, j; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + sum_s += src[i * sstride + j]; + sum_d += dst[i * dstride + j]; + sum_s2 += src[i * sstride + j] * src[i * sstride + j]; + sum_d2 += dst[i * dstride + j] * dst[i * dstride + j]; + sum_sd += src[i * sstride + j] * dst[i * dstride + j]; + } + } + /* Compute the variance -- the calculation cannot go negative. */ + svar = sum_s2 - ((sum_s * sum_s + 32) >> 6); + dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6); + return (uint64_t)floor( + .5 + + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 * + (svar + dvar + (400 << 2 * coeff_shift)) / + (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar))); +} + +static INLINE uint64_t mse_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src, + int sstride) { + uint64_t sum = 0; + int i, j; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + int e = dst[i * dstride + j] - src[i * sstride + j]; + sum += e * e; + } + } + return sum; +} + +static INLINE uint64_t mse_4x4_16bit(uint16_t *dst, int dstride, uint16_t *src, + int sstride) { + uint64_t sum = 0; + int i, j; + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + int e = dst[i * dstride + j] - src[i * sstride + j]; + sum += e * e; + } + } + return sum; +} + +/* Compute MSE only on the blocks we filtered. */ +uint64_t compute_dering_dist(uint16_t *dst, int dstride, uint16_t *src, + dering_list *dlist, int dering_count, + BLOCK_SIZE bsize, int coeff_shift, int pli) { + uint64_t sum = 0; + int bi, bx, by; + if (bsize == BLOCK_8X8) { + for (bi = 0; bi < dering_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + if (pli == 0) { + sum += dist_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride, + &src[bi << (3 + 3)], 8, coeff_shift); + } else { + sum += mse_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride, + &src[bi << (3 + 3)], 8); + } + } + } else if (bsize == BLOCK_4X8) { + for (bi = 0; bi < dering_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + sum += mse_4x4_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride, + &src[bi << (3 + 2)], 4); + sum += mse_4x4_16bit(&dst[((by << 3) + 4) * dstride + (bx << 2)], dstride, + &src[(bi << (3 + 2)) + 4 * 4], 4); + } + } else if (bsize == BLOCK_8X4) { + for (bi = 0; bi < dering_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride, + &src[bi << (2 + 3)], 8); + sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3) + 4], dstride, + &src[(bi << (2 + 3)) + 4], 8); + } + } else { + assert(bsize == BLOCK_4X4); + for (bi = 0; bi < dering_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride, + &src[bi << (2 + 2)], 4); + } + } + return sum >> 2 * coeff_shift; +} + +void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, + AV1_COMMON *cm, MACROBLOCKD *xd) { + int r, c; + int sbr, sbc; + uint16_t *src[3]; + uint16_t *ref_coeff[3]; + dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE]; + int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } }; + int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } }; + int stride[3]; + int bsize[3]; + int mi_wide_l2[3]; + int mi_high_l2[3]; + int xdec[3]; + int ydec[3]; + int pli; + int dering_count; + int coeff_shift = AOMMAX(cm->bit_depth - 8, 0); + uint64_t best_tot_mse = (uint64_t)1 << 63; + uint64_t tot_mse; + int sb_count; + int nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE; + int nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE; + int *sb_index = aom_malloc(nvsb * nhsb * sizeof(*sb_index)); + int *selected_strength = aom_malloc(nvsb * nhsb * sizeof(*sb_index)); + uint64_t(*mse[2])[TOTAL_STRENGTHS]; + int clpf_damping = 3 + (cm->base_qindex >> 6); + int dering_damping = 6; + int i; + int nb_strengths; + int nb_strength_bits; + int quantizer; + double lambda; + int nplanes = 3; + DECLARE_ALIGNED(32, uint16_t, inbuf[OD_DERING_INBUF_SIZE]); + uint16_t *in; + DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SQUARE]); + int chroma_dering = + xd->plane[1].subsampling_x == xd->plane[1].subsampling_y && + xd->plane[2].subsampling_x == xd->plane[2].subsampling_y; + quantizer = + av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8); + lambda = .12 * quantizer * quantizer / 256.; + + av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0); + mse[0] = aom_malloc(sizeof(**mse) * nvsb * nhsb); + mse[1] = aom_malloc(sizeof(**mse) * nvsb * nhsb); + for (pli = 0; pli < nplanes; pli++) { + uint8_t *ref_buffer; + int ref_stride; + switch (pli) { + case 0: + ref_buffer = ref->y_buffer; + ref_stride = ref->y_stride; + break; + case 1: + ref_buffer = ref->u_buffer; + ref_stride = ref->uv_stride; + break; + case 2: + ref_buffer = ref->v_buffer; + ref_stride = ref->uv_stride; + break; + } + src[pli] = aom_memalign( + 32, sizeof(*src) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE); + ref_coeff[pli] = aom_memalign( + 32, sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE); + xdec[pli] = xd->plane[pli].subsampling_x; + ydec[pli] = xd->plane[pli].subsampling_y; + bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4) + : (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8); + stride[pli] = cm->mi_cols << MI_SIZE_LOG2; + mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x; + mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y; + + const int frame_height = + (cm->mi_rows * MI_SIZE) >> xd->plane[pli].subsampling_y; + const int frame_width = + (cm->mi_cols * MI_SIZE) >> xd->plane[pli].subsampling_x; + + for (r = 0; r < frame_height; ++r) { + for (c = 0; c < frame_width; ++c) { +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) { + src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR( + xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c]; + ref_coeff[pli][r * stride[pli] + c] = + CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c]; + } else { +#endif + src[pli][r * stride[pli] + c] = + xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c]; + ref_coeff[pli][r * stride[pli] + c] = ref_buffer[r * ref_stride + c]; +#if CONFIG_HIGHBITDEPTH + } +#endif + } + } + } + in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER; + sb_count = 0; + for (sbr = 0; sbr < nvsb; ++sbr) { + for (sbc = 0; sbc < nhsb; ++sbc) { + int nvb, nhb; + int gi; + int dirinit = 0; + nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc); + nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr); + cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride + + MAX_MIB_SIZE * sbc] + ->mbmi.cdef_strength = -1; + if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue; + dering_count = sb_compute_dering_list(cm, sbr * MAX_MIB_SIZE, + sbc * MAX_MIB_SIZE, dlist, 1); + for (pli = 0; pli < nplanes; pli++) { + for (i = 0; i < OD_DERING_INBUF_SIZE; i++) + inbuf[i] = OD_DERING_VERY_LARGE; + for (gi = 0; gi < TOTAL_STRENGTHS; gi++) { + int threshold; + uint64_t curr_mse; + int clpf_strength; + threshold = gi / CLPF_STRENGTHS; + if (pli > 0 && !chroma_dering) threshold = 0; + /* We avoid filtering the pixels for which some of the pixels to + average + are outside the frame. We could change the filter instead, but it + would add special cases for any future vectorization. */ + int yoff = OD_FILT_VBORDER * (sbr != 0); + int xoff = OD_FILT_HBORDER * (sbc != 0); + int ysize = (nvb << mi_high_l2[pli]) + + OD_FILT_VBORDER * (sbr != nvsb - 1) + yoff; + int xsize = (nhb << mi_wide_l2[pli]) + + OD_FILT_HBORDER * (sbc != nhsb - 1) + xoff; + clpf_strength = gi % CLPF_STRENGTHS; + if (clpf_strength == 0) + copy_sb16_16(&in[(-yoff * OD_FILT_BSTRIDE - xoff)], OD_FILT_BSTRIDE, + src[pli], + (sbr * MAX_MIB_SIZE << mi_high_l2[pli]) - yoff, + (sbc * MAX_MIB_SIZE << mi_wide_l2[pli]) - xoff, + stride[pli], ysize, xsize); + od_dering(clpf_strength ? NULL : (uint8_t *)in, OD_FILT_BSTRIDE, + tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit, var, pli, + dlist, dering_count, threshold, + clpf_strength + (clpf_strength == 3), clpf_damping, + dering_damping, coeff_shift, clpf_strength != 0, 1); + curr_mse = compute_dering_dist( + ref_coeff[pli] + + (sbr * MAX_MIB_SIZE << mi_high_l2[pli]) * stride[pli] + + (sbc * MAX_MIB_SIZE << mi_wide_l2[pli]), + stride[pli], tmp_dst, dlist, dering_count, bsize[pli], + coeff_shift, pli); + if (pli < 2) + mse[pli][sb_count][gi] = curr_mse; + else + mse[1][sb_count][gi] += curr_mse; + sb_index[sb_count] = + MAX_MIB_SIZE * sbr * cm->mi_stride + MAX_MIB_SIZE * sbc; + } + } + sb_count++; + } + } + nb_strength_bits = 0; + /* Search for different number of signalling bits. */ + for (i = 0; i <= 3; i++) { + int j; + int best_lev0[CDEF_MAX_STRENGTHS]; + int best_lev1[CDEF_MAX_STRENGTHS] = { 0 }; + nb_strengths = 1 << i; + if (nplanes >= 3) + tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths, + mse, sb_count); + else + tot_mse = + joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count); + /* Count superblock signalling cost. */ + tot_mse += (uint64_t)(sb_count * lambda * i); + /* Count header signalling cost. */ + tot_mse += (uint64_t)(nb_strengths * lambda * CDEF_STRENGTH_BITS); + if (tot_mse < best_tot_mse) { + best_tot_mse = tot_mse; + nb_strength_bits = i; + for (j = 0; j < 1 << nb_strength_bits; j++) { + cm->cdef_strengths[j] = best_lev0[j]; + cm->cdef_uv_strengths[j] = best_lev1[j]; + } + } + } + nb_strengths = 1 << nb_strength_bits; + + cm->cdef_bits = nb_strength_bits; + cm->nb_cdef_strengths = nb_strengths; + for (i = 0; i < sb_count; i++) { + int gi; + int best_gi; + uint64_t best_mse = (uint64_t)1 << 63; + best_gi = 0; + for (gi = 0; gi < cm->nb_cdef_strengths; gi++) { + uint64_t curr = mse[0][i][cm->cdef_strengths[gi]]; + if (nplanes >= 3) curr += mse[1][i][cm->cdef_uv_strengths[gi]]; + if (curr < best_mse) { + best_gi = gi; + best_mse = curr; + } + } + selected_strength[i] = best_gi; + cm->mi_grid_visible[sb_index[i]]->mbmi.cdef_strength = best_gi; + } + cm->cdef_dering_damping = dering_damping; + cm->cdef_clpf_damping = clpf_damping; + aom_free(mse[0]); + aom_free(mse[1]); + for (pli = 0; pli < nplanes; pli++) { + aom_free(src[pli]); + aom_free(ref_coeff[pli]); + } + aom_free(sb_index); + aom_free(selected_strength); +} diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c new file mode 100644 index 000000000..fc0ea485d --- /dev/null +++ b/third_party/aom/av1/encoder/picklpf.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_scale_rtcd.h" + +#include "aom_dsp/psnr.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/av1_loopfilter.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/quant_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/picklpf.h" + +int av1_get_max_filter_level(const AV1_COMP *cpi) { + if (cpi->oxcf.pass == 2) { + return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 + : MAX_LOOP_FILTER; + } else { + return MAX_LOOP_FILTER; + } +} + +static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, + AV1_COMP *const cpi, int filt_level, + int partial_frame) { + AV1_COMMON *const cm = &cpi->common; + int64_t filt_err; + +#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4 + av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1, + partial_frame); +#else + if (cpi->num_workers > 1) + av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane, + filt_level, 1, partial_frame, cpi->workers, + cpi->num_workers, &cpi->lf_row_sync); + else + av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, + 1, partial_frame); +#endif + +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) { + filt_err = aom_highbd_get_y_sse(sd, cm->frame_to_show); + } else { + filt_err = aom_get_y_sse(sd, cm->frame_to_show); + } +#else + filt_err = aom_get_y_sse(sd, cm->frame_to_show); +#endif // CONFIG_HIGHBITDEPTH + + // Re-instate the unfiltered frame + aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); + + return filt_err; +} + +int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, + int partial_frame, double *best_cost_ret) { + const AV1_COMMON *const cm = &cpi->common; + const struct loopfilter *const lf = &cm->lf; + const int min_filter_level = 0; + const int max_filter_level = av1_get_max_filter_level(cpi); + int filt_direction = 0; + int64_t best_err; + int filt_best; + MACROBLOCK *x = &cpi->td.mb; + + // Start the search at the previous frame filter level unless it is now out of + // range. + int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level); + int filter_step = filt_mid < 16 ? 4 : filt_mid / 4; + // Sum squared error at each filter level + int64_t ss_err[MAX_LOOP_FILTER + 1]; + + // Set each entry to -1 + memset(ss_err, 0xFF, sizeof(ss_err)); + + // Make a copy of the unfiltered / processed recon buffer + aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); + + best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame); + filt_best = filt_mid; + ss_err[filt_mid] = best_err; + + while (filter_step > 0) { + const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level); + const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level); + + // Bias against raising loop filter in favor of lowering it. + int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; + + if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20)) + bias = (bias * cpi->twopass.section_intra_rating) / 20; + + // yx, bias less for large block size + if (cm->tx_mode != ONLY_4X4) bias >>= 1; + + if (filt_direction <= 0 && filt_low != filt_mid) { + // Get Low filter error score + if (ss_err[filt_low] < 0) { + ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame); + } + // If value is close to the best so far then bias towards a lower loop + // filter value. + if (ss_err[filt_low] < (best_err + bias)) { + // Was it actually better than the previous best? + if (ss_err[filt_low] < best_err) { + best_err = ss_err[filt_low]; + } + filt_best = filt_low; + } + } + + // Now look at filt_high + if (filt_direction >= 0 && filt_high != filt_mid) { + if (ss_err[filt_high] < 0) { + ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame); + } + // If value is significantly better than previous best, bias added against + // raising filter value + if (ss_err[filt_high] < (best_err - bias)) { + best_err = ss_err[filt_high]; + filt_best = filt_high; + } + } + + // Half the step distance if the best filter value was the same as last time + if (filt_best == filt_mid) { + filter_step /= 2; + filt_direction = 0; + } else { + filt_direction = (filt_best < filt_mid) ? -1 : 1; + filt_mid = filt_best; + } + } + + // Update best error + best_err = ss_err[filt_best]; + + if (best_cost_ret) + *best_cost_ret = RDCOST_DBL(x->rdmult, x->rddiv, 0, best_err); + return filt_best; +} + +void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, + LPF_PICK_METHOD method) { + AV1_COMMON *const cm = &cpi->common; + struct loopfilter *const lf = &cm->lf; + + lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness; + + if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) { + lf->filter_level = 0; + } else if (method >= LPF_PICK_FROM_Q) { + const int min_filter_level = 0; + const int max_filter_level = av1_get_max_filter_level(cpi); + const int q = av1_ac_quant(cm->base_qindex, 0, cm->bit_depth); +// These values were determined by linear fitting the result of the +// searched level, filt_guess = q * 0.316206 + 3.87252 +#if CONFIG_HIGHBITDEPTH + int filt_guess; + switch (cm->bit_depth) { + case AOM_BITS_8: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); + break; + case AOM_BITS_10: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); + break; + case AOM_BITS_12: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22); + break; + default: + assert(0 && + "bit_depth should be AOM_BITS_8, AOM_BITS_10 " + "or AOM_BITS_12"); + return; + } +#else + int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); +#endif // CONFIG_HIGHBITDEPTH + if (cm->frame_type == KEY_FRAME) filt_guess -= 4; + lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level); + } else { + lf->filter_level = av1_search_filter_level( + sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL); + } + +#if CONFIG_EXT_TILE + // TODO(any): 0 loopfilter level is only necessary if individual tile + // decoding is required. We need to communicate this requirement to this + // code and force loop filter level 0 only if required. + if (cm->tile_encoding_mode) lf->filter_level = 0; +#endif // CONFIG_EXT_TILE +} diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h new file mode 100644 index 000000000..3c0a83462 --- /dev/null +++ b/third_party/aom/av1/encoder/picklpf.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_PICKLPF_H_ +#define AV1_ENCODER_PICKLPF_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/encoder.h" + +struct yv12_buffer_config; +struct AV1_COMP; +int av1_get_max_filter_level(const AV1_COMP *cpi); +int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, + int partial_frame, double *err); +void av1_pick_filter_level(const struct yv12_buffer_config *sd, + struct AV1_COMP *cpi, LPF_PICK_METHOD method); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_PICKLPF_H_ diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c new file mode 100644 index 000000000..21410e0af --- /dev/null +++ b/third_party/aom/av1/encoder/pickrst.c @@ -0,0 +1,1269 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "./aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_dsp/psnr.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +#include "av1/common/onyxc_int.h" +#include "av1/common/quant_common.h" +#include "av1/common/restoration.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/picklpf.h" +#include "av1/encoder/pickrst.h" + +// When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed. +// When set to RESTORE_NONE (0) we allow switchable. +const RestorationType force_restore_type = RESTORE_NONE; + +// Number of Wiener iterations +#define NUM_WIENER_ITERS 10 + +typedef double (*search_restore_type)(const YV12_BUFFER_CONFIG *src, + AV1_COMP *cpi, int partial_frame, + RestorationInfo *info, + RestorationType *rest_level, + double *best_tile_cost, + YV12_BUFFER_CONFIG *dst_frame); + +const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 2, 2 }; + +static int64_t sse_restoration_tile(const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *dst, + const AV1_COMMON *cm, int h_start, + int width, int v_start, int height, + int components_pattern) { + int64_t filt_err = 0; + (void)cm; + // Y and UV components cannot be mixed + assert(components_pattern == 1 || components_pattern == 2 || + components_pattern == 4 || components_pattern == 6); +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) { + if ((components_pattern >> AOM_PLANE_Y) & 1) { + filt_err += + aom_highbd_get_y_sse_part(src, dst, h_start, width, v_start, height); + } + if ((components_pattern >> AOM_PLANE_U) & 1) { + filt_err += + aom_highbd_get_u_sse_part(src, dst, h_start, width, v_start, height); + } + if ((components_pattern >> AOM_PLANE_V) & 1) { + filt_err += + aom_highbd_get_v_sse_part(src, dst, h_start, width, v_start, height); + } + return filt_err; + } +#endif // CONFIG_HIGHBITDEPTH + if ((components_pattern >> AOM_PLANE_Y) & 1) { + filt_err += aom_get_y_sse_part(src, dst, h_start, width, v_start, height); + } + if ((components_pattern >> AOM_PLANE_U) & 1) { + filt_err += aom_get_u_sse_part(src, dst, h_start, width, v_start, height); + } + if ((components_pattern >> AOM_PLANE_V) & 1) { + filt_err += aom_get_v_sse_part(src, dst, h_start, width, v_start, height); + } + return filt_err; +} + +static int64_t sse_restoration_frame(AV1_COMMON *const cm, + const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *dst, + int components_pattern) { + int64_t filt_err = 0; +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) { + if ((components_pattern >> AOM_PLANE_Y) & 1) { + filt_err += aom_highbd_get_y_sse(src, dst); + } + if ((components_pattern >> AOM_PLANE_U) & 1) { + filt_err += aom_highbd_get_u_sse(src, dst); + } + if ((components_pattern >> AOM_PLANE_V) & 1) { + filt_err += aom_highbd_get_v_sse(src, dst); + } + return filt_err; + } +#else + (void)cm; +#endif // CONFIG_HIGHBITDEPTH + if ((components_pattern >> AOM_PLANE_Y) & 1) { + filt_err = aom_get_y_sse(src, dst); + } + if ((components_pattern >> AOM_PLANE_U) & 1) { + filt_err += aom_get_u_sse(src, dst); + } + if ((components_pattern >> AOM_PLANE_V) & 1) { + filt_err += aom_get_v_sse(src, dst); + } + return filt_err; +} + +static int64_t try_restoration_tile(const YV12_BUFFER_CONFIG *src, + AV1_COMP *const cpi, RestorationInfo *rsi, + int components_pattern, int partial_frame, + int tile_idx, int subtile_idx, + int subtile_bits, + YV12_BUFFER_CONFIG *dst_frame) { + AV1_COMMON *const cm = &cpi->common; + int64_t filt_err; + int tile_width, tile_height, nhtiles, nvtiles; + int h_start, h_end, v_start, v_end; + int ntiles, width, height; + + // Y and UV components cannot be mixed + assert(components_pattern == 1 || components_pattern == 2 || + components_pattern == 4 || components_pattern == 6); + + if (components_pattern == 1) { // Y only + width = src->y_crop_width; + height = src->y_crop_height; + } else { // Color + width = src->uv_crop_width; + height = src->uv_crop_height; + } + ntiles = av1_get_rest_ntiles( + width, height, cm->rst_info[components_pattern > 1].restoration_tilesize, + &tile_width, &tile_height, &nhtiles, &nvtiles); + (void)ntiles; + + av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern, + partial_frame, dst_frame); + av1_get_rest_tile_limits(tile_idx, subtile_idx, subtile_bits, nhtiles, + nvtiles, tile_width, tile_height, width, height, 0, + 0, &h_start, &h_end, &v_start, &v_end); + filt_err = sse_restoration_tile(src, dst_frame, cm, h_start, h_end - h_start, + v_start, v_end - v_start, components_pattern); + + return filt_err; +} + +static int64_t try_restoration_frame(const YV12_BUFFER_CONFIG *src, + AV1_COMP *const cpi, RestorationInfo *rsi, + int components_pattern, int partial_frame, + YV12_BUFFER_CONFIG *dst_frame) { + AV1_COMMON *const cm = &cpi->common; + int64_t filt_err; + av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern, + partial_frame, dst_frame); + filt_err = sse_restoration_frame(cm, src, dst_frame, components_pattern); + return filt_err; +} + +static int64_t get_pixel_proj_error(uint8_t *src8, int width, int height, + int src_stride, uint8_t *dat8, + int dat_stride, int bit_depth, + int32_t *flt1, int flt1_stride, + int32_t *flt2, int flt2_stride, int *xqd) { + int i, j; + int64_t err = 0; + int xq[2]; + decode_xq(xqd, xq); + if (bit_depth == 8) { + const uint8_t *src = src8; + const uint8_t *dat = dat8; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t u = + (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u; + const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u; + const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - + src[i * src_stride + j]; + err += e * e; + } + } + } else { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t u = + (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u; + const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u; + const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - + src[i * src_stride + j]; + err += e * e; + } + } + } + return err; +} + +static void get_proj_subspace(uint8_t *src8, int width, int height, + int src_stride, uint8_t *dat8, int dat_stride, + int bit_depth, int32_t *flt1, int flt1_stride, + int32_t *flt2, int flt2_stride, int *xq) { + int i, j; + double H[2][2] = { { 0, 0 }, { 0, 0 } }; + double C[2] = { 0, 0 }; + double Det; + double x[2]; + const int size = width * height; + + aom_clear_system_state(); + + // Default + xq[0] = 0; + xq[1] = 0; + if (bit_depth == 8) { + const uint8_t *src = src8; + const uint8_t *dat = dat8; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const double s = + (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const double f1 = (double)flt1[i * flt1_stride + j] - u; + const double f2 = (double)flt2[i * flt2_stride + j] - u; + H[0][0] += f1 * f1; + H[1][1] += f2 * f2; + H[0][1] += f1 * f2; + C[0] += f1 * s; + C[1] += f2 * s; + } + } + } else { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const double s = + (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const double f1 = (double)flt1[i * flt1_stride + j] - u; + const double f2 = (double)flt2[i * flt2_stride + j] - u; + H[0][0] += f1 * f1; + H[1][1] += f2 * f2; + H[0][1] += f1 * f2; + C[0] += f1 * s; + C[1] += f2 * s; + } + } + } + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; + Det = (H[0][0] * H[1][1] - H[0][1] * H[1][0]); + if (Det < 1e-8) return; // ill-posed, return default values + x[0] = (H[1][1] * C[0] - H[0][1] * C[1]) / Det; + x[1] = (H[0][0] * C[1] - H[1][0] * C[0]) / Det; + xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS)); + xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS)); +} + +void encode_xq(int *xq, int *xqd) { + xqd[0] = xq[0]; + xqd[0] = clamp(xqd[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0); + xqd[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1]; + xqd[1] = clamp(xqd[1], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1); +} + +static void search_selfguided_restoration(uint8_t *dat8, int width, int height, + int dat_stride, uint8_t *src8, + int src_stride, int bit_depth, + int *eps, int *xqd, int32_t *rstbuf) { + int32_t *flt1 = rstbuf; + int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX; + int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX; + int ep, bestep = 0; + int64_t err, besterr = -1; + int exqd[2], bestxqd[2] = { 0, 0 }; + + for (ep = 0; ep < SGRPROJ_PARAMS; ep++) { + int exq[2]; +#if CONFIG_HIGHBITDEPTH + if (bit_depth > 8) { + uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); +#if USE_HIGHPASS_IN_SGRPROJ + av1_highpass_filter_highbd(dat, width, height, dat_stride, flt1, width, + sgr_params[ep].corner, sgr_params[ep].edge); +#else + av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt1, + width, bit_depth, sgr_params[ep].r1, + sgr_params[ep].e1, tmpbuf2); +#endif // USE_HIGHPASS_IN_SGRPROJ + av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt2, + width, bit_depth, sgr_params[ep].r2, + sgr_params[ep].e2, tmpbuf2); + } else { +#endif +#if USE_HIGHPASS_IN_SGRPROJ + av1_highpass_filter(dat8, width, height, dat_stride, flt1, width, + sgr_params[ep].corner, sgr_params[ep].edge); +#else + av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, width, + sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2); +#endif // USE_HIGHPASS_IN_SGRPROJ + av1_selfguided_restoration(dat8, width, height, dat_stride, flt2, width, + sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2); +#if CONFIG_HIGHBITDEPTH + } +#endif + get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride, + bit_depth, flt1, width, flt2, width, exq); + encode_xq(exq, exqd); + err = + get_pixel_proj_error(src8, width, height, src_stride, dat8, dat_stride, + bit_depth, flt1, width, flt2, width, exqd); + if (besterr == -1 || err < besterr) { + bestep = ep; + besterr = err; + bestxqd[0] = exqd[0]; + bestxqd[1] = exqd[1]; + } + } + *eps = bestep; + xqd[0] = bestxqd[0]; + xqd[1] = bestxqd[1]; +} + +static int count_sgrproj_bits(SgrprojInfo *sgrproj_info, + SgrprojInfo *ref_sgrproj_info) { + int bits = SGRPROJ_PARAMS_BITS; + bits += aom_count_primitive_refsubexpfin( + SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + bits += aom_count_primitive_refsubexpfin( + SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + return bits; +} + +static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, + int partial_frame, RestorationInfo *info, + RestorationType *type, double *best_tile_cost, + YV12_BUFFER_CONFIG *dst_frame) { + SgrprojInfo *sgrproj_info = info->sgrproj_info; + double err, cost_norestore, cost_sgrproj; + int bits; + MACROBLOCK *x = &cpi->td.mb; + AV1_COMMON *const cm = &cpi->common; + const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show; + RestorationInfo *rsi = &cpi->rst_search[0]; + int tile_idx, tile_width, tile_height, nhtiles, nvtiles; + int h_start, h_end, v_start, v_end; + // Allocate for the src buffer at high precision + const int ntiles = av1_get_rest_ntiles( + cm->width, cm->height, cm->rst_info[0].restoration_tilesize, &tile_width, + &tile_height, &nhtiles, &nvtiles); + SgrprojInfo ref_sgrproj_info; + set_default_sgrproj(&ref_sgrproj_info); + + rsi->frame_restoration_type = RESTORE_SGRPROJ; + + for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) { + rsi->restoration_type[tile_idx] = RESTORE_NONE; + } + // Compute best Sgrproj filters for each tile + for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) { + av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width, + tile_height, cm->width, cm->height, 0, 0, &h_start, + &h_end, &v_start, &v_end); + err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start, + h_end - h_start, v_start, v_end - v_start, 1); + // #bits when a tile is not restored + bits = av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 0); + cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err); + best_tile_cost[tile_idx] = DBL_MAX; + search_selfguided_restoration( + dgd->y_buffer + v_start * dgd->y_stride + h_start, h_end - h_start, + v_end - v_start, dgd->y_stride, + src->y_buffer + v_start * src->y_stride + h_start, src->y_stride, +#if CONFIG_HIGHBITDEPTH + cm->bit_depth, +#else + 8, +#endif // CONFIG_HIGHBITDEPTH + &rsi->sgrproj_info[tile_idx].ep, rsi->sgrproj_info[tile_idx].xqd, + cm->rst_internal.tmpbuf); + rsi->restoration_type[tile_idx] = RESTORE_SGRPROJ; + err = try_restoration_tile(src, cpi, rsi, 1, partial_frame, tile_idx, 0, 0, + dst_frame); + bits = count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info) + << AV1_PROB_COST_SHIFT; + bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 1); + cost_sgrproj = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err); + if (cost_sgrproj >= cost_norestore) { + type[tile_idx] = RESTORE_NONE; + } else { + type[tile_idx] = RESTORE_SGRPROJ; + memcpy(&sgrproj_info[tile_idx], &rsi->sgrproj_info[tile_idx], + sizeof(sgrproj_info[tile_idx])); + bits = count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info) + << AV1_PROB_COST_SHIFT; + memcpy(&ref_sgrproj_info, &sgrproj_info[tile_idx], + sizeof(ref_sgrproj_info)); + best_tile_cost[tile_idx] = err; + } + rsi->restoration_type[tile_idx] = RESTORE_NONE; + } + // Cost for Sgrproj filtering + set_default_sgrproj(&ref_sgrproj_info); + bits = frame_level_restore_bits[rsi->frame_restoration_type] + << AV1_PROB_COST_SHIFT; + for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) { + bits += + av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, type[tile_idx] != RESTORE_NONE); + memcpy(&rsi->sgrproj_info[tile_idx], &sgrproj_info[tile_idx], + sizeof(sgrproj_info[tile_idx])); + if (type[tile_idx] == RESTORE_SGRPROJ) { + bits += + count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info) + << AV1_PROB_COST_SHIFT; + memcpy(&ref_sgrproj_info, &rsi->sgrproj_info[tile_idx], + sizeof(ref_sgrproj_info)); + } + rsi->restoration_type[tile_idx] = type[tile_idx]; + } + err = try_restoration_frame(src, cpi, rsi, 1, partial_frame, dst_frame); + cost_sgrproj = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err); + + return cost_sgrproj; +} + +static double find_average(uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int stride) { + uint64_t sum = 0; + double avg = 0; + int i, j; + aom_clear_system_state(); + for (i = v_start; i < v_end; i++) + for (j = h_start; j < h_end; j++) sum += src[i * stride + j]; + avg = (double)sum / ((v_end - v_start) * (h_end - h_start)); + return avg; +} + +static void compute_stats(uint8_t *dgd, uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, double *M, double *H) { + int i, j, k, l; + double Y[WIENER_WIN2]; + const double avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + memset(M, 0, sizeof(*M) * WIENER_WIN2); + memset(H, 0, sizeof(*H) * WIENER_WIN2 * WIENER_WIN2); + for (i = v_start; i < v_end; i++) { + for (j = h_start; j < h_end; j++) { + const double X = (double)src[i * src_stride + j] - avg; + int idx = 0; + for (k = -WIENER_HALFWIN; k <= WIENER_HALFWIN; k++) { + for (l = -WIENER_HALFWIN; l <= WIENER_HALFWIN; l++) { + Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg; + idx++; + } + } + for (k = 0; k < WIENER_WIN2; ++k) { + M[k] += Y[k] * X; + H[k * WIENER_WIN2 + k] += Y[k] * Y[k]; + for (l = k + 1; l < WIENER_WIN2; ++l) { + // H is a symmetric matrix, so we only need to fill out the upper + // triangle here. We can copy it down to the lower triangle outside + // the (i, j) loops. + H[k * WIENER_WIN2 + l] += Y[k] * Y[l]; + } + } + } + } + for (k = 0; k < WIENER_WIN2; ++k) { + for (l = k + 1; l < WIENER_WIN2; ++l) { + H[l * WIENER_WIN2 + k] = H[k * WIENER_WIN2 + l]; + } + } +} + +#if CONFIG_HIGHBITDEPTH +static double find_average_highbd(uint16_t *src, int h_start, int h_end, + int v_start, int v_end, int stride) { + uint64_t sum = 0; + double avg = 0; + int i, j; + aom_clear_system_state(); + for (i = v_start; i < v_end; i++) + for (j = h_start; j < h_end; j++) sum += src[i * stride + j]; + avg = (double)sum / ((v_end - v_start) * (h_end - h_start)); + return avg; +} + +static void compute_stats_highbd(uint8_t *dgd8, uint8_t *src8, int h_start, + int h_end, int v_start, int v_end, + int dgd_stride, int src_stride, double *M, + double *H) { + int i, j, k, l; + double Y[WIENER_WIN2]; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const double avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + memset(M, 0, sizeof(*M) * WIENER_WIN2); + memset(H, 0, sizeof(*H) * WIENER_WIN2 * WIENER_WIN2); + for (i = v_start; i < v_end; i++) { + for (j = h_start; j < h_end; j++) { + const double X = (double)src[i * src_stride + j] - avg; + int idx = 0; + for (k = -WIENER_HALFWIN; k <= WIENER_HALFWIN; k++) { + for (l = -WIENER_HALFWIN; l <= WIENER_HALFWIN; l++) { + Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg; + idx++; + } + } + for (k = 0; k < WIENER_WIN2; ++k) { + M[k] += Y[k] * X; + H[k * WIENER_WIN2 + k] += Y[k] * Y[k]; + for (l = k + 1; l < WIENER_WIN2; ++l) { + // H is a symmetric matrix, so we only need to fill out the upper + // triangle here. We can copy it down to the lower triangle outside + // the (i, j) loops. + H[k * WIENER_WIN2 + l] += Y[k] * Y[l]; + } + } + } + } + for (k = 0; k < WIENER_WIN2; ++k) { + for (l = k + 1; l < WIENER_WIN2; ++l) { + H[l * WIENER_WIN2 + k] = H[k * WIENER_WIN2 + l]; + } + } +} +#endif // CONFIG_HIGHBITDEPTH + +// Solves Ax = b, where x and b are column vectors +static int linsolve(int n, double *A, int stride, double *b, double *x) { + int i, j, k; + double c; + + aom_clear_system_state(); + + // Forward elimination + for (k = 0; k < n - 1; k++) { + // Bring the largest magitude to the diagonal position + for (i = n - 1; i > k; i--) { + if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) { + for (j = 0; j < n; j++) { + c = A[i * stride + j]; + A[i * stride + j] = A[(i - 1) * stride + j]; + A[(i - 1) * stride + j] = c; + } + c = b[i]; + b[i] = b[i - 1]; + b[i - 1] = c; + } + } + for (i = k; i < n - 1; i++) { + if (fabs(A[k * stride + k]) < 1e-10) return 0; + c = A[(i + 1) * stride + k] / A[k * stride + k]; + for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j]; + b[i + 1] -= c * b[k]; + } + } + // Backward substitution + for (i = n - 1; i >= 0; i--) { + if (fabs(A[i * stride + i]) < 1e-10) return 0; + c = 0; + for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j]; + x[i] = (b[i] - c) / A[i * stride + i]; + } + + return 1; +} + +static INLINE int wrap_index(int i) { + return (i >= WIENER_HALFWIN1 ? WIENER_WIN - 1 - i : i); +} + +// Fix vector b, update vector a +static void update_a_sep_sym(double **Mc, double **Hc, double *a, double *b) { + int i, j; + double S[WIENER_WIN]; + double A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; + int w, w2; + memset(A, 0, sizeof(A)); + memset(B, 0, sizeof(B)); + for (i = 0; i < WIENER_WIN; i++) { + for (j = 0; j < WIENER_WIN; ++j) { + const int jj = wrap_index(j); + A[jj] += Mc[i][j] * b[i]; + } + } + for (i = 0; i < WIENER_WIN; i++) { + for (j = 0; j < WIENER_WIN; j++) { + int k, l; + for (k = 0; k < WIENER_WIN; ++k) + for (l = 0; l < WIENER_WIN; ++l) { + const int kk = wrap_index(k); + const int ll = wrap_index(l); + B[ll * WIENER_HALFWIN1 + kk] += + Hc[j * WIENER_WIN + i][k * WIENER_WIN2 + l] * b[i] * b[j]; + } + } + } + // Normalization enforcement in the system of equations itself + w = WIENER_WIN; + w2 = (w >> 1) + 1; + for (i = 0; i < w2 - 1; ++i) + A[i] -= + A[w2 - 1] * 2 + B[i * w2 + w2 - 1] - 2 * B[(w2 - 1) * w2 + (w2 - 1)]; + for (i = 0; i < w2 - 1; ++i) + for (j = 0; j < w2 - 1; ++j) + B[i * w2 + j] -= 2 * (B[i * w2 + (w2 - 1)] + B[(w2 - 1) * w2 + j] - + 2 * B[(w2 - 1) * w2 + (w2 - 1)]); + if (linsolve(w2 - 1, B, w2, A, S)) { + S[w2 - 1] = 1.0; + for (i = w2; i < w; ++i) { + S[i] = S[w - 1 - i]; + S[w2 - 1] -= 2 * S[i]; + } + memcpy(a, S, w * sizeof(*a)); + } +} + +// Fix vector a, update vector b +static void update_b_sep_sym(double **Mc, double **Hc, double *a, double *b) { + int i, j; + double S[WIENER_WIN]; + double A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; + int w, w2; + memset(A, 0, sizeof(A)); + memset(B, 0, sizeof(B)); + for (i = 0; i < WIENER_WIN; i++) { + const int ii = wrap_index(i); + for (j = 0; j < WIENER_WIN; j++) A[ii] += Mc[i][j] * a[j]; + } + + for (i = 0; i < WIENER_WIN; i++) { + for (j = 0; j < WIENER_WIN; j++) { + const int ii = wrap_index(i); + const int jj = wrap_index(j); + int k, l; + for (k = 0; k < WIENER_WIN; ++k) + for (l = 0; l < WIENER_WIN; ++l) + B[jj * WIENER_HALFWIN1 + ii] += + Hc[i * WIENER_WIN + j][k * WIENER_WIN2 + l] * a[k] * a[l]; + } + } + // Normalization enforcement in the system of equations itself + w = WIENER_WIN; + w2 = WIENER_HALFWIN1; + for (i = 0; i < w2 - 1; ++i) + A[i] -= + A[w2 - 1] * 2 + B[i * w2 + w2 - 1] - 2 * B[(w2 - 1) * w2 + (w2 - 1)]; + for (i = 0; i < w2 - 1; ++i) + for (j = 0; j < w2 - 1; ++j) + B[i * w2 + j] -= 2 * (B[i * w2 + (w2 - 1)] + B[(w2 - 1) * w2 + j] - + 2 * B[(w2 - 1) * w2 + (w2 - 1)]); + if (linsolve(w2 - 1, B, w2, A, S)) { + S[w2 - 1] = 1.0; + for (i = w2; i < w; ++i) { + S[i] = S[w - 1 - i]; + S[w2 - 1] -= 2 * S[i]; + } + memcpy(b, S, w * sizeof(*b)); + } +} + +static int wiener_decompose_sep_sym(double *M, double *H, double *a, + double *b) { + static const double init_filt[WIENER_WIN] = { + 0.035623, -0.127154, 0.211436, 0.760190, 0.211436, -0.127154, 0.035623, + }; + int i, j, iter; + double *Hc[WIENER_WIN2]; + double *Mc[WIENER_WIN]; + for (i = 0; i < WIENER_WIN; i++) { + Mc[i] = M + i * WIENER_WIN; + for (j = 0; j < WIENER_WIN; j++) { + Hc[i * WIENER_WIN + j] = + H + i * WIENER_WIN * WIENER_WIN2 + j * WIENER_WIN; + } + } + memcpy(a, init_filt, sizeof(*a) * WIENER_WIN); + memcpy(b, init_filt, sizeof(*b) * WIENER_WIN); + + iter = 1; + while (iter < NUM_WIENER_ITERS) { + update_a_sep_sym(Mc, Hc, a, b); + update_b_sep_sym(Mc, Hc, a, b); + iter++; + } + return 1; +} + +// Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares +// against identity filters; Final score is defined as the difference between +// the function values +static double compute_score(double *M, double *H, InterpKernel vfilt, + InterpKernel hfilt) { + double ab[WIENER_WIN * WIENER_WIN]; + int i, k, l; + double P = 0, Q = 0; + double iP = 0, iQ = 0; + double Score, iScore; + double a[WIENER_WIN], b[WIENER_WIN]; + + aom_clear_system_state(); + + a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = 1.0; + for (i = 0; i < WIENER_HALFWIN; ++i) { + a[i] = a[WIENER_WIN - i - 1] = (double)vfilt[i] / WIENER_FILT_STEP; + b[i] = b[WIENER_WIN - i - 1] = (double)hfilt[i] / WIENER_FILT_STEP; + a[WIENER_HALFWIN] -= 2 * a[i]; + b[WIENER_HALFWIN] -= 2 * b[i]; + } + for (k = 0; k < WIENER_WIN; ++k) { + for (l = 0; l < WIENER_WIN; ++l) ab[k * WIENER_WIN + l] = a[l] * b[k]; + } + for (k = 0; k < WIENER_WIN2; ++k) { + P += ab[k] * M[k]; + for (l = 0; l < WIENER_WIN2; ++l) + Q += ab[k] * H[k * WIENER_WIN2 + l] * ab[l]; + } + Score = Q - 2 * P; + + iP = M[WIENER_WIN2 >> 1]; + iQ = H[(WIENER_WIN2 >> 1) * WIENER_WIN2 + (WIENER_WIN2 >> 1)]; + iScore = iQ - 2 * iP; + + return Score - iScore; +} + +static void quantize_sym_filter(double *f, InterpKernel fi) { + int i; + for (i = 0; i < WIENER_HALFWIN; ++i) { + fi[i] = RINT(f[i] * WIENER_FILT_STEP); + } + // Specialize for 7-tap filter + fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV); + fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV); + fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV); + // Satisfy filter constraints + fi[WIENER_WIN - 1] = fi[0]; + fi[WIENER_WIN - 2] = fi[1]; + fi[WIENER_WIN - 3] = fi[2]; + // The central element has an implicit +WIENER_FILT_STEP + fi[3] = -2 * (fi[0] + fi[1] + fi[2]); +} + +static int count_wiener_bits(WienerInfo *wiener_info, + WienerInfo *ref_wiener_info) { + int bits = 0; + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV); + return bits; +} + +static double search_wiener_uv(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, + int partial_frame, int plane, + RestorationInfo *info, RestorationType *type, + YV12_BUFFER_CONFIG *dst_frame) { + WienerInfo *wiener_info = info->wiener_info; + AV1_COMMON *const cm = &cpi->common; + RestorationInfo *rsi = cpi->rst_search; + int64_t err; + int bits; + double cost_wiener, cost_norestore, cost_wiener_frame, cost_norestore_frame; + MACROBLOCK *x = &cpi->td.mb; + double M[WIENER_WIN2]; + double H[WIENER_WIN2 * WIENER_WIN2]; + double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN]; + const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show; + const int width = src->uv_crop_width; + const int height = src->uv_crop_height; + const int src_stride = src->uv_stride; + const int dgd_stride = dgd->uv_stride; + double score; + int tile_idx, tile_width, tile_height, nhtiles, nvtiles; + int h_start, h_end, v_start, v_end; + const int ntiles = + av1_get_rest_ntiles(width, height, cm->rst_info[1].restoration_tilesize, + &tile_width, &tile_height, &nhtiles, &nvtiles); + WienerInfo ref_wiener_info; + set_default_wiener(&ref_wiener_info); + assert(width == dgd->uv_crop_width); + assert(height == dgd->uv_crop_height); + + rsi[plane].frame_restoration_type = RESTORE_NONE; + err = sse_restoration_frame(cm, src, cm->frame_to_show, (1 << plane)); + bits = 0; + cost_norestore_frame = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err); + + rsi[plane].frame_restoration_type = RESTORE_WIENER; + + for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) { + rsi[plane].restoration_type[tile_idx] = RESTORE_NONE; + } + + // Compute best Wiener filters for each tile + for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) { + av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width, + tile_height, width, height, 0, 0, &h_start, &h_end, + &v_start, &v_end); + err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start, + h_end - h_start, v_start, v_end - v_start, + 1 << plane); + // #bits when a tile is not restored + bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0); + cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err); + // best_tile_cost[tile_idx] = DBL_MAX; + + av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width, + tile_height, width, height, WIENER_HALFWIN, + WIENER_HALFWIN, &h_start, &h_end, &v_start, + &v_end); + if (plane == AOM_PLANE_U) { +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + compute_stats_highbd(dgd->u_buffer, src->u_buffer, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M, H); + else +#endif // CONFIG_HIGHBITDEPTH + compute_stats(dgd->u_buffer, src->u_buffer, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H); + } else if (plane == AOM_PLANE_V) { +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + compute_stats_highbd(dgd->v_buffer, src->v_buffer, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M, H); + else +#endif // CONFIG_HIGHBITDEPTH + compute_stats(dgd->v_buffer, src->v_buffer, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H); + } else { + assert(0); + } + + type[tile_idx] = RESTORE_WIENER; + + if (!wiener_decompose_sep_sym(M, H, vfilterd, hfilterd)) { + type[tile_idx] = RESTORE_NONE; + continue; + } + quantize_sym_filter(vfilterd, rsi[plane].wiener_info[tile_idx].vfilter); + quantize_sym_filter(hfilterd, rsi[plane].wiener_info[tile_idx].hfilter); + + // Filter score computes the value of the function x'*A*x - x'*b for the + // learned filter and compares it against identity filer. If there is no + // reduction in the function, the filter is reverted back to identity + score = compute_score(M, H, rsi[plane].wiener_info[tile_idx].vfilter, + rsi[plane].wiener_info[tile_idx].hfilter); + if (score > 0.0) { + type[tile_idx] = RESTORE_NONE; + continue; + } + + rsi[plane].restoration_type[tile_idx] = RESTORE_WIENER; + err = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame, + tile_idx, 0, 0, dst_frame); + bits = + count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info) + << AV1_PROB_COST_SHIFT; + // bits = WIENER_FILT_BITS << AV1_PROB_COST_SHIFT; + bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1); + cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err); + if (cost_wiener >= cost_norestore) { + type[tile_idx] = RESTORE_NONE; + } else { + type[tile_idx] = RESTORE_WIENER; + memcpy(&wiener_info[tile_idx], &rsi[plane].wiener_info[tile_idx], + sizeof(wiener_info[tile_idx])); + memcpy(&ref_wiener_info, &rsi[plane].wiener_info[tile_idx], + sizeof(ref_wiener_info)); + } + rsi[plane].restoration_type[tile_idx] = RESTORE_NONE; + } + // Cost for Wiener filtering + set_default_wiener(&ref_wiener_info); + bits = 0; + for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) { + bits += + av1_cost_bit(RESTORE_NONE_WIENER_PROB, type[tile_idx] != RESTORE_NONE); + memcpy(&rsi[plane].wiener_info[tile_idx], &wiener_info[tile_idx], + sizeof(wiener_info[tile_idx])); + if (type[tile_idx] == RESTORE_WIENER) { + bits += + count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info) + << AV1_PROB_COST_SHIFT; + memcpy(&ref_wiener_info, &rsi[plane].wiener_info[tile_idx], + sizeof(ref_wiener_info)); + } + rsi[plane].restoration_type[tile_idx] = type[tile_idx]; + } + err = try_restoration_frame(src, cpi, rsi, 1 << plane, partial_frame, + dst_frame); + cost_wiener_frame = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err); + + if (cost_wiener_frame < cost_norestore_frame) { + info->frame_restoration_type = RESTORE_WIENER; + } else { + info->frame_restoration_type = RESTORE_NONE; + } + + return info->frame_restoration_type == RESTORE_WIENER ? cost_wiener_frame + : cost_norestore_frame; +} + +static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, + int partial_frame, RestorationInfo *info, + RestorationType *type, double *best_tile_cost, + YV12_BUFFER_CONFIG *dst_frame) { + WienerInfo *wiener_info = info->wiener_info; + AV1_COMMON *const cm = &cpi->common; + RestorationInfo *rsi = cpi->rst_search; + int64_t err; + int bits; + double cost_wiener, cost_norestore; + MACROBLOCK *x = &cpi->td.mb; + double M[WIENER_WIN2]; + double H[WIENER_WIN2 * WIENER_WIN2]; + double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN]; + const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show; + const int width = cm->width; + const int height = cm->height; + const int src_stride = src->y_stride; + const int dgd_stride = dgd->y_stride; + double score; + int tile_idx, tile_width, tile_height, nhtiles, nvtiles; + int h_start, h_end, v_start, v_end; + const int ntiles = + av1_get_rest_ntiles(width, height, cm->rst_info[0].restoration_tilesize, + &tile_width, &tile_height, &nhtiles, &nvtiles); + WienerInfo ref_wiener_info; + set_default_wiener(&ref_wiener_info); + + assert(width == dgd->y_crop_width); + assert(height == dgd->y_crop_height); + assert(width == src->y_crop_width); + assert(height == src->y_crop_height); + + rsi->frame_restoration_type = RESTORE_WIENER; + + for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) { + rsi->restoration_type[tile_idx] = RESTORE_NONE; + } + +// Construct a (WIENER_HALFWIN)-pixel border around the frame +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + extend_frame_highbd(CONVERT_TO_SHORTPTR(dgd->y_buffer), width, height, + dgd_stride); + else +#endif + extend_frame(dgd->y_buffer, width, height, dgd_stride); + + // Compute best Wiener filters for each tile + for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) { + av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width, + tile_height, width, height, 0, 0, &h_start, &h_end, + &v_start, &v_end); + err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start, + h_end - h_start, v_start, v_end - v_start, 1); + // #bits when a tile is not restored + bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0); + cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err); + best_tile_cost[tile_idx] = DBL_MAX; + + av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width, + tile_height, width, height, 0, 0, &h_start, &h_end, + &v_start, &v_end); +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + compute_stats_highbd(dgd->y_buffer, src->y_buffer, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M, H); + else +#endif // CONFIG_HIGHBITDEPTH + compute_stats(dgd->y_buffer, src->y_buffer, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H); + + type[tile_idx] = RESTORE_WIENER; + + if (!wiener_decompose_sep_sym(M, H, vfilterd, hfilterd)) { + type[tile_idx] = RESTORE_NONE; + continue; + } + quantize_sym_filter(vfilterd, rsi->wiener_info[tile_idx].vfilter); + quantize_sym_filter(hfilterd, rsi->wiener_info[tile_idx].hfilter); + + // Filter score computes the value of the function x'*A*x - x'*b for the + // learned filter and compares it against identity filer. If there is no + // reduction in the function, the filter is reverted back to identity + score = compute_score(M, H, rsi->wiener_info[tile_idx].vfilter, + rsi->wiener_info[tile_idx].hfilter); + if (score > 0.0) { + type[tile_idx] = RESTORE_NONE; + continue; + } + + rsi->restoration_type[tile_idx] = RESTORE_WIENER; + err = try_restoration_tile(src, cpi, rsi, 1, partial_frame, tile_idx, 0, 0, + dst_frame); + bits = count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info) + << AV1_PROB_COST_SHIFT; + bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1); + cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err); + if (cost_wiener >= cost_norestore) { + type[tile_idx] = RESTORE_NONE; + } else { + type[tile_idx] = RESTORE_WIENER; + memcpy(&wiener_info[tile_idx], &rsi->wiener_info[tile_idx], + sizeof(wiener_info[tile_idx])); + memcpy(&ref_wiener_info, &rsi->wiener_info[tile_idx], + sizeof(ref_wiener_info)); + bits = count_wiener_bits(&wiener_info[tile_idx], &ref_wiener_info) + << AV1_PROB_COST_SHIFT; + best_tile_cost[tile_idx] = err; + } + rsi->restoration_type[tile_idx] = RESTORE_NONE; + } + // Cost for Wiener filtering + set_default_wiener(&ref_wiener_info); + bits = frame_level_restore_bits[rsi->frame_restoration_type] + << AV1_PROB_COST_SHIFT; + for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) { + bits += + av1_cost_bit(RESTORE_NONE_WIENER_PROB, type[tile_idx] != RESTORE_NONE); + memcpy(&rsi->wiener_info[tile_idx], &wiener_info[tile_idx], + sizeof(wiener_info[tile_idx])); + if (type[tile_idx] == RESTORE_WIENER) { + bits += count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info) + << AV1_PROB_COST_SHIFT; + memcpy(&ref_wiener_info, &rsi->wiener_info[tile_idx], + sizeof(ref_wiener_info)); + } + rsi->restoration_type[tile_idx] = type[tile_idx]; + } + err = try_restoration_frame(src, cpi, rsi, 1, partial_frame, dst_frame); + cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err); + + return cost_wiener; +} + +static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, + int partial_frame, RestorationInfo *info, + RestorationType *type, double *best_tile_cost, + YV12_BUFFER_CONFIG *dst_frame) { + double err, cost_norestore; + int bits; + MACROBLOCK *x = &cpi->td.mb; + AV1_COMMON *const cm = &cpi->common; + int tile_idx, tile_width, tile_height, nhtiles, nvtiles; + int h_start, h_end, v_start, v_end; + const int ntiles = av1_get_rest_ntiles( + cm->width, cm->height, cm->rst_info[0].restoration_tilesize, &tile_width, + &tile_height, &nhtiles, &nvtiles); + (void)info; + (void)dst_frame; + (void)partial_frame; + + for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) { + av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width, + tile_height, cm->width, cm->height, 0, 0, &h_start, + &h_end, &v_start, &v_end); + err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start, + h_end - h_start, v_start, v_end - v_start, 1); + type[tile_idx] = RESTORE_NONE; + best_tile_cost[tile_idx] = err; + } + // RD cost associated with no restoration + err = sse_restoration_tile(src, cm->frame_to_show, cm, 0, cm->width, 0, + cm->height, 1); + bits = frame_level_restore_bits[RESTORE_NONE] << AV1_PROB_COST_SHIFT; + cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err); + return cost_norestore; +} + +static double search_switchable_restoration( + AV1_COMP *cpi, int partial_frame, RestorationInfo *rsi, + double *tile_cost[RESTORE_SWITCHABLE_TYPES]) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *x = &cpi->td.mb; + double cost_switchable = 0; + int bits, tile_idx; + RestorationType r; + const int ntiles = av1_get_rest_ntiles(cm->width, cm->height, + cm->rst_info[0].restoration_tilesize, + NULL, NULL, NULL, NULL); + SgrprojInfo ref_sgrproj_info; + set_default_sgrproj(&ref_sgrproj_info); + WienerInfo ref_wiener_info; + set_default_wiener(&ref_wiener_info); + (void)partial_frame; + + rsi->frame_restoration_type = RESTORE_SWITCHABLE; + bits = frame_level_restore_bits[rsi->frame_restoration_type] + << AV1_PROB_COST_SHIFT; + cost_switchable = RDCOST_DBL(x->rdmult, x->rddiv, bits >> 4, 0); + for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) { + double best_cost = RDCOST_DBL( + x->rdmult, x->rddiv, (cpi->switchable_restore_cost[RESTORE_NONE] >> 4), + tile_cost[RESTORE_NONE][tile_idx]); + rsi->restoration_type[tile_idx] = RESTORE_NONE; + for (r = 1; r < RESTORE_SWITCHABLE_TYPES; r++) { + if (force_restore_type != 0) + if (r != force_restore_type) continue; + int tilebits = 0; + if (r == RESTORE_WIENER) + tilebits += + count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info); + else if (r == RESTORE_SGRPROJ) + tilebits += + count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info); + tilebits <<= AV1_PROB_COST_SHIFT; + tilebits += cpi->switchable_restore_cost[r]; + double cost = RDCOST_DBL(x->rdmult, x->rddiv, tilebits >> 4, + tile_cost[r][tile_idx]); + + if (cost < best_cost) { + rsi->restoration_type[tile_idx] = r; + best_cost = cost; + } + } + if (rsi->restoration_type[tile_idx] == RESTORE_WIENER) + memcpy(&ref_wiener_info, &rsi->wiener_info[tile_idx], + sizeof(ref_wiener_info)); + else if (rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) + memcpy(&ref_sgrproj_info, &rsi->sgrproj_info[tile_idx], + sizeof(ref_sgrproj_info)); + if (force_restore_type != 0) + assert(rsi->restoration_type[tile_idx] == force_restore_type || + rsi->restoration_type[tile_idx] == RESTORE_NONE); + cost_switchable += best_cost; + } + return cost_switchable; +} + +void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, + LPF_PICK_METHOD method) { + static search_restore_type search_restore_fun[RESTORE_SWITCHABLE_TYPES] = { + search_norestore, search_wiener, search_sgrproj, + }; + AV1_COMMON *const cm = &cpi->common; + double cost_restore[RESTORE_TYPES]; + double *tile_cost[RESTORE_SWITCHABLE_TYPES]; + RestorationType *restore_types[RESTORE_SWITCHABLE_TYPES]; + double best_cost_restore; + RestorationType r, best_restore; + + const int ntiles = av1_get_rest_ntiles(cm->width, cm->height, + cm->rst_info[0].restoration_tilesize, + NULL, NULL, NULL, NULL); + + for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) { + tile_cost[r] = (double *)aom_malloc(sizeof(*tile_cost[0]) * ntiles); + restore_types[r] = + (RestorationType *)aom_malloc(sizeof(*restore_types[0]) * ntiles); + } + + for (r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) { + if (force_restore_type != 0) + if (r != RESTORE_NONE && r != force_restore_type) continue; + cost_restore[r] = search_restore_fun[r]( + src, cpi, method == LPF_PICK_FROM_SUBIMAGE, &cm->rst_info[0], + restore_types[r], tile_cost[r], &cpi->trial_frame_rst); + } + cost_restore[RESTORE_SWITCHABLE] = search_switchable_restoration( + cpi, method == LPF_PICK_FROM_SUBIMAGE, &cm->rst_info[0], tile_cost); + + best_cost_restore = DBL_MAX; + best_restore = 0; + for (r = 0; r < RESTORE_TYPES; ++r) { + if (force_restore_type != 0) + if (r != RESTORE_NONE && r != force_restore_type) continue; + if (cost_restore[r] < best_cost_restore) { + best_restore = r; + best_cost_restore = cost_restore[r]; + } + } + cm->rst_info[0].frame_restoration_type = best_restore; + if (force_restore_type != 0) + assert(best_restore == force_restore_type || best_restore == RESTORE_NONE); + if (best_restore != RESTORE_SWITCHABLE) { + memcpy(cm->rst_info[0].restoration_type, restore_types[best_restore], + ntiles * sizeof(restore_types[best_restore][0])); + } + + // Color components + search_wiener_uv(src, cpi, method == LPF_PICK_FROM_SUBIMAGE, AOM_PLANE_U, + &cm->rst_info[AOM_PLANE_U], + cm->rst_info[AOM_PLANE_U].restoration_type, + &cpi->trial_frame_rst); + search_wiener_uv(src, cpi, method == LPF_PICK_FROM_SUBIMAGE, AOM_PLANE_V, + &cm->rst_info[AOM_PLANE_V], + cm->rst_info[AOM_PLANE_V].restoration_type, + &cpi->trial_frame_rst); + /* + printf("Frame %d/%d restore types: %d %d %d\n", + cm->current_video_frame, cm->show_frame, + cm->rst_info[0].frame_restoration_type, + cm->rst_info[1].frame_restoration_type, + cm->rst_info[2].frame_restoration_type); + printf("Frame %d/%d frame_restore_type %d : %f %f %f %f\n", + cm->current_video_frame, cm->show_frame, + cm->rst_info[0].frame_restoration_type, cost_restore[0], + cost_restore[1], cost_restore[2], cost_restore[3]); + */ + + for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) { + aom_free(tile_cost[r]); + aom_free(restore_types[r]); + } +} diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h new file mode 100644 index 000000000..f6096ed1d --- /dev/null +++ b/third_party/aom/av1/encoder/pickrst.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AV1_ENCODER_PICKRST_H_ +#define AV1_ENCODER_PICKRST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/encoder.h" + +struct yv12_buffer_config; +struct AV1_COMP; + +void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, + LPF_PICK_METHOD method); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_PICKRST_H_ diff --git a/third_party/aom/av1/encoder/pvq_encoder.c b/third_party/aom/av1/encoder/pvq_encoder.c new file mode 100644 index 000000000..ab63f1b7d --- /dev/null +++ b/third_party/aom/av1/encoder/pvq_encoder.c @@ -0,0 +1,988 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* clang-format off */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include "aom_dsp/entcode.h" +#include "aom_dsp/entenc.h" +#include "av1/common/blockd.h" +#include "av1/common/odintrin.h" +#include "av1/common/partition.h" +#include "av1/common/pvq_state.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/pvq_encoder.h" +#include "aom_ports/system_state.h" + +/*Shift to ensure that the upper bound (i.e. for the max blocksize) of the + dot-product of the 1st band of chroma with the luma ref doesn't overflow.*/ +#define OD_CFL_FLIP_SHIFT (OD_LIMIT_BSIZE_MAX + 0) + +void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf, + int nsymbs) { + if (cdf[0] == 0) + aom_cdf_init_q15_1D(cdf, nsymbs, CDF_SIZE(nsymbs)); + aom_write_symbol(w, symb, cdf, nsymbs); +} + +static void aom_encode_pvq_codeword(aom_writer *w, od_pvq_codeword_ctx *adapt, + const od_coeff *in, int n, int k) { + int i; + aom_encode_band_pvq_splits(w, adapt, in, n, k, 0); + for (i = 0; i < n; i++) if (in[i]) aom_write_bit(w, in[i] < 0); +} + +/* Computes 1/sqrt(i) using a table for small values. */ +static double od_rsqrt_table(int i) { + static double table[16] = { + 1.000000, 0.707107, 0.577350, 0.500000, + 0.447214, 0.408248, 0.377964, 0.353553, + 0.333333, 0.316228, 0.301511, 0.288675, + 0.277350, 0.267261, 0.258199, 0.250000}; + if (i <= 16) return table[i-1]; + else return 1./sqrt(i); +} + +/*Computes 1/sqrt(start+2*i+1) using a lookup table containing the results + where 0 <= i < table_size.*/ +static double od_custom_rsqrt_dynamic_table(const double* table, + const int table_size, const double start, const int i) { + if (i < table_size) return table[i]; + else return od_rsqrt_table((int)(start + 2*i + 1)); +} + +/*Fills tables used in od_custom_rsqrt_dynamic_table for a given start.*/ +static void od_fill_dynamic_rsqrt_table(double *table, const int table_size, + const double start) { + int i; + for (i = 0; i < table_size; i++) + table[i] = od_rsqrt_table((int)(start + 2*i + 1)); +} + +/** Find the codepoint on the given PSphere closest to the desired + * vector. Double-precision PVQ search just to make sure our tests + * aren't limited by numerical accuracy. + * + * @param [in] xcoeff input vector to quantize (x in the math doc) + * @param [in] n number of dimensions + * @param [in] k number of pulses + * @param [out] ypulse optimal codevector found (y in the math doc) + * @param [out] g2 multiplier for the distortion (typically squared + * gain units) + * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO + * @param [in] prev_k number of pulses already in ypulse that we should + * reuse for the search (or 0 for a new search) + * @return cosine distance between x and y (between 0 and 1) + */ +double pvq_search_rdo_double_c(const od_val16 *xcoeff, int n, int k, + od_coeff *ypulse, double g2, double pvq_norm_lambda, int prev_k) { + int i, j; + double xy; + double yy; + /* TODO - This blows our 8kB stack space budget and should be fixed when + converting PVQ to fixed point. */ + double x[MAXN]; + double xx; + double lambda; + double norm_1; + int rdo_pulses; + double delta_rate; + xx = xy = yy = 0; + for (j = 0; j < n; j++) { + x[j] = fabs((float)xcoeff[j]); + xx += x[j]*x[j]; + } + norm_1 = 1./sqrt(1e-30 + xx); + lambda = pvq_norm_lambda/(1e-30 + g2); + i = 0; + if (prev_k > 0 && prev_k <= k) { + /* We reuse pulses from a previous search so we don't have to search them + again. */ + for (j = 0; j < n; j++) { + ypulse[j] = abs(ypulse[j]); + xy += x[j]*ypulse[j]; + yy += ypulse[j]*ypulse[j]; + i += ypulse[j]; + } + } + else if (k > 2) { + double l1_norm; + double l1_inv; + l1_norm = 0; + for (j = 0; j < n; j++) l1_norm += x[j]; + l1_inv = 1./OD_MAXF(l1_norm, 1e-100); + for (j = 0; j < n; j++) { + double tmp; + tmp = k*x[j]*l1_inv; + ypulse[j] = OD_MAXI(0, (int)floor(tmp)); + xy += x[j]*ypulse[j]; + yy += ypulse[j]*ypulse[j]; + i += ypulse[j]; + } + } + else OD_CLEAR(ypulse, n); + + /* Only use RDO on the last few pulses. This not only saves CPU, but using + RDO on all pulses actually makes the results worse for reasons I don't + fully understand. */ + rdo_pulses = 1 + k/4; + /* Rough assumption for now, the last position costs about 3 bits more than + the first. */ + delta_rate = 3./n; + /* Search one pulse at a time */ + for (; i < k - rdo_pulses; i++) { + int pos; + double best_xy; + double best_yy; + pos = 0; + best_xy = -10; + best_yy = 1; + for (j = 0; j < n; j++) { + double tmp_xy; + double tmp_yy; + tmp_xy = xy + x[j]; + tmp_yy = yy + 2*ypulse[j] + 1; + tmp_xy *= tmp_xy; + if (j == 0 || tmp_xy*best_yy > best_xy*tmp_yy) { + best_xy = tmp_xy; + best_yy = tmp_yy; + pos = j; + } + } + xy = xy + x[pos]; + yy = yy + 2*ypulse[pos] + 1; + ypulse[pos]++; + } + /* Search last pulses with RDO. Distortion is D = (x-y)^2 = x^2 - 2*x*y + y^2 + and since x^2 and y^2 are constant, we just maximize x*y, plus a + lambda*rate term. Note that since x and y aren't normalized here, + we need to divide by sqrt(x^2)*sqrt(y^2). */ + for (; i < k; i++) { + double rsqrt_table[4]; + int rsqrt_table_size = 4; + int pos; + double best_cost; + pos = 0; + best_cost = -1e5; + /*Fill the small rsqrt lookup table with inputs relative to yy. + Specifically, the table of n values is filled with + rsqrt(yy + 1), rsqrt(yy + 2 + 1) .. rsqrt(yy + 2*(n-1) + 1).*/ + od_fill_dynamic_rsqrt_table(rsqrt_table, rsqrt_table_size, yy); + for (j = 0; j < n; j++) { + double tmp_xy; + double tmp_yy; + tmp_xy = xy + x[j]; + /*Calculate rsqrt(yy + 2*ypulse[j] + 1) using an optimized method.*/ + tmp_yy = od_custom_rsqrt_dynamic_table(rsqrt_table, rsqrt_table_size, + yy, ypulse[j]); + tmp_xy = 2*tmp_xy*norm_1*tmp_yy - lambda*j*delta_rate; + if (j == 0 || tmp_xy > best_cost) { + best_cost = tmp_xy; + pos = j; + } + } + xy = xy + x[pos]; + yy = yy + 2*ypulse[pos] + 1; + ypulse[pos]++; + } + for (i = 0; i < n; i++) { + if (xcoeff[i] < 0) ypulse[i] = -ypulse[i]; + } + return xy/(1e-100 + sqrt(xx*yy)); +} + +/** Encodes the gain so that the return value increases with the + * distance |x-ref|, so that we can encode a zero when x=ref. The + * value x=0 is not covered because it is only allowed in the noref + * case. + * + * @param [in] x quantized gain to encode + * @param [in] ref quantized gain of the reference + * @return interleave-encoded quantized gain value + */ +static int neg_interleave(int x, int ref) { + if (x < ref) return -2*(x - ref) - 1; + else if (x < 2*ref) return 2*(x - ref); + else return x-1; +} + +int od_vector_is_null(const od_coeff *x, int len) { + int i; + for (i = 0; i < len; i++) if (x[i]) return 0; + return 1; +} + +static double od_pvq_rate(int qg, int icgr, int theta, int ts, + const od_adapt_ctx *adapt, const od_coeff *y0, int k, int n, int speed) { + double rate; + if (k == 0) rate = 0; + else if (speed > 0) { + int i; + int sum; + double f; + /* Compute "center of mass" of the pulse vector. */ + sum = 0; + for (i = 0; i < n - (theta != -1); i++) sum += i*abs(y0[i]); + f = sum/(double)(k*n); + /* Estimates the number of bits it will cost to encode K pulses in + N dimensions based on hand-tuned fit for bitrate vs K, N and + "center of mass". */ + rate = (1 + .4*f)*n*OD_LOG2(1 + OD_MAXF(0, log(n*2*(1*f + .025))*k/n)) + 3; + } + else { + aom_writer w; + od_pvq_codeword_ctx cd; + int tell; +#if CONFIG_DAALA_EC + od_ec_enc_init(&w.ec, 1000); +#else +# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + OD_COPY(&cd, &adapt->pvq.pvq_codeword_ctx, 1); +#if CONFIG_DAALA_EC + tell = od_ec_enc_tell_frac(&w.ec); +#else +# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + aom_encode_pvq_codeword(&w, &cd, y0, n - (theta != -1), k); +#if CONFIG_DAALA_EC + rate = (od_ec_enc_tell_frac(&w.ec)-tell)/8.; + od_ec_enc_clear(&w.ec); +#else +# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + } + if (qg > 0 && theta >= 0) { + /* Approximate cost of entropy-coding theta */ + rate += .9*OD_LOG2(ts); + if (qg == icgr) rate -= .5; + } + return rate; +} + +#define MAX_PVQ_ITEMS (20) +/* This stores the information about a PVQ search candidate, so we can sort + based on K. */ +typedef struct { + int gain; + int k; + od_val32 qtheta; + int theta; + int ts; + od_val32 qcg; +} pvq_search_item; + +int items_compare(pvq_search_item *a, pvq_search_item *b) { + /* Break ties in K with gain to ensure a stable sort. + Otherwise, the order depends on qsort implementation. */ + return a->k == b->k ? a->gain - b->gain : a->k - b->k; +} + +/** Perform PVQ quantization with prediction, trying several + * possible gains and angles. See draft-valin-videocodec-pvq and + * http://jmvalin.ca/slides/pvq.pdf for more details. + * + * @param [out] out coefficients after quantization + * @param [in] x0 coefficients before quantization + * @param [in] r0 reference, aka predicted coefficients + * @param [in] n number of dimensions + * @param [in] q0 quantization step size + * @param [out] y pulse vector (i.e. selected PVQ codevector) + * @param [out] itheta angle between input and reference (-1 if noref) + * @param [out] vk total number of pulses + * @param [in] beta per-band activity masking beta param + * @param [out] skip_diff distortion cost of skipping this block + * (accumulated) + * @param [in] is_keyframe whether we're encoding a keyframe + * @param [in] pli plane index + * @param [in] adapt probability adaptation context + * @param [in] qm QM with magnitude compensation + * @param [in] qm_inv Inverse of QM with magnitude compensation + * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO + * @param [in] speed Make search faster by making approximations + * @return gain index of the quatized gain +*/ +static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0, + int n, int q0, od_coeff *y, int *itheta, int *vk, + od_val16 beta, double *skip_diff, int is_keyframe, int pli, + const od_adapt_ctx *adapt, const int16_t *qm, const int16_t *qm_inv, + double pvq_norm_lambda, int speed) { + od_val32 g; + od_val32 gr; + od_coeff y_tmp[MAXN + 3]; + int i; + /* Number of pulses. */ + int k; + /* Companded gain of x and reference, normalized to q. */ + od_val32 cg; + od_val32 cgr; + int icgr; + int qg; + /* Best RDO cost (D + lamdba*R) so far. */ + double best_cost; + double dist0; + /* Distortion (D) that corresponds to the best RDO cost. */ + double best_dist; + double dist; + /* Sign of Householder reflection. */ + int s; + /* Dimension on which Householder reflects. */ + int m; + od_val32 theta; + double corr; + int best_k; + od_val32 best_qtheta; + od_val32 gain_offset; + int noref; + double skip_dist; + int cfl_enabled; + int skip; + double gain_weight; + od_val16 x16[MAXN]; + od_val16 r16[MAXN]; + int xshift; + int rshift; + /* Give more weight to gain error when calculating the total distortion. */ + gain_weight = 1.0; + OD_ASSERT(n > 1); + corr = 0; +#if !defined(OD_FLOAT_PVQ) + /* Shift needed to make x fit in 16 bits even after rotation. + This shift value is not normative (it can be changed without breaking + the bitstream) */ + xshift = OD_MAXI(0, od_vector_log_mag(x0, n) - 15); + /* Shift needed to make the reference fit in 15 bits, so that the Householder + vector can fit in 16 bits. + This shift value *is* normative, and has to match the decoder. */ + rshift = OD_MAXI(0, od_vector_log_mag(r0, n) - 14); +#else + xshift = 0; + rshift = 0; +#endif + for (i = 0; i < n; i++) { +#if defined(OD_FLOAT_PVQ) + /*This is slightly different from the original float PVQ code, + where the qm was applied in the accumulation in od_pvq_compute_gain and + the vectors were od_coeffs, not od_val16 (i.e. double).*/ + x16[i] = x0[i]*(double)qm[i]*OD_QM_SCALE_1; + r16[i] = r0[i]*(double)qm[i]*OD_QM_SCALE_1; +#else + x16[i] = OD_SHR_ROUND(x0[i]*qm[i], OD_QM_SHIFT + xshift); + r16[i] = OD_SHR_ROUND(r0[i]*qm[i], OD_QM_SHIFT + rshift); +#endif + corr += OD_MULT16_16(x16[i], r16[i]); + } + cfl_enabled = is_keyframe && pli != 0 && !OD_DISABLE_CFL; + cg = od_pvq_compute_gain(x16, n, q0, &g, beta, xshift); + cgr = od_pvq_compute_gain(r16, n, q0, &gr, beta, rshift); + if (cfl_enabled) cgr = OD_CGAIN_SCALE; + /* gain_offset is meant to make sure one of the quantized gains has + exactly the same gain as the reference. */ +#if defined(OD_FLOAT_PVQ) + icgr = (int)floor(.5 + cgr); +#else + icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT); +#endif + gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT); + /* Start search with null case: gain=0, no pulse. */ + qg = 0; + dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2; + best_dist = dist; + best_cost = dist + pvq_norm_lambda*od_pvq_rate(0, 0, -1, 0, adapt, NULL, 0, + n, speed); + noref = 1; + best_k = 0; + *itheta = -1; + OD_CLEAR(y, n); + best_qtheta = 0; + m = 0; + s = 1; + corr = corr/(1e-100 + g*(double)gr/OD_SHL(1, xshift + rshift)); + corr = OD_MAXF(OD_MINF(corr, 1.), -1.); + if (is_keyframe) skip_dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2; + else { + skip_dist = gain_weight*(cg - cgr)*(cg - cgr) + + cgr*(double)cg*(2 - 2*corr); + skip_dist *= OD_CGAIN_SCALE_2; + } + if (!is_keyframe) { + /* noref, gain=0 isn't allowed, but skip is allowed. */ + od_val32 scgr; + scgr = OD_MAXF(0,gain_offset); + if (icgr == 0) { + best_dist = gain_weight*(cg - scgr)*(cg - scgr) + + scgr*(double)cg*(2 - 2*corr); + best_dist *= OD_CGAIN_SCALE_2; + } + best_cost = best_dist + pvq_norm_lambda*od_pvq_rate(0, icgr, 0, 0, adapt, + NULL, 0, n, speed); + best_qtheta = 0; + *itheta = 0; + noref = 0; + } + dist0 = best_dist; + if (n <= OD_MAX_PVQ_SIZE && !od_vector_is_null(r0, n) && corr > 0) { + od_val16 xr[MAXN]; + int gain_bound; + int prev_k; + pvq_search_item items[MAX_PVQ_ITEMS]; + int idx; + int nitems; + double cos_dist; + idx = 0; + gain_bound = OD_SHR(cg - gain_offset, OD_CGAIN_SHIFT); + /* Perform theta search only if prediction is useful. */ + theta = OD_ROUND32(OD_THETA_SCALE*acos(corr)); + m = od_compute_householder(r16, n, gr, &s, rshift); + od_apply_householder(xr, x16, r16, n); + prev_k = 0; + for (i = m; i < n - 1; i++) xr[i] = xr[i + 1]; + /* Compute all candidate PVQ searches within a reasonable range of gain + and theta. */ + for (i = OD_MAXI(1, gain_bound - 1); i <= gain_bound + 1; i++) { + int j; + od_val32 qcg; + int ts; + int theta_lower; + int theta_upper; + /* Quantized companded gain */ + qcg = OD_SHL(i, OD_CGAIN_SHIFT) + gain_offset; + /* Set angular resolution (in ra) to match the encoded gain */ + ts = od_pvq_compute_max_theta(qcg, beta); + theta_lower = OD_MAXI(0, (int)floor(.5 + + theta*OD_THETA_SCALE_1*2/M_PI*ts) - 2); + theta_upper = OD_MINI(ts - 1, (int)ceil(theta*OD_THETA_SCALE_1*2/M_PI*ts)); + /* Include the angles within a reasonable range. */ + for (j = theta_lower; j <= theta_upper; j++) { + od_val32 qtheta; + qtheta = od_pvq_compute_theta(j, ts); + k = od_pvq_compute_k(qcg, j, 0, n, beta); + items[idx].gain = i; + items[idx].theta = j; + items[idx].k = k; + items[idx].qcg = qcg; + items[idx].qtheta = qtheta; + items[idx].ts = ts; + idx++; + OD_ASSERT(idx < MAX_PVQ_ITEMS); + } + } + nitems = idx; + cos_dist = 0; + /* Sort PVQ search candidates in ascending order of pulses K so that + we can reuse all the previously searched pulses across searches. */ + qsort(items, nitems, sizeof(items[0]), + (int (*)(const void *, const void *))items_compare); + /* Search for the best gain/theta in order. */ + for (idx = 0; idx < nitems; idx++) { + int j; + od_val32 qcg; + int ts; + double cost; + double dist_theta; + double sin_prod; + od_val32 qtheta; + /* Quantized companded gain */ + qcg = items[idx].qcg; + i = items[idx].gain; + j = items[idx].theta; + /* Set angular resolution (in ra) to match the encoded gain */ + ts = items[idx].ts; + /* Search for the best angle within a reasonable range. */ + qtheta = items[idx].qtheta; + k = items[idx].k; + /* Compute the minimal possible distortion by not taking the PVQ + cos_dist into account. */ + dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1; + dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta; + dist *= OD_CGAIN_SCALE_2; + /* If we have no hope of beating skip (including a 1-bit worst-case + penalty), stop now. */ + if (dist > dist0 + 1.0*pvq_norm_lambda && k != 0) continue; + sin_prod = od_pvq_sin(theta)*OD_TRIG_SCALE_1*od_pvq_sin(qtheta)* + OD_TRIG_SCALE_1; + /* PVQ search, using a gain of qcg*cg*sin(theta)*sin(qtheta) since + that's the factor by which cos_dist is multiplied to get the + distortion metric. */ + if (k == 0) { + cos_dist = 0; + OD_CLEAR(y_tmp, n-1); + } + else if (k != prev_k) { + cos_dist = pvq_search_rdo_double(xr, n - 1, k, y_tmp, + qcg*(double)cg*sin_prod*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k); + } + prev_k = k; + /* See Jmspeex' Journal of Dubious Theoretical Results. */ + dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1 + + sin_prod*(2 - 2*cos_dist); + dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta; + dist *= OD_CGAIN_SCALE_2; + /* Do approximate RDO. */ + cost = dist + pvq_norm_lambda*od_pvq_rate(i, icgr, j, ts, adapt, y_tmp, + k, n, speed); + if (cost < best_cost) { + best_cost = cost; + best_dist = dist; + qg = i; + best_k = k; + best_qtheta = qtheta; + *itheta = j; + noref = 0; + OD_COPY(y, y_tmp, n - 1); + } + } + } + /* Don't bother with no-reference version if there's a reasonable + correlation. */ + if (n <= OD_MAX_PVQ_SIZE && (corr < .5 + || cg < (od_val32)(OD_SHL(2, OD_CGAIN_SHIFT)))) { + int gain_bound; + int prev_k; + gain_bound = OD_SHR(cg, OD_CGAIN_SHIFT); + prev_k = 0; + /* Search for the best gain (haven't determined reasonable range yet). */ + for (i = OD_MAXI(1, gain_bound); i <= gain_bound + 1; i++) { + double cos_dist; + double cost; + od_val32 qcg; + qcg = OD_SHL(i, OD_CGAIN_SHIFT); + k = od_pvq_compute_k(qcg, -1, 1, n, beta); + /* Compute the minimal possible distortion by not taking the PVQ + cos_dist into account. */ + dist = gain_weight*(qcg - cg)*(qcg - cg); + dist *= OD_CGAIN_SCALE_2; + if (dist > dist0 && k != 0) continue; + cos_dist = pvq_search_rdo_double(x16, n, k, y_tmp, + qcg*(double)cg*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k); + prev_k = k; + /* See Jmspeex' Journal of Dubious Theoretical Results. */ + dist = gain_weight*(qcg - cg)*(qcg - cg) + + qcg*(double)cg*(2 - 2*cos_dist); + dist *= OD_CGAIN_SCALE_2; + /* Do approximate RDO. */ + cost = dist + pvq_norm_lambda*od_pvq_rate(i, 0, -1, 0, adapt, y_tmp, k, + n, speed); + if (cost <= best_cost) { + best_cost = cost; + best_dist = dist; + qg = i; + noref = 1; + best_k = k; + *itheta = -1; + OD_COPY(y, y_tmp, n); + } + } + } + k = best_k; + theta = best_qtheta; + skip = 0; + if (noref) { + if (qg == 0) skip = OD_PVQ_SKIP_ZERO; + } + else { + if (!is_keyframe && qg == 0) { + skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY); + } + if (qg == icgr && *itheta == 0 && !cfl_enabled) skip = OD_PVQ_SKIP_COPY; + } + /* Synthesize like the decoder would. */ + if (skip) { + if (skip == OD_PVQ_SKIP_COPY) OD_COPY(out, r0, n); + else OD_CLEAR(out, n); + } + else { + if (noref) gain_offset = 0; + g = od_gain_expand(OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset, q0, beta); + od_pvq_synthesis_partial(out, y, r16, n, noref, g, theta, m, s, + qm_inv); + } + *vk = k; + *skip_diff += skip_dist - best_dist; + /* Encode gain differently depending on whether we use prediction or not. + Special encoding on inter frames where qg=0 is allowed for noref=0 + but not noref=1.*/ + if (is_keyframe) return noref ? qg : neg_interleave(qg, icgr); + else return noref ? qg - 1 : neg_interleave(qg + 1, icgr + 1); +} + +/** Encodes a single vector of integers (eg, a partition within a + * coefficient block) using PVQ + * + * @param [in,out] w multi-symbol entropy encoder + * @param [in] qg quantized gain + * @param [in] theta quantized post-prediction theta + * @param [in] in coefficient vector to code + * @param [in] n number of coefficients in partition + * @param [in] k number of pulses in partition + * @param [in,out] model entropy encoder state + * @param [in,out] adapt adaptation context + * @param [in,out] exg ExQ16 expectation of gain value + * @param [in,out] ext ExQ16 expectation of theta value + * @param [in] cdf_ctx selects which cdf context to use + * @param [in] is_keyframe whether we're encoding a keyframe + * @param [in] code_skip whether the "skip rest" flag is allowed + * @param [in] skip_rest when set, we skip all higher bands + * @param [in] encode_flip whether we need to encode the CfL flip flag now + * @param [in] flip value of the CfL flip flag + */ +void pvq_encode_partition(aom_writer *w, + int qg, + int theta, + const od_coeff *in, + int n, + int k, + generic_encoder model[3], + od_adapt_ctx *adapt, + int *exg, + int *ext, + int cdf_ctx, + int is_keyframe, + int code_skip, + int skip_rest, + int encode_flip, + int flip) { + int noref; + int id; + noref = (theta == -1); + id = (qg > 0) + 2*OD_MINI(theta + 1,3) + 8*code_skip*skip_rest; + if (is_keyframe) { + OD_ASSERT(id != 8); + if (id >= 8) id--; + } + else { + OD_ASSERT(id != 10); + if (id >= 10) id--; + } + /* Jointly code gain, theta and noref for small values. Then we handle + larger gain and theta values. For noref, theta = -1. */ + aom_write_symbol_pvq(w, id, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0], + 8 + 7*code_skip); + if (encode_flip) { + /* We could eventually do some smarter entropy coding here, but it would + have to be good enough to overcome the overhead of the entropy coder. + An early attempt using a "toogle" flag with simple adaptation wasn't + worth the trouble. */ + aom_write_bit(w, flip); + } + if (qg > 0) { + int tmp; + tmp = *exg; + generic_encode(w, &model[!noref], qg - 1, &tmp, 2); + OD_IIR_DIADIC(*exg, qg << 16, 2); + } + if (theta > 1) { + int tmp; + tmp = *ext; + generic_encode(w, &model[2], theta - 2, &tmp, 2); + OD_IIR_DIADIC(*ext, theta << 16, 2); + } + aom_encode_pvq_codeword(w, &adapt->pvq.pvq_codeword_ctx, in, + n - (theta != -1), k); +} + +/** Quantizes a scalar with rate-distortion optimization (RDO) + * @param [in] x unquantized value + * @param [in] q quantization step size + * @param [in] delta0 rate increase for encoding a 1 instead of a 0 + * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO + * @retval quantized value + */ +int od_rdo_quant(od_coeff x, int q, double delta0, double pvq_norm_lambda) { + int n; + /* Optimal quantization threshold is 1/2 + lambda*delta_rate/2. See + Jmspeex' Journal of Dubious Theoretical Results for details. */ + n = OD_DIV_R0(abs(x), q); + if ((double)abs(x)/q < (double)n/2 + pvq_norm_lambda*delta0/(2*n)) { + return 0; + } + else { + return OD_DIV_R0(x, q); + } +} + +/** Encode a coefficient block (excepting DC) using PVQ + * + * @param [in,out] enc daala encoder context + * @param [in] ref 'reference' (prediction) vector + * @param [in] in coefficient block to quantize and encode + * @param [out] out quantized coefficient block + * @param [in] q0 scale/quantizer + * @param [in] pli plane index + * @param [in] bs log of the block size minus two + * @param [in] beta per-band activity masking beta param + * @param [in] is_keyframe whether we're encoding a keyframe + * @param [in] qm QM with magnitude compensation + * @param [in] qm_inv Inverse of QM with magnitude compensation + * @param [in] speed Make search faster by making approximations + * @param [in] pvq_info If null, conisdered as RDO search mode + * @return Returns block skip info indicating whether DC/AC are coded. + * bit0: DC is coded, bit1: AC is coded (1 means coded) + * + */ +PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc, + od_coeff *ref, + const od_coeff *in, + od_coeff *out, + int q_dc, + int q_ac, + int pli, + int bs, + const od_val16 *beta, + int is_keyframe, + const int16_t *qm, + const int16_t *qm_inv, + int speed, + PVQ_INFO *pvq_info){ + int theta[PVQ_MAX_PARTITIONS]; + int qg[PVQ_MAX_PARTITIONS]; + int k[PVQ_MAX_PARTITIONS]; + od_coeff y[OD_TXSIZE_MAX*OD_TXSIZE_MAX]; + int *exg; + int *ext; + int nb_bands; + int i; + const int *off; + int size[PVQ_MAX_PARTITIONS]; + generic_encoder *model; + double skip_diff; + int tell; + uint16_t *skip_cdf; + od_rollback_buffer buf; + int dc_quant; + int flip; + int cfl_encoded; + int skip_rest; + int skip_dir; + int skip_theta_value; + const unsigned char *pvq_qm; + double dc_rate; + int use_masking; + PVQ_SKIP_TYPE ac_dc_coded; + + aom_clear_system_state(); + + use_masking = enc->use_activity_masking; + + if (use_masking) + pvq_qm = &enc->state.pvq_qm_q4[pli][0]; + else + pvq_qm = 0; + + exg = &enc->state.adapt->pvq.pvq_exg[pli][bs][0]; + ext = enc->state.adapt->pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS; + skip_cdf = enc->state.adapt->skip_cdf[2*bs + (pli != 0)]; + model = enc->state.adapt->pvq.pvq_param_model; + nb_bands = OD_BAND_OFFSETS[bs][0]; + off = &OD_BAND_OFFSETS[bs][1]; + + if (use_masking) + dc_quant = OD_MAXI(1, q_dc * pvq_qm[od_qm_get_index(bs, 0)] >> 4); + else + dc_quant = OD_MAXI(1, q_dc); + + tell = 0; + for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i]; + skip_diff = 0; + flip = 0; + /*If we are coding a chroma block of a keyframe, we are doing CfL.*/ + if (pli != 0 && is_keyframe) { + od_val32 xy; + xy = 0; + /*Compute the dot-product of the first band of chroma with the luma ref.*/ + for (i = off[0]; i < off[1]; i++) { +#if defined(OD_FLOAT_PVQ) + xy += ref[i]*(double)qm[i]*OD_QM_SCALE_1* + (double)in[i]*(double)qm[i]*OD_QM_SCALE_1; +#else + od_val32 rq; + od_val32 inq; + rq = ref[i]*qm[i]; + inq = in[i]*qm[i]; + xy += OD_SHR(rq*(int64_t)inq, OD_SHL(OD_QM_SHIFT + OD_CFL_FLIP_SHIFT, + 1)); +#endif + } + /*If cos(theta) < 0, then |theta| > pi/2 and we should negate the ref.*/ + if (xy < 0) { + flip = 1; + for(i = off[0]; i < off[nb_bands]; i++) ref[i] = -ref[i]; + } + } + for (i = 0; i < nb_bands; i++) { + int q; + + if (use_masking) + q = OD_MAXI(1, q_ac * pvq_qm[od_qm_get_index(bs, i + 1)] >> 4); + else + q = OD_MAXI(1, q_ac); + + qg[i] = pvq_theta(out + off[i], in + off[i], ref + off[i], size[i], + q, y + off[i], &theta[i], &k[i], beta[i], &skip_diff, is_keyframe, + pli, enc->state.adapt, qm + off[i], qm_inv + off[i], + enc->pvq_norm_lambda, speed); + } + od_encode_checkpoint(enc, &buf); + if (is_keyframe) out[0] = 0; + else { + int n; + n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant); + if (n == 0) { + out[0] = 0; + } else { + int tell2; + od_rollback_buffer dc_buf; + + dc_rate = -OD_LOG2((double)(skip_cdf[3] - skip_cdf[2])/ + (double)(skip_cdf[2] - skip_cdf[1])); + dc_rate += 1; + +#if CONFIG_DAALA_EC + tell2 = od_ec_enc_tell_frac(&enc->w.ec); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + od_encode_checkpoint(enc, &dc_buf); + generic_encode(&enc->w, &enc->state.adapt->model_dc[pli], + n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2); +#if CONFIG_DAALA_EC + tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2; +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + dc_rate += tell2/8.0; + od_encode_rollback(enc, &dc_buf); + + out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate, + enc->pvq_norm_lambda); + } + } +#if CONFIG_DAALA_EC + tell = od_ec_enc_tell_frac(&enc->w.ec); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + /* Code as if we're not skipping. */ + aom_write_symbol(&enc->w, 2 + (out[0] != 0), skip_cdf, 4); + ac_dc_coded = AC_CODED + (out[0] != 0); + cfl_encoded = 0; + skip_rest = 1; + skip_theta_value = is_keyframe ? -1 : 0; + for (i = 1; i < nb_bands; i++) { + if (theta[i] != skip_theta_value || qg[i]) skip_rest = 0; + } + skip_dir = 0; + if (nb_bands > 1) { + for (i = 0; i < 3; i++) { + int j; + int tmp; + tmp = 1; + // ToDo(yaowu): figure out better stop condition without gcc warning. + for (j = i + 1; j < nb_bands && j < PVQ_MAX_PARTITIONS; j += 3) { + if (theta[j] != skip_theta_value || qg[j]) tmp = 0; + } + skip_dir |= tmp << i; + } + } + if (theta[0] == skip_theta_value && qg[0] == 0 && skip_rest) nb_bands = 0; + + /* NOTE: There was no other better place to put this function. */ + if (pvq_info) + av1_store_pvq_enc_info(pvq_info, qg, theta, k, y, nb_bands, off, size, + skip_rest, skip_dir, bs); + + for (i = 0; i < nb_bands; i++) { + int encode_flip; + /* Encode CFL flip bit just after the first time it's used. */ + encode_flip = pli != 0 && is_keyframe && theta[i] != -1 && !cfl_encoded; + if (i == 0 || (!skip_rest && !(skip_dir & (1 << ((i - 1)%3))))) { + pvq_encode_partition(&enc->w, qg[i], theta[i], y + off[i], + size[i], k[i], model, enc->state.adapt, exg + i, ext + i, + (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i, + is_keyframe, i == 0 && (i < nb_bands - 1), skip_rest, encode_flip, flip); + } + if (i == 0 && !skip_rest && bs > 0) { + aom_write_symbol(&enc->w, skip_dir, + &enc->state.adapt->pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7); + } + if (encode_flip) cfl_encoded = 1; + } +#if CONFIG_DAALA_EC + tell = od_ec_enc_tell_frac(&enc->w.ec) - tell; +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + /* Account for the rate of skipping the AC, based on the same DC decision + we made when trying to not skip AC. */ + { + double skip_rate; + if (out[0] != 0) { + skip_rate = -OD_LOG2((skip_cdf[1] - skip_cdf[0])/ + (double)skip_cdf[3]); + } + else { + skip_rate = -OD_LOG2(skip_cdf[0]/ + (double)skip_cdf[3]); + } + tell -= (int)floor(.5+8*skip_rate); + } + if (nb_bands == 0 || skip_diff <= enc->pvq_norm_lambda/8*tell) { + if (is_keyframe) out[0] = 0; + else { + int n; + n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant); + if (n == 0) { + out[0] = 0; + } else { + int tell2; + od_rollback_buffer dc_buf; + + dc_rate = -OD_LOG2((double)(skip_cdf[1] - skip_cdf[0])/ + (double)skip_cdf[0]); + dc_rate += 1; + +#if CONFIG_DAALA_EC + tell2 = od_ec_enc_tell_frac(&enc->w.ec); +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + od_encode_checkpoint(enc, &dc_buf); + generic_encode(&enc->w, &enc->state.adapt->model_dc[pli], + n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2); +#if CONFIG_DAALA_EC + tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2; +#else +#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC." +#endif + dc_rate += tell2/8.0; + od_encode_rollback(enc, &dc_buf); + + out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate, + enc->pvq_norm_lambda); + } + } + /* We decide to skip, roll back everything as it was before. */ + od_encode_rollback(enc, &buf); + aom_write_symbol(&enc->w, out[0] != 0, skip_cdf, 4); + ac_dc_coded = (out[0] != 0); + if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0; + else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i]; + } + if (pvq_info) + pvq_info->ac_dc_coded = ac_dc_coded; + return ac_dc_coded; +} diff --git a/third_party/aom/av1/encoder/pvq_encoder.h b/third_party/aom/av1/encoder/pvq_encoder.h new file mode 100644 index 000000000..b84c8961b --- /dev/null +++ b/third_party/aom/av1/encoder/pvq_encoder.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* clang-format off */ + +#if !defined(_pvq_encoder_H) +# define _pvq_encoder_H (1) +# include "aom_dsp/bitwriter.h" +# include "aom_dsp/entenc.h" +# include "av1/common/blockd.h" +# include "av1/common/pvq.h" +# include "av1/encoder/encint.h" + +void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf, + int nsymbs); + +void aom_encode_band_pvq_splits(aom_writer *w, od_pvq_codeword_ctx *adapt, + const int *y, int n, int k, int level); + +void aom_laplace_encode_special(aom_writer *w, int x, unsigned decay); + +void pvq_encode_partition(aom_writer *w, + int qg, + int theta, + const od_coeff *in, + int n, + int k, + generic_encoder model[3], + od_adapt_ctx *adapt, + int *exg, + int *ext, + int cdf_ctx, + int is_keyframe, + int code_skip, + int skip_rest, + int encode_flip, + int flip); + +PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc, od_coeff *ref, + const od_coeff *in, od_coeff *out, int q_dc, int q_ac, int pli, int bs, + const od_val16 *beta, int is_keyframe, + const int16_t *qm, const int16_t *qm_inv, int speed, + PVQ_INFO *pvq_info); + +#endif diff --git a/third_party/aom/av1/encoder/ransac.c b/third_party/aom/av1/encoder/ransac.c new file mode 100644 index 000000000..5d5dd7572 --- /dev/null +++ b/third_party/aom/av1/encoder/ransac.c @@ -0,0 +1,1210 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#define _POSIX_C_SOURCE 200112L // rand_r() +#include +#include +#include +#include +#include +#include + +#include "av1/encoder/ransac.h" + +#define MAX_MINPTS 4 +#define MAX_DEGENERATE_ITER 10 +#define MINPTS_MULTIPLIER 5 + +#define INLIER_THRESHOLD 1.0 +#define MIN_TRIALS 20 + +//////////////////////////////////////////////////////////////////////////////// +// ransac +typedef int (*IsDegenerateFunc)(double *p); +typedef void (*NormalizeFunc)(double *p, int np, double *T); +typedef void (*DenormalizeFunc)(double *params, double *T1, double *T2); +typedef int (*FindTransformationFunc)(int points, double *points1, + double *points2, double *params); +typedef void (*ProjectPointsDoubleFunc)(double *mat, double *points, + double *proj, const int n, + const int stride_points, + const int stride_proj); + +static void project_points_double_translation(double *mat, double *points, + double *proj, const int n, + const int stride_points, + const int stride_proj) { + int i; + for (i = 0; i < n; ++i) { + const double x = *(points++), y = *(points++); + *(proj++) = x + mat[0]; + *(proj++) = y + mat[1]; + points += stride_points - 2; + proj += stride_proj - 2; + } +} + +static void project_points_double_rotzoom(double *mat, double *points, + double *proj, const int n, + const int stride_points, + const int stride_proj) { + int i; + for (i = 0; i < n; ++i) { + const double x = *(points++), y = *(points++); + *(proj++) = mat[2] * x + mat[3] * y + mat[0]; + *(proj++) = -mat[3] * x + mat[2] * y + mat[1]; + points += stride_points - 2; + proj += stride_proj - 2; + } +} + +static void project_points_double_affine(double *mat, double *points, + double *proj, const int n, + const int stride_points, + const int stride_proj) { + int i; + for (i = 0; i < n; ++i) { + const double x = *(points++), y = *(points++); + *(proj++) = mat[2] * x + mat[3] * y + mat[0]; + *(proj++) = mat[4] * x + mat[5] * y + mat[1]; + points += stride_points - 2; + proj += stride_proj - 2; + } +} + +static void project_points_double_hortrapezoid(double *mat, double *points, + double *proj, const int n, + const int stride_points, + const int stride_proj) { + int i; + double x, y, Z, Z_inv; + for (i = 0; i < n; ++i) { + x = *(points++), y = *(points++); + Z_inv = mat[7] * y + 1; + assert(fabs(Z_inv) > 0.000001); + Z = 1. / Z_inv; + *(proj++) = (mat[2] * x + mat[3] * y + mat[0]) * Z; + *(proj++) = (mat[5] * y + mat[1]) * Z; + points += stride_points - 2; + proj += stride_proj - 2; + } +} + +static void project_points_double_vertrapezoid(double *mat, double *points, + double *proj, const int n, + const int stride_points, + const int stride_proj) { + int i; + double x, y, Z, Z_inv; + for (i = 0; i < n; ++i) { + x = *(points++), y = *(points++); + Z_inv = mat[6] * x + 1; + assert(fabs(Z_inv) > 0.000001); + Z = 1. / Z_inv; + *(proj++) = (mat[2] * x + mat[0]) * Z; + *(proj++) = (mat[4] * x + mat[5] * y + mat[1]) * Z; + points += stride_points - 2; + proj += stride_proj - 2; + } +} + +static void project_points_double_homography(double *mat, double *points, + double *proj, const int n, + const int stride_points, + const int stride_proj) { + int i; + double x, y, Z, Z_inv; + for (i = 0; i < n; ++i) { + x = *(points++), y = *(points++); + Z_inv = mat[6] * x + mat[7] * y + 1; + assert(fabs(Z_inv) > 0.000001); + Z = 1. / Z_inv; + *(proj++) = (mat[2] * x + mat[3] * y + mat[0]) * Z; + *(proj++) = (mat[4] * x + mat[5] * y + mat[1]) * Z; + points += stride_points - 2; + proj += stride_proj - 2; + } +} + +/////////////////////////////////////////////////////////////////////////////// +// svdcmp +// Adopted from Numerical Recipes in C + +static const double TINY_NEAR_ZERO = 1.0E-12; + +static INLINE double sign(double a, double b) { + return ((b) >= 0 ? fabs(a) : -fabs(a)); +} + +static INLINE double pythag(double a, double b) { + double ct; + const double absa = fabs(a); + const double absb = fabs(b); + + if (absa > absb) { + ct = absb / absa; + return absa * sqrt(1.0 + ct * ct); + } else { + ct = absa / absb; + return (absb == 0) ? 0 : absb * sqrt(1.0 + ct * ct); + } +} + +static void multiply_mat(const double *m1, const double *m2, double *res, + const int m1_rows, const int inner_dim, + const int m2_cols) { + double sum; + + int row, col, inner; + for (row = 0; row < m1_rows; ++row) { + for (col = 0; col < m2_cols; ++col) { + sum = 0; + for (inner = 0; inner < inner_dim; ++inner) + sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col]; + *(res++) = sum; + } + } +} + +static int svdcmp(double **u, int m, int n, double w[], double **v) { + const int max_its = 30; + int flag, i, its, j, jj, k, l, nm; + double anorm, c, f, g, h, s, scale, x, y, z; + double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1)); + g = scale = anorm = 0.0; + for (i = 0; i < n; i++) { + l = i + 1; + rv1[i] = scale * g; + g = s = scale = 0.0; + if (i < m) { + for (k = i; k < m; k++) scale += fabs(u[k][i]); + if (scale != 0.) { + for (k = i; k < m; k++) { + u[k][i] /= scale; + s += u[k][i] * u[k][i]; + } + f = u[i][i]; + g = -sign(sqrt(s), f); + h = f * g - s; + u[i][i] = f - g; + for (j = l; j < n; j++) { + for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j]; + f = s / h; + for (k = i; k < m; k++) u[k][j] += f * u[k][i]; + } + for (k = i; k < m; k++) u[k][i] *= scale; + } + } + w[i] = scale * g; + g = s = scale = 0.0; + if (i < m && i != n - 1) { + for (k = l; k < n; k++) scale += fabs(u[i][k]); + if (scale != 0.) { + for (k = l; k < n; k++) { + u[i][k] /= scale; + s += u[i][k] * u[i][k]; + } + f = u[i][l]; + g = -sign(sqrt(s), f); + h = f * g - s; + u[i][l] = f - g; + for (k = l; k < n; k++) rv1[k] = u[i][k] / h; + for (j = l; j < m; j++) { + for (s = 0.0, k = l; k < n; k++) s += u[j][k] * u[i][k]; + for (k = l; k < n; k++) u[j][k] += s * rv1[k]; + } + for (k = l; k < n; k++) u[i][k] *= scale; + } + } + anorm = fmax(anorm, (fabs(w[i]) + fabs(rv1[i]))); + } + + for (i = n - 1; i >= 0; i--) { + if (i < n - 1) { + if (g != 0.) { + for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g; + for (j = l; j < n; j++) { + for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j]; + for (k = l; k < n; k++) v[k][j] += s * v[k][i]; + } + } + for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0; + } + v[i][i] = 1.0; + g = rv1[i]; + l = i; + } + for (i = AOMMIN(m, n) - 1; i >= 0; i--) { + l = i + 1; + g = w[i]; + for (j = l; j < n; j++) u[i][j] = 0.0; + if (g != 0.) { + g = 1.0 / g; + for (j = l; j < n; j++) { + for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j]; + f = (s / u[i][i]) * g; + for (k = i; k < m; k++) u[k][j] += f * u[k][i]; + } + for (j = i; j < m; j++) u[j][i] *= g; + } else { + for (j = i; j < m; j++) u[j][i] = 0.0; + } + ++u[i][i]; + } + for (k = n - 1; k >= 0; k--) { + for (its = 0; its < max_its; its++) { + flag = 1; + for (l = k; l >= 0; l--) { + nm = l - 1; + if ((double)(fabs(rv1[l]) + anorm) == anorm || nm < 0) { + flag = 0; + break; + } + if ((double)(fabs(w[nm]) + anorm) == anorm) break; + } + if (flag) { + c = 0.0; + s = 1.0; + for (i = l; i <= k; i++) { + f = s * rv1[i]; + rv1[i] = c * rv1[i]; + if ((double)(fabs(f) + anorm) == anorm) break; + g = w[i]; + h = pythag(f, g); + w[i] = h; + h = 1.0 / h; + c = g * h; + s = -f * h; + for (j = 0; j < m; j++) { + y = u[j][nm]; + z = u[j][i]; + u[j][nm] = y * c + z * s; + u[j][i] = z * c - y * s; + } + } + } + z = w[k]; + if (l == k) { + if (z < 0.0) { + w[k] = -z; + for (j = 0; j < n; j++) v[j][k] = -v[j][k]; + } + break; + } + if (its == max_its - 1) { + aom_free(rv1); + return 1; + } + assert(k > 0); + x = w[l]; + nm = k - 1; + y = w[nm]; + g = rv1[nm]; + h = rv1[k]; + f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y); + g = pythag(f, 1.0); + f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x; + c = s = 1.0; + for (j = l; j <= nm; j++) { + i = j + 1; + g = rv1[i]; + y = w[i]; + h = s * g; + g = c * g; + z = pythag(f, h); + rv1[j] = z; + c = f / z; + s = h / z; + f = x * c + g * s; + g = g * c - x * s; + h = y * s; + y *= c; + for (jj = 0; jj < n; jj++) { + x = v[jj][j]; + z = v[jj][i]; + v[jj][j] = x * c + z * s; + v[jj][i] = z * c - x * s; + } + z = pythag(f, h); + w[j] = z; + if (z != 0.) { + z = 1.0 / z; + c = f * z; + s = h * z; + } + f = c * g + s * y; + x = c * y - s * g; + for (jj = 0; jj < m; jj++) { + y = u[jj][j]; + z = u[jj][i]; + u[jj][j] = y * c + z * s; + u[jj][i] = z * c - y * s; + } + } + rv1[l] = 0.0; + rv1[k] = f; + w[k] = x; + } + } + aom_free(rv1); + return 0; +} + +static int SVD(double *U, double *W, double *V, double *matx, int M, int N) { + // Assumes allocation for U is MxN + double **nrU = (double **)aom_malloc((M) * sizeof(*nrU)); + double **nrV = (double **)aom_malloc((N) * sizeof(*nrV)); + int problem, i; + + problem = !(nrU && nrV); + if (!problem) { + for (i = 0; i < M; i++) { + nrU[i] = &U[i * N]; + } + for (i = 0; i < N; i++) { + nrV[i] = &V[i * N]; + } + } else { + if (nrU) aom_free(nrU); + if (nrV) aom_free(nrV); + return 1; + } + + /* copy from given matx into nrU */ + for (i = 0; i < M; i++) { + memcpy(&(nrU[i][0]), matx + N * i, N * sizeof(*matx)); + } + + /* HERE IT IS: do SVD */ + if (svdcmp(nrU, M, N, W, nrV)) { + aom_free(nrU); + aom_free(nrV); + return 1; + } + + /* aom_free Numerical Recipes arrays */ + aom_free(nrU); + aom_free(nrV); + + return 0; +} + +int pseudo_inverse(double *inv, double *matx, const int M, const int N) { + double ans; + int i, j, k; + double *const U = (double *)aom_malloc(M * N * sizeof(*matx)); + double *const W = (double *)aom_malloc(N * sizeof(*matx)); + double *const V = (double *)aom_malloc(N * N * sizeof(*matx)); + + if (!(U && W && V)) { + return 1; + } + if (SVD(U, W, V, matx, M, N)) { + aom_free(U); + aom_free(W); + aom_free(V); + return 1; + } + for (i = 0; i < N; i++) { + if (fabs(W[i]) < TINY_NEAR_ZERO) { + aom_free(U); + aom_free(W); + aom_free(V); + return 1; + } + } + + for (i = 0; i < N; i++) { + for (j = 0; j < M; j++) { + ans = 0; + for (k = 0; k < N; k++) { + ans += V[k + N * i] * U[k + N * j] / W[k]; + } + inv[j + M * i] = ans; + } + } + aom_free(U); + aom_free(W); + aom_free(V); + return 0; +} + +static void normalize_homography(double *pts, int n, double *T) { + double *p = pts; + double mean[2] = { 0, 0 }; + double msqe = 0; + double scale; + int i; + for (i = 0; i < n; ++i, p += 2) { + mean[0] += p[0]; + mean[1] += p[1]; + } + mean[0] /= n; + mean[1] /= n; + for (p = pts, i = 0; i < n; ++i, p += 2) { + p[0] -= mean[0]; + p[1] -= mean[1]; + msqe += sqrt(p[0] * p[0] + p[1] * p[1]); + } + msqe /= n; + scale = (msqe == 0 ? 1.0 : sqrt(2) / msqe); + T[0] = scale; + T[1] = 0; + T[2] = -scale * mean[0]; + T[3] = 0; + T[4] = scale; + T[5] = -scale * mean[1]; + T[6] = 0; + T[7] = 0; + T[8] = 1; + for (p = pts, i = 0; i < n; ++i, p += 2) { + p[0] *= scale; + p[1] *= scale; + } +} + +static void invnormalize_mat(double *T, double *iT) { + double is = 1.0 / T[0]; + double m0 = -T[2] * is; + double m1 = -T[5] * is; + iT[0] = is; + iT[1] = 0; + iT[2] = m0; + iT[3] = 0; + iT[4] = is; + iT[5] = m1; + iT[6] = 0; + iT[7] = 0; + iT[8] = 1; +} + +static void denormalize_homography(double *params, double *T1, double *T2) { + double iT2[9]; + double params2[9]; + invnormalize_mat(T2, iT2); + multiply_mat(params, T1, params2, 3, 3, 3); + multiply_mat(iT2, params2, params, 3, 3, 3); +} + +static void denormalize_homography_reorder(double *params, double *T1, + double *T2) { + double params_denorm[MAX_PARAMDIM]; + memcpy(params_denorm, params, sizeof(*params) * 8); + params_denorm[8] = 1.0; + denormalize_homography(params_denorm, T1, T2); + params[0] = params_denorm[2]; + params[1] = params_denorm[5]; + params[2] = params_denorm[0]; + params[3] = params_denorm[1]; + params[4] = params_denorm[3]; + params[5] = params_denorm[4]; + params[6] = params_denorm[6]; + params[7] = params_denorm[7]; +} + +static void denormalize_affine_reorder(double *params, double *T1, double *T2) { + double params_denorm[MAX_PARAMDIM]; + params_denorm[0] = params[0]; + params_denorm[1] = params[1]; + params_denorm[2] = params[4]; + params_denorm[3] = params[2]; + params_denorm[4] = params[3]; + params_denorm[5] = params[5]; + params_denorm[6] = params_denorm[7] = 0; + params_denorm[8] = 1; + denormalize_homography(params_denorm, T1, T2); + params[0] = params_denorm[2]; + params[1] = params_denorm[5]; + params[2] = params_denorm[0]; + params[3] = params_denorm[1]; + params[4] = params_denorm[3]; + params[5] = params_denorm[4]; + params[6] = params[7] = 0; +} + +static void denormalize_rotzoom_reorder(double *params, double *T1, + double *T2) { + double params_denorm[MAX_PARAMDIM]; + params_denorm[0] = params[0]; + params_denorm[1] = params[1]; + params_denorm[2] = params[2]; + params_denorm[3] = -params[1]; + params_denorm[4] = params[0]; + params_denorm[5] = params[3]; + params_denorm[6] = params_denorm[7] = 0; + params_denorm[8] = 1; + denormalize_homography(params_denorm, T1, T2); + params[0] = params_denorm[2]; + params[1] = params_denorm[5]; + params[2] = params_denorm[0]; + params[3] = params_denorm[1]; + params[4] = -params[3]; + params[5] = params[2]; + params[6] = params[7] = 0; +} + +static void denormalize_translation_reorder(double *params, double *T1, + double *T2) { + double params_denorm[MAX_PARAMDIM]; + params_denorm[0] = 1; + params_denorm[1] = 0; + params_denorm[2] = params[0]; + params_denorm[3] = 0; + params_denorm[4] = 1; + params_denorm[5] = params[1]; + params_denorm[6] = params_denorm[7] = 0; + params_denorm[8] = 1; + denormalize_homography(params_denorm, T1, T2); + params[0] = params_denorm[2]; + params[1] = params_denorm[5]; + params[2] = params[5] = 1; + params[3] = params[4] = 0; + params[6] = params[7] = 0; +} + +static int find_translation(int np, double *pts1, double *pts2, double *mat) { + int i; + double sx, sy, dx, dy; + double sumx, sumy; + + double T1[9], T2[9]; + normalize_homography(pts1, np, T1); + normalize_homography(pts2, np, T2); + + sumx = 0; + sumy = 0; + for (i = 0; i < np; ++i) { + dx = *(pts2++); + dy = *(pts2++); + sx = *(pts1++); + sy = *(pts1++); + + sumx += dx - sx; + sumy += dy - sy; + } + mat[0] = sumx / np; + mat[1] = sumy / np; + denormalize_translation_reorder(mat, T1, T2); + return 0; +} + +static int find_rotzoom(int np, double *pts1, double *pts2, double *mat) { + const int np2 = np * 2; + double *a = (double *)aom_malloc(sizeof(*a) * np2 * 9); + double *b = a + np2 * 4; + double *temp = b + np2; + int i; + double sx, sy, dx, dy; + + double T1[9], T2[9]; + normalize_homography(pts1, np, T1); + normalize_homography(pts2, np, T2); + + for (i = 0; i < np; ++i) { + dx = *(pts2++); + dy = *(pts2++); + sx = *(pts1++); + sy = *(pts1++); + + a[i * 2 * 4 + 0] = sx; + a[i * 2 * 4 + 1] = sy; + a[i * 2 * 4 + 2] = 1; + a[i * 2 * 4 + 3] = 0; + a[(i * 2 + 1) * 4 + 0] = sy; + a[(i * 2 + 1) * 4 + 1] = -sx; + a[(i * 2 + 1) * 4 + 2] = 0; + a[(i * 2 + 1) * 4 + 3] = 1; + + b[2 * i] = dx; + b[2 * i + 1] = dy; + } + if (pseudo_inverse(temp, a, np2, 4)) { + aom_free(a); + return 1; + } + multiply_mat(temp, b, mat, 4, np2, 1); + denormalize_rotzoom_reorder(mat, T1, T2); + aom_free(a); + return 0; +} + +static int find_affine(int np, double *pts1, double *pts2, double *mat) { + const int np2 = np * 2; + double *a = (double *)aom_malloc(sizeof(*a) * np2 * 13); + double *b = a + np2 * 6; + double *temp = b + np2; + int i; + double sx, sy, dx, dy; + + double T1[9], T2[9]; + normalize_homography(pts1, np, T1); + normalize_homography(pts2, np, T2); + + for (i = 0; i < np; ++i) { + dx = *(pts2++); + dy = *(pts2++); + sx = *(pts1++); + sy = *(pts1++); + + a[i * 2 * 6 + 0] = sx; + a[i * 2 * 6 + 1] = sy; + a[i * 2 * 6 + 2] = 0; + a[i * 2 * 6 + 3] = 0; + a[i * 2 * 6 + 4] = 1; + a[i * 2 * 6 + 5] = 0; + a[(i * 2 + 1) * 6 + 0] = 0; + a[(i * 2 + 1) * 6 + 1] = 0; + a[(i * 2 + 1) * 6 + 2] = sx; + a[(i * 2 + 1) * 6 + 3] = sy; + a[(i * 2 + 1) * 6 + 4] = 0; + a[(i * 2 + 1) * 6 + 5] = 1; + + b[2 * i] = dx; + b[2 * i + 1] = dy; + } + if (pseudo_inverse(temp, a, np2, 6)) { + aom_free(a); + return 1; + } + multiply_mat(temp, b, mat, 6, np2, 1); + denormalize_affine_reorder(mat, T1, T2); + aom_free(a); + return 0; +} + +static int find_vertrapezoid(int np, double *pts1, double *pts2, double *mat) { + const int np3 = np * 3; + double *a = (double *)aom_malloc(sizeof(*a) * np3 * 14); + double *U = a + np3 * 7; + double S[7], V[7 * 7], H[9]; + int i, mini; + double sx, sy, dx, dy; + double T1[9], T2[9]; + + normalize_homography(pts1, np, T1); + normalize_homography(pts2, np, T2); + + for (i = 0; i < np; ++i) { + dx = *(pts2++); + dy = *(pts2++); + sx = *(pts1++); + sy = *(pts1++); + + a[i * 3 * 7 + 0] = a[i * 3 * 7 + 1] = 0; + a[i * 3 * 7 + 2] = -sx; + a[i * 3 * 7 + 3] = -sy; + a[i * 3 * 7 + 4] = -1; + a[i * 3 * 7 + 5] = dy * sx; + a[i * 3 * 7 + 6] = dy; + + a[(i * 3 + 1) * 7 + 0] = sx; + a[(i * 3 + 1) * 7 + 1] = 1; + a[(i * 3 + 1) * 7 + 2] = a[(i * 3 + 1) * 7 + 3] = a[(i * 3 + 1) * 7 + 4] = + 0; + a[(i * 3 + 1) * 7 + 5] = -dx * sx; + a[(i * 3 + 1) * 7 + 6] = -dx; + + a[(i * 3 + 2) * 7 + 0] = -dy * sx; + a[(i * 3 + 2) * 7 + 1] = -dy; + a[(i * 3 + 2) * 7 + 2] = dx * sx; + a[(i * 3 + 2) * 7 + 3] = dx * sy; + a[(i * 3 + 2) * 7 + 4] = dx; + a[(i * 3 + 2) * 7 + 5] = a[(i * 3 + 2) * 7 + 6] = 0; + } + if (SVD(U, S, V, a, np3, 7)) { + aom_free(a); + return 1; + } else { + double minS = 1e12; + mini = -1; + for (i = 0; i < 7; ++i) { + if (S[i] < minS) { + minS = S[i]; + mini = i; + } + } + } + H[1] = H[7] = 0; + for (i = 0; i < 1; i++) H[i] = V[i * 7 + mini]; + for (; i < 6; i++) H[i + 1] = V[i * 7 + mini]; + for (; i < 7; i++) H[i + 2] = V[i * 7 + mini]; + + denormalize_homography_reorder(H, T1, T2); + aom_free(a); + if (H[8] == 0.0) { + return 1; + } else { + // normalize + double f = 1.0 / H[8]; + for (i = 0; i < 8; i++) mat[i] = f * H[i]; + } + return 0; +} + +static int find_hortrapezoid(int np, double *pts1, double *pts2, double *mat) { + const int np3 = np * 3; + double *a = (double *)aom_malloc(sizeof(*a) * np3 * 14); + double *U = a + np3 * 7; + double S[7], V[7 * 7], H[9]; + int i, mini; + double sx, sy, dx, dy; + double T1[9], T2[9]; + + normalize_homography(pts1, np, T1); + normalize_homography(pts2, np, T2); + + for (i = 0; i < np; ++i) { + dx = *(pts2++); + dy = *(pts2++); + sx = *(pts1++); + sy = *(pts1++); + + a[i * 3 * 7 + 0] = a[i * 3 * 7 + 1] = a[i * 3 * 7 + 2] = 0; + a[i * 3 * 7 + 3] = -sy; + a[i * 3 * 7 + 4] = -1; + a[i * 3 * 7 + 5] = dy * sy; + a[i * 3 * 7 + 6] = dy; + + a[(i * 3 + 1) * 7 + 0] = sx; + a[(i * 3 + 1) * 7 + 1] = sy; + a[(i * 3 + 1) * 7 + 2] = 1; + a[(i * 3 + 1) * 7 + 3] = a[(i * 3 + 1) * 7 + 4] = 0; + a[(i * 3 + 1) * 7 + 5] = -dx * sy; + a[(i * 3 + 1) * 7 + 6] = -dx; + + a[(i * 3 + 2) * 7 + 0] = -dy * sx; + a[(i * 3 + 2) * 7 + 1] = -dy * sy; + a[(i * 3 + 2) * 7 + 2] = -dy; + a[(i * 3 + 2) * 7 + 3] = dx * sy; + a[(i * 3 + 2) * 7 + 4] = dx; + a[(i * 3 + 2) * 7 + 5] = a[(i * 3 + 2) * 7 + 6] = 0; + } + + if (SVD(U, S, V, a, np3, 7)) { + aom_free(a); + return 1; + } else { + double minS = 1e12; + mini = -1; + for (i = 0; i < 7; ++i) { + if (S[i] < minS) { + minS = S[i]; + mini = i; + } + } + } + H[3] = H[6] = 0; + for (i = 0; i < 3; i++) H[i] = V[i * 7 + mini]; + for (; i < 5; i++) H[i + 1] = V[i * 7 + mini]; + for (; i < 7; i++) H[i + 2] = V[i * 7 + mini]; + + denormalize_homography_reorder(H, T1, T2); + aom_free(a); + if (H[8] == 0.0) { + return 1; + } else { + // normalize + double f = 1.0 / H[8]; + for (i = 0; i < 8; i++) mat[i] = f * H[i]; + } + return 0; +} + +static int find_homography(int np, double *pts1, double *pts2, double *mat) { + // Implemented from Peter Kovesi's normalized implementation + const int np3 = np * 3; + double *a = (double *)aom_malloc(sizeof(*a) * np3 * 18); + double *U = a + np3 * 9; + double S[9], V[9 * 9], H[9]; + int i, mini; + double sx, sy, dx, dy; + double T1[9], T2[9]; + + normalize_homography(pts1, np, T1); + normalize_homography(pts2, np, T2); + + for (i = 0; i < np; ++i) { + dx = *(pts2++); + dy = *(pts2++); + sx = *(pts1++); + sy = *(pts1++); + + a[i * 3 * 9 + 0] = a[i * 3 * 9 + 1] = a[i * 3 * 9 + 2] = 0; + a[i * 3 * 9 + 3] = -sx; + a[i * 3 * 9 + 4] = -sy; + a[i * 3 * 9 + 5] = -1; + a[i * 3 * 9 + 6] = dy * sx; + a[i * 3 * 9 + 7] = dy * sy; + a[i * 3 * 9 + 8] = dy; + + a[(i * 3 + 1) * 9 + 0] = sx; + a[(i * 3 + 1) * 9 + 1] = sy; + a[(i * 3 + 1) * 9 + 2] = 1; + a[(i * 3 + 1) * 9 + 3] = a[(i * 3 + 1) * 9 + 4] = a[(i * 3 + 1) * 9 + 5] = + 0; + a[(i * 3 + 1) * 9 + 6] = -dx * sx; + a[(i * 3 + 1) * 9 + 7] = -dx * sy; + a[(i * 3 + 1) * 9 + 8] = -dx; + + a[(i * 3 + 2) * 9 + 0] = -dy * sx; + a[(i * 3 + 2) * 9 + 1] = -dy * sy; + a[(i * 3 + 2) * 9 + 2] = -dy; + a[(i * 3 + 2) * 9 + 3] = dx * sx; + a[(i * 3 + 2) * 9 + 4] = dx * sy; + a[(i * 3 + 2) * 9 + 5] = dx; + a[(i * 3 + 2) * 9 + 6] = a[(i * 3 + 2) * 9 + 7] = a[(i * 3 + 2) * 9 + 8] = + 0; + } + + if (SVD(U, S, V, a, np3, 9)) { + aom_free(a); + return 1; + } else { + double minS = 1e12; + mini = -1; + for (i = 0; i < 9; ++i) { + if (S[i] < minS) { + minS = S[i]; + mini = i; + } + } + } + + for (i = 0; i < 9; i++) H[i] = V[i * 9 + mini]; + denormalize_homography_reorder(H, T1, T2); + aom_free(a); + if (H[8] == 0.0) { + return 1; + } else { + // normalize + double f = 1.0 / H[8]; + for (i = 0; i < 8; i++) mat[i] = f * H[i]; + } + return 0; +} + +static int get_rand_indices(int npoints, int minpts, int *indices, + unsigned int *seed) { + int i, j; + int ptr = rand_r(seed) % npoints; + if (minpts > npoints) return 0; + indices[0] = ptr; + ptr = (ptr == npoints - 1 ? 0 : ptr + 1); + i = 1; + while (i < minpts) { + int index = rand_r(seed) % npoints; + while (index) { + ptr = (ptr == npoints - 1 ? 0 : ptr + 1); + for (j = 0; j < i; ++j) { + if (indices[j] == ptr) break; + } + if (j == i) index--; + } + indices[i++] = ptr; + } + return 1; +} + +typedef struct { + int num_inliers; + double variance; + int *inlier_indices; +} RANSAC_MOTION; + +// Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise. +static int compare_motions(const void *arg_a, const void *arg_b) { + const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a; + const RANSAC_MOTION *motion_b = (RANSAC_MOTION *)arg_b; + + if (motion_a->num_inliers > motion_b->num_inliers) return -1; + if (motion_a->num_inliers < motion_b->num_inliers) return 1; + if (motion_a->variance < motion_b->variance) return -1; + if (motion_a->variance > motion_b->variance) return 1; + return 0; +} + +static int is_better_motion(const RANSAC_MOTION *motion_a, + const RANSAC_MOTION *motion_b) { + return compare_motions(motion_a, motion_b) < 0; +} + +static void copy_points_at_indices(double *dest, const double *src, + const int *indices, int num_points) { + for (int i = 0; i < num_points; ++i) { + const int index = indices[i]; + dest[i * 2] = src[index * 2]; + dest[i * 2 + 1] = src[index * 2 + 1]; + } +} + +static const double kInfiniteVariance = 1e12; + +static void clear_motion(RANSAC_MOTION *motion, int num_points) { + motion->num_inliers = 0; + motion->variance = kInfiniteVariance; + memset(motion->inlier_indices, 0, + sizeof(*motion->inlier_indices * num_points)); +} + +static int ransac(const int *matched_points, int npoints, + int *num_inliers_by_motion, double *params_by_motion, + int num_desired_motions, const int minpts, + IsDegenerateFunc is_degenerate, + FindTransformationFunc find_transformation, + ProjectPointsDoubleFunc projectpoints) { + static const double PROBABILITY_REQUIRED = 0.9; + static const double EPS = 1e-12; + + int N = 10000, trial_count = 0; + int i = 0; + int ret_val = 0; + + unsigned int seed = (unsigned int)npoints; + + int indices[MAX_MINPTS] = { 0 }; + + double *points1, *points2; + double *corners1, *corners2; + double *image1_coord; + + // Store information for the num_desired_motions best transformations found + // and the worst motion among them, as well as the motion currently under + // consideration. + RANSAC_MOTION *motions, *worst_kept_motion = NULL; + RANSAC_MOTION current_motion; + + // Store the parameters and the indices of the inlier points for the motion + // currently under consideration. + double params_this_motion[MAX_PARAMDIM]; + + double *cnp1, *cnp2; + + if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) { + return 1; + } + + points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2); + points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2); + corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2); + corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2); + image1_coord = (double *)aom_malloc(sizeof(*image1_coord) * npoints * 2); + + motions = + (RANSAC_MOTION *)aom_malloc(sizeof(RANSAC_MOTION) * num_desired_motions); + for (i = 0; i < num_desired_motions; ++i) { + motions[i].inlier_indices = + (int *)aom_malloc(sizeof(*motions->inlier_indices) * npoints); + clear_motion(motions + i, npoints); + } + current_motion.inlier_indices = + (int *)aom_malloc(sizeof(*current_motion.inlier_indices) * npoints); + clear_motion(¤t_motion, npoints); + + worst_kept_motion = motions; + + if (!(points1 && points2 && corners1 && corners2 && image1_coord && motions && + current_motion.inlier_indices)) { + ret_val = 1; + goto finish_ransac; + } + + cnp1 = corners1; + cnp2 = corners2; + for (i = 0; i < npoints; ++i) { + *(cnp1++) = *(matched_points++); + *(cnp1++) = *(matched_points++); + *(cnp2++) = *(matched_points++); + *(cnp2++) = *(matched_points++); + } + + while (N > trial_count) { + double sum_distance = 0.0; + double sum_distance_squared = 0.0; + + clear_motion(¤t_motion, npoints); + + int degenerate = 1; + int num_degenerate_iter = 0; + + while (degenerate) { + num_degenerate_iter++; + if (!get_rand_indices(npoints, minpts, indices, &seed)) { + ret_val = 1; + goto finish_ransac; + } + + copy_points_at_indices(points1, corners1, indices, minpts); + copy_points_at_indices(points2, corners2, indices, minpts); + + degenerate = is_degenerate(points1); + if (num_degenerate_iter > MAX_DEGENERATE_ITER) { + ret_val = 1; + goto finish_ransac; + } + } + + if (find_transformation(minpts, points1, points2, params_this_motion)) { + trial_count++; + continue; + } + + projectpoints(params_this_motion, corners1, image1_coord, npoints, 2, 2); + + for (i = 0; i < npoints; ++i) { + double dx = image1_coord[i * 2] - corners2[i * 2]; + double dy = image1_coord[i * 2 + 1] - corners2[i * 2 + 1]; + double distance = sqrt(dx * dx + dy * dy); + + if (distance < INLIER_THRESHOLD) { + current_motion.inlier_indices[current_motion.num_inliers++] = i; + sum_distance += distance; + sum_distance_squared += distance * distance; + } + } + + if (current_motion.num_inliers >= worst_kept_motion->num_inliers && + current_motion.num_inliers > 1) { + int temp; + double fracinliers, pNoOutliers, mean_distance; + mean_distance = sum_distance / ((double)current_motion.num_inliers); + current_motion.variance = + sum_distance_squared / ((double)current_motion.num_inliers - 1.0) - + mean_distance * mean_distance * ((double)current_motion.num_inliers) / + ((double)current_motion.num_inliers - 1.0); + if (is_better_motion(¤t_motion, worst_kept_motion)) { + // This motion is better than the worst currently kept motion. Remember + // the inlier points and variance. The parameters for each kept motion + // will be recomputed later using only the inliers. + worst_kept_motion->num_inliers = current_motion.num_inliers; + worst_kept_motion->variance = current_motion.variance; + memcpy(worst_kept_motion->inlier_indices, current_motion.inlier_indices, + sizeof(*current_motion.inlier_indices) * npoints); + + assert(npoints > 0); + fracinliers = (double)current_motion.num_inliers / (double)npoints; + pNoOutliers = 1 - pow(fracinliers, minpts); + pNoOutliers = fmax(EPS, pNoOutliers); + pNoOutliers = fmin(1 - EPS, pNoOutliers); + temp = (int)(log(1.0 - PROBABILITY_REQUIRED) / log(pNoOutliers)); + + if (temp > 0 && temp < N) { + N = AOMMAX(temp, MIN_TRIALS); + } + + // Determine the new worst kept motion and its num_inliers and variance. + for (i = 0; i < num_desired_motions; ++i) { + if (is_better_motion(worst_kept_motion, &motions[i])) { + worst_kept_motion = &motions[i]; + } + } + } + } + trial_count++; + } + + // Sort the motions, best first. + qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions); + + // Recompute the motions using only the inliers. + for (i = 0; i < num_desired_motions; ++i) { + copy_points_at_indices(points1, corners1, motions[i].inlier_indices, + motions[i].num_inliers); + copy_points_at_indices(points2, corners2, motions[i].inlier_indices, + motions[i].num_inliers); + + find_transformation(motions[i].num_inliers, points1, points2, + params_by_motion + (MAX_PARAMDIM - 1) * i); + num_inliers_by_motion[i] = motions[i].num_inliers; + } + +finish_ransac: + aom_free(points1); + aom_free(points2); + aom_free(corners1); + aom_free(corners2); + aom_free(image1_coord); + aom_free(current_motion.inlier_indices); + for (i = 0; i < num_desired_motions; ++i) { + aom_free(motions[i].inlier_indices); + } + aom_free(motions); + + return ret_val; +} + +static int is_collinear3(double *p1, double *p2, double *p3) { + static const double collinear_eps = 1e-3; + const double v = + (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]); + return fabs(v) < collinear_eps; +} + +static int is_degenerate_translation(double *p) { + return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2; +} + +static int is_degenerate_affine(double *p) { + return is_collinear3(p, p + 2, p + 4); +} + +static int is_degenerate_homography(double *p) { + return is_collinear3(p, p + 2, p + 4) || is_collinear3(p, p + 2, p + 6) || + is_collinear3(p, p + 4, p + 6) || is_collinear3(p + 2, p + 4, p + 6); +} + +int ransac_translation(int *matched_points, int npoints, + int *num_inliers_by_motion, double *params_by_motion, + int num_desired_motions) { + return ransac(matched_points, npoints, num_inliers_by_motion, + params_by_motion, num_desired_motions, 3, + is_degenerate_translation, find_translation, + project_points_double_translation); +} + +int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion, + double *params_by_motion, int num_desired_motions) { + return ransac(matched_points, npoints, num_inliers_by_motion, + params_by_motion, num_desired_motions, 3, is_degenerate_affine, + find_rotzoom, project_points_double_rotzoom); +} + +int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion, + double *params_by_motion, int num_desired_motions) { + return ransac(matched_points, npoints, num_inliers_by_motion, + params_by_motion, num_desired_motions, 3, is_degenerate_affine, + find_affine, project_points_double_affine); +} + +int ransac_homography(int *matched_points, int npoints, + int *num_inliers_by_motion, double *params_by_motion, + int num_desired_motions) { + return ransac(matched_points, npoints, num_inliers_by_motion, + params_by_motion, num_desired_motions, 4, + is_degenerate_homography, find_homography, + project_points_double_homography); +} + +int ransac_hortrapezoid(int *matched_points, int npoints, + int *num_inliers_by_motion, double *params_by_motion, + int num_desired_motions) { + return ransac(matched_points, npoints, num_inliers_by_motion, + params_by_motion, num_desired_motions, 4, + is_degenerate_homography, find_hortrapezoid, + project_points_double_hortrapezoid); +} + +int ransac_vertrapezoid(int *matched_points, int npoints, + int *num_inliers_by_motion, double *params_by_motion, + int num_desired_motions) { + return ransac(matched_points, npoints, num_inliers_by_motion, + params_by_motion, num_desired_motions, 4, + is_degenerate_homography, find_vertrapezoid, + project_points_double_vertrapezoid); +} diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h new file mode 100644 index 000000000..f611add36 --- /dev/null +++ b/third_party/aom/av1/encoder/ransac.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_RANSAC_H_ +#define AV1_ENCODER_RANSAC_H_ + +#include +#include +#include +#include + +#include "av1/common/warped_motion.h" + +typedef int (*RansacFunc)(int *matched_points, int npoints, + int *num_inliers_by_motion, double *params_by_motion, + int num_motions); + +/* Each of these functions fits a motion model from a set of + corresponding points in 2 frames using RANSAC. */ +int ransac_homography(int *matched_points, int npoints, + int *num_inliers_by_motion, double *params_by_motion, + int num_motions); +int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion, + double *params_by_motion, int num_motions); +int ransac_hortrapezoid(int *matched_points, int npoints, + int *num_inliers_by_motion, double *params_by_motion, + int num_motions); +int ransac_vertrapezoid(int *matched_points, int npoints, + int *num_inliers_by_motion, double *params_by_motion, + int num_motions); +int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion, + double *params_by_motion, int num_motions); +int ransac_translation(int *matched_points, int npoints, + int *num_inliers_by_motion, double *params_by_motion, + int num_motions); +#endif // AV1_ENCODER_RANSAC_H_ diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c new file mode 100644 index 000000000..1f2ea3606 --- /dev/null +++ b/third_party/aom/av1/encoder/ratectrl.c @@ -0,0 +1,1759 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +#include "av1/common/alloccommon.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/common/common.h" +#include "av1/common/entropymode.h" +#include "av1/common/quant_common.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/encodemv.h" +#include "av1/encoder/ratectrl.h" + +// Max rate target for 1080P and below encodes under normal circumstances +// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB +#define MAX_MB_RATE 250 +#define MAXRATE_1080P 2025000 + +#define DEFAULT_KF_BOOST 2000 +#define DEFAULT_GF_BOOST 2000 + +#define MIN_BPB_FACTOR 0.005 +#define MAX_BPB_FACTOR 50 + +#define FRAME_OVERHEAD_BITS 200 +#if CONFIG_HIGHBITDEPTH +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + switch (bit_depth) { \ + case AOM_BITS_8: name = name##_8; break; \ + case AOM_BITS_10: name = name##_10; break; \ + case AOM_BITS_12: name = name##_12; break; \ + default: \ + assert(0 && \ + "bit_depth should be AOM_BITS_8, AOM_BITS_10" \ + " or AOM_BITS_12"); \ + name = NULL; \ + } \ + } while (0) +#else +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + (void)bit_depth; \ + name = name##_8; \ + } while (0) +#endif + +// Tables relating active max Q to active min Q +static int kf_low_motion_minq_8[QINDEX_RANGE]; +static int kf_high_motion_minq_8[QINDEX_RANGE]; +static int arfgf_low_motion_minq_8[QINDEX_RANGE]; +static int arfgf_high_motion_minq_8[QINDEX_RANGE]; +static int inter_minq_8[QINDEX_RANGE]; +static int rtc_minq_8[QINDEX_RANGE]; + +#if CONFIG_HIGHBITDEPTH +static int kf_low_motion_minq_10[QINDEX_RANGE]; +static int kf_high_motion_minq_10[QINDEX_RANGE]; +static int arfgf_low_motion_minq_10[QINDEX_RANGE]; +static int arfgf_high_motion_minq_10[QINDEX_RANGE]; +static int inter_minq_10[QINDEX_RANGE]; +static int rtc_minq_10[QINDEX_RANGE]; +static int kf_low_motion_minq_12[QINDEX_RANGE]; +static int kf_high_motion_minq_12[QINDEX_RANGE]; +static int arfgf_low_motion_minq_12[QINDEX_RANGE]; +static int arfgf_high_motion_minq_12[QINDEX_RANGE]; +static int inter_minq_12[QINDEX_RANGE]; +static int rtc_minq_12[QINDEX_RANGE]; +#endif + +static int gf_high = 2000; +static int gf_low = 400; +static int kf_high = 5000; +static int kf_low = 400; + +// Functions to compute the active minq lookup table entries based on a +// formulaic approach to facilitate easier adjustment of the Q tables. +// The formulae were derived from computing a 3rd order polynomial best +// fit to the original data (after plotting real maxq vs minq (not q index)) +static int get_minq_index(double maxq, double x3, double x2, double x1, + aom_bit_depth_t bit_depth) { + int i; + const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq); + + // Special case handling to deal with the step from q2.0 + // down to lossless mode represented by q 1.0. + if (minqtarget <= 2.0) return 0; + + for (i = 0; i < QINDEX_RANGE; i++) { + if (minqtarget <= av1_convert_qindex_to_q(i, bit_depth)) return i; + } + + return QINDEX_RANGE - 1; +} + +static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, + int *arfgf_high, int *inter, int *rtc, + aom_bit_depth_t bit_depth) { + int i; + for (i = 0; i < QINDEX_RANGE; i++) { + const double maxq = av1_convert_qindex_to_q(i, bit_depth); + kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth); + kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); + arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth); + arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); + inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth); + rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth); + } +} + +void av1_rc_init_minq_luts(void) { + init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8, + arfgf_low_motion_minq_8, arfgf_high_motion_minq_8, + inter_minq_8, rtc_minq_8, AOM_BITS_8); +#if CONFIG_HIGHBITDEPTH + init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10, + arfgf_low_motion_minq_10, arfgf_high_motion_minq_10, + inter_minq_10, rtc_minq_10, AOM_BITS_10); + init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12, + arfgf_low_motion_minq_12, arfgf_high_motion_minq_12, + inter_minq_12, rtc_minq_12, AOM_BITS_12); +#endif +} + +// These functions use formulaic calculations to make playing with the +// quantizer tables easier. If necessary they can be replaced by lookup +// tables if and when things settle down in the experimental bitstream +double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) { +// Convert the index to a real Q value (scaled down to match old Q values) +#if CONFIG_HIGHBITDEPTH + switch (bit_depth) { + case AOM_BITS_8: return av1_ac_quant(qindex, 0, bit_depth) / 4.0; + case AOM_BITS_10: return av1_ac_quant(qindex, 0, bit_depth) / 16.0; + case AOM_BITS_12: return av1_ac_quant(qindex, 0, bit_depth) / 64.0; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1.0; + } +#else + return av1_ac_quant(qindex, 0, bit_depth) / 4.0; +#endif +} + +int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor, aom_bit_depth_t bit_depth) { + const double q = av1_convert_qindex_to_q(qindex, bit_depth); + int enumerator = frame_type == KEY_FRAME ? 2700000 : 1800000; + + assert(correction_factor <= MAX_BPB_FACTOR && + correction_factor >= MIN_BPB_FACTOR); + + // q based adjustment to baseline enumerator + enumerator += (int)(enumerator * q) >> 12; + return (int)(enumerator * correction_factor / q); +} + +int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, + double correction_factor, + aom_bit_depth_t bit_depth) { + const int bpm = + (int)(av1_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth)); + return AOMMAX(FRAME_OVERHEAD_BITS, + (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS); +} + +int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target) { + const RATE_CONTROL *rc = &cpi->rc; + const AV1EncoderConfig *oxcf = &cpi->oxcf; + const int min_frame_target = + AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); +// Clip the frame target to the minimum setup value. +#if CONFIG_EXT_REFS + if (cpi->rc.is_src_frame_alt_ref) { +#else + if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { +#endif // CONFIG_EXT_REFS + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a constructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for constructed ARFs. + target = min_frame_target; + } else if (target < min_frame_target) { + target = min_frame_target; + } + + // Clip the frame target to the maximum allowed value. + if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + if (oxcf->rc_max_inter_bitrate_pct) { + const int max_rate = + rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; + target = AOMMIN(target, max_rate); + } + + return target; +} + +int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int target) { + const RATE_CONTROL *rc = &cpi->rc; + const AV1EncoderConfig *oxcf = &cpi->oxcf; + if (oxcf->rc_max_intra_bitrate_pct) { + const int max_rate = + rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100; + target = AOMMIN(target, max_rate); + } + if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + return target; +} + +// Update the buffer level: leaky bucket model. +static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) { + const AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + +// Non-viewable frames are a special case and are treated as pure overhead. +#if CONFIG_EXT_REFS + // TODO(zoeliu): To further explore whether we should treat BWDREF_FRAME + // differently, since it is a no-show frame. + if (!cm->show_frame && !rc->is_bwd_ref_frame) +#else + if (!cm->show_frame) +#endif // CONFIG_EXT_REFS + rc->bits_off_target -= encoded_frame_size; + else + rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; + + // Clip the buffer level to the maximum specified buffer size. + rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = rc->bits_off_target; +} + +int av1_rc_get_default_min_gf_interval(int width, int height, + double framerate) { + // Assume we do not need any constraint lower than 4K 20 fps + static const double factor_safe = 3840 * 2160 * 20.0; + const double factor = width * height * framerate; + const int default_interval = + clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL); + + if (factor <= factor_safe) + return default_interval; + else + return AOMMAX(default_interval, + (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5)); + // Note this logic makes: + // 4K24: 5 + // 4K30: 6 + // 4K60: 12 +} + +int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) { + int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75)); + interval += (interval & 0x01); // Round to even value + return AOMMAX(interval, min_gf_interval); +} + +void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { + int i; + + if (pass == 0 && oxcf->rc_mode == AOM_CBR) { + rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q; + rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; + } else { + rc->avg_frame_qindex[KEY_FRAME] = + (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2; + rc->avg_frame_qindex[INTER_FRAME] = + (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2; + } + + rc->last_q[KEY_FRAME] = oxcf->best_allowed_q; + rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q; + + rc->buffer_level = rc->starting_buffer_level; + rc->bits_off_target = rc->starting_buffer_level; + + rc->rolling_target_bits = rc->avg_frame_bandwidth; + rc->rolling_actual_bits = rc->avg_frame_bandwidth; + rc->long_rolling_target_bits = rc->avg_frame_bandwidth; + rc->long_rolling_actual_bits = rc->avg_frame_bandwidth; + + rc->total_actual_bits = 0; + rc->total_target_bits = 0; + rc->total_target_vs_actual = 0; + + rc->frames_since_key = 8; // Sensible default for first frame. + rc->this_key_frame_forced = 0; + rc->next_key_frame_forced = 0; + rc->source_alt_ref_pending = 0; + rc->source_alt_ref_active = 0; + + rc->frames_till_gf_update_due = 0; + rc->ni_av_qi = oxcf->worst_allowed_q; + rc->ni_tot_qi = 0; + rc->ni_frames = 0; + + rc->tot_q = 0.0; + rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth); + + for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { + rc->rate_correction_factors[i] = 1.0; + } + + rc->min_gf_interval = oxcf->min_gf_interval; + rc->max_gf_interval = oxcf->max_gf_interval; + if (rc->min_gf_interval == 0) + rc->min_gf_interval = av1_rc_get_default_min_gf_interval( + oxcf->width, oxcf->height, oxcf->init_framerate); + if (rc->max_gf_interval == 0) + rc->max_gf_interval = av1_rc_get_default_max_gf_interval( + oxcf->init_framerate, rc->min_gf_interval); + rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2; +} + +int av1_rc_drop_frame(AV1_COMP *cpi) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + + if (!oxcf->drop_frames_water_mark) { + return 0; + } else { + if (rc->buffer_level < 0) { + // Always drop if buffer is below 0. + return 1; + } else { + // If buffer is below drop_mark, for now just drop every other frame + // (starting with the next frame) until it increases back over drop_mark. + int drop_mark = + (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100); + if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) { + --rc->decimation_factor; + } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) { + rc->decimation_factor = 1; + } + if (rc->decimation_factor > 0) { + if (rc->decimation_count > 0) { + --rc->decimation_count; + return 1; + } else { + rc->decimation_count = rc->decimation_factor; + return 0; + } + } else { + rc->decimation_count = 0; + return 0; + } + } + } +} + +static double get_rate_correction_factor(const AV1_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + double rcf; + + if (cpi->common.frame_type == KEY_FRAME) { + rcf = rc->rate_correction_factors[KF_STD]; + } else if (cpi->oxcf.pass == 2) { + RATE_FACTOR_LEVEL rf_lvl = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; + rcf = rc->rate_correction_factors[rf_lvl]; + } else { + if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && + !rc->is_src_frame_alt_ref && + (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20)) + rcf = rc->rate_correction_factors[GF_ARF_STD]; + else + rcf = rc->rate_correction_factors[INTER_NORMAL]; + } + rcf *= rcf_mult[rc->frame_size_selector]; + return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR); +} + +static void set_rate_correction_factor(AV1_COMP *cpi, double factor) { + RATE_CONTROL *const rc = &cpi->rc; + + // Normalize RCF to account for the size-dependent scaling factor. + factor /= rcf_mult[cpi->rc.frame_size_selector]; + + factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR); + + if (cpi->common.frame_type == KEY_FRAME) { + rc->rate_correction_factors[KF_STD] = factor; + } else if (cpi->oxcf.pass == 2) { + RATE_FACTOR_LEVEL rf_lvl = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; + rc->rate_correction_factors[rf_lvl] = factor; + } else { + if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && + !rc->is_src_frame_alt_ref && + (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20)) + rc->rate_correction_factors[GF_ARF_STD] = factor; + else + rc->rate_correction_factors[INTER_NORMAL] = factor; + } +} + +void av1_rc_update_rate_correction_factors(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + int correction_factor = 100; + double rate_correction_factor = get_rate_correction_factor(cpi); + double adjustment_limit; + + int projected_size_based_on_q = 0; + + // Do not update the rate factors for arf overlay frames. + if (cpi->rc.is_src_frame_alt_ref) return; + + // Clear down mmx registers to allow floating point in what follows + aom_clear_system_state(); + + // Work out how big we would have expected the frame to be at this Q given + // the current correction factor. + // Stay in double to avoid int overflow when values are large + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) { + projected_size_based_on_q = + av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor); + } else { + projected_size_based_on_q = + av1_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, cm->MBs, + rate_correction_factor, cm->bit_depth); + } + // Work out a size correction factor. + if (projected_size_based_on_q > FRAME_OVERHEAD_BITS) + correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) / + projected_size_based_on_q); + + // More heavily damped adjustment used if we have been oscillating either side + // of target. + if (correction_factor > 0) { + adjustment_limit = + 0.25 + 0.5 * AOMMIN(1, fabs(log10(0.01 * correction_factor))); + } else { + adjustment_limit = 0.75; + } + + cpi->rc.q_2_frame = cpi->rc.q_1_frame; + cpi->rc.q_1_frame = cm->base_qindex; + cpi->rc.rc_2_frame = cpi->rc.rc_1_frame; + if (correction_factor > 110) + cpi->rc.rc_1_frame = -1; + else if (correction_factor < 90) + cpi->rc.rc_1_frame = 1; + else + cpi->rc.rc_1_frame = 0; + + if (correction_factor > 102) { + // We are not already at the worst allowable quality + correction_factor = + (int)(100 + ((correction_factor - 100) * adjustment_limit)); + rate_correction_factor = (rate_correction_factor * correction_factor) / 100; + // Keep rate_correction_factor within limits + if (rate_correction_factor > MAX_BPB_FACTOR) + rate_correction_factor = MAX_BPB_FACTOR; + } else if (correction_factor < 99) { + // We are not already at the best allowable quality + correction_factor = + (int)(100 - ((100 - correction_factor) * adjustment_limit)); + rate_correction_factor = (rate_correction_factor * correction_factor) / 100; + + // Keep rate_correction_factor within limits + if (rate_correction_factor < MIN_BPB_FACTOR) + rate_correction_factor = MIN_BPB_FACTOR; + } + + set_rate_correction_factor(cpi, rate_correction_factor); +} + +int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame, + int active_best_quality, int active_worst_quality) { + const AV1_COMMON *const cm = &cpi->common; + int q = active_worst_quality; + int last_error = INT_MAX; + int i, target_bits_per_mb, bits_per_mb_at_this_q; + const double correction_factor = get_rate_correction_factor(cpi); + + // Calculate required scaling factor based on target frame size and size of + // frame produced using previous Q. + target_bits_per_mb = + (int)((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs; + + i = active_best_quality; + + do { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { + bits_per_mb_at_this_q = + (int)av1_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor); + } else { + bits_per_mb_at_this_q = (int)av1_rc_bits_per_mb( + cm->frame_type, i, correction_factor, cm->bit_depth); + } + + if (bits_per_mb_at_this_q <= target_bits_per_mb) { + if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error) + q = i; + else + q = i - 1; + + break; + } else { + last_error = bits_per_mb_at_this_q - target_bits_per_mb; + } + } while (++i <= active_worst_quality); + + // In CBR mode, this makes sure q is between oscillating Qs to prevent + // resonance. + if (cpi->oxcf.rc_mode == AOM_CBR && + (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && + cpi->rc.q_1_frame != cpi->rc.q_2_frame) { + q = clamp(q, AOMMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), + AOMMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); + } + return q; +} + +static int get_active_quality(int q, int gfu_boost, int low, int high, + int *low_motion_minq, int *high_motion_minq) { + if (gfu_boost > high) { + return low_motion_minq[q]; + } else if (gfu_boost < low) { + return high_motion_minq[q]; + } else { + const int gap = high - low; + const int offset = high - gfu_boost; + const int qdiff = high_motion_minq[q] - low_motion_minq[q]; + const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; + return low_motion_minq[q] + adjustment; + } +} + +static int get_kf_active_quality(const RATE_CONTROL *const rc, int q, + aom_bit_depth_t bit_depth) { + int *kf_low_motion_minq; + int *kf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq); + ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq); + return get_active_quality(q, rc->kf_boost, kf_low, kf_high, + kf_low_motion_minq, kf_high_motion_minq); +} + +static int get_gf_active_quality(const RATE_CONTROL *const rc, int q, + aom_bit_depth_t bit_depth) { + int *arfgf_low_motion_minq; + int *arfgf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq); + ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); + return get_active_quality(q, rc->gfu_boost, gf_low, gf_high, + arfgf_low_motion_minq, arfgf_high_motion_minq); +} + +static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + const unsigned int curr_frame = cpi->common.current_video_frame; + int active_worst_quality; + + if (cpi->common.frame_type == KEY_FRAME) { + active_worst_quality = + curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2; + } else { + if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4 + : rc->last_q[INTER_FRAME]; + } else { + active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2 + : rc->last_q[INTER_FRAME] * 2; + } + } + return AOMMIN(active_worst_quality, rc->worst_quality); +} + +// Adjust active_worst_quality level based on buffer level. +static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) { + // Adjust active_worst_quality: If buffer is above the optimal/target level, + // bring active_worst_quality down depending on fullness of buffer. + // If buffer is below the optimal level, let the active_worst_quality go from + // ambient Q (at buffer = optimal level) to worst_quality level + // (at buffer = critical level). + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *rc = &cpi->rc; + // Buffer level below which we push active_worst to worst_quality. + int64_t critical_level = rc->optimal_buffer_level >> 3; + int64_t buff_lvl_step = 0; + int adjustment = 0; + int active_worst_quality; + int ambient_qp; + if (cm->frame_type == KEY_FRAME) return rc->worst_quality; + // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME] + // for the first few frames following key frame. These are both initialized + // to worst_quality and updated with (3/4, 1/4) average in postencode_update. + // So for first few frames following key, the qp of that key frame is weighted + // into the active_worst_quality setting. + ambient_qp = (cm->current_video_frame < 5) + ? AOMMIN(rc->avg_frame_qindex[INTER_FRAME], + rc->avg_frame_qindex[KEY_FRAME]) + : rc->avg_frame_qindex[INTER_FRAME]; + active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4); + if (rc->buffer_level > rc->optimal_buffer_level) { + // Adjust down. + // Maximum limit for down adjustment, ~30%. + int max_adjustment_down = active_worst_quality / 3; + if (max_adjustment_down) { + buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) / + max_adjustment_down); + if (buff_lvl_step) + adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) / + buff_lvl_step); + active_worst_quality -= adjustment; + } + } else if (rc->buffer_level > critical_level) { + // Adjust up from ambient Q. + if (critical_level) { + buff_lvl_step = (rc->optimal_buffer_level - critical_level); + if (buff_lvl_step) { + adjustment = (int)((rc->worst_quality - ambient_qp) * + (rc->optimal_buffer_level - rc->buffer_level) / + buff_lvl_step); + } + active_worst_quality = ambient_qp + adjustment; + } + } else { + // Set to worst_quality if buffer is below critical level. + active_worst_quality = rc->worst_quality; + } + return active_worst_quality; +} + +static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, + int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + int active_best_quality; + int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi); + int q; + int *rtc_minq; + ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq); + + if (frame_is_intra_only(cm)) { + active_best_quality = rc->best_quality; + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + if (rc->this_key_frame_forced) { + int qindex = rc->last_boosted_qindex; + double last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth); + int delta_qindex = av1_compute_qdelta( + rc, last_boosted_q, (last_boosted_q * 0.75), cm->bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else if (cm->current_video_frame > 0) { + // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + double q_val; + + active_best_quality = get_kf_active_quality( + rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = av1_convert_qindex_to_q(active_best_quality, cm->bit_depth); + active_best_quality += + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + } + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = rc->avg_frame_qindex[INTER_FRAME]; + } else { + q = active_worst_quality; + } + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + } else { + // Use the lower of active_worst_quality and recent/average Q. + if (cm->current_video_frame > 1) { + if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) + active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]]; + else + active_best_quality = rtc_minq[active_worst_quality]; + } else { + if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality) + active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]]; + else + active_best_quality = rtc_minq[active_worst_quality]; + } + } + + // Clip the active best and worst quality values to limits + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + // Limit Q range for the adaptive loop. + if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced && + !(cm->current_video_frame == 0)) { + int qdelta = 0; + aom_clear_system_state(); + qdelta = av1_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth); + *top_index = active_worst_quality + qdelta; + *top_index = AOMMAX(*top_index, *bottom_index); + } + + // Special case code to try and match quality with forced key frames + if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) { + q = rc->last_boosted_qindex; + } else { + q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +static int get_active_cq_level(const RATE_CONTROL *rc, + const AV1EncoderConfig *const oxcf) { + static const double cq_adjust_threshold = 0.1; + int active_cq_level = oxcf->cq_level; + if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) { + const double x = (double)rc->total_actual_bits / rc->total_target_bits; + if (x < cq_adjust_threshold) { + active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold); + } + } + return active_cq_level; +} + +static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, + int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = get_active_cq_level(rc, oxcf); + int active_best_quality; + int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi); + int q; + int *inter_minq; + ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); + + if (frame_is_intra_only(cm)) { + if (oxcf->rc_mode == AOM_Q) { + const int qindex = cq_level; + const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth); + const int delta_qindex = + av1_compute_qdelta(rc, q_val, q_val * 0.25, cm->bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else if (rc->this_key_frame_forced) { + const int qindex = rc->last_boosted_qindex; + const double last_boosted_q = + av1_convert_qindex_to_q(qindex, cm->bit_depth); + const int delta_qindex = av1_compute_qdelta( + rc, last_boosted_q, last_boosted_q * 0.75, cm->bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else { // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + + active_best_quality = get_kf_active_quality( + rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Convert the adjustment factor to a qindex delta on active_best_quality. + { + const double q_val = + av1_convert_qindex_to_q(active_best_quality, cm->bit_depth); + active_best_quality += + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + } + } + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + q = (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) + ? rc->avg_frame_qindex[INTER_FRAME] + : rc->avg_frame_qindex[KEY_FRAME]; + // For constrained quality dont allow Q less than the cq level + if (oxcf->rc_mode == AOM_CQ) { + if (q < cq_level) q = cq_level; + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + // Constrained quality use slightly lower active best. + active_best_quality = active_best_quality * 15 / 16; + } else if (oxcf->rc_mode == AOM_Q) { + const int qindex = cq_level; + const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth); + const int delta_qindex = + (cpi->refresh_alt_ref_frame) + ? av1_compute_qdelta(rc, q_val, q_val * 0.40, cm->bit_depth) + : av1_compute_qdelta(rc, q_val, q_val * 0.50, cm->bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else { + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + } + } else { + if (oxcf->rc_mode == AOM_Q) { + const int qindex = cq_level; + const double q_val = av1_convert_qindex_to_q(qindex, cm->bit_depth); + const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0, + 0.70, 1.0, 0.85, 1.0 }; + const int delta_qindex = av1_compute_qdelta( + rc, q_val, + q_val * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL], + cm->bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else { + // Use the lower of active_worst_quality and recent/average Q. + active_best_quality = (cm->current_video_frame > 1) + ? inter_minq[rc->avg_frame_qindex[INTER_FRAME]] + : inter_minq[rc->avg_frame_qindex[KEY_FRAME]]; + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; + } + } + } + + // Clip the active best and worst quality values to limits + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + // Limit Q range for the adaptive loop. + { + int qdelta = 0; + aom_clear_system_state(); + if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced && + !(cm->current_video_frame == 0)) { + qdelta = av1_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth); + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + qdelta = av1_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth); + } + *top_index = active_worst_quality + qdelta; + *top_index = AOMMAX(*top_index, *bottom_index); + } + + if (oxcf->rc_mode == AOM_Q) { + q = active_best_quality; + // Special case code to try and match quality with forced key frames + } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) { + q = rc->last_boosted_qindex; + } else { + q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) { + static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = { + 1.00, // INTER_NORMAL +#if CONFIG_EXT_REFS + 0.80, // INTER_LOW + 1.50, // INTER_HIGH + 1.25, // GF_ARF_LOW +#else + 1.00, // INTER_HIGH + 1.50, // GF_ARF_LOW +#endif // CONFIG_EXT_REFS + 2.00, // GF_ARF_STD + 2.00, // KF_STD + }; + static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = +#if CONFIG_EXT_REFS + { INTER_FRAME, INTER_FRAME, INTER_FRAME, + INTER_FRAME, INTER_FRAME, KEY_FRAME }; +#else + { INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME }; +#endif // CONFIG_EXT_REFS + const AV1_COMMON *const cm = &cpi->common; + int qdelta = + av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q, + rate_factor_deltas[rf_level], cm->bit_depth); + return qdelta; +} + +#define STATIC_MOTION_THRESH 95 +static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + const int cq_level = get_active_cq_level(rc, oxcf); + int active_best_quality; + int active_worst_quality = cpi->twopass.active_worst_quality; + int q; + int *inter_minq; + ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); + + if (frame_is_intra_only(cm)) { + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + if (rc->this_key_frame_forced) { + double last_boosted_q; + int delta_qindex; + int qindex; + + if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex); + active_best_quality = qindex; + last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth); + delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 1.25, cm->bit_depth); + active_worst_quality = + AOMMIN(qindex + delta_qindex, active_worst_quality); + } else { + qindex = rc->last_boosted_qindex; + last_boosted_q = av1_convert_qindex_to_q(qindex, cm->bit_depth); + delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 0.75, cm->bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } + } else { + // Not forced keyframe. + double q_adj_factor = 1.0; + double q_val; + + // Baseline value derived from cpi->active_worst_quality and kf boost. + active_best_quality = + get_kf_active_quality(rc, active_worst_quality, cm->bit_depth); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Make a further adjustment based on the kf zero motion measure. + q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = av1_convert_qindex_to_q(active_best_quality, cm->bit_depth); + active_best_quality += + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + } + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = rc->avg_frame_qindex[INTER_FRAME]; + } else { + q = active_worst_quality; + } + // For constrained quality dont allow Q less than the cq level + if (oxcf->rc_mode == AOM_CQ) { + if (q < cq_level) q = cq_level; + + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + + // Constrained quality use slightly lower active best. + active_best_quality = active_best_quality * 15 / 16; + + } else if (oxcf->rc_mode == AOM_Q) { + if (!cpi->refresh_alt_ref_frame) { + active_best_quality = cq_level; + } else { + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + + // Modify best quality for second level arfs. For mode AOM_Q this + // becomes the baseline frame q. + if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) + active_best_quality = (active_best_quality + cq_level + 1) / 2; + } + } else { + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + } + } else { + if (oxcf->rc_mode == AOM_Q) { + active_best_quality = cq_level; + } else { + active_best_quality = inter_minq[active_worst_quality]; + + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; + } + } + } + + // Extension to max or min Q if undershoot or overshoot is outside + // the permitted range. + if ((cpi->oxcf.rc_mode != AOM_Q) && + (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) { + if (frame_is_intra_only(cm) || + (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); + active_worst_quality += (cpi->twopass.extend_maxq / 2); + } else { + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2; + active_worst_quality += cpi->twopass.extend_maxq; + } + } + + aom_clear_system_state(); + // Static forced key frames Q restrictions dealt with elsewhere. + if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced || + (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) { + int qdelta = av1_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index], + active_worst_quality); + active_worst_quality = + AOMMAX(active_worst_quality + qdelta, active_best_quality); + } + + // Modify active_best_quality for downscaled normal frames. + if (rc->frame_size_selector != UNSCALED && !frame_is_kf_gf_arf(cpi)) { + int qdelta = av1_compute_qdelta_by_rate( + rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth); + active_best_quality = + AOMMAX(active_best_quality + qdelta, rc->best_quality); + } + + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + if (oxcf->rc_mode == AOM_Q) { + q = active_best_quality; + // Special case code to try and match quality with forced key frames. + } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) { + // If static since last kf use better of last boosted and last kf q. + if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex); + } else { + q = rc->last_boosted_qindex; + } + } else { + q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality); + if (q > active_worst_quality) { + // Special case when we are targeting the max allowed rate. + if (rc->this_frame_target >= rc->max_frame_bandwidth) + active_worst_quality = q; + else + q = active_worst_quality; + } + } + clamp(q, active_best_quality, active_worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int *bottom_index, + int *top_index) { + int q; + if (cpi->oxcf.pass == 0) { + if (cpi->oxcf.rc_mode == AOM_CBR) + q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index); + else + q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index); + } else { + q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index); + } + + return q; +} + +void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit) { + if (cpi->oxcf.rc_mode == AOM_Q) { + *frame_under_shoot_limit = 0; + *frame_over_shoot_limit = INT_MAX; + } else { + // For very small rate targets where the fractional adjustment + // may be tiny make sure there is at least a minimum range. + const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100; + *frame_under_shoot_limit = AOMMAX(frame_target - tolerance - 200, 0); + *frame_over_shoot_limit = + AOMMIN(frame_target + tolerance + 200, cpi->rc.max_frame_bandwidth); + } +} + +void av1_rc_set_frame_target(AV1_COMP *cpi, int target) { + const AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + + rc->this_frame_target = target; + + // Modify frame size target when down-scaling. + if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && + rc->frame_size_selector != UNSCALED) + rc->this_frame_target = (int)(rc->this_frame_target * + rate_thresh_mult[rc->frame_size_selector]); + + // Target rate per SB64 (including partial SB64s. + rc->sb64_target_rate = (int)((int64_t)rc->this_frame_target * 64 * 64) / + (cm->width * cm->height); +} + +static void update_alt_ref_frame_stats(AV1_COMP *cpi) { + // this frame refreshes means next frames don't unless specified by user + RATE_CONTROL *const rc = &cpi->rc; + rc->frames_since_golden = 0; + + // Mark the alt ref as done (setting to 0 means no further alt refs pending). + rc->source_alt_ref_pending = 0; + + // Set the alternate reference frame active flag + rc->source_alt_ref_active = 1; +} + +static void update_golden_frame_stats(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + +#if CONFIG_EXT_REFS + // Update the Golden frame usage counts. + // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame, + // only the virtual indices for the reference frame will be + // updated and cpi->refresh_golden_frame will still be zero. + if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) { +#else + // Update the Golden frame usage counts. + if (cpi->refresh_golden_frame) { +#endif // CONFIG_EXT_REFS + +#if CONFIG_EXT_REFS + // We will not use internal overlay frames to replace the golden frame + if (!rc->is_src_frame_ext_arf) +#endif // CONFIG_EXT_REFS + // this frame refreshes means next frames don't unless specified by user + rc->frames_since_golden = 0; + + // If we are not using alt ref in the up and coming group clear the arf + // active flag. In multi arf group case, if the index is not 0 then + // we are overlaying a mid group arf so should not reset the flag. + if (cpi->oxcf.pass == 2) { + if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0)) + rc->source_alt_ref_active = 0; + } else if (!rc->source_alt_ref_pending) { + rc->source_alt_ref_active = 0; + } + + // Decrement count down till next gf + if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; + + } else if (!cpi->refresh_alt_ref_frame) { + // Decrement count down till next gf + if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; + + rc->frames_since_golden++; + } +} + +void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { + const AV1_COMMON *const cm = &cpi->common; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + const int qindex = cm->base_qindex; + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { + av1_cyclic_refresh_postencode(cpi); + } + + // Update rate control heuristics + rc->projected_frame_size = (int)(bytes_used << 3); + + // Post encode loop adjustment of Q prediction. + av1_rc_update_rate_correction_factors(cpi); + + // Keep a record of last Q and ambient average Q. + if (cm->frame_type == KEY_FRAME) { + rc->last_q[KEY_FRAME] = qindex; + rc->avg_frame_qindex[KEY_FRAME] = + ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); + } else { + if (!rc->is_src_frame_alt_ref && + !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + rc->last_q[INTER_FRAME] = qindex; + rc->avg_frame_qindex[INTER_FRAME] = + ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); + rc->ni_frames++; + rc->tot_q += av1_convert_qindex_to_q(qindex, cm->bit_depth); + rc->avg_q = rc->tot_q / rc->ni_frames; + // Calculate the average Q for normal inter frames (not key or GFU + // frames). + rc->ni_tot_qi += qindex; + rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames; + } + } + + // Keep record of last boosted (KF/GF/ARF) Q value. + // If the current frame is coded at a lower Q then we also update it. + // If all mbs in this group are skipped only update if the Q value is + // better than that already stored. + // This is used to help set quality in forced key frames to reduce popping + if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) || + (!rc->constrained_gf_group && + (cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { + rc->last_boosted_qindex = qindex; + } + if (cm->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex; + + update_buffer_level(cpi, rc->projected_frame_size); + + // Rolling monitors of whether we are over or underspending used to help + // regulate min and Max Q in two pass. + if (cm->frame_type != KEY_FRAME) { + rc->rolling_target_bits = ROUND_POWER_OF_TWO( + rc->rolling_target_bits * 3 + rc->this_frame_target, 2); + rc->rolling_actual_bits = ROUND_POWER_OF_TWO( + rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2); + rc->long_rolling_target_bits = ROUND_POWER_OF_TWO( + rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5); + rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO( + rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5); + } + + // Actual bits spent + rc->total_actual_bits += rc->projected_frame_size; +#if CONFIG_EXT_REFS + rc->total_target_bits += + (cm->show_frame || rc->is_bwd_ref_frame) ? rc->avg_frame_bandwidth : 0; +#else + rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0; +#endif // CONFIG_EXT_REFS + + rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits; + + if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame && + (cm->frame_type != KEY_FRAME)) + // Update the alternate reference frame stats as appropriate. + update_alt_ref_frame_stats(cpi); + else + // Update the Golden frame stats as appropriate. + update_golden_frame_stats(cpi); + + if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0; + +#if CONFIG_EXT_REFS + if (cm->show_frame || rc->is_bwd_ref_frame) { +#else + if (cm->show_frame) { +#endif // CONFIG_EXT_REFS + rc->frames_since_key++; + rc->frames_to_key--; + } + + // Trigger the resizing of the next frame if it is scaled. + if (oxcf->pass != 0) { + cpi->resize_pending = + rc->next_frame_size_selector != rc->frame_size_selector; + rc->frame_size_selector = rc->next_frame_size_selector; + } +} + +void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) { + // Update buffer level with zero size, update frame counters, and return. + update_buffer_level(cpi, 0); + cpi->rc.frames_since_key++; + cpi->rc.frames_to_key--; + cpi->rc.rc_2_frame = 0; + cpi->rc.rc_1_frame = 0; +} + +// Use this macro to turn on/off use of alt-refs in one-pass mode. +#define USE_ALTREF_FOR_ONE_PASS 1 + +static int calc_pframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) { + static const int af_ratio = 10; + const RATE_CONTROL *const rc = &cpi->rc; + int target; +#if USE_ALTREF_FOR_ONE_PASS + target = + (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) + ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) / + (rc->baseline_gf_interval + af_ratio - 1) + : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) / + (rc->baseline_gf_interval + af_ratio - 1); +#else + target = rc->avg_frame_bandwidth; +#endif + return av1_rc_clamp_pframe_target_size(cpi, target); +} + +static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) { + static const int kf_ratio = 25; + const RATE_CONTROL *rc = &cpi->rc; + const int target = rc->avg_frame_bandwidth * kf_ratio; + return av1_rc_clamp_iframe_target_size(cpi, target); +} + +void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int target; + // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. + if (!cpi->refresh_alt_ref_frame && + (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || + rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) { + cm->frame_type = KEY_FRAME; + rc->this_key_frame_forced = + cm->current_video_frame != 0 && rc->frames_to_key == 0; + rc->frames_to_key = cpi->oxcf.key_freq; + rc->kf_boost = DEFAULT_KF_BOOST; + rc->source_alt_ref_active = 0; + } else { + cm->frame_type = INTER_FRAME; + } + if (rc->frames_till_gf_update_due == 0) { + rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + // NOTE: frames_till_gf_update_due must be <= frames_to_key. + if (rc->frames_till_gf_update_due > rc->frames_to_key) { + rc->frames_till_gf_update_due = rc->frames_to_key; + rc->constrained_gf_group = 1; + } else { + rc->constrained_gf_group = 0; + } + cpi->refresh_golden_frame = 1; + rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS; + rc->gfu_boost = DEFAULT_GF_BOOST; + } + if (cm->frame_type == KEY_FRAME) + target = calc_iframe_target_size_one_pass_vbr(cpi); + else + target = calc_pframe_target_size_one_pass_vbr(cpi); + av1_rc_set_frame_target(cpi, target); +} + +static int calc_pframe_target_size_one_pass_cbr(const AV1_COMP *cpi) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + const RATE_CONTROL *rc = &cpi->rc; + const int64_t diff = rc->optimal_buffer_level - rc->buffer_level; + const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100; + int min_frame_target = + AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); + int target; + + if (oxcf->gf_cbr_boost_pct) { + const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100; + target = cpi->refresh_golden_frame + ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * + af_ratio_pct) / + (rc->baseline_gf_interval * 100 + af_ratio_pct - 100) + : (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) / + (rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + } else { + target = rc->avg_frame_bandwidth; + } + + if (diff > 0) { + // Lower the target bandwidth for this frame. + const int pct_low = (int)AOMMIN(diff / one_pct_bits, oxcf->under_shoot_pct); + target -= (target * pct_low) / 200; + } else if (diff < 0) { + // Increase the target bandwidth for this frame. + const int pct_high = + (int)AOMMIN(-diff / one_pct_bits, oxcf->over_shoot_pct); + target += (target * pct_high) / 200; + } + if (oxcf->rc_max_inter_bitrate_pct) { + const int max_rate = + rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; + target = AOMMIN(target, max_rate); + } + return AOMMAX(min_frame_target, target); +} + +static int calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) { + const RATE_CONTROL *rc = &cpi->rc; + int target; + if (cpi->common.current_video_frame == 0) { + target = ((rc->starting_buffer_level / 2) > INT_MAX) + ? INT_MAX + : (int)(rc->starting_buffer_level / 2); + } else { + int kf_boost = 32; + double framerate = cpi->framerate; + + kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16)); + if (rc->frames_since_key < framerate / 2) { + kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2)); + } + target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4; + } + return av1_rc_clamp_iframe_target_size(cpi, target); +} + +void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int target; + // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. + if ((cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || + rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) { + cm->frame_type = KEY_FRAME; + rc->this_key_frame_forced = + cm->current_video_frame != 0 && rc->frames_to_key == 0; + rc->frames_to_key = cpi->oxcf.key_freq; + rc->kf_boost = DEFAULT_KF_BOOST; + rc->source_alt_ref_active = 0; + } else { + cm->frame_type = INTER_FRAME; + } + if (rc->frames_till_gf_update_due == 0) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + av1_cyclic_refresh_set_golden_update(cpi); + else + rc->baseline_gf_interval = + (rc->min_gf_interval + rc->max_gf_interval) / 2; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + // NOTE: frames_till_gf_update_due must be <= frames_to_key. + if (rc->frames_till_gf_update_due > rc->frames_to_key) + rc->frames_till_gf_update_due = rc->frames_to_key; + cpi->refresh_golden_frame = 1; + rc->gfu_boost = DEFAULT_GF_BOOST; + } + + // Any update/change of global cyclic refresh parameters (amount/delta-qp) + // should be done here, before the frame qp is selected. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + av1_cyclic_refresh_update_parameters(cpi); + + if (cm->frame_type == KEY_FRAME) + target = calc_iframe_target_size_one_pass_cbr(cpi); + else + target = calc_pframe_target_size_one_pass_cbr(cpi); + + av1_rc_set_frame_target(cpi, target); + if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC) + cpi->resize_pending = av1_resize_one_pass_cbr(cpi); + else + cpi->resize_pending = 0; +} + +int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, + aom_bit_depth_t bit_depth) { + int start_index = rc->worst_quality; + int target_index = rc->worst_quality; + int i; + + // Convert the average q value to an index. + for (i = rc->best_quality; i < rc->worst_quality; ++i) { + start_index = i; + if (av1_convert_qindex_to_q(i, bit_depth) >= qstart) break; + } + + // Convert the q target to an index + for (i = rc->best_quality; i < rc->worst_quality; ++i) { + target_index = i; + if (av1_convert_qindex_to_q(i, bit_depth) >= qtarget) break; + } + + return target_index - start_index; +} + +int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, + int qindex, double rate_target_ratio, + aom_bit_depth_t bit_depth) { + int target_index = rc->worst_quality; + int i; + + // Look up the current projected bits per block for the base index + const int base_bits_per_mb = + av1_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth); + + // Find the target bits per mb based on the base value and given ratio. + const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb); + + // Convert the q target to an index + for (i = rc->best_quality; i < rc->worst_quality; ++i) { + if (av1_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <= + target_bits_per_mb) { + target_index = i; + break; + } + } + return target_index - qindex; +} + +void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi, + RATE_CONTROL *const rc) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + // Special case code for 1 pass fixed Q mode tests + if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) { + rc->max_gf_interval = FIXED_GF_INTERVAL; + rc->min_gf_interval = FIXED_GF_INTERVAL; + rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL; + } else { + // Set Maximum gf/arf interval + rc->max_gf_interval = oxcf->max_gf_interval; + rc->min_gf_interval = oxcf->min_gf_interval; + if (rc->min_gf_interval == 0) + rc->min_gf_interval = av1_rc_get_default_min_gf_interval( + oxcf->width, oxcf->height, cpi->framerate); + if (rc->max_gf_interval == 0) + rc->max_gf_interval = av1_rc_get_default_max_gf_interval( + cpi->framerate, rc->min_gf_interval); + + // Extended interval for genuinely static scenes + rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2; + + if (is_altref_enabled(cpi)) { + if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) + rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; + } + + if (rc->max_gf_interval > rc->static_scene_max_gf_interval) + rc->max_gf_interval = rc->static_scene_max_gf_interval; + + // Clamp min to max + rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval); + } +} + +void av1_rc_update_framerate(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + int vbr_max_bits; + + rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate); + rc->min_frame_bandwidth = + (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100); + + rc->min_frame_bandwidth = + AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); + + // A maximum bitrate for a frame is defined. + // The baseline for this aligns with HW implementations that + // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits + // per 16x16 MB (averaged over a frame). However this limit is extended if + // a very high rate is given on the command line or the the rate cannnot + // be acheived because of a user specificed max q (e.g. when the user + // specifies lossless encode. + vbr_max_bits = + (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) / + 100); + rc->max_frame_bandwidth = + AOMMAX(AOMMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits); + + av1_rc_set_gf_interval_range(cpi, rc); +} + +#define VBR_PCT_ADJUSTMENT_LIMIT 50 +// For VBR...adjustment to the frame target based on error from previous frames +static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) { + RATE_CONTROL *const rc = &cpi->rc; + int64_t vbr_bits_off_target = rc->vbr_bits_off_target; + int max_delta; + double position_factor = 1.0; + + // How far through the clip are we. + // This number is used to damp the per frame rate correction. + // Range 0 - 1.0 + if (cpi->twopass.total_stats.count != 0.) { + position_factor = sqrt((double)cpi->common.current_video_frame / + cpi->twopass.total_stats.count); + } + max_delta = (int)(position_factor * + ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100)); + + // vbr_bits_off_target > 0 means we have extra bits to spend + if (vbr_bits_off_target > 0) { + *this_frame_target += (vbr_bits_off_target > max_delta) + ? max_delta + : (int)vbr_bits_off_target; + } else { + *this_frame_target -= (vbr_bits_off_target < -max_delta) + ? max_delta + : (int)-vbr_bits_off_target; + } + + // Fast redistribution of bits arising from massive local undershoot. + // Dont do it for kf,arf,gf or overlay frames. + if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref && + rc->vbr_bits_off_target_fast) { + int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target); + int fast_extra_bits; + fast_extra_bits = (int)AOMMIN(rc->vbr_bits_off_target_fast, one_frame_bits); + fast_extra_bits = (int)AOMMIN( + fast_extra_bits, + AOMMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8)); + *this_frame_target += (int)fast_extra_bits; + rc->vbr_bits_off_target_fast -= fast_extra_bits; + } +} + +void av1_set_target_rate(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + int target_rate = rc->base_frame_target; + + // Correction to rate target based on prior over or under shoot. + if (cpi->oxcf.rc_mode == AOM_VBR || cpi->oxcf.rc_mode == AOM_CQ) + vbr_rate_correction(cpi, &target_rate); + av1_rc_set_frame_target(cpi, target_rate); +} + +// Check if we should resize, based on average QP from past x frames. +// Only allow for resize at most one scale down for now, scaling factor is 2. +int av1_resize_one_pass_cbr(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int resize_now = 0; + cpi->resize_scale_num = 1; + cpi->resize_scale_den = 1; + // Don't resize on key frame; reset the counters on key frame. + if (cm->frame_type == KEY_FRAME) { + cpi->resize_avg_qp = 0; + cpi->resize_count = 0; + return 0; + } + // Resize based on average buffer underflow and QP over some window. + // Ignore samples close to key frame, since QP is usually high after key. + if (cpi->rc.frames_since_key > 2 * cpi->framerate) { + const int window = (int)(5 * cpi->framerate); + cpi->resize_avg_qp += cm->base_qindex; + if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100)) + ++cpi->resize_buffer_underflow; + ++cpi->resize_count; + // Check for resize action every "window" frames. + if (cpi->resize_count >= window) { + int avg_qp = cpi->resize_avg_qp / cpi->resize_count; + // Resize down if buffer level has underflowed sufficent amount in past + // window, and we are at original resolution. + // Resize back up if average QP is low, and we are currently in a resized + // down state. + if (cpi->resize_state == 0 && + cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) { + resize_now = 1; + cpi->resize_state = 1; + } else if (cpi->resize_state == 1 && + avg_qp < 40 * cpi->rc.worst_quality / 100) { + resize_now = -1; + cpi->resize_state = 0; + } + // Reset for next window measurement. + cpi->resize_avg_qp = 0; + cpi->resize_count = 0; + cpi->resize_buffer_underflow = 0; + } + } + // If decision is to resize, reset some quantities, and check is we should + // reduce rate correction factor, + if (resize_now != 0) { + int target_bits_per_frame; + int active_worst_quality; + int qindex; + int tot_scale_change; + // For now, resize is by 1/2 x 1/2. + cpi->resize_scale_num = 1; + cpi->resize_scale_den = 2; + tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) / + (cpi->resize_scale_num * cpi->resize_scale_num); + // Reset buffer level to optimal, update target size. + rc->buffer_level = rc->optimal_buffer_level; + rc->bits_off_target = rc->optimal_buffer_level; + rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi); + // Reset cyclic refresh parameters. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) + av1_cyclic_refresh_reset_resize(cpi); + // Get the projected qindex, based on the scaled target frame size (scaled + // so target_bits_per_mb in av1_rc_regulate_q will be correct target). + target_bits_per_frame = (resize_now == 1) + ? rc->this_frame_target * tot_scale_change + : rc->this_frame_target / tot_scale_change; + active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi); + qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality, + active_worst_quality); + // If resize is down, check if projected q index is close to worst_quality, + // and if so, reduce the rate correction factor (since likely can afford + // lower q for resized frame). + if (resize_now == 1 && qindex > 90 * cpi->rc.worst_quality / 100) { + rc->rate_correction_factors[INTER_NORMAL] *= 0.85; + } + // If resize is back up, check if projected q index is too much above the + // current base_qindex, and if so, reduce the rate correction factor + // (since prefer to keep q for resized frame at least close to previous q). + if (resize_now == -1 && qindex > 130 * cm->base_qindex / 100) { + rc->rate_correction_factors[INTER_NORMAL] *= 0.9; + } + } + return resize_now; +} diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h new file mode 100644 index 000000000..93a9b4939 --- /dev/null +++ b/third_party/aom/av1/encoder/ratectrl.h @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_RATECTRL_H_ +#define AV1_ENCODER_RATECTRL_H_ + +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" + +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Bits Per MB at different Q (Multiplied by 512) +#define BPER_MB_NORMBITS 9 + +#define MIN_GF_INTERVAL 4 +#define MAX_GF_INTERVAL 16 +#define FIXED_GF_INTERVAL 8 // Used in some testing modes only + +#if CONFIG_EXT_REFS +typedef enum { + INTER_NORMAL = 0, + INTER_LOW = 1, + INTER_HIGH = 2, + GF_ARF_LOW = 3, + GF_ARF_STD = 4, + KF_STD = 5, + RATE_FACTOR_LEVELS = 6 +} RATE_FACTOR_LEVEL; +#else +typedef enum { + INTER_NORMAL = 0, + INTER_HIGH = 1, + GF_ARF_LOW = 2, + GF_ARF_STD = 3, + KF_STD = 4, + RATE_FACTOR_LEVELS = 5 +} RATE_FACTOR_LEVEL; +#endif // CONFIG_EXT_REFS + +// Internal frame scaling level. +typedef enum { + UNSCALED = 0, // Frame is unscaled. + SCALE_STEP1 = 1, // First-level down-scaling. + FRAME_SCALE_STEPS +} FRAME_SCALE_LEVEL; + +// Frame dimensions multiplier wrt the native frame size, in 1/16ths, +// specified for the scale-up case. +// e.g. 24 => 16/24 = 2/3 of native size. The restriction to 1/16th is +// intended to match the capabilities of the normative scaling filters, +// giving precedence to the up-scaling accuracy. +static const int frame_scale_factor[FRAME_SCALE_STEPS] = { 16, 24 }; + +// Multiplier of the target rate to be used as threshold for triggering scaling. +static const double rate_thresh_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 }; + +// Scale dependent Rate Correction Factor multipliers. Compensates for the +// greater number of bits per pixel generated in down-scaled frames. +static const double rcf_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 }; + +typedef struct { + // Rate targetting variables + int base_frame_target; // A baseline frame target before adjustment + // for previous under or over shoot. + int this_frame_target; // Actual frame target after rc adjustment. + int projected_frame_size; + int sb64_target_rate; + int last_q[FRAME_TYPES]; // Separate values for Intra/Inter + int last_boosted_qindex; // Last boosted GF/KF/ARF q + int last_kf_qindex; // Q index of the last key frame coded. + + int gfu_boost; + int last_boost; + int kf_boost; + + double rate_correction_factors[RATE_FACTOR_LEVELS]; + + int frames_since_golden; + int frames_till_gf_update_due; + int min_gf_interval; + int max_gf_interval; + int static_scene_max_gf_interval; + int baseline_gf_interval; + int constrained_gf_group; + int frames_to_key; + int frames_since_key; + int this_key_frame_forced; + int next_key_frame_forced; + int source_alt_ref_pending; + int source_alt_ref_active; + int is_src_frame_alt_ref; + +#if CONFIG_EXT_REFS + // Length of the bi-predictive frame group interval + int bipred_group_interval; + + // NOTE: Different types of frames may have different bits allocated + // accordingly, aiming to achieve the overall optimal RD performance. + int is_bwd_ref_frame; + int is_last_bipred_frame; + int is_bipred_frame; + int is_src_frame_ext_arf; +#endif // CONFIG_EXT_REFS + + int avg_frame_bandwidth; // Average frame size target for clip + int min_frame_bandwidth; // Minimum allocation used for any frame + int max_frame_bandwidth; // Maximum burst rate allowed for a frame. + + int ni_av_qi; + int ni_tot_qi; + int ni_frames; + int avg_frame_qindex[FRAME_TYPES]; + double tot_q; + double avg_q; + + int64_t buffer_level; + int64_t bits_off_target; + int64_t vbr_bits_off_target; + int64_t vbr_bits_off_target_fast; + + int decimation_factor; + int decimation_count; + + int rolling_target_bits; + int rolling_actual_bits; + + int long_rolling_target_bits; + int long_rolling_actual_bits; + + int rate_error_estimate; + + int64_t total_actual_bits; + int64_t total_target_bits; + int64_t total_target_vs_actual; + + int worst_quality; + int best_quality; + + int64_t starting_buffer_level; + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; + + // rate control history for last frame(1) and the frame before(2). + // -1: undershot + // 1: overshoot + // 0: not initialized. + int rc_1_frame; + int rc_2_frame; + int q_1_frame; + int q_2_frame; + + // Auto frame-scaling variables. + FRAME_SCALE_LEVEL frame_size_selector; + FRAME_SCALE_LEVEL next_frame_size_selector; + int frame_width[FRAME_SCALE_STEPS]; + int frame_height[FRAME_SCALE_STEPS]; + int rf_level_maxq[RATE_FACTOR_LEVELS]; +} RATE_CONTROL; + +struct AV1_COMP; +struct AV1EncoderConfig; + +void av1_rc_init(const struct AV1EncoderConfig *oxcf, int pass, + RATE_CONTROL *rc); + +int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs, + double correction_factor, aom_bit_depth_t bit_depth); + +double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth); + +void av1_rc_init_minq_luts(void); + +int av1_rc_get_default_min_gf_interval(int width, int height, double framerate); +// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to +// be passed in to ensure that the max_gf_interval returned is at least as bis +// as that. +int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate); + +// Generally at the high level, the following flow is expected +// to be enforced for rate control: +// First call per frame, one of: +// av1_rc_get_one_pass_vbr_params() +// av1_rc_get_one_pass_cbr_params() +// av1_rc_get_first_pass_params() +// av1_rc_get_second_pass_params() +// depending on the usage to set the rate control encode parameters desired. +// +// Then, call encode_frame_to_data_rate() to perform the +// actual encode. This function will in turn call encode_frame() +// one or more times, followed by one of: +// av1_rc_postencode_update() +// av1_rc_postencode_update_drop_frame() +// +// The majority of rate control parameters are only expected +// to be set in the av1_rc_get_..._params() functions and +// updated during the av1_rc_postencode_update...() functions. +// The only exceptions are av1_rc_drop_frame() and +// av1_rc_update_rate_correction_factors() functions. + +// Functions to set parameters for encoding before the actual +// encode_frame_to_data_rate() function. +void av1_rc_get_one_pass_vbr_params(struct AV1_COMP *cpi); +void av1_rc_get_one_pass_cbr_params(struct AV1_COMP *cpi); + +// Post encode update of the rate control parameters based +// on bytes used +void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used); +// Post encode update of the rate control parameters for dropped frames +void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi); + +// Updates rate correction factors +// Changes only the rate correction factors in the rate control structure. +void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi); + +// Decide if we should drop this frame: For 1-pass CBR. +// Changes only the decimation count in the rate control structure +int av1_rc_drop_frame(struct AV1_COMP *cpi); + +// Computes frame size bounds. +void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi, + int this_frame_target, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit); + +// Picks q and q bounds given the target for bits +int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int *bottom_index, + int *top_index); + +// Estimates q to achieve a target bits per frame +int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame, + int active_best_quality, int active_worst_quality); + +// Estimates bits per mb for a given qindex and correction factor. +int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor, aom_bit_depth_t bit_depth); + +// Clamping utilities for bitrate targets for iframes and pframes. +int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi, + int target); +int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi, + int target); +// Utility to set frame_target into the RATE_CONTROL structure +// This function is called only from the av1_rc_get_..._params() functions. +void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target); + +// Computes a q delta (in "q index" terms) to get from a starting q value +// to a target q value +int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, + aom_bit_depth_t bit_depth); + +// Computes a q delta (in "q index" terms) to get from a starting q value +// to a value that should equate to the given rate ratio. +int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, + int qindex, double rate_target_ratio, + aom_bit_depth_t bit_depth); + +int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int rf_level, int q); + +void av1_rc_update_framerate(struct AV1_COMP *cpi); + +void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi, + RATE_CONTROL *const rc); + +void av1_set_target_rate(struct AV1_COMP *cpi); + +int av1_resize_one_pass_cbr(struct AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_RATECTRL_H_ diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.c b/third_party/aom/av1/encoder/ratectrl_xiph.c new file mode 100644 index 000000000..b9f827528 --- /dev/null +++ b/third_party/aom/av1/encoder/ratectrl_xiph.c @@ -0,0 +1,1244 @@ +/* + * Copyright (c) 2001-2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include "av1/common/odintrin.h" +#include "av1/encoder/ratectrl_xiph.h" + +#define OD_Q57(v) ((int64_t)((uint64_t)(v) << 57)) +#define OD_F_Q45(v) ((int64_t)(((v) * ((int64_t)1 << 45)))) +#define OD_F_Q12(v) ((int32_t)(((v) * ((int32_t)1 << 12)))) + +/*A rough lookup table for tan(x), 0 <= x < pi/2. + The values are Q12 fixed-point and spaced at 5 degree intervals. + These decisions are somewhat arbitrary, but sufficient for the 2nd order + Bessel follower below. + Values of x larger than 85 degrees are extrapolated from the last interval, + which is way off, but "good enough".*/ +static uint16_t OD_ROUGH_TAN_LOOKUP[18] = { 0, 358, 722, 1098, 1491, + 1910, 2365, 2868, 3437, 4096, + 4881, 5850, 7094, 8784, 11254, + 15286, 23230, 46817 }; + +/*alpha is Q24 in the range [0,0.5). + The return values is 5.12.*/ +static int od_warp_alpha(int alpha) { + int i; + int d; + int t0; + int t1; + i = alpha * 36 >> 24; + if (i >= 17) i = 16; + t0 = OD_ROUGH_TAN_LOOKUP[i]; + t1 = OD_ROUGH_TAN_LOOKUP[i + 1]; + d = alpha * 36 - (i << 24); + return (int)((((int64_t)t0 << 32) + ((t1 - t0) << 8) * (int64_t)d) >> 32); +} + +static const int64_t OD_ATANH_LOG2[32] = { + 0x32B803473F7AD0F4LL, 0x2F2A71BD4E25E916LL, 0x2E68B244BB93BA06LL, + 0x2E39FB9198CE62E4LL, 0x2E2E683F68565C8FLL, 0x2E2B850BE2077FC1LL, + 0x2E2ACC58FE7B78DBLL, 0x2E2A9E2DE52FD5F2LL, 0x2E2A92A338D53EECLL, + 0x2E2A8FC08F5E19B6LL, 0x2E2A8F07E51A485ELL, 0x2E2A8ED9BA8AF388LL, + 0x2E2A8ECE2FE7384ALL, 0x2E2A8ECB4D3E4B1ALL, 0x2E2A8ECA94940FE8LL, + 0x2E2A8ECA6669811DLL, 0x2E2A8ECA5ADEDD6ALL, 0x2E2A8ECA57FC347ELL, + 0x2E2A8ECA57438A43LL, 0x2E2A8ECA57155FB4LL, 0x2E2A8ECA5709D510LL, + 0x2E2A8ECA5706F267LL, 0x2E2A8ECA570639BDLL, 0x2E2A8ECA57060B92LL, + 0x2E2A8ECA57060008LL, 0x2E2A8ECA5705FD25LL, 0x2E2A8ECA5705FC6CLL, + 0x2E2A8ECA5705FC3ELL, 0x2E2A8ECA5705FC33LL, 0x2E2A8ECA5705FC30LL, + 0x2E2A8ECA5705FC2FLL, 0x2E2A8ECA5705FC2FLL +}; + +static int od_ilog64(int64_t v) { + static const unsigned char OD_DEBRUIJN_IDX64[64] = { + 0, 1, 2, 7, 3, 13, 8, 19, 4, 25, 14, 28, 9, 34, 20, 40, + 5, 17, 26, 38, 15, 46, 29, 48, 10, 31, 35, 54, 21, 50, 41, 57, + 63, 6, 12, 18, 24, 27, 33, 39, 16, 37, 45, 47, 30, 53, 49, 56, + 62, 11, 23, 32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58 + }; + int ret; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + ret = (int)v & 1; + v = (v >> 1) + 1; + ret += OD_DEBRUIJN_IDX64[v * UINT64_C(0x218A392CD3D5DBF) >> 58 & 0x3F]; + return ret; +} + +/*Computes the binary exponential of logq57. + input: a log base 2 in Q57 format + output: a 64 bit integer in Q0 (no fraction) */ +static int64_t od_bexp64(int64_t logq57) { + int64_t w; + int64_t z; + int ipart; + ipart = (int)(logq57 >> 57); + if (ipart < 0) return 0; + if (ipart >= 63) return 0x7FFFFFFFFFFFFFFFLL; + z = logq57 - OD_Q57(ipart); + if (z) { + int64_t mask; + int64_t wlo; + int i; + /*C doesn't give us 64x64->128 muls, so we use CORDIC. + This is not particularly fast, but it's not being used in time-critical + code; it is very accurate.*/ + /*z is the fractional part of the log in Q62 format. + We need 1 bit of headroom since the magnitude can get larger than 1 + during the iteration, and a sign bit.*/ + z <<= 5; + /*w is the exponential in Q61 format (since it also needs headroom and can + get as large as 2.0); we could get another bit if we dropped the sign, + but we'll recover that bit later anyway. + Ideally this should start out as + \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}} + but in order to guarantee convergence we have to repeat iterations 4, + 13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/ + w = 0x26A3D0E401DD846DLL; + for (i = 0;; i++) { + mask = -(z < 0); + w += ((w >> (i + 1)) + mask) ^ mask; + z -= (OD_ATANH_LOG2[i] + mask) ^ mask; + /*Repeat iteration 4.*/ + if (i >= 3) break; + z *= 2; + } + for (;; i++) { + mask = -(z < 0); + w += ((w >> (i + 1)) + mask) ^ mask; + z -= (OD_ATANH_LOG2[i] + mask) ^ mask; + /*Repeat iteration 13.*/ + if (i >= 12) break; + z *= 2; + } + for (; i < 32; i++) { + mask = -(z < 0); + w += ((w >> (i + 1)) + mask) ^ mask; + z = (z - ((OD_ATANH_LOG2[i] + mask) ^ mask)) * 2; + } + wlo = 0; + /*Skip the remaining iterations unless we really require that much + precision. + We could have bailed out earlier for smaller iparts, but that would + require initializing w from a table, as the limit doesn't converge to + 61-bit precision until n=30.*/ + if (ipart > 30) { + /*For these iterations, we just update the low bits, as the high bits + can't possibly be affected. + OD_ATANH_LOG2 has also converged (it actually did so one iteration + earlier, but that's no reason for an extra special case).*/ + for (;; i++) { + mask = -(z < 0); + wlo += ((w >> i) + mask) ^ mask; + z -= (OD_ATANH_LOG2[31] + mask) ^ mask; + /*Repeat iteration 40.*/ + if (i >= 39) break; + z <<= 1; + } + for (; i < 61; i++) { + mask = -(z < 0); + wlo += ((w >> i) + mask) ^ mask; + z = (z - ((OD_ATANH_LOG2[31] + mask) ^ mask)) << 1; + } + } + w = (w << 1) + wlo; + } else { + w = (int64_t)1 << 62; + } + if (ipart < 62) { + w = ((w >> (61 - ipart)) + 1) >> 1; + } + return w; +} + +/*Computes the binary log of w + input: a 64-bit integer in Q0 (no fraction) + output: a 64-bit log in Q57 */ +static int64_t od_blog64(int64_t w) { + int64_t z; + int ipart; + if (w <= 0) return -1; + ipart = od_ilog64(w) - 1; + if (ipart > 61) { + w >>= ipart - 61; + } else { + w <<= 61 - ipart; + } + z = 0; + if (w & (w - 1)) { + int64_t x; + int64_t y; + int64_t u; + int64_t mask; + int i; + /*C doesn't give us 64x64->128 muls, so we use CORDIC. + This is not particularly fast, but it's not being used in time-critical + code; it is very accurate.*/ + /*z is the fractional part of the log in Q61 format.*/ + /*x and y are the cosh() and sinh(), respectively, in Q61 format. + We are computing z = 2*atanh(y/x) = 2*atanh((w - 1)/(w + 1)).*/ + x = w + ((int64_t)1 << 61); + y = w - ((int64_t)1 << 61); + for (i = 0; i < 4; i++) { + mask = -(y < 0); + z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask; + u = x >> (i + 1); + x -= ((y >> (i + 1)) + mask) ^ mask; + y -= (u + mask) ^ mask; + } + /*Repeat iteration 4.*/ + for (i--; i < 13; i++) { + mask = -(y < 0); + z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask; + u = x >> (i + 1); + x -= ((y >> (i + 1)) + mask) ^ mask; + y -= (u + mask) ^ mask; + } + /*Repeat iteration 13.*/ + for (i--; i < 32; i++) { + mask = -(y < 0); + z += ((OD_ATANH_LOG2[i] >> i) + mask) ^ mask; + u = x >> (i + 1); + x -= ((y >> (i + 1)) + mask) ^ mask; + y -= (u + mask) ^ mask; + } + /*OD_ATANH_LOG2 has converged.*/ + for (; i < 40; i++) { + mask = -(y < 0); + z += ((OD_ATANH_LOG2[31] >> i) + mask) ^ mask; + u = x >> (i + 1); + x -= ((y >> (i + 1)) + mask) ^ mask; + y -= (u + mask) ^ mask; + } + /*Repeat iteration 40.*/ + for (i--; i < 62; i++) { + mask = -(y < 0); + z += ((OD_ATANH_LOG2[31] >> i) + mask) ^ mask; + u = x >> (i + 1); + x -= ((y >> (i + 1)) + mask) ^ mask; + y -= (u + mask) ^ mask; + } + z = (z + 8) >> 4; + } + return OD_Q57(ipart) + z; +} + +/*Convenience function converts Q57 value to a clamped 32-bit Q24 value + in: input in Q57 format. + Return: same number in Q24 */ +static int32_t od_q57_to_q24(int64_t in) { + int64_t ret; + ret = (in + ((int64_t)1 << 32)) >> 33; + /*0x80000000 is automatically converted to unsigned on 32-bit systems. + -0x7FFFFFFF-1 is needed to avoid "promoting" the whole expression to + unsigned.*/ + return (int32_t)OD_CLAMPI(-0x7FFFFFFF - 1, ret, 0x7FFFFFFF); +} + +/*Binary exponential of log_scale with 24-bit fractional precision and + saturation. + log_scale: A binary logarithm in Q57 format. + Return: The binary exponential in Q24 format, saturated to 2**31-1 if + log_scale was too large.*/ +static int32_t od_bexp64_q24(int64_t log_scale) { + if (log_scale < OD_Q57(8)) { + int64_t ret; + ret = od_bexp64(log_scale + OD_Q57(24)); + return ret < 0x7FFFFFFF ? (int32_t)ret : 0x7FFFFFFF; + } + return 0x7FFFFFFF; +} + +/*Re-initialize Bessel filter coefficients with the specified delay. + This does not alter the x/y state, but changes the reaction time of the + filter. + Altering the time constant of a reactive filter without alterning internal + state is something that has to be done carefuly, but our design operates at + high enough delays and with small enough time constant changes to make it + safe.*/ +static void od_iir_bessel2_reinit(od_iir_bessel2 *f, int delay) { + int alpha; + int64_t one48; + int64_t warp; + int64_t k1; + int64_t k2; + int64_t d; + int64_t a; + int64_t ik2; + int64_t b1; + int64_t b2; + /*This borrows some code from an unreleased version of Postfish. + See the recipe at http://unicorn.us.com/alex/2polefilters.html for details + on deriving the filter coefficients.*/ + /*alpha is Q24*/ + alpha = (1 << 24) / delay; + one48 = (int64_t)1 << 48; + /*warp is 7.12*/ + warp = OD_MAXI(od_warp_alpha(alpha), 1); + /*k1 is 9.12*/ + k1 = 3 * warp; + /*k2 is 16.24.*/ + k2 = k1 * warp; + /*d is 16.15.*/ + d = ((((1 << 12) + k1) << 12) + k2 + 256) >> 9; + /*a is 0.32, since d is larger than both 1.0 and k2.*/ + a = (k2 << 23) / d; + /*ik2 is 25.24.*/ + ik2 = one48 / k2; + /*b1 is Q56; in practice, the integer ranges between -2 and 2.*/ + b1 = 2 * a * (ik2 - (1 << 24)); + /*b2 is Q56; in practice, the integer ranges between -2 and 2.*/ + b2 = (one48 << 8) - ((4 * a) << 24) - b1; + /*All of the filter parameters are Q24.*/ + f->c[0] = (int32_t)((b1 + ((int64_t)1 << 31)) >> 32); + f->c[1] = (int32_t)((b2 + ((int64_t)1 << 31)) >> 32); + f->g = (int32_t)((a + 128) >> 8); +} + +/*Initialize a 2nd order low-pass Bessel filter with the corresponding delay + and initial value. + value is Q24.*/ +static void od_iir_bessel2_init(od_iir_bessel2 *f, int delay, int32_t value) { + od_iir_bessel2_reinit(f, delay); + f->y[1] = f->y[0] = f->x[1] = f->x[0] = value; +} + +static int64_t od_iir_bessel2_update(od_iir_bessel2 *f, int32_t x) { + int64_t c0; + int64_t c1; + int64_t g; + int64_t x0; + int64_t x1; + int64_t y0; + int64_t y1; + int64_t ya; + c0 = f->c[0]; + c1 = f->c[1]; + g = f->g; + x0 = f->x[0]; + x1 = f->x[1]; + y0 = f->y[0]; + y1 = f->y[1]; + ya = ((x + x0 * 2 + x1) * g + y0 * c0 + y1 * c1 + (1 << 23)) >> 24; + f->x[1] = (int32_t)x0; + f->x[0] = x; + f->y[1] = (int32_t)y0; + f->y[0] = (int32_t)ya; + return ya; +} + +static void od_enc_rc_reset(od_rc_state *rc) { + int64_t npixels; + int64_t ibpp; + rc->bits_per_frame = (int64_t)(rc->target_bitrate / rc->framerate); + /*Insane framerates or frame sizes mean insane bitrates. + Let's not get carried away.*/ + if (rc->bits_per_frame > 0x400000000000LL) { + rc->bits_per_frame = (int64_t)0x400000000000LL; + } else { + if (rc->bits_per_frame < 32) { + rc->bits_per_frame = 32; + } + } + rc->reservoir_frame_delay = OD_MAXI(rc->reservoir_frame_delay, 12); + rc->reservoir_max = rc->bits_per_frame * rc->reservoir_frame_delay; + /*Start with a buffer fullness and fullness target of 50% */ + rc->reservoir_target = (rc->reservoir_max + 1) >> 1; + rc->reservoir_fullness = rc->reservoir_target; + /*Pick exponents and initial scales for quantizer selection.*/ + npixels = rc->frame_width * (int64_t)rc->frame_height; + rc->log_npixels = od_blog64(npixels); + ibpp = npixels / rc->bits_per_frame; + /*All of these initial scale/exp values are from Theora, and have not yet + been adapted to Daala, so they're certainly wrong. + The B-frame values especially are simply copies of the P-frame values.*/ + if (ibpp < 1) { + rc->exp[OD_I_FRAME] = 59; + rc->log_scale[OD_I_FRAME] = od_blog64(1997) - OD_Q57(OD_COEFF_SHIFT); + } else if (ibpp < 2) { + rc->exp[OD_I_FRAME] = 55; + rc->log_scale[OD_I_FRAME] = od_blog64(1604) - OD_Q57(OD_COEFF_SHIFT); + } else { + rc->exp[OD_I_FRAME] = 48; + rc->log_scale[OD_I_FRAME] = od_blog64(834) - OD_Q57(OD_COEFF_SHIFT); + } + if (ibpp < 4) { + rc->exp[OD_P_FRAME] = 100; + rc->log_scale[OD_P_FRAME] = od_blog64(2249) - OD_Q57(OD_COEFF_SHIFT); + } else if (ibpp < 8) { + rc->exp[OD_P_FRAME] = 95; + rc->log_scale[OD_P_FRAME] = od_blog64(1751) - OD_Q57(OD_COEFF_SHIFT); + } else { + rc->exp[OD_P_FRAME] = 73; + rc->log_scale[OD_P_FRAME] = od_blog64(1260) - OD_Q57(OD_COEFF_SHIFT); + } + /*Golden P-frames both use the same log_scale and exp modeling + values as regular P-frames and the same scale follower. + For convenience in the rate calculation code, we maintain a copy of + the scale and exp values in OD_GOLDEN_P_FRAME.*/ + rc->exp[OD_GOLDEN_P_FRAME] = rc->exp[OD_P_FRAME]; + rc->log_scale[OD_GOLDEN_P_FRAME] = rc->log_scale[OD_P_FRAME]; + rc->exp[OD_ALTREF_P_FRAME] = rc->exp[OD_P_FRAME]; + rc->log_scale[OD_ALTREF_P_FRAME] = rc->log_scale[OD_P_FRAME]; + /*We clamp the actual I and B frame delays to a minimum of 10 to work within + the range of values where later incrementing the delay works as designed. + 10 is not an exact choice, but rather a good working trade-off.*/ + rc->inter_p_delay = 10; + rc->inter_delay_target = rc->reservoir_frame_delay >> 1; + memset(rc->frame_count, 0, sizeof(rc->frame_count)); + /*Drop-frame tracking is concerned with more than just the basic three frame + types. + It needs to track boosted and cut subtypes (of which there is only one + right now, OD_GOLDEN_P_FRAME). */ + rc->prev_drop_count[OD_I_FRAME] = 0; + rc->log_drop_scale[OD_I_FRAME] = OD_Q57(0); + rc->prev_drop_count[OD_P_FRAME] = 0; + rc->log_drop_scale[OD_P_FRAME] = OD_Q57(0); + rc->prev_drop_count[OD_GOLDEN_P_FRAME] = 0; + rc->log_drop_scale[OD_GOLDEN_P_FRAME] = OD_Q57(0); + rc->prev_drop_count[OD_ALTREF_P_FRAME] = 0; + rc->log_drop_scale[OD_ALTREF_P_FRAME] = OD_Q57(0); + /*Set up second order followers, initialized according to corresponding + time constants.*/ + od_iir_bessel2_init(&rc->scalefilter[OD_I_FRAME], 4, + od_q57_to_q24(rc->log_scale[OD_I_FRAME])); + od_iir_bessel2_init(&rc->scalefilter[OD_P_FRAME], rc->inter_p_delay, + od_q57_to_q24(rc->log_scale[OD_P_FRAME])); + od_iir_bessel2_init(&rc->vfrfilter[OD_I_FRAME], 4, + od_bexp64_q24(rc->log_drop_scale[OD_I_FRAME])); + od_iir_bessel2_init(&rc->vfrfilter[OD_P_FRAME], 4, + od_bexp64_q24(rc->log_drop_scale[OD_P_FRAME])); + od_iir_bessel2_init(&rc->vfrfilter[OD_GOLDEN_P_FRAME], 4, + od_bexp64_q24(rc->log_drop_scale[OD_GOLDEN_P_FRAME])); + od_iir_bessel2_init(&rc->vfrfilter[OD_ALTREF_P_FRAME], 4, + od_bexp64_q24(rc->log_drop_scale[OD_ALTREF_P_FRAME])); +} + +int od_enc_rc_resize(od_rc_state *rc) { + /*If encoding has not yet begun, reset the buffer state.*/ + if (rc->cur_frame == 0) { + od_enc_rc_reset(rc); + } else { + int idt; + /*Otherwise, update the bounds on the buffer, but not the current + fullness.*/ + rc->bits_per_frame = (int64_t)(rc->target_bitrate / rc->framerate); + /*Insane framerates or frame sizes mean insane bitrates. + Let's not get carried away.*/ + if (rc->bits_per_frame > 0x400000000000LL) { + rc->bits_per_frame = (int64_t)0x400000000000LL; + } else { + if (rc->bits_per_frame < 32) { + rc->bits_per_frame = 32; + } + } + rc->reservoir_frame_delay = OD_MAXI(rc->reservoir_frame_delay, 12); + rc->reservoir_max = rc->bits_per_frame * rc->reservoir_frame_delay; + rc->reservoir_target = + ((rc->reservoir_max + 1) >> 1) + + ((rc->bits_per_frame + 2) >> 2) * + OD_MINI(rc->keyframe_rate, rc->reservoir_frame_delay); + /*Update the INTER-frame scale filter delay. + We jump to it immediately if we've already seen enough frames; otherwise + it is simply set as the new target.*/ + rc->inter_delay_target = idt = OD_MAXI(rc->reservoir_frame_delay >> 1, 10); + if (idt < OD_MINI(rc->inter_p_delay, rc->frame_count[OD_P_FRAME])) { + od_iir_bessel2_init(&rc->scalefilter[OD_P_FRAME], idt, + rc->scalefilter[OD_P_FRAME].y[0]); + rc->inter_p_delay = idt; + } + } + return 0; +} + +int od_enc_rc_init(od_rc_state *rc, int64_t bitrate, int delay_ms) { + if (rc->framerate <= 0) return 1; + if (rc->target_bitrate > 0) { + /*State has already been initialized; rather than reinitialize, + adjust the buffering for the new target rate. */ + rc->target_bitrate = bitrate; + return od_enc_rc_resize(rc); + } + rc->target_quantizer = 0; + rc->target_bitrate = bitrate; + rc->rate_bias = 0; + if (bitrate > 0) { + /* The buffer size is clamped between [12, 256], this interval is short + enough to + allow reaction, but long enough to allow looking into the next GOP + (avoiding + the case where the last frames before an I-frame get starved). + The 12 frame minimum gives us some chance to distribute bit estimation + errors in the worst case. The 256 frame maximum means we'll require 8-10 + seconds + of pre-buffering at 24-30 fps, which is not unreasonable.*/ + rc->reservoir_frame_delay = + (int)OD_MINI((delay_ms / 1000) * rc->framerate, 256); + rc->drop_frames = 1; + rc->cap_overflow = 1; + rc->cap_underflow = 0; + rc->twopass_state = 0; + od_enc_rc_reset(rc); + } + return 0; +} + +/*Scale the number of frames by the number of expected drops/duplicates.*/ +static int od_rc_scale_drop(od_rc_state *rc, int frame_type, int nframes) { + if (rc->prev_drop_count[frame_type] > 0 || + rc->log_drop_scale[frame_type] > OD_Q57(0)) { + int64_t dup_scale; + dup_scale = od_bexp64(((rc->log_drop_scale[frame_type] + + od_blog64(rc->prev_drop_count[frame_type] + 1)) >> + 1) + + OD_Q57(8)); + if (dup_scale < nframes << 8) { + int dup_scalei; + dup_scalei = (int)dup_scale; + if (dup_scalei > 0) { + nframes = ((nframes << 8) + dup_scalei - 1) / dup_scalei; + } + } else { + nframes = !!nframes; + } + } + return nframes; +} + +/*Closed form version of frame determination code. + Used by rate control to predict frame types and subtypes into the future. + No side effects, may be called any number of times. + Note that it ignores end-of-file conditions; one-pass planning *should* + ignore end-of-file. */ +int od_frame_type(od_rc_state *rc, int64_t coding_frame_count, int *is_golden, + int *is_altref, int64_t *ip_count) { + int frame_type; + if (coding_frame_count == 0) { + *is_golden = 1; + *is_altref = 1; + *ip_count = 0; + frame_type = OD_I_FRAME; + } else { + int keyrate = rc->keyframe_rate; + if (rc->closed_gop) { + int ip_per_gop; + int gop_n; + int gop_i; + ip_per_gop = (keyrate - 1) / 2; + gop_n = coding_frame_count / keyrate; + gop_i = coding_frame_count - gop_n * keyrate; + *ip_count = gop_n * ip_per_gop + (gop_i > 0) + (gop_i - 1); + frame_type = gop_i == 0 ? OD_I_FRAME : OD_P_FRAME; + } else { + int ip_per_gop; + int gop_n; + int gop_i; + ip_per_gop = (keyrate); + gop_n = (coding_frame_count - 1) / keyrate; + gop_i = coding_frame_count - gop_n * keyrate - 1; + *ip_count = (coding_frame_count > 0) + gop_n * ip_per_gop + (gop_i); + frame_type = gop_i / 1 < ip_per_gop - 1 ? OD_P_FRAME : OD_I_FRAME; + } + } + *is_golden = + (*ip_count % rc->goldenframe_rate) == 0 || frame_type == OD_I_FRAME; + *is_altref = (*ip_count % rc->altref_rate) == 0 || frame_type == OD_I_FRAME; + return frame_type; +} + +/*Count frames types forward from the current frame up to but not including + the last I-frame in reservoir_frame_delay. + If reservoir_frame_delay contains no I-frames (or the current frame is the + only I-frame), count all reservoir_frame_delay frames. + Returns the number of frames counted. + Right now, this implementation is simple, brute-force, and expensive. + It is also easy to understand and debug. + TODO: replace with a virtual FIFO that keeps running totals as + repeating the counting over-and-over will have a performance impact on + whole-file 2pass usage.*/ +static int frame_type_count(od_rc_state *rc, int nframes[OD_FRAME_NSUBTYPES]) { + int i; + int j; + int acc[OD_FRAME_NSUBTYPES]; + int count; + int reservoir_frames; + int reservoir_frame_delay; + memset(nframes, 0, OD_FRAME_NSUBTYPES * sizeof(*nframes)); + memset(acc, 0, sizeof(acc)); + count = 0; + reservoir_frames = 0; +#if 1 + /*Go ahead and count past end-of-stream. + We won't nail the exact bitrate on short files that end with a partial + GOP, but we also won't [potentially] destroy the quality of the last few + frames in that same case when we suddenly find out the stream is ending + before the original planning horizon.*/ + reservoir_frame_delay = rc->reservoir_frame_delay; +#else + /*Don't count past the end of the stream (once we know where end-of-stream + is).*/ + reservoir_frame_delay = + rc->end_of_input ? rc->input_size + 1 : rc->reservoir_frame_delay; +#endif + for (i = 0; i < reservoir_frame_delay; i++) { + int frame_type; + int is_golden; + int is_altref; + int64_t dummy; + frame_type = + od_frame_type(rc, rc->cur_frame + i, &is_golden, &is_altref, &dummy); + switch (frame_type) { + case OD_I_FRAME: { + for (j = 0; j < OD_FRAME_NSUBTYPES; j++) nframes[j] += acc[j]; + reservoir_frames += count; + memset(acc, 0, sizeof(acc)); + acc[OD_I_FRAME] = 1; + count = 1; + break; + } + case OD_P_FRAME: { + if (is_golden) { + ++acc[OD_GOLDEN_P_FRAME]; + ++count; + } else if (is_altref) { + ++acc[OD_ALTREF_P_FRAME]; + ++count; + } else { + ++acc[OD_P_FRAME]; + ++count; + } + break; + } + } + } + /*If there were no I-frames at all, or only the first frame was an I-frame, + the accumulators never flushed and still contain the counts for the + entire buffer. + In both these cases, we return these counts. + Otherwise, we discard what remains in the accumulators as they contain + the counts from and past the last I-frame.*/ + if (reservoir_frames == 0) { + for (i = 0; i < OD_FRAME_NSUBTYPES; i++) nframes[i] = acc[i]; + reservoir_frames += count; + } + return reservoir_frames; +} + +static int convert_to_ac_quant(int q, int bit_depth) { + return lrint(av1_convert_qindex_to_q(q, bit_depth)); +} + +int od_enc_rc_select_quantizers_and_lambdas(od_rc_state *rc, + int is_golden_frame, + int is_altref_frame, int frame_type, + int *bottom_idx, int *top_idx) { + int frame_subtype; + int64_t log_cur_scale; + int lossy_quantizer_min; + int lossy_quantizer_max; + double mqp_i = OD_MQP_I; + double mqp_p = OD_MQP_P; + double mqp_gp = OD_MQP_GP; + double mqp_ap = OD_MQP_AP; + int reservoir_frames; + int nframes[OD_FRAME_NSUBTYPES]; + int32_t mqp_Q12[OD_FRAME_NSUBTYPES]; + int64_t dqp_Q45[OD_FRAME_NSUBTYPES]; + /*Verify the closed-form frame type determination code matches what the + input queue set.*/ + /*One pseudo-non-closed-form caveat: + Once we've seen end-of-input, the batched frame determination code + suppresses the last open-GOP's I-frame (since it would only be + useful for the next GOP, which doesn't exist). + Thus, don't check one the input queue is drained.*/ + if (!rc->end_of_input) { + int closed_form_type; + int closed_form_golden; + int closed_form_altref; + int64_t closed_form_cur_frame; + closed_form_type = + od_frame_type(rc, rc->cur_frame, &closed_form_golden, + &closed_form_altref, &closed_form_cur_frame); + OD_UNUSED(closed_form_type); + OD_UNUSED(is_altref_frame); + assert(closed_form_type == frame_type); + assert(closed_form_cur_frame == rc->cur_frame); + assert(closed_form_altref == is_altref_frame); + assert(closed_form_golden == is_golden_frame); + } + + log_cur_scale = (int64_t)rc->scalefilter[frame_type].y[0] << 33; + + /*Count the various types and classes of frames.*/ + reservoir_frames = frame_type_count(rc, nframes); + nframes[OD_I_FRAME] = od_rc_scale_drop(rc, OD_I_FRAME, nframes[OD_I_FRAME]); + nframes[OD_P_FRAME] = od_rc_scale_drop(rc, OD_P_FRAME, nframes[OD_P_FRAME]); + nframes[OD_GOLDEN_P_FRAME] = + od_rc_scale_drop(rc, OD_GOLDEN_P_FRAME, nframes[OD_GOLDEN_P_FRAME]); + nframes[OD_ALTREF_P_FRAME] = + od_rc_scale_drop(rc, OD_ALTREF_P_FRAME, nframes[OD_ALTREF_P_FRAME]); + + switch (rc->twopass_state) { + default: break; + case 1: { + /*Pass 1 mode: use a fixed qi value.*/ + return rc->firstpass_quant; + } break; + case 2: { + int i; + int64_t scale_sum[OD_FRAME_NSUBTYPES]; + int qti; + /*Pass 2 mode: we know exactly how much of each frame type there is in + the current buffer window, and have estimates for the scales.*/ + for (i = 0; i < OD_FRAME_NSUBTYPES; i++) { + nframes[i] = rc->nframes[i]; + nframes[i] = rc->nframes[i]; + scale_sum[i] = rc->scale_sum[i]; + } + /*If we're not using the same frame type as in pass 1 (because someone + changed the keyframe interval), remove that scale estimate. + We'll add in a replacement for the correct frame type below.*/ + qti = rc->cur_metrics.frame_type; + if (qti != frame_type) { + nframes[qti]--; + scale_sum[qti] -= od_bexp64_q24(rc->cur_metrics.log_scale); + } + /*Compute log_scale estimates for each frame type from the pass-1 scales + we measured in the current window.*/ + for (qti = 0; qti < OD_FRAME_NSUBTYPES; qti++) { + rc->log_scale[qti] = nframes[qti] > 0 + ? od_blog64(scale_sum[qti]) - + od_blog64(nframes[qti]) - OD_Q57(24) + : -rc->log_npixels; + } + /*If we're not using the same frame type as in pass 1, add a scale + estimate for the corresponding frame using the current low-pass + filter value. + This is mostly to ensure we have a valid estimate even when pass 1 had + no frames of this type in the buffer window. + TODO: We could also plan ahead and figure out how many keyframes we'll + be forced to add in the current buffer window.*/ + qti = rc->cur_metrics.frame_type; + if (qti != frame_type) { + int64_t scale; + scale = rc->log_scale[frame_type] < OD_Q57(23) + ? od_bexp64(rc->log_scale[frame_type] + OD_Q57(24)) + : 0x7FFFFFFFFFFFLL; + scale *= nframes[frame_type]; + nframes[frame_type]++; + scale += od_bexp64_q24(log_cur_scale >> 33); + rc->log_scale[frame_type] = + od_blog64(scale) - od_blog64(nframes[qti]) - OD_Q57(24); + } else { + log_cur_scale = (int64_t)rc->cur_metrics.log_scale << 33; + } + } break; + } + + /*Quantizer selection sticks to the codable, lossy portion of the quantizer + range.*/ + lossy_quantizer_min = convert_to_ac_quant(rc->minq, rc->bit_depth); + lossy_quantizer_max = convert_to_ac_quant(rc->maxq, rc->bit_depth); + frame_subtype = frame_type; + /*Stash quantizer modulation by frame type.*/ + mqp_Q12[OD_I_FRAME] = OD_F_Q12(mqp_i); + mqp_Q12[OD_P_FRAME] = OD_F_Q12(mqp_p); + mqp_Q12[OD_GOLDEN_P_FRAME] = OD_F_Q12(mqp_gp); + mqp_Q12[OD_ALTREF_P_FRAME] = OD_F_Q12(mqp_ap); + dqp_Q45[OD_I_FRAME] = OD_F_Q45(OD_DQP_I); + dqp_Q45[OD_P_FRAME] = OD_F_Q45(OD_DQP_P); + dqp_Q45[OD_GOLDEN_P_FRAME] = OD_F_Q45(OD_DQP_GP); + dqp_Q45[OD_ALTREF_P_FRAME] = OD_F_Q45(OD_DQP_AP); + /*Is rate control active?*/ + if (rc->target_bitrate <= 0) { + /*Rate control is not active; derive quantizer directly from + quality parameter and frame type. */ + /*Can't use the OD_LOSSLESS macro, as it uses state.quantizer to intuit, + and we've not set it yet.*/ + if (rc->quality == 0) { + /*Lossless coding requested.*/ + rc->base_quantizer = 0; + rc->target_quantizer = 0; + } else { + int64_t log_quantizer; + + /* Adjust the modulation constants using the last frame's quantizer. */ + double mqp_delta = (255 - rc->target_quantizer) / 2000.0f; + mqp_i -= mqp_delta; + mqp_p += mqp_delta; + mqp_gp -= mqp_delta; + mqp_Q12[OD_I_FRAME] = OD_F_Q12(mqp_i); + mqp_Q12[OD_P_FRAME] = OD_F_Q12(mqp_p); + mqp_Q12[OD_GOLDEN_P_FRAME] = OD_F_Q12(mqp_gp); + mqp_Q12[OD_ALTREF_P_FRAME] = OD_F_Q12(mqp_ap); + + if (rc->quality == -1) { + /*A quality of -1 means quality was unset; use a default.*/ + rc->base_quantizer = convert_to_ac_quant(10, rc->bit_depth); + } else { + rc->base_quantizer = convert_to_ac_quant(rc->quality, rc->bit_depth); + } + + if (rc->periodic_boosts && !is_golden_frame) { + int pattern_rate = (rc->goldenframe_rate >> 1); + int dist_to_golden = rc->cur_frame % pattern_rate; + int dist_away_golden = pattern_rate - dist_to_golden; + int boost = dist_to_golden; + if (dist_away_golden > dist_to_golden) boost = dist_away_golden; + boost -= pattern_rate; + boost *= (rc->base_quantizer) / OD_PERIODIC_BOOST_DIV; + rc->base_quantizer = rc->base_quantizer + boost; + } + + /*As originally written, qp modulation is applied to the coded quantizer. + Because we now have and use a more precise target quantizer for various + calculation, that needs to be modulated as well. + Calculate what is, effectively, a fractional coded quantizer. */ + /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/ + log_quantizer = od_blog64(rc->base_quantizer) - OD_Q57(OD_COEFF_SHIFT); + /*log_quantizer to Q21.*/ + log_quantizer >>= 36; + /*scale log quantizer, result is Q33.*/ + log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12; + /*Add Q33 offset to Q33 log_quantizer.*/ + log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12; + /*Modulate quantizer according to frame type; result is Q45.*/ + log_quantizer *= mqp_Q12[frame_subtype]; + /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/ + log_quantizer += dqp_Q45[frame_subtype]; + /*Back to log2 quantizer in Q57.*/ + log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) * + OD_LOG_QUANTIZER_EXP_Q12 + + OD_Q57(OD_COEFF_SHIFT); + /*Convert Q57 log2 quantizer to unclamped linear target quantizer value.*/ + rc->target_quantizer = od_bexp64(log_quantizer); + } + } else { + int clamp; + int64_t rate_bias; + int64_t rate_total; + int base_quantizer; + int64_t log_quantizer; + int qlo; + int qhi; + int i; + /*We clamp the allowed amount of qi change (after initialization).*/ + clamp = rc->cur_frame > 0; + /*Figure out how to re-distribute bits so that we hit our fullness target + before the last keyframe in our current buffer window (after the current + frame), or the end of the buffer window, whichever comes first.*/ + /*Single pass only right now.*/ + /*If we've been missing our target, add a penalty term.*/ + rate_bias = (rc->rate_bias / (rc->cur_frame + 1000)) * reservoir_frames; + /*rate_total is the total bits available over the next + reservoir_frames frames.*/ + rate_total = rc->reservoir_fullness - rc->reservoir_target + rate_bias + + reservoir_frames * rc->bits_per_frame; + /*Find a target quantizer that meets our rate target for the specific mix + of frame types we'll have over the next frame_delay frames. + We model the rate<->quantizer relationship as: + rate = scale*(quantizer**-exp) + In this case, we have our desired rate, an exponent selected in setup, + and a scale that's been measured over our frame history, so we're + solving for the quantizer. + Exponentiation with arbitrary exponents is expensive, so we work in + the binary log domain (binary exp and log aren't too bad): + rate = e2(log2_scale - log2_quantizer * exp) + There's no easy closed form solution, so we bisection search for it.*/ + /*We do not currently allow rate control to select lossless encoding.*/ + qlo = 1; + /*If there's a quality specified, it's used to select the + coarsest base quantizer we can select. + Otherwise we can use up to and including the coarsest codable + quantizer.*/ + if (rc->quality > 0) + qhi = convert_to_ac_quant(rc->quality, rc->bit_depth); + else + qhi = lossy_quantizer_max; + base_quantizer = (qlo + qhi) >> 1; + while (qlo < qhi) { + volatile int64_t log_base_quantizer; + int64_t diff; + int64_t bits; + /*Count bits contributed by each frame type using the model.*/ + bits = 0; + log_base_quantizer = od_blog64(base_quantizer); + for (i = 0; i < OD_FRAME_NSUBTYPES; i++) { + /*Modulate base quantizer by frame type.*/ + /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/ + log_quantizer = log_base_quantizer - OD_Q57(OD_COEFF_SHIFT); + /*log_quantizer to Q21.*/ + log_quantizer >>= 36; + /*scale log quantizer, result is Q33.*/ + log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12; + /*Add Q33 offset to Q33 log_quantizer.*/ + log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12; + /*Modulate quantizer according to frame type; result is Q45.*/ + log_quantizer *= mqp_Q12[i]; + /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/ + log_quantizer += dqp_Q45[i]; + /*Back to log2 quantizer in Q57.*/ + log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) * + OD_LOG_QUANTIZER_EXP_Q12 + + OD_Q57(OD_COEFF_SHIFT); + /*Clamp modulated quantizer values.*/ + log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer, + od_blog64(lossy_quantizer_max)); + /* All the fields here are Q57 except for the exponent which is Q6.*/ + bits += nframes[i] * od_bexp64(rc->log_scale[i] + rc->log_npixels - + (log_quantizer >> 6) * rc->exp[i]); + } + diff = bits - rate_total; + if (diff > 0) { + qlo = base_quantizer + 1; + } else if (diff < 0) { + qhi = base_quantizer - 1; + } else { + break; + } + base_quantizer = (qlo + qhi) >> 1; + } + /*If this was not one of the initial frames, limit the change in base + quantizer to within [0.8*Q,1.2*Q], where Q is the previous frame's + base quantizer.*/ + if (clamp) { + base_quantizer = OD_CLAMPI((rc->base_quantizer * 0x0CCCD + 0x8000) >> 16, + base_quantizer, + (rc->base_quantizer * 0x13333 + 0x8000) >> 16); + } + /*Modulate chosen base quantizer to produce target quantizer.*/ + log_quantizer = od_blog64(base_quantizer); + /*Get the log2 quantizer in Q57 (normalized for coefficient shift).*/ + log_quantizer -= OD_Q57(OD_COEFF_SHIFT); + /*log_quantizer to Q21.*/ + log_quantizer >>= 36; + /*scale log quantizer, result is Q33.*/ + log_quantizer *= OD_LOG_QUANTIZER_BASE_Q12; + /*Add Q33 offset to Q33 log_quantizer.*/ + log_quantizer += OD_LOG_QUANTIZER_OFFSET_Q45 >> 12; + /*Modulate quantizer according to frame type; result is Q45.*/ + log_quantizer *= mqp_Q12[frame_subtype]; + /*Add Q45 boost/cut to Q45 fractional coded quantizer.*/ + log_quantizer += dqp_Q45[frame_subtype]; + /*Back to log2 quantizer in Q57.*/ + log_quantizer = (log_quantizer - OD_LOG_QUANTIZER_OFFSET_Q45) * + OD_LOG_QUANTIZER_EXP_Q12 + + OD_Q57(OD_COEFF_SHIFT); + /*Clamp modulated quantizer values.*/ + log_quantizer = OD_CLAMPI(od_blog64(lossy_quantizer_min), log_quantizer, + od_blog64(lossy_quantizer_max)); + /*The above allocation looks only at the total rate we'll accumulate in + the next reservoir_frame_delay frames. + However we could overflow the bit reservoir on the very next frame, so + check for that here if we're not using a soft target.*/ + if (rc->cap_overflow) { + int64_t margin; + int64_t soft_limit; + int64_t log_soft_limit; + int64_t log_scale_pixels; + int64_t exp; + int64_t log_qexp; + /*Allow 3% of the buffer for prediction error. + This should be plenty, and we don't mind if we go a bit over; we only + want to keep these bits from being completely wasted.*/ + margin = (rc->reservoir_max + 31) >> 5; + /*We want to use at least this many bits next frame.*/ + soft_limit = rc->reservoir_fullness + rc->bits_per_frame - + (rc->reservoir_max - margin); + log_soft_limit = od_blog64(soft_limit); + /*If we're predicting we won't use that many bits...*/ + log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels; + exp = rc->exp[frame_subtype]; + log_qexp = (log_quantizer >> 6) * exp; + if (log_scale_pixels - log_qexp < log_soft_limit) { + /*Scale the adjustment based on how far into the margin we are.*/ + log_qexp += ((log_scale_pixels - log_soft_limit - log_qexp) >> 32) * + (OD_MINI(margin, soft_limit) << 32) / margin; + log_quantizer = (((log_qexp + (exp >> 1)) / exp) << 6); + } + } + /*We just checked we don't overflow the reservoir next frame, now check + we don't underflow and bust the budget (when not using a soft target). + Disabled when a quality bound is set; if we saturate quantizer to the + maximum possible size when we have a limiting max quality, the + resulting lambda can cause strange behavior.*/ + if (rc->quality == -1) { + int64_t exp; + int64_t log_qexp; + int64_t log_scale_pixels; + int64_t log_hard_limit; + /*Compute the maximum number of bits we can use in the next frame. + Allow 50% of the rate for a single frame for prediction error. + This may not be enough for keyframes or sudden changes in + complexity.*/ + log_hard_limit = + od_blog64(rc->reservoir_fullness + (rc->bits_per_frame >> 1)); + /*If we're predicting we'll use more than this...*/ + log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels; + exp = rc->exp[frame_subtype]; + log_qexp = (log_quantizer >> 6) * exp; + if (log_scale_pixels - log_qexp > log_hard_limit) { + /*Force the target to hit our limit exactly.*/ + log_qexp = log_scale_pixels - log_hard_limit; + log_quantizer = (log_qexp + (exp >> 1)) / exp << 6; + /*If that target is unreasonable, oh well; we'll have to drop.*/ + log_quantizer = OD_MAXI(log_quantizer, od_blog64(lossy_quantizer_max)); + } + } + /*Compute a final estimate of the number of bits we plan to use, update + the running rate bias measurement.*/ + { + int64_t log_qexp; + int64_t log_scale_pixels; + log_scale_pixels = rc->log_scale[frame_subtype] + rc->log_npixels; + log_qexp = (log_quantizer >> 6) * rc->exp[frame_subtype]; + rc->rate_bias += od_bexp64(log_scale_pixels - log_qexp); + } + rc->target_quantizer = od_bexp64(log_quantizer); + /*The various cappings and adjustments may have altered the log_quantizer + target significantly. + We can either update the base quantizer to be consistent with the + target or let it track separately. + Theora behavior effectively keeps them consistent, as it regenerates + the effective base quantizer from the target each frame rather than + saving both. + For Daala, it's easier to allow them to track separately. + For now, allow them to track separately and see how it behaves.*/ + rc->base_quantizer = base_quantizer; + } + *bottom_idx = lossy_quantizer_min; + *top_idx = lossy_quantizer_max; + rc->target_quantizer = av1_qindex_from_ac( + OD_CLAMPI(lossy_quantizer_min, rc->target_quantizer, lossy_quantizer_max), + rc->bit_depth); + return rc->target_quantizer; +} + +int od_enc_rc_update_state(od_rc_state *rc, int64_t bits, int is_golden_frame, + int is_altref_frame, int frame_type, int droppable) { + int dropped; + dropped = 0; + /*Update rate control only if rate control is active.*/ + if (rc->target_bitrate > 0) { + int64_t log_scale; + int frame_subtype; + frame_subtype = frame_type; + /*Track non-golden and golden P frame drops separately.*/ + if (is_golden_frame && frame_type == OD_P_FRAME) + frame_subtype = OD_GOLDEN_P_FRAME; + else if (is_altref_frame && frame_type == OD_P_FRAME) + frame_subtype = OD_ALTREF_P_FRAME; + if (bits <= 0) { + /*We didn't code any blocks in this frame.*/ + log_scale = OD_Q57(-64); + bits = 0; + ++rc->prev_drop_count[frame_subtype]; + } else { + int64_t log_bits; + int64_t log_qexp; + /*Compute the estimated scale factor for this frame type.*/ + log_bits = od_blog64(bits); + log_qexp = od_blog64(rc->target_quantizer); + log_qexp = (log_qexp >> 6) * (rc->exp[frame_type]); + log_scale = OD_MINI(log_bits - rc->log_npixels + log_qexp, OD_Q57(16)); + } + + switch (rc->twopass_state) { + case 1: { + int golden, altref; + int64_t ipc; + rc->cur_metrics.frame_type = + od_frame_type(rc, rc->cur_frame, &golden, &altref, &ipc); + /*Pass 1 mode: save the metrics for this frame.*/ + rc->cur_metrics.log_scale = od_q57_to_q24(log_scale); + } break; + case 2: { + /*Pass 2 mode:*/ + int m_frame_type = rc->cur_metrics.frame_type; + rc->nframes[m_frame_type]--; + rc->scale_sum[m_frame_type] -= od_bexp64_q24(rc->cur_metrics.log_scale); + } break; + } + + if (bits > 0) { + od_iir_bessel2 *f; + /*If this is the first example of the given frame type we've + seen, we immediately replace the default scale factor guess + with the estimate we just computed using the first frame.*/ + if (rc->frame_count[frame_type] == 0) { + f = rc->scalefilter + frame_type; + f->y[1] = f->y[0] = f->x[1] = f->x[0] = od_q57_to_q24(log_scale); + rc->log_scale[frame_type] = log_scale; + } else { + /*Lengthen the time constant for the inter filters as we collect more + frame statistics, until we reach our target.*/ + if (frame_type != OD_I_FRAME && + rc->inter_p_delay < rc->inter_delay_target && + rc->frame_count[frame_type] >= rc->inter_p_delay) { + od_iir_bessel2_reinit(&rc->scalefilter[frame_type], + ++rc->inter_p_delay); + } + /*Update the low-pass scale filter for this frame type + regardless of whether or not we drop this frame.*/ + rc->log_scale[frame_type] = + od_iir_bessel2_update(rc->scalefilter + frame_type, + od_q57_to_q24(log_scale)) + << 33; + } + /*If this frame busts our budget, it must be dropped.*/ + if (droppable && rc->reservoir_fullness + rc->bits_per_frame < bits) { + ++rc->prev_drop_count[frame_subtype]; + bits = 0; + dropped = 1; + } else { + uint32_t drop_count; + /*Update a low-pass filter to estimate the "real" frame rate taking + drops into account. + This is only done if the frame is coded, as it needs the final + count of dropped frames.*/ + drop_count = rc->prev_drop_count[frame_subtype] + 1; + if (drop_count > 0x7F) { + drop_count = 0x7FFFFFFF; + } else { + drop_count <<= 24; + } + rc->log_drop_scale[frame_subtype] = + od_blog64(od_iir_bessel2_update(rc->vfrfilter + frame_subtype, + drop_count)) - + OD_Q57(24); + /*Zero the drop count for this frame. + It will be increased if we drop frames.*/ + rc->prev_drop_count[frame_subtype] = 0; + } + /*Increment the frame count for filter adaptation purposes.*/ + if (!rc->twopass_state) rc->frame_count[frame_type]++; + } + rc->reservoir_fullness += rc->bits_per_frame - bits; + /*If we're too quick filling the buffer and overflow is capped, + that rate is lost forever.*/ + if (rc->cap_overflow && rc->reservoir_fullness > rc->reservoir_max) { + rc->reservoir_fullness = rc->reservoir_max; + } + /*If we're too quick draining the buffer and underflow is capped, + don't try to make up that rate later.*/ + if (rc->cap_underflow && rc->reservoir_fullness < 0) { + rc->reservoir_fullness = 0; + } + /*Adjust the bias for the real bits we've used.*/ + rc->rate_bias -= bits; + } + return dropped; +} + +static INLINE void od_rc_buffer_val(od_rc_state *rc, int64_t val, int bytes) { + while (bytes-- > 0) { + rc->twopass_buffer[rc->twopass_buffer_bytes++] = (uint8_t)(val & 0xFF); + val >>= 8; + } +} + +static INLINE int64_t od_rc_unbuffer_val(od_rc_state *rc, int bytes) { + int64_t ret = 0; + int shift = 0; + while (bytes-- > 0) { + ret |= ((int64_t)rc->twopass_buffer[rc->twopass_buffer_bytes++]) << shift; + shift += 8; + } + return ret; +} + +int od_enc_rc_2pass_out(od_rc_state *rc, struct aom_codec_pkt_list *pkt_list, + int summary) { + int i; + struct aom_codec_cx_pkt pkt; + rc->twopass_buffer = rc->firstpass_buffer; + rc->twopass_buffer_bytes = 0; + if (!rc->twopass_state) { + rc->twopass_state = 1; + for (i = 0; i < OD_FRAME_NSUBTYPES; i++) { + rc->frame_count[i] = 0; + rc->exp[i] = 0; + rc->scale_sum[i] = 0; + } + } + if (summary) { + od_rc_buffer_val(rc, OD_RC_2PASS_MAGIC, 4); + od_rc_buffer_val(rc, OD_RC_2PASS_VERSION, 1); + for (i = 0; i < OD_FRAME_NSUBTYPES; i++) { + od_rc_buffer_val(rc, rc->frame_count[i], 4); + od_rc_buffer_val(rc, rc->exp[i], 4); + od_rc_buffer_val(rc, rc->scale_sum[i], 8); + } + } else { + int frame_type = rc->cur_metrics.frame_type; + rc->scale_sum[frame_type] += od_bexp64_q24(rc->cur_metrics.log_scale); + rc->frame_count[frame_type]++; + od_rc_buffer_val(rc, rc->cur_metrics.frame_type, 1); + od_rc_buffer_val(rc, rc->cur_metrics.log_scale, 4); + } + pkt.data.twopass_stats.buf = rc->firstpass_buffer; + pkt.data.twopass_stats.sz = rc->twopass_buffer_bytes; + pkt.kind = AOM_CODEC_STATS_PKT; + aom_codec_pkt_list_add(pkt_list, &pkt); + return 0; +} + +int od_enc_rc_2pass_in(od_rc_state *rc) { + /* Enable pass 2 mode if this is the first call. */ + if (rc->twopass_state == 0) { + uint32_t i, total_frames = 0; + + if (!rc->twopass_allframes_buf || + rc->twopass_allframes_buf_size < OD_RC_2PASS_MIN) + return -1; + + /* Find summary packet at the end */ + rc->twopass_buffer = rc->twopass_allframes_buf; + rc->twopass_buffer += + rc->twopass_allframes_buf_size - OD_RC_2PASS_SUMMARY_SZ; + rc->twopass_buffer_bytes = 0; + + if (od_rc_unbuffer_val(rc, 4) != OD_RC_2PASS_MAGIC) return -1; + if (od_rc_unbuffer_val(rc, 1) != OD_RC_2PASS_VERSION) return -1; + + for (i = 0; i < OD_FRAME_NSUBTYPES; i++) { + rc->frame_count[i] = od_rc_unbuffer_val(rc, 4); + rc->exp[i] = od_rc_unbuffer_val(rc, 4); + rc->scale_sum[i] = od_rc_unbuffer_val(rc, 8); + rc->nframes[i] = rc->frame_count[i]; + total_frames += rc->frame_count[i]; + } + + if (total_frames < 1) return -1; + + if (total_frames * OD_RC_2PASS_PACKET_SZ > rc->twopass_allframes_buf_size) + return -1; + + od_enc_rc_reset(rc); + + /* Everything looks ok */ + rc->twopass_buffer = rc->twopass_allframes_buf; + rc->twopass_state = 2; + rc->twopass_buffer_bytes = 0; + } + + rc->cur_metrics.frame_type = od_rc_unbuffer_val(rc, 1); + rc->cur_metrics.log_scale = od_rc_unbuffer_val(rc, 4); + + return 0; +} diff --git a/third_party/aom/av1/encoder/ratectrl_xiph.h b/third_party/aom/av1/encoder/ratectrl_xiph.h new file mode 100644 index 000000000..a4a9052fa --- /dev/null +++ b/third_party/aom/av1/encoder/ratectrl_xiph.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2001-2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if !defined(_ratectrl_xiph_H) +#define _ratectrl_xiph_H (1) + +#include "av1/encoder/ratectrl.h" +#include "aom/internal/aom_codec_internal.h" + +/*Frame types.*/ +#define OD_I_FRAME (0) +#define OD_P_FRAME (1) +#define OD_GOLDEN_P_FRAME (2) +#define OD_ALTREF_P_FRAME (3) + +#define OD_FRAME_NSUBTYPES (OD_ALTREF_P_FRAME + 1) + +/* Periodic boost (in between golden frames) strength - lower is more */ +#define OD_PERIODIC_BOOST_DIV (10) + +/* Constants for frame QP modulation <- tweak these + * Adjusts how the rate control system decides the quantizers per frame + * (sub)type */ +#define OD_MQP_I (0.98) +#define OD_MQP_P (1.06) +#define OD_MQP_GP (0.99) +#define OD_MQP_AP (0.92) +#define OD_DQP_I (-2) +#define OD_DQP_P (0) +#define OD_DQP_GP (-2) +#define OD_DQP_AP (-2) + +/*Fractional_coded_quantizer ~= + log2(quantizer / (1 << OD_COEFF_SHIFT))*6.307 + 6.235*/ +/*Base/scale factor for linear quantizer to fractional coded quantizer + conversion (6.307 * 2^12) */ +#define OD_LOG_QUANTIZER_BASE_Q12 (0x0064EB) +/*Inverse of above scale factor.*/ +#define OD_LOG_QUANTIZER_EXP_Q12 (0x000289) +/*Offset for linear quantizer to fractional coded quantizer + conversion (6.235 * 2^45) */ +#define OD_LOG_QUANTIZER_OFFSET_Q45 (0x0000C7851EB851ECLL) + +#define OD_RC_2PASS_MAGIC (0x53015641) /* [A, V, 1, S] in little endian */ +#define OD_RC_2PASS_SUMMARY_SZ (4 + 1 + (4 + 4 + 8) * OD_FRAME_NSUBTYPES) +#define OD_RC_2PASS_PACKET_SZ (1 + 4) +#define OD_RC_2PASS_MIN (OD_RC_2PASS_PACKET_SZ + OD_RC_2PASS_SUMMARY_SZ) +#define OD_RC_2PASS_VERSION (1) + +/*A 2nd order low-pass Bessel follower. + We use this for rate control because it has fast reaction time, but is + critically damped.*/ +typedef struct od_iir_bessel2 { + int32_t c[2]; + int64_t g; + int32_t x[2]; + int32_t y[2]; +} od_iir_bessel2; + +/* The 2-pass metrics associated with a single frame. */ +typedef struct od_frame_metrics { + /*The log base 2 of the scale factor for this frame in Q24 format.*/ + int64_t log_scale; + /*The frame type from pass 1.*/ + unsigned frame_type : 1; +} od_frame_metrics; + +/*Rate control setup and working state information.*/ +typedef struct od_rc_state { + /* Image format */ + int frame_width; + int frame_height; + int bit_depth; + + /* Framerate */ + double framerate; + /* Keyframe rate */ + int keyframe_rate; + /* Golden frame period */ + int goldenframe_rate; + /* Altref frame period */ + int altref_rate; + /*The target bit-rate in bits per second.*/ + int64_t target_bitrate; + /* Quality level for non-bitrate-targeting */ + int quality; + /* Copied from oxcf->frame_periodic_boost */ + int periodic_boosts; + /* Max Q */ + int maxq; + /* Min Q */ + int minq; + /* Quantizer to use for the first pass */ + int firstpass_quant; + + /* 2-pass metrics */ + od_frame_metrics cur_metrics; + + /* 2-pass state */ + int64_t scale_sum[OD_FRAME_NSUBTYPES]; + int nframes[OD_FRAME_NSUBTYPES]; + + /* 2-pass bytestream reader/writer context */ + uint8_t *twopass_buffer; + int twopass_buffer_bytes; + + /* Pass 1 stats packet storage */ + uint8_t firstpass_buffer[OD_RC_2PASS_SUMMARY_SZ]; + + /* Every state packet from the first pass in a single buffer */ + uint8_t *twopass_allframes_buf; + size_t twopass_allframes_buf_size; + + /* Actual returned quantizer */ + int target_quantizer; + /*The full-precision, unmodulated quantizer upon which + our modulated quantizers are based.*/ + int base_quantizer; + + /* Increments by 1 for each frame. */ + int64_t cur_frame; + + /* End of input flag */ + int end_of_input; + /* Closed GOP flag */ + int closed_gop; + /*The number of frames over which to distribute the reservoir usage.*/ + int reservoir_frame_delay; + /*Will we drop frames to meet bitrate target?*/ + unsigned char drop_frames; + /*Do we respect the maximum reservoir fullness?*/ + unsigned char cap_overflow; + /*Can the reservoir go negative?*/ + unsigned char cap_underflow; + /*Two-pass mode state. + 0 => 1-pass encoding. + 1 => 1st pass of 2-pass encoding. + 2 => 2nd pass of 2-pass encoding.*/ + int twopass_state; + /*The log of the number of pixels in a frame in Q57 format.*/ + int64_t log_npixels; + /*The target average bits per frame.*/ + int64_t bits_per_frame; + /*The current bit reservoir fullness (bits available to be used).*/ + int64_t reservoir_fullness; + /*The target buffer fullness. + This is where we'd like to be by the last keyframe the appears in the next + buf_delay frames.*/ + int64_t reservoir_target; + /*The maximum buffer fullness (total size of the buffer).*/ + int64_t reservoir_max; + /*The log of estimated scale factor for the rate model in Q57 format.*/ + int64_t log_scale[OD_FRAME_NSUBTYPES]; + /*The exponent used in the rate model in Q8 format.*/ + unsigned exp[OD_FRAME_NSUBTYPES]; + /*The log of an estimated scale factor used to obtain the real framerate, for + VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/ + int64_t log_drop_scale[OD_FRAME_NSUBTYPES]; + /*The total drop count from the previous frame.*/ + uint32_t prev_drop_count[OD_FRAME_NSUBTYPES]; + /*Second-order lowpass filters to track scale and VFR/drops.*/ + od_iir_bessel2 scalefilter[OD_FRAME_NSUBTYPES]; + od_iir_bessel2 vfrfilter[OD_FRAME_NSUBTYPES]; + int frame_count[OD_FRAME_NSUBTYPES]; + int inter_p_delay; + int inter_delay_target; + /*The total accumulated estimation bias.*/ + int64_t rate_bias; +} od_rc_state; + +int od_enc_rc_init(od_rc_state *rc, int64_t bitrate, int delay_ms); + +int od_enc_rc_select_quantizers_and_lambdas(od_rc_state *rc, + int is_golden_frame, + int is_altref_frame, int frame_type, + int *bottom_idx, int *top_idx); + +/* Returns 1 if the frame should be dropped */ +int od_enc_rc_update_state(od_rc_state *rc, int64_t bits, int is_golden_frame, + int is_altref_frame, int frame_type, int droppable); + +int od_frame_type(od_rc_state *rc, int64_t coding_frame_count, int *is_golden, + int *is_altref, int64_t *ip_count); + +int od_enc_rc_resize(od_rc_state *rc); + +int od_enc_rc_2pass_out(od_rc_state *rc, struct aom_codec_pkt_list *pkt_list, + int summary); + +int od_enc_rc_2pass_in(od_rc_state *rc); + +#endif diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c new file mode 100644 index 000000000..f06e569e7 --- /dev/null +++ b/third_party/aom/av1/encoder/rd.c @@ -0,0 +1,1204 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "./av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/tokenize.h" + +#define RD_THRESH_POW 1.25 + +// Factor to weigh the rate for switchable interp filters. +#define SWITCHABLE_INTERP_RATE_FACTOR 1 + +// The baseline rd thresholds for breaking out of the rd loop for +// certain modes are assumed to be based on 8x8 blocks. +// This table is used to correct for block size. +// The factors here are << 2 (2 = x0.5, 32 = x8 etc). +static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = { +#if CONFIG_CB4X4 + 2, 2, 2, +#endif + 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, +#if CONFIG_EXT_PARTITION + 48, 48, 64 +#endif // CONFIG_EXT_PARTITION +}; + +static void fill_mode_costs(AV1_COMP *cpi) { + const FRAME_CONTEXT *const fc = cpi->common.fc; + int i, j; + + for (i = 0; i < INTRA_MODES; ++i) + for (j = 0; j < INTRA_MODES; ++j) + av1_cost_tokens(cpi->y_mode_costs[i][j], av1_kf_y_mode_prob[i][j], + av1_intra_mode_tree); + + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) + av1_cost_tokens(cpi->mbmode_cost[i], fc->y_mode_prob[i], + av1_intra_mode_tree); + + for (i = 0; i < INTRA_MODES; ++i) + av1_cost_tokens(cpi->intra_uv_mode_cost[i], fc->uv_mode_prob[i], + av1_intra_mode_tree); + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + av1_cost_tokens(cpi->switchable_interp_costs[i], + fc->switchable_interp_prob[i], av1_switchable_interp_tree); + +#if CONFIG_PALETTE + for (i = 0; i < PALETTE_BLOCK_SIZES; ++i) { + av1_cost_tokens(cpi->palette_y_size_cost[i], + av1_default_palette_y_size_prob[i], av1_palette_size_tree); + av1_cost_tokens(cpi->palette_uv_size_cost[i], + av1_default_palette_uv_size_prob[i], av1_palette_size_tree); + } + + for (i = 0; i < PALETTE_SIZES; ++i) { + for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) { + av1_cost_tokens(cpi->palette_y_color_cost[i][j], + av1_default_palette_y_color_index_prob[i][j], + av1_palette_color_index_tree[i]); + av1_cost_tokens(cpi->palette_uv_color_cost[i][j], + av1_default_palette_uv_color_index_prob[i][j], + av1_palette_color_index_tree[i]); + } + } +#endif // CONFIG_PALETTE + + for (i = 0; i < MAX_TX_DEPTH; ++i) + for (j = 0; j < TX_SIZE_CONTEXTS; ++j) + av1_cost_tokens(cpi->tx_size_cost[i][j], fc->tx_size_probs[i][j], + av1_tx_size_tree[i]); + +#if CONFIG_EXT_TX + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + int s; + for (s = 1; s < EXT_TX_SETS_INTER; ++s) { + if (use_inter_ext_tx_for_txsize[s][i]) { + av1_cost_tokens(cpi->inter_tx_type_costs[s][i], + fc->inter_ext_tx_prob[s][i], av1_ext_tx_inter_tree[s]); + } + } + for (s = 1; s < EXT_TX_SETS_INTRA; ++s) { + if (use_intra_ext_tx_for_txsize[s][i]) { + for (j = 0; j < INTRA_MODES; ++j) + av1_cost_tokens(cpi->intra_tx_type_costs[s][i][j], + fc->intra_ext_tx_prob[s][i][j], + av1_ext_tx_intra_tree[s]); + } + } + } +#else + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + for (j = 0; j < TX_TYPES; ++j) + av1_cost_tokens(cpi->intra_tx_type_costs[i][j], + fc->intra_ext_tx_prob[i][j], av1_ext_tx_tree); + } + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + av1_cost_tokens(cpi->inter_tx_type_costs[i], fc->inter_ext_tx_prob[i], + av1_ext_tx_tree); + } +#endif // CONFIG_EXT_TX +#if CONFIG_EXT_INTRA +#if CONFIG_INTRA_INTERP + for (i = 0; i < INTRA_FILTERS + 1; ++i) + av1_cost_tokens(cpi->intra_filter_cost[i], fc->intra_filter_probs[i], + av1_intra_filter_tree); +#endif // CONFIG_INTRA_INTERP +#endif // CONFIG_EXT_INTRA +#if CONFIG_LOOP_RESTORATION + av1_cost_tokens(cpi->switchable_restore_cost, fc->switchable_restore_prob, + av1_switchable_restore_tree); +#endif // CONFIG_LOOP_RESTORATION +#if CONFIG_GLOBAL_MOTION + av1_cost_tokens(cpi->gmtype_cost, fc->global_motion_types_prob, + av1_global_motion_types_tree); +#endif // CONFIG_GLOBAL_MOTION +} + +void av1_fill_token_costs(av1_coeff_cost *c, + av1_coeff_probs_model (*p)[PLANE_TYPES]) { + int i, j, k, l; + TX_SIZE t; + for (t = 0; t < TX_SIZES; ++t) + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + aom_prob probs[ENTROPY_NODES]; + av1_model_to_full_probs(p[t][i][j][k][l], probs); + av1_cost_tokens((int *)c[t][i][j][k][0][l], probs, av1_coef_tree); + av1_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs, + av1_coef_tree); + assert(c[t][i][j][k][0][l][EOB_TOKEN] == + c[t][i][j][k][1][l][EOB_TOKEN]); + } +} + +// Values are now correlated to quantizer. +static int sad_per_bit16lut_8[QINDEX_RANGE]; +static int sad_per_bit4lut_8[QINDEX_RANGE]; + +#if CONFIG_HIGHBITDEPTH +static int sad_per_bit16lut_10[QINDEX_RANGE]; +static int sad_per_bit4lut_10[QINDEX_RANGE]; +static int sad_per_bit16lut_12[QINDEX_RANGE]; +static int sad_per_bit4lut_12[QINDEX_RANGE]; +#endif + +static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range, + aom_bit_depth_t bit_depth) { + int i; + // Initialize the sad lut tables using a formulaic calculation for now. + // This is to make it easier to resolve the impact of experimental changes + // to the quantizer tables. + for (i = 0; i < range; i++) { + const double q = av1_convert_qindex_to_q(i, bit_depth); + bit16lut[i] = (int)(0.0418 * q + 2.4107); + bit4lut[i] = (int)(0.063 * q + 2.742); + } +} + +void av1_init_me_luts(void) { + init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE, + AOM_BITS_8); +#if CONFIG_HIGHBITDEPTH + init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE, + AOM_BITS_10); + init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE, + AOM_BITS_12); +#endif +} + +static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, + 8, 8, 4, 4, 2, 2, 1, 0 }; +static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { + 128, 144, 128, 128, 144, +#if CONFIG_EXT_REFS + // TODO(zoeliu): To adjust further following factor values. + 128, 128, 128 + // TODO(weitinglin): We should investigate if the values should be the same + // as the value used by OVERLAY frame + , + 144 +#endif // CONFIG_EXT_REFS +}; + +int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) { + const int64_t q = av1_dc_quant(qindex, 0, cpi->common.bit_depth); +#if CONFIG_HIGHBITDEPTH + int64_t rdmult = 0; + switch (cpi->common.bit_depth) { + case AOM_BITS_8: rdmult = 88 * q * q / 24; break; + case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break; + case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +#else + int64_t rdmult = 88 * q * q / 24; +#endif // CONFIG_HIGHBITDEPTH + if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; + const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100)); + + rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7; + rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7); + } + if (rdmult < 1) rdmult = 1; + return (int)rdmult; +} + +static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) { + double q; +#if CONFIG_HIGHBITDEPTH + switch (bit_depth) { + case AOM_BITS_8: q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0; break; + case AOM_BITS_10: q = av1_dc_quant(qindex, 0, AOM_BITS_10) / 16.0; break; + case AOM_BITS_12: q = av1_dc_quant(qindex, 0, AOM_BITS_12) / 64.0; break; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +#else + (void)bit_depth; + q = av1_dc_quant(qindex, 0, AOM_BITS_8) / 4.0; +#endif // CONFIG_HIGHBITDEPTH + // TODO(debargha): Adjust the function below. + return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8); +} + +void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) { +#if CONFIG_HIGHBITDEPTH + switch (cpi->common.bit_depth) { + case AOM_BITS_8: + x->sadperbit16 = sad_per_bit16lut_8[qindex]; + x->sadperbit4 = sad_per_bit4lut_8[qindex]; + break; + case AOM_BITS_10: + x->sadperbit16 = sad_per_bit16lut_10[qindex]; + x->sadperbit4 = sad_per_bit4lut_10[qindex]; + break; + case AOM_BITS_12: + x->sadperbit16 = sad_per_bit16lut_12[qindex]; + x->sadperbit4 = sad_per_bit4lut_12[qindex]; + break; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + } +#else + (void)cpi; + x->sadperbit16 = sad_per_bit16lut_8[qindex]; + x->sadperbit4 = sad_per_bit4lut_8[qindex]; +#endif // CONFIG_HIGHBITDEPTH +} + +static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) { + int i, bsize, segment_id; + + for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { + const int qindex = + clamp(av1_get_qindex(&cm->seg, segment_id, cm->base_qindex) + + cm->y_dc_delta_q, + 0, MAXQ); + const int q = compute_rd_thresh_factor(qindex, cm->bit_depth); + + for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { + // Threshold here seems unnecessarily harsh but fine given actual + // range of values used for cpi->sf.thresh_mult[]. + const int t = q * rd_thresh_block_size_factor[bsize]; + const int thresh_max = INT_MAX / t; + +#if CONFIG_CB4X4 + for (i = 0; i < MAX_MODES; ++i) + rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max + ? rd->thresh_mult[i] * t / 4 + : INT_MAX; +#else + if (bsize >= BLOCK_8X8) { + for (i = 0; i < MAX_MODES; ++i) + rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max + ? rd->thresh_mult[i] * t / 4 + : INT_MAX; + } else { + for (i = 0; i < MAX_REFS; ++i) + rd->threshes[segment_id][bsize][i] = + rd->thresh_mult_sub8x8[i] < thresh_max + ? rd->thresh_mult_sub8x8[i] * t / 4 + : INT_MAX; + } +#endif + } + } +} + +#if CONFIG_REF_MV +void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref, + int ref_mv_idx) { + MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext; + int8_t rf_type = av1_ref_frame_type(x->e_mbd.mi[0]->mbmi.ref_frame); + int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type], + mbmi_ext->ref_mv_stack[rf_type], ref, ref_mv_idx); + (void)ref_frame; + x->mvcost = x->mv_cost_stack[nmv_ctx]; + x->nmvjointcost = x->nmv_vec_cost[nmv_ctx]; + x->mvsadcost = x->mvcost; + x->nmvjointsadcost = x->nmvjointcost; +} +#endif + +void av1_initialize_rd_consts(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + RD_OPT *const rd = &cpi->rd; + int i; +#if CONFIG_REF_MV + int nmv_ctx; +#endif + + aom_clear_system_state(); + + rd->RDDIV = RDDIV_BITS; // In bits (to multiply D by 128). + rd->RDMULT = av1_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q); + + set_error_per_bit(x, rd->RDMULT); + + set_block_thresholds(cm, rd); + +#if CONFIG_REF_MV + for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) { + av1_build_nmv_cost_table( + x->nmv_vec_cost[nmv_ctx], + cm->allow_high_precision_mv ? x->nmvcost_hp[nmv_ctx] + : x->nmvcost[nmv_ctx], + &cm->fc->nmvc[nmv_ctx], cm->allow_high_precision_mv); + } + x->mvcost = x->mv_cost_stack[0]; + x->nmvjointcost = x->nmv_vec_cost[0]; + x->mvsadcost = x->mvcost; + x->nmvjointsadcost = x->nmvjointcost; +#else + av1_build_nmv_cost_table( + x->nmvjointcost, cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, + &cm->fc->nmvc, cm->allow_high_precision_mv); +#endif + + if (cpi->oxcf.pass != 1) { + av1_fill_token_costs(x->token_costs, cm->fc->coef_probs); + + if (cpi->sf.partition_search_type != VAR_BASED_PARTITION || + cm->frame_type == KEY_FRAME) { +#if CONFIG_EXT_PARTITION_TYPES + for (i = 0; i < PARTITION_PLOFFSET; ++i) + av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i], + av1_partition_tree); + for (; i < PARTITION_CONTEXTS_PRIMARY; ++i) + av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i], + av1_ext_partition_tree); +#else + for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i) + av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i], + av1_partition_tree); +#endif // CONFIG_EXT_PARTITION_TYPES +#if CONFIG_UNPOISON_PARTITION_CTX + for (; i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) { + aom_prob p = cm->fc->partition_prob[i][PARTITION_VERT]; + assert(p > 0); + cpi->partition_cost[i][PARTITION_NONE] = INT_MAX; + cpi->partition_cost[i][PARTITION_HORZ] = INT_MAX; + cpi->partition_cost[i][PARTITION_VERT] = av1_cost_bit(p, 0); + cpi->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1); + } + for (; i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) { + aom_prob p = cm->fc->partition_prob[i][PARTITION_HORZ]; + assert(p > 0); + cpi->partition_cost[i][PARTITION_NONE] = INT_MAX; + cpi->partition_cost[i][PARTITION_HORZ] = av1_cost_bit(p, 0); + cpi->partition_cost[i][PARTITION_VERT] = INT_MAX; + cpi->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1); + } + cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_NONE] = INT_MAX; + cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_HORZ] = INT_MAX; + cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_VERT] = INT_MAX; + cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_SPLIT] = 0; +#endif // CONFIG_UNPOISON_PARTITION_CTX + } + + fill_mode_costs(cpi); + + if (!frame_is_intra_only(cm)) { +#if CONFIG_REF_MV + for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) { + cpi->newmv_mode_cost[i][0] = av1_cost_bit(cm->fc->newmv_prob[i], 0); + cpi->newmv_mode_cost[i][1] = av1_cost_bit(cm->fc->newmv_prob[i], 1); + } + + for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) { + cpi->zeromv_mode_cost[i][0] = av1_cost_bit(cm->fc->zeromv_prob[i], 0); + cpi->zeromv_mode_cost[i][1] = av1_cost_bit(cm->fc->zeromv_prob[i], 1); + } + + for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) { + cpi->refmv_mode_cost[i][0] = av1_cost_bit(cm->fc->refmv_prob[i], 0); + cpi->refmv_mode_cost[i][1] = av1_cost_bit(cm->fc->refmv_prob[i], 1); + } + + for (i = 0; i < DRL_MODE_CONTEXTS; ++i) { + cpi->drl_mode_cost0[i][0] = av1_cost_bit(cm->fc->drl_prob[i], 0); + cpi->drl_mode_cost0[i][1] = av1_cost_bit(cm->fc->drl_prob[i], 1); + } +#else + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + av1_cost_tokens((int *)cpi->inter_mode_cost[i], + cm->fc->inter_mode_probs[i], av1_inter_mode_tree); +#endif // CONFIG_REF_MV +#if CONFIG_EXT_INTER + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + av1_cost_tokens((int *)cpi->inter_compound_mode_cost[i], + cm->fc->inter_compound_mode_probs[i], + av1_inter_compound_mode_tree); + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) + av1_cost_tokens((int *)cpi->interintra_mode_cost[i], + cm->fc->interintra_mode_prob[i], + av1_interintra_mode_tree); +#endif // CONFIG_EXT_INTER +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + for (i = BLOCK_8X8; i < BLOCK_SIZES; i++) { + av1_cost_tokens((int *)cpi->motion_mode_cost[i], + cm->fc->motion_mode_prob[i], av1_motion_mode_tree); + } +#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION + for (i = BLOCK_8X8; i < BLOCK_SIZES; i++) { + cpi->motion_mode_cost1[i][0] = av1_cost_bit(cm->fc->obmc_prob[i], 0); + cpi->motion_mode_cost1[i][1] = av1_cost_bit(cm->fc->obmc_prob[i], 1); + } +#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + } + } +} + +static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { + // NOTE: The tables below must be of the same size. + + // The functions described below are sampled at the four most significant + // bits of x^2 + 8 / 256. + + // Normalized rate: + // This table models the rate for a Laplacian source with given variance + // when quantized with a uniform quantizer with given stepsize. The + // closed form expression is: + // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], + // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), + // and H(x) is the binary entropy function. + static const int rate_tab_q10[] = { + 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, + 4044, 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, + 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, + 2290, 2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, + 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963, + 911, 864, 821, 781, 745, 680, 623, 574, 530, 490, 455, 424, + 395, 345, 304, 269, 239, 213, 190, 171, 154, 126, 104, 87, + 73, 61, 52, 44, 38, 28, 21, 16, 12, 10, 8, 6, + 5, 3, 2, 1, 1, 1, 0, 0, + }; + // Normalized distortion: + // This table models the normalized distortion for a Laplacian source + // with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expression is: + // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) + // where x = qpstep / sqrt(variance). + // Note the actual distortion is Dn * variance. + static const int dist_tab_q10[] = { + 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, + 5, 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17, + 18, 21, 24, 26, 29, 31, 34, 36, 39, 44, 49, 54, + 59, 64, 69, 73, 78, 88, 97, 106, 115, 124, 133, 142, + 151, 167, 184, 200, 215, 231, 245, 260, 274, 301, 327, 351, + 375, 397, 418, 439, 458, 495, 528, 559, 587, 613, 637, 659, + 680, 717, 749, 777, 801, 823, 842, 859, 874, 899, 919, 936, + 949, 960, 969, 977, 983, 994, 1001, 1006, 1010, 1013, 1015, 1017, + 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, + }; + static const int xsq_iq_q10[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, + 40, 48, 56, 64, 72, 80, 88, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 256, 288, + 320, 352, 384, 416, 448, 480, 544, 608, 672, + 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504, + 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296, + 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136, + 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, + 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736, + 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696, + 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808, + 180192, 196576, 212960, 229344, 245728, + }; + const int tmp = (xsq_q10 >> 2) + 8; + const int k = get_msb(tmp) - 3; + const int xq = (k << 3) + ((tmp >> k) & 0x7); + const int one_q10 = 1 << 10; + const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k); + const int b_q10 = one_q10 - a_q10; + *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; + *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; +} + +void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2, + unsigned int qstep, int *rate, + int64_t *dist) { + // This function models the rate and distortion for a Laplacian + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expressions are in: + // Hang and Chen, "Source Model for transform video coder and its + // application - Part I: Fundamental Theory", IEEE Trans. Circ. + // Sys. for Video Tech., April 1997. + if (var == 0) { + *rate = 0; + *dist = 0; + } else { + int d_q10, r_q10; + static const uint32_t MAX_XSQ_Q10 = 245727; + const uint64_t xsq_q10_64 = + (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var; + const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10); + model_rd_norm(xsq_q10, &r_q10, &d_q10); + *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT); + *dist = (var * (int64_t)d_q10 + 512) >> 10; + } +} + +static void get_entropy_contexts_plane( + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) { + const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0]; + const ENTROPY_CONTEXT *const above = pd->above_context; + const ENTROPY_CONTEXT *const left = pd->left_context; + + int i; + +#if CONFIG_CB4X4 + switch (tx_size) { + case TX_2X2: + memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); + memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); + break; + case TX_4X4: + for (i = 0; i < num_4x4_w; i += 2) + t_above[i] = !!*(const uint16_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 2) + t_left[i] = !!*(const uint16_t *)&left[i]; + break; + case TX_8X8: + for (i = 0; i < num_4x4_w; i += 4) + t_above[i] = !!*(const uint32_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 4) + t_left[i] = !!*(const uint32_t *)&left[i]; + break; + case TX_16X16: + for (i = 0; i < num_4x4_w; i += 8) + t_above[i] = !!*(const uint64_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 8) + t_left[i] = !!*(const uint64_t *)&left[i]; + break; + case TX_32X32: + for (i = 0; i < num_4x4_w; i += 16) + t_above[i] = + !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]); + for (i = 0; i < num_4x4_h; i += 16) + t_left[i] = + !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]); + break; + case TX_4X8: + for (i = 0; i < num_4x4_w; i += 2) + t_above[i] = !!*(const uint16_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 4) + t_left[i] = !!*(const uint32_t *)&left[i]; + break; + case TX_8X4: + for (i = 0; i < num_4x4_w; i += 4) + t_above[i] = !!*(const uint32_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 2) + t_left[i] = !!*(const uint16_t *)&left[i]; + break; + case TX_8X16: + for (i = 0; i < num_4x4_w; i += 4) + t_above[i] = !!*(const uint32_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 8) + t_left[i] = !!*(const uint64_t *)&left[i]; + break; + case TX_16X8: + for (i = 0; i < num_4x4_w; i += 8) + t_above[i] = !!*(const uint64_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 4) + t_left[i] = !!*(const uint32_t *)&left[i]; + break; + case TX_16X32: + for (i = 0; i < num_4x4_w; i += 8) + t_above[i] = !!*(const uint64_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 16) + t_left[i] = + !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]); + break; + case TX_32X16: + for (i = 0; i < num_4x4_w; i += 16) + t_above[i] = + !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]); + for (i = 0; i < num_4x4_h; i += 8) + t_left[i] = !!*(const uint64_t *)&left[i]; + break; + + default: assert(0 && "Invalid transform size."); break; + } + return; +#endif + + switch (tx_size) { + case TX_4X4: + memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); + memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); + break; + case TX_8X8: + for (i = 0; i < num_4x4_w; i += 2) + t_above[i] = !!*(const uint16_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 2) + t_left[i] = !!*(const uint16_t *)&left[i]; + break; + case TX_16X16: + for (i = 0; i < num_4x4_w; i += 4) + t_above[i] = !!*(const uint32_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 4) + t_left[i] = !!*(const uint32_t *)&left[i]; + break; + case TX_32X32: + for (i = 0; i < num_4x4_w; i += 8) + t_above[i] = !!*(const uint64_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 8) + t_left[i] = !!*(const uint64_t *)&left[i]; + break; +#if CONFIG_TX64X64 + case TX_64X64: + for (i = 0; i < num_4x4_w; i += 16) + t_above[i] = + !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]); + for (i = 0; i < num_4x4_h; i += 16) + t_left[i] = + !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]); + break; +#endif // CONFIG_TX64X64 + case TX_4X8: + memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); + for (i = 0; i < num_4x4_h; i += 2) + t_left[i] = !!*(const uint16_t *)&left[i]; + break; + case TX_8X4: + for (i = 0; i < num_4x4_w; i += 2) + t_above[i] = !!*(const uint16_t *)&above[i]; + memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); + break; + case TX_8X16: + for (i = 0; i < num_4x4_w; i += 2) + t_above[i] = !!*(const uint16_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 4) + t_left[i] = !!*(const uint32_t *)&left[i]; + break; + case TX_16X8: + for (i = 0; i < num_4x4_w; i += 4) + t_above[i] = !!*(const uint32_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 2) + t_left[i] = !!*(const uint16_t *)&left[i]; + break; + case TX_16X32: + for (i = 0; i < num_4x4_w; i += 4) + t_above[i] = !!*(const uint32_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 8) + t_left[i] = !!*(const uint64_t *)&left[i]; + break; + case TX_32X16: + for (i = 0; i < num_4x4_w; i += 8) + t_above[i] = !!*(const uint64_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 4) + t_left[i] = !!*(const uint32_t *)&left[i]; + break; + default: assert(0 && "Invalid transform size."); break; + } +} + +void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) { + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left); +} + +void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, + int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) { + int i; + int zero_seen = 0; + int best_index = 0; + int best_sad = INT_MAX; + int this_sad = INT_MAX; + int max_mv = 0; + int near_same_nearest; + uint8_t *src_y_ptr = x->plane[0].src.buf; + uint8_t *ref_y_ptr; + const int num_mv_refs = + MAX_MV_REF_CANDIDATES + + (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size); + + MV pred_mv[3]; + pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv; + pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv; + pred_mv[2] = x->pred_mv[ref_frame]; + assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0]))); + + near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int == + x->mbmi_ext->ref_mvs[ref_frame][1].as_int; + // Get the sad for each candidate reference mv. + for (i = 0; i < num_mv_refs; ++i) { + const MV *this_mv = &pred_mv[i]; + int fp_row, fp_col; + + if (i == 1 && near_same_nearest) continue; + fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3; + fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3; + max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3); + + if (fp_row == 0 && fp_col == 0 && zero_seen) continue; + zero_seen |= (fp_row == 0 && fp_col == 0); + + ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col]; + // Find sad for current vector. + this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride, + ref_y_ptr, ref_y_stride); + // Note if it is the best so far. + if (this_sad < best_sad) { + best_sad = this_sad; + best_index = i; + } + } + + // Note the index of the mv that worked best in the reference list. + x->mv_best_ref_index[ref_frame] = best_index; + x->max_mv_context[ref_frame] = max_mv; + x->pred_mv_sad[ref_frame] = best_sad; +} + +void av1_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *scale, + const struct scale_factors *scale_uv) { + int i; + + dst[0].buf = src->y_buffer; + dst[0].stride = src->y_stride; + dst[1].buf = src->u_buffer; + dst[2].buf = src->v_buffer; + dst[1].stride = dst[2].stride = src->uv_stride; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + setup_pred_plane(dst + i, xd->mi[0]->mbmi.sb_type, dst[i].buf, + i ? src->uv_crop_width : src->y_crop_width, + i ? src->uv_crop_height : src->y_crop_height, + dst[i].stride, mi_row, mi_col, i ? scale_uv : scale, + xd->plane[i].subsampling_x, xd->plane[i].subsampling_y); + } +} + +int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block, + int stride) { + const int bw = b_width_log2_lookup[plane_bsize]; + const int y = 4 * (raster_block >> bw); + const int x = 4 * (raster_block & ((1 << bw) - 1)); + return y * stride + x; +} + +int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block, + int16_t *base) { + const int stride = block_size_wide[plane_bsize]; + return base + av1_raster_block_offset(plane_bsize, raster_block, stride); +} + +YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi, + int ref_frame) { + const AV1_COMMON *const cm = &cpi->common; + const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1]; + const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame); + return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) + ? &cm->buffer_pool->frame_bufs[scaled_idx].buf + : NULL; +} + +#if CONFIG_DUAL_FILTER +int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) { + const AV1_COMMON *const cm = &cpi->common; + if (cm->interp_filter == SWITCHABLE) { + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + int inter_filter_cost = 0; + int dir; + + for (dir = 0; dir < 2; ++dir) { + if (has_subpel_mv_component(xd->mi[0], xd, dir) || + (mbmi->ref_frame[1] > INTRA_FRAME && + has_subpel_mv_component(xd->mi[0], xd, dir + 2))) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + inter_filter_cost += + cpi->switchable_interp_costs[ctx][mbmi->interp_filter[dir]]; + } + } + return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost; + } else { + return 0; + } +} +#else +int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) { + const AV1_COMMON *const cm = &cpi->common; + if (cm->interp_filter == SWITCHABLE) { + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const int ctx = av1_get_pred_context_switchable_interp(xd); + return SWITCHABLE_INTERP_RATE_FACTOR * + cpi->switchable_interp_costs[ctx][mbmi->interp_filter]; + } + return 0; +} +#endif + +void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { + int i; + RD_OPT *const rd = &cpi->rd; + SPEED_FEATURES *const sf = &cpi->sf; + + // Set baseline threshold values. + for (i = 0; i < MAX_MODES; ++i) rd->thresh_mult[i] = cpi->oxcf.mode == 0; + + if (sf->adaptive_rd_thresh) { + rd->thresh_mult[THR_NEARESTMV] = 300; +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_NEARESTL2] = 300; + rd->thresh_mult[THR_NEARESTL3] = 300; + rd->thresh_mult[THR_NEARESTB] = 300; +#endif // CONFIG_EXT_REFS + rd->thresh_mult[THR_NEARESTA] = 300; + rd->thresh_mult[THR_NEARESTG] = 300; + } else { + rd->thresh_mult[THR_NEARESTMV] = 0; +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_NEARESTL2] = 0; + rd->thresh_mult[THR_NEARESTL3] = 0; + rd->thresh_mult[THR_NEARESTB] = 0; +#endif // CONFIG_EXT_REFS + rd->thresh_mult[THR_NEARESTA] = 0; + rd->thresh_mult[THR_NEARESTG] = 0; + } + + rd->thresh_mult[THR_DC] += 1000; + + rd->thresh_mult[THR_NEWMV] += 1000; +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_NEWL2] += 1000; + rd->thresh_mult[THR_NEWL3] += 1000; + rd->thresh_mult[THR_NEWB] += 1000; +#endif // CONFIG_EXT_REFS + rd->thresh_mult[THR_NEWA] += 1000; + rd->thresh_mult[THR_NEWG] += 1000; + + rd->thresh_mult[THR_NEARMV] += 1000; +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_NEARL2] += 1000; + rd->thresh_mult[THR_NEARL3] += 1000; + rd->thresh_mult[THR_NEARB] += 1000; +#endif // CONFIG_EXT_REFS + rd->thresh_mult[THR_NEARA] += 1000; + rd->thresh_mult[THR_NEARG] += 1000; + + rd->thresh_mult[THR_ZEROMV] += 2000; +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_ZEROL2] += 2000; + rd->thresh_mult[THR_ZEROL3] += 2000; + rd->thresh_mult[THR_ZEROB] += 2000; +#endif // CONFIG_EXT_REFS + rd->thresh_mult[THR_ZEROG] += 2000; + rd->thresh_mult[THR_ZEROA] += 2000; + + rd->thresh_mult[THR_TM] += 1000; + +#if CONFIG_EXT_INTER + + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000; +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000; +#endif // CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000; +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] += 1000; +#endif // CONFIG_EXT_REFS + +#else // CONFIG_EXT_INTER + + rd->thresh_mult[THR_COMP_NEARESTLA] += 1000; +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_NEARESTL2A] += 1000; + rd->thresh_mult[THR_COMP_NEARESTL3A] += 1000; +#endif // CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_NEARESTGA] += 1000; +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_NEARESTLB] += 1000; + rd->thresh_mult[THR_COMP_NEARESTL2B] += 1000; + rd->thresh_mult[THR_COMP_NEARESTL3B] += 1000; + rd->thresh_mult[THR_COMP_NEARESTGB] += 1000; +#endif // CONFIG_EXT_REFS + +#endif // CONFIG_EXT_INTER + +#if CONFIG_EXT_INTER + + rd->thresh_mult[THR_COMP_NEAREST_NEARLA] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARESTLA] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000; + rd->thresh_mult[THR_COMP_ZERO_ZEROLA] += 2500; + +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_NEAREST_NEARL2A] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARESTL2A] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000; + rd->thresh_mult[THR_COMP_ZERO_ZEROL2A] += 2500; + + rd->thresh_mult[THR_COMP_NEAREST_NEARL3A] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARESTL3A] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] += 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] += 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000; + rd->thresh_mult[THR_COMP_ZERO_ZEROL3A] += 2500; +#endif // CONFIG_EXT_REFS + + rd->thresh_mult[THR_COMP_NEAREST_NEARGA] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARESTGA] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000; + rd->thresh_mult[THR_COMP_ZERO_ZEROGA] += 2500; + +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_NEAREST_NEARLB] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARESTLB] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000; + rd->thresh_mult[THR_COMP_ZERO_ZEROLB] += 2500; + + rd->thresh_mult[THR_COMP_NEAREST_NEARL2B] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARESTL2B] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] += 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL2B] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL2B] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL2B] += 2000; + rd->thresh_mult[THR_COMP_ZERO_ZEROL2B] += 2500; + + rd->thresh_mult[THR_COMP_NEAREST_NEARL3B] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARESTL3B] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000; + rd->thresh_mult[THR_COMP_ZERO_ZEROL3B] += 2500; + + rd->thresh_mult[THR_COMP_NEAREST_NEARGB] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARESTGB] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARGB] += 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTGB] += 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWGB] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700; + rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000; + rd->thresh_mult[THR_COMP_ZERO_ZEROGB] += 2500; +#endif // CONFIG_EXT_REFS + +#else // CONFIG_EXT_INTER + + rd->thresh_mult[THR_COMP_NEARLA] += 1500; + rd->thresh_mult[THR_COMP_NEWLA] += 2000; +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_NEARL2A] += 1500; + rd->thresh_mult[THR_COMP_NEWL2A] += 2000; + rd->thresh_mult[THR_COMP_NEARL3A] += 1500; + rd->thresh_mult[THR_COMP_NEWL3A] += 2000; +#endif // CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_NEARGA] += 1500; + rd->thresh_mult[THR_COMP_NEWGA] += 2000; + +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_NEARLB] += 1500; + rd->thresh_mult[THR_COMP_NEWLB] += 2000; + rd->thresh_mult[THR_COMP_NEARL2B] += 1500; + rd->thresh_mult[THR_COMP_NEWL2B] += 2000; + rd->thresh_mult[THR_COMP_NEARL3B] += 1500; + rd->thresh_mult[THR_COMP_NEWL3B] += 2000; + rd->thresh_mult[THR_COMP_NEARGB] += 1500; + rd->thresh_mult[THR_COMP_NEWGB] += 2000; +#endif // CONFIG_EXT_REFS + + rd->thresh_mult[THR_COMP_ZEROLA] += 2500; +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_ZEROL2A] += 2500; + rd->thresh_mult[THR_COMP_ZEROL3A] += 2500; +#endif // CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_ZEROGA] += 2500; + +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_ZEROLB] += 2500; + rd->thresh_mult[THR_COMP_ZEROL2B] += 2500; + rd->thresh_mult[THR_COMP_ZEROL3B] += 2500; + rd->thresh_mult[THR_COMP_ZEROGB] += 2500; +#endif // CONFIG_EXT_REFS + +#endif // CONFIG_EXT_INTER + + rd->thresh_mult[THR_H_PRED] += 2000; + rd->thresh_mult[THR_V_PRED] += 2000; + rd->thresh_mult[THR_D135_PRED] += 2500; + rd->thresh_mult[THR_D207_PRED] += 2500; + rd->thresh_mult[THR_D153_PRED] += 2500; + rd->thresh_mult[THR_D63_PRED] += 2500; + rd->thresh_mult[THR_D117_PRED] += 2500; + rd->thresh_mult[THR_D45_PRED] += 2500; + +#if CONFIG_EXT_INTER + rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEARL] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEWL] += 2000; + +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL2] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL2] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEARL2] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEWL2] += 2000; + + rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL3] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL3] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEARL3] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEWL3] += 2000; +#endif // CONFIG_EXT_REFS + + rd->thresh_mult[THR_COMP_INTERINTRA_ZEROG] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEARG] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEWG] += 2000; + +#if CONFIG_EXT_REFS + rd->thresh_mult[THR_COMP_INTERINTRA_ZEROB] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTB] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEARB] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEWB] += 2000; +#endif // CONFIG_EXT_REFS + + rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEARA] += 1500; + rd->thresh_mult[THR_COMP_INTERINTRA_NEWA] += 2000; +#endif // CONFIG_EXT_INTER +} + +void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) { + static const int thresh_mult[MAX_REFS] = { +#if CONFIG_EXT_REFS + 2500, + 2500, + 2500, + 2500, + 2500, + 2500, + 4500, + 4500, + 4500, + 4500, + 4500, + 4500, + 4500, + 4500, + 2500 +#else + 2500, + 2500, + 2500, + 4500, + 4500, + 2500 +#endif // CONFIG_EXT_REFS + }; + RD_OPT *const rd = &cpi->rd; + memcpy(rd->thresh_mult_sub8x8, thresh_mult, sizeof(thresh_mult)); +} + +void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, + int (*factor_buf)[MAX_MODES], int rd_thresh, + int bsize, int best_mode_index) { + if (rd_thresh > 0) { +#if CONFIG_CB4X4 + const int top_mode = MAX_MODES; +#else + const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES; +#endif + int mode; + for (mode = 0; mode < top_mode; ++mode) { + const BLOCK_SIZE min_size = AOMMAX(bsize - 1, BLOCK_4X4); + const BLOCK_SIZE max_size = AOMMIN(bsize + 2, (int)cm->sb_size); + BLOCK_SIZE bs; + for (bs = min_size; bs <= max_size; ++bs) { + int *const fact = &factor_buf[bs][mode]; + if (mode == best_mode_index) { + *fact -= (*fact >> 4); + } else { + *fact = AOMMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT); + } + } + } + } +} + +int av1_get_intra_cost_penalty(int qindex, int qdelta, + aom_bit_depth_t bit_depth) { + const int q = av1_dc_quant(qindex, qdelta, bit_depth); +#if CONFIG_HIGHBITDEPTH + switch (bit_depth) { + case AOM_BITS_8: return 20 * q; + case AOM_BITS_10: return 5 * q; + case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2); + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +#else + return 20 * q; +#endif // CONFIG_HIGHBITDEPTH +} diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h new file mode 100644 index 000000000..c0ac1f7e7 --- /dev/null +++ b/third_party/aom/av1/encoder/rd.h @@ -0,0 +1,505 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_RD_H_ +#define AV1_ENCODER_RD_H_ + +#include + +#if CONFIG_ANS +#include "aom_dsp/ans.h" +#endif // CONFIG_ANS +#include "av1/common/blockd.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/cost.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RDDIV_BITS 7 +#define RD_EPB_SHIFT 6 + +#define RDCOST(RM, DM, R, D) \ + (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), AV1_PROB_COST_SHIFT) + (D << DM)) + +#define RDCOST_DBL(RM, DM, R, D) \ + (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \ + ((double)(D) * (1 << (DM)))) + +#define QIDX_SKIP_THRESH 115 + +#define MV_COST_WEIGHT 108 +#define MV_COST_WEIGHT_SUB 120 + +#define INVALID_MV 0x80008000 + +#if CONFIG_EXT_REFS +#define MAX_REFS 15 +#else +#define MAX_REFS 6 +#endif // CONFIG_EXT_REFS + +#define RD_THRESH_MAX_FACT 64 +#define RD_THRESH_INC 1 + +// This enumerator type needs to be kept aligned with the mode order in +// const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code. +typedef enum { + THR_NEARESTMV, +#if CONFIG_EXT_REFS + THR_NEARESTL2, + THR_NEARESTL3, + THR_NEARESTB, +#endif // CONFIG_EXT_REFS + THR_NEARESTA, + THR_NEARESTG, + + THR_DC, + + THR_NEWMV, +#if CONFIG_EXT_REFS + THR_NEWL2, + THR_NEWL3, + THR_NEWB, +#endif // CONFIG_EXT_REFS + THR_NEWA, + THR_NEWG, + + THR_NEARMV, +#if CONFIG_EXT_REFS + THR_NEARL2, + THR_NEARL3, + THR_NEARB, +#endif // CONFIG_EXT_REFS + THR_NEARA, + THR_NEARG, + + THR_ZEROMV, +#if CONFIG_EXT_REFS + THR_ZEROL2, + THR_ZEROL3, + THR_ZEROB, +#endif // CONFIG_EXT_REFS + THR_ZEROG, + THR_ZEROA, + +#if CONFIG_EXT_INTER + + THR_COMP_NEAREST_NEARESTLA, +#if CONFIG_EXT_REFS + THR_COMP_NEAREST_NEARESTL2A, + THR_COMP_NEAREST_NEARESTL3A, +#endif // CONFIG_EXT_REFS + THR_COMP_NEAREST_NEARESTGA, +#if CONFIG_EXT_REFS + THR_COMP_NEAREST_NEARESTLB, + THR_COMP_NEAREST_NEARESTL2B, + THR_COMP_NEAREST_NEARESTL3B, + THR_COMP_NEAREST_NEARESTGB, +#endif // CONFIG_EXT_REFS + +#else // CONFIG_EXT_INTER + + THR_COMP_NEARESTLA, +#if CONFIG_EXT_REFS + THR_COMP_NEARESTL2A, + THR_COMP_NEARESTL3A, +#endif // CONFIG_EXT_REFS + THR_COMP_NEARESTGA, +#if CONFIG_EXT_REFS + THR_COMP_NEARESTLB, + THR_COMP_NEARESTL2B, + THR_COMP_NEARESTL3B, + THR_COMP_NEARESTGB, +#endif // CONFIG_EXT_REFS + +#endif // CONFIG_EXT_INTER + + THR_TM, + +#if CONFIG_ALT_INTRA + THR_SMOOTH, +#endif // CONFIG_ALT_INTRA + +#if CONFIG_EXT_INTER + + THR_COMP_NEAR_NEARESTLA, + THR_COMP_NEAREST_NEARLA, + THR_COMP_NEAR_NEARLA, + THR_COMP_NEW_NEARESTLA, + THR_COMP_NEAREST_NEWLA, + THR_COMP_NEW_NEARLA, + THR_COMP_NEAR_NEWLA, + THR_COMP_NEW_NEWLA, + THR_COMP_ZERO_ZEROLA, + +#if CONFIG_EXT_REFS + THR_COMP_NEAR_NEARESTL2A, + THR_COMP_NEAREST_NEARL2A, + THR_COMP_NEAR_NEARL2A, + THR_COMP_NEW_NEARESTL2A, + THR_COMP_NEAREST_NEWL2A, + THR_COMP_NEW_NEARL2A, + THR_COMP_NEAR_NEWL2A, + THR_COMP_NEW_NEWL2A, + THR_COMP_ZERO_ZEROL2A, + + THR_COMP_NEAR_NEARESTL3A, + THR_COMP_NEAREST_NEARL3A, + THR_COMP_NEAR_NEARL3A, + THR_COMP_NEW_NEARESTL3A, + THR_COMP_NEAREST_NEWL3A, + THR_COMP_NEW_NEARL3A, + THR_COMP_NEAR_NEWL3A, + THR_COMP_NEW_NEWL3A, + THR_COMP_ZERO_ZEROL3A, +#endif // CONFIG_EXT_REFS + + THR_COMP_NEAR_NEARESTGA, + THR_COMP_NEAREST_NEARGA, + THR_COMP_NEAR_NEARGA, + THR_COMP_NEW_NEARESTGA, + THR_COMP_NEAREST_NEWGA, + THR_COMP_NEW_NEARGA, + THR_COMP_NEAR_NEWGA, + THR_COMP_NEW_NEWGA, + THR_COMP_ZERO_ZEROGA, + +#if CONFIG_EXT_REFS + THR_COMP_NEAR_NEARESTLB, + THR_COMP_NEAREST_NEARLB, + THR_COMP_NEAR_NEARLB, + THR_COMP_NEW_NEARESTLB, + THR_COMP_NEAREST_NEWLB, + THR_COMP_NEW_NEARLB, + THR_COMP_NEAR_NEWLB, + THR_COMP_NEW_NEWLB, + THR_COMP_ZERO_ZEROLB, + + THR_COMP_NEAR_NEARESTL2B, + THR_COMP_NEAREST_NEARL2B, + THR_COMP_NEAR_NEARL2B, + THR_COMP_NEW_NEARESTL2B, + THR_COMP_NEAREST_NEWL2B, + THR_COMP_NEW_NEARL2B, + THR_COMP_NEAR_NEWL2B, + THR_COMP_NEW_NEWL2B, + THR_COMP_ZERO_ZEROL2B, + + THR_COMP_NEAR_NEARESTL3B, + THR_COMP_NEAREST_NEARL3B, + THR_COMP_NEAR_NEARL3B, + THR_COMP_NEW_NEARESTL3B, + THR_COMP_NEAREST_NEWL3B, + THR_COMP_NEW_NEARL3B, + THR_COMP_NEAR_NEWL3B, + THR_COMP_NEW_NEWL3B, + THR_COMP_ZERO_ZEROL3B, + + THR_COMP_NEAR_NEARESTGB, + THR_COMP_NEAREST_NEARGB, + THR_COMP_NEAR_NEARGB, + THR_COMP_NEW_NEARESTGB, + THR_COMP_NEAREST_NEWGB, + THR_COMP_NEW_NEARGB, + THR_COMP_NEAR_NEWGB, + THR_COMP_NEW_NEWGB, + THR_COMP_ZERO_ZEROGB, +#endif // CONFIG_EXT_REFS + +#else // CONFIG_EXT_INTER + + THR_COMP_NEARLA, + THR_COMP_NEWLA, +#if CONFIG_EXT_REFS + THR_COMP_NEARL2A, + THR_COMP_NEWL2A, + THR_COMP_NEARL3A, + THR_COMP_NEWL3A, +#endif // CONFIG_EXT_REFS + THR_COMP_NEARGA, + THR_COMP_NEWGA, + +#if CONFIG_EXT_REFS + THR_COMP_NEARLB, + THR_COMP_NEWLB, + THR_COMP_NEARL2B, + THR_COMP_NEWL2B, + THR_COMP_NEARL3B, + THR_COMP_NEWL3B, + THR_COMP_NEARGB, + THR_COMP_NEWGB, +#endif // CONFIG_EXT_REFS + + THR_COMP_ZEROLA, +#if CONFIG_EXT_REFS + THR_COMP_ZEROL2A, + THR_COMP_ZEROL3A, +#endif // CONFIG_EXT_REFS + THR_COMP_ZEROGA, + +#if CONFIG_EXT_REFS + THR_COMP_ZEROLB, + THR_COMP_ZEROL2B, + THR_COMP_ZEROL3B, + THR_COMP_ZEROGB, +#endif // CONFIG_EXT_REFS + +#endif // CONFIG_EXT_INTER + + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D207_PRED, + THR_D153_PRED, + THR_D63_PRED, + THR_D117_PRED, + THR_D45_PRED, + +#if CONFIG_EXT_INTER + THR_COMP_INTERINTRA_ZEROL, + THR_COMP_INTERINTRA_NEARESTL, + THR_COMP_INTERINTRA_NEARL, + THR_COMP_INTERINTRA_NEWL, + +#if CONFIG_EXT_REFS + THR_COMP_INTERINTRA_ZEROL2, + THR_COMP_INTERINTRA_NEARESTL2, + THR_COMP_INTERINTRA_NEARL2, + THR_COMP_INTERINTRA_NEWL2, + + THR_COMP_INTERINTRA_ZEROL3, + THR_COMP_INTERINTRA_NEARESTL3, + THR_COMP_INTERINTRA_NEARL3, + THR_COMP_INTERINTRA_NEWL3, +#endif // CONFIG_EXT_REFS + + THR_COMP_INTERINTRA_ZEROG, + THR_COMP_INTERINTRA_NEARESTG, + THR_COMP_INTERINTRA_NEARG, + THR_COMP_INTERINTRA_NEWG, + +#if CONFIG_EXT_REFS + THR_COMP_INTERINTRA_ZEROB, + THR_COMP_INTERINTRA_NEARESTB, + THR_COMP_INTERINTRA_NEARB, + THR_COMP_INTERINTRA_NEWB, +#endif // CONFIG_EXT_REFS + + THR_COMP_INTERINTRA_ZEROA, + THR_COMP_INTERINTRA_NEARESTA, + THR_COMP_INTERINTRA_NEARA, + THR_COMP_INTERINTRA_NEWA, +#endif // CONFIG_EXT_INTER + MAX_MODES +} THR_MODES; + +typedef enum { + THR_LAST, +#if CONFIG_EXT_REFS + THR_LAST2, + THR_LAST3, + THR_BWDR, +#endif // CONFIG_EXT_REFS + THR_GOLD, + THR_ALTR, + + THR_COMP_LA, +#if CONFIG_EXT_REFS + THR_COMP_L2A, + THR_COMP_L3A, +#endif // CONFIG_EXT_REFS + THR_COMP_GA, + +#if CONFIG_EXT_REFS + THR_COMP_LB, + THR_COMP_L2B, + THR_COMP_L3B, + THR_COMP_GB, +#endif // CONFIG_EXT_REFS + + THR_INTRA, +} THR_MODES_SUB8X8; + +typedef struct RD_OPT { + // Thresh_mult is used to set a threshold for the rd score. A higher value + // means that we will accept the best mode so far more often. This number + // is used in combination with the current block size, and thresh_freq_fact + // to pick a threshold. + int thresh_mult[MAX_MODES]; + int thresh_mult_sub8x8[MAX_REFS]; + + int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES]; + + int64_t prediction_type_threshes[TOTAL_REFS_PER_FRAME][REFERENCE_MODES]; + + int RDMULT; + int RDDIV; +} RD_OPT; + +static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) { +#if CONFIG_RD_DEBUG + int plane; +#endif + rd_stats->rate = 0; + rd_stats->dist = 0; + rd_stats->rdcost = 0; + rd_stats->sse = 0; + rd_stats->skip = 1; +#if CONFIG_RD_DEBUG + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + rd_stats->txb_coeff_cost[plane] = 0; +#if CONFIG_VAR_TX + { + int r, c; + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) + rd_stats->txb_coeff_cost_map[plane][r][c] = 0; + } +#endif + } +#endif +} + +static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) { +#if CONFIG_RD_DEBUG + int plane; +#endif + rd_stats->rate = INT_MAX; + rd_stats->dist = INT64_MAX; + rd_stats->rdcost = INT64_MAX; + rd_stats->sse = INT64_MAX; + rd_stats->skip = 0; +#if CONFIG_RD_DEBUG + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + rd_stats->txb_coeff_cost[plane] = INT_MAX; +#if CONFIG_VAR_TX + { + int r, c; + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) + rd_stats->txb_coeff_cost_map[plane][r][c] = INT_MAX; + } +#endif + } +#endif +} + +static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst, + const RD_STATS *rd_stats_src) { +#if CONFIG_RD_DEBUG + int plane; +#endif + rd_stats_dst->rate += rd_stats_src->rate; + rd_stats_dst->dist += rd_stats_src->dist; + rd_stats_dst->sse += rd_stats_src->sse; + rd_stats_dst->skip &= rd_stats_src->skip; +#if CONFIG_RD_DEBUG + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane]; +#if CONFIG_VAR_TX + { + // TODO(angiebird): optimize this part + int r, c; + int ref_txb_coeff_cost = 0; + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { + rd_stats_dst->txb_coeff_cost_map[plane][r][c] += + rd_stats_src->txb_coeff_cost_map[plane][r][c]; + ref_txb_coeff_cost += rd_stats_dst->txb_coeff_cost_map[plane][r][c]; + } + assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]); + } +#endif + } +#endif +} + +struct TileInfo; +struct TileDataEnc; +struct AV1_COMP; +struct macroblock; + +int av1_compute_rd_mult(const struct AV1_COMP *cpi, int qindex); + +void av1_initialize_rd_consts(struct AV1_COMP *cpi); + +void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x, + int qindex); + +void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n, + unsigned int qstep, int *rate, int64_t *dist); + +int av1_get_switchable_rate(const struct AV1_COMP *cpi, const MACROBLOCKD *xd); + +int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block, + int stride); + +int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block, + int16_t *base); + +YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi, + int ref_frame); + +void av1_init_me_luts(void); + +#if CONFIG_REF_MV +void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref, + int ref_mv_idx); +#endif + +void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]); + +void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi); + +void av1_set_rd_speed_thresholds_sub8x8(struct AV1_COMP *cpi); + +void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, + int (*fact)[MAX_MODES], int rd_thresh, int bsize, + int best_mode_index); + +void av1_fill_token_costs(av1_coeff_cost *c, + av1_coeff_probs_model (*p)[PLANE_TYPES]); + +static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, + int thresh_fact) { + return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX; +} + +void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x, + uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame, + BLOCK_SIZE block_size); + +static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) { + x->errorperbit = rdmult >> RD_EPB_SHIFT; + x->errorperbit += (x->errorperbit == 0); +} + +void av1_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *scale, + const struct scale_factors *scale_uv); + +int av1_get_intra_cost_penalty(int qindex, int qdelta, + aom_bit_depth_t bit_depth); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_RD_H_ diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c new file mode 100644 index 000000000..a1096f782 --- /dev/null +++ b/third_party/aom/av1/encoder/rdopt.c @@ -0,0 +1,12713 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "./av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/idct.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" +#if CONFIG_LV_MAP +#include "av1/common/txb_common.h" +#endif +#if CONFIG_WARPED_MOTION +#include "av1/common/warped_motion.h" +#endif // CONFIG_WARPED_MOTION + +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#if CONFIG_LV_MAP +#include "av1/encoder/encodetxb.h" +#endif +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/mcomp.h" +#if CONFIG_PALETTE +#include "av1/encoder/palette.h" +#endif // CONFIG_PALETTE +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/tokenize.h" +#if CONFIG_PVQ +#include "av1/encoder/pvq_encoder.h" +#endif // CONFIG_PVQ +#if CONFIG_PVQ || CONFIG_DAALA_DIST +#include "av1/common/pvq.h" +#endif // CONFIG_PVQ || CONFIG_DAALA_DIST +#if CONFIG_DUAL_FILTER +#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS) +static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = { + { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, { 1, 0 }, { 1, 1 }, + { 1, 2 }, { 1, 3 }, { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 }, + { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 }, +}; +#endif // CONFIG_DUAL_FILTER + +#if CONFIG_EXT_REFS + +#define LAST_FRAME_MODE_MASK \ + ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | \ + (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME)) +#define LAST2_FRAME_MODE_MASK \ + ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) | \ + (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME)) +#define LAST3_FRAME_MODE_MASK \ + ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \ + (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME)) +#define GOLDEN_FRAME_MODE_MASK \ + ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \ + (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME)) +#define BWDREF_FRAME_MODE_MASK \ + ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \ + (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME)) +#define ALTREF_FRAME_MODE_MASK \ + ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \ + (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME)) + +#else + +#define LAST_FRAME_MODE_MASK \ + ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME)) +#define GOLDEN_FRAME_MODE_MASK \ + ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME)) +#define ALTREF_FRAME_MODE_MASK \ + ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | (1 << INTRA_FRAME)) + +#endif // CONFIG_EXT_REFS + +#if CONFIG_EXT_REFS +#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | 0x01) +#else +#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01) +#endif // CONFIG_EXT_REFS + +#define MIN_EARLY_TERM_INDEX 3 +#define NEW_MV_DISCOUNT_FACTOR 8 + +#if CONFIG_EXT_INTRA +#define ANGLE_SKIP_THRESH 10 +#define FILTER_FAST_SEARCH 1 +#endif // CONFIG_EXT_INTRA + +const double ADST_FLIP_SVM[8] = { -6.6623, -2.8062, -3.2531, 3.1671, // vert + -7.7051, -3.2234, -3.6193, 3.4533 }; // horz + +typedef struct { + PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame[2]; +} MODE_DEFINITION; + +typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION; + +struct rdcost_block_args { + const AV1_COMP *cpi; + MACROBLOCK *x; + ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE]; + ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]; + RD_STATS rd_stats; + int64_t this_rd; + int64_t best_rd; + int exit_early; + int use_fast_coef_costing; +}; + +#define LAST_NEW_MV_INDEX 6 +static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { + { NEARESTMV, { LAST_FRAME, NONE_FRAME } }, +#if CONFIG_EXT_REFS + { NEARESTMV, { LAST2_FRAME, NONE_FRAME } }, + { NEARESTMV, { LAST3_FRAME, NONE_FRAME } }, + { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } }, +#endif // CONFIG_EXT_REFS + { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } }, + { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } }, + + { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, + + { NEWMV, { LAST_FRAME, NONE_FRAME } }, +#if CONFIG_EXT_REFS + { NEWMV, { LAST2_FRAME, NONE_FRAME } }, + { NEWMV, { LAST3_FRAME, NONE_FRAME } }, + { NEWMV, { BWDREF_FRAME, NONE_FRAME } }, +#endif // CONFIG_EXT_REFS + { NEWMV, { ALTREF_FRAME, NONE_FRAME } }, + { NEWMV, { GOLDEN_FRAME, NONE_FRAME } }, + + { NEARMV, { LAST_FRAME, NONE_FRAME } }, +#if CONFIG_EXT_REFS + { NEARMV, { LAST2_FRAME, NONE_FRAME } }, + { NEARMV, { LAST3_FRAME, NONE_FRAME } }, + { NEARMV, { BWDREF_FRAME, NONE_FRAME } }, +#endif // CONFIG_EXT_REFS + { NEARMV, { ALTREF_FRAME, NONE_FRAME } }, + { NEARMV, { GOLDEN_FRAME, NONE_FRAME } }, + + { ZEROMV, { LAST_FRAME, NONE_FRAME } }, +#if CONFIG_EXT_REFS + { ZEROMV, { LAST2_FRAME, NONE_FRAME } }, + { ZEROMV, { LAST3_FRAME, NONE_FRAME } }, + { ZEROMV, { BWDREF_FRAME, NONE_FRAME } }, +#endif // CONFIG_EXT_REFS + { ZEROMV, { GOLDEN_FRAME, NONE_FRAME } }, + { ZEROMV, { ALTREF_FRAME, NONE_FRAME } }, + +// TODO(zoeliu): May need to reconsider the order on the modes to check + +#if CONFIG_EXT_INTER + { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, +#if CONFIG_EXT_REFS + { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, +#endif // CONFIG_EXT_REFS + { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, +#if CONFIG_EXT_REFS + { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, +#endif // CONFIG_EXT_REFS + +#else // CONFIG_EXT_INTER + + { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, +#if CONFIG_EXT_REFS + { NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, +#endif // CONFIG_EXT_REFS + { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, +#if CONFIG_EXT_REFS + { NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, +#endif // CONFIG_EXT_REFS +#endif // CONFIG_EXT_INTER + + { TM_PRED, { INTRA_FRAME, NONE_FRAME } }, + +#if CONFIG_ALT_INTRA + { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, +#endif // CONFIG_ALT_INTRA + +#if CONFIG_EXT_INTER + { NEAR_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { ZERO_ZEROMV, { LAST_FRAME, ALTREF_FRAME } }, + +#if CONFIG_EXT_REFS + { NEAR_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { ZERO_ZEROMV, { LAST2_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, + { ZERO_ZEROMV, { LAST3_FRAME, ALTREF_FRAME } }, +#endif // CONFIG_EXT_REFS + + { NEAR_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + +#if CONFIG_EXT_REFS + { NEAR_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { ZERO_ZEROMV, { LAST_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { ZERO_ZEROMV, { LAST2_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { ZERO_ZEROMV, { LAST3_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } }, +#endif // CONFIG_EXT_REFS + +#else // CONFIG_EXT_INTER + + { NEARMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEWMV, { LAST_FRAME, ALTREF_FRAME } }, +#if CONFIG_EXT_REFS + { NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, +#endif // CONFIG_EXT_REFS + { NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + +#if CONFIG_EXT_REFS + { NEARMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, +#endif // CONFIG_EXT_REFS + + { ZEROMV, { LAST_FRAME, ALTREF_FRAME } }, +#if CONFIG_EXT_REFS + { ZEROMV, { LAST2_FRAME, ALTREF_FRAME } }, + { ZEROMV, { LAST3_FRAME, ALTREF_FRAME } }, +#endif // CONFIG_EXT_REFS + { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + +#if CONFIG_EXT_REFS + { ZEROMV, { LAST_FRAME, BWDREF_FRAME } }, + { ZEROMV, { LAST2_FRAME, BWDREF_FRAME } }, + { ZEROMV, { LAST3_FRAME, BWDREF_FRAME } }, + { ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } }, +#endif // CONFIG_EXT_REFS + +#endif // CONFIG_EXT_INTER + + { H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D207_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D153_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D63_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D117_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, + +#if CONFIG_EXT_INTER + { ZEROMV, { LAST_FRAME, INTRA_FRAME } }, + { NEARESTMV, { LAST_FRAME, INTRA_FRAME } }, + { NEARMV, { LAST_FRAME, INTRA_FRAME } }, + { NEWMV, { LAST_FRAME, INTRA_FRAME } }, + +#if CONFIG_EXT_REFS + { ZEROMV, { LAST2_FRAME, INTRA_FRAME } }, + { NEARESTMV, { LAST2_FRAME, INTRA_FRAME } }, + { NEARMV, { LAST2_FRAME, INTRA_FRAME } }, + { NEWMV, { LAST2_FRAME, INTRA_FRAME } }, + + { ZEROMV, { LAST3_FRAME, INTRA_FRAME } }, + { NEARESTMV, { LAST3_FRAME, INTRA_FRAME } }, + { NEARMV, { LAST3_FRAME, INTRA_FRAME } }, + { NEWMV, { LAST3_FRAME, INTRA_FRAME } }, +#endif // CONFIG_EXT_REFS + + { ZEROMV, { GOLDEN_FRAME, INTRA_FRAME } }, + { NEARESTMV, { GOLDEN_FRAME, INTRA_FRAME } }, + { NEARMV, { GOLDEN_FRAME, INTRA_FRAME } }, + { NEWMV, { GOLDEN_FRAME, INTRA_FRAME } }, + +#if CONFIG_EXT_REFS + { ZEROMV, { BWDREF_FRAME, INTRA_FRAME } }, + { NEARESTMV, { BWDREF_FRAME, INTRA_FRAME } }, + { NEARMV, { BWDREF_FRAME, INTRA_FRAME } }, + { NEWMV, { BWDREF_FRAME, INTRA_FRAME } }, +#endif // CONFIG_EXT_REFS + + { ZEROMV, { ALTREF_FRAME, INTRA_FRAME } }, + { NEARESTMV, { ALTREF_FRAME, INTRA_FRAME } }, + { NEARMV, { ALTREF_FRAME, INTRA_FRAME } }, + { NEWMV, { ALTREF_FRAME, INTRA_FRAME } }, +#endif // CONFIG_EXT_INTER +}; + +static const REF_DEFINITION av1_ref_order[MAX_REFS] = { + { { LAST_FRAME, NONE_FRAME } }, +#if CONFIG_EXT_REFS + { { LAST2_FRAME, NONE_FRAME } }, { { LAST3_FRAME, NONE_FRAME } }, + { { BWDREF_FRAME, NONE_FRAME } }, +#endif // CONFIG_EXT_REFS + { { GOLDEN_FRAME, NONE_FRAME } }, { { ALTREF_FRAME, NONE_FRAME } }, + + { { LAST_FRAME, ALTREF_FRAME } }, +#if CONFIG_EXT_REFS + { { LAST2_FRAME, ALTREF_FRAME } }, { { LAST3_FRAME, ALTREF_FRAME } }, +#endif // CONFIG_EXT_REFS + { { GOLDEN_FRAME, ALTREF_FRAME } }, + +#if CONFIG_EXT_REFS + { { LAST_FRAME, BWDREF_FRAME } }, { { LAST2_FRAME, BWDREF_FRAME } }, + { { LAST3_FRAME, BWDREF_FRAME } }, { { GOLDEN_FRAME, BWDREF_FRAME } }, +#endif // CONFIG_EXT_REFS + + { { INTRA_FRAME, NONE_FRAME } }, +}; + +#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE +static INLINE int write_uniform_cost(int n, int v) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + if (l == 0) return 0; + if (v < m) + return (l - 1) * av1_cost_bit(128, 0); + else + return l * av1_cost_bit(128, 0); +} +#endif // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE + +// constants for prune 1 and prune 2 decision boundaries +#define FAST_EXT_TX_CORR_MID 0.0 +#define FAST_EXT_TX_EDST_MID 0.1 +#define FAST_EXT_TX_CORR_MARGIN 0.5 +#define FAST_EXT_TX_EDST_MARGIN 0.3 + +static const TX_TYPE_1D vtx_tab[TX_TYPES] = { + DCT_1D, ADST_1D, DCT_1D, ADST_1D, +#if CONFIG_EXT_TX + FLIPADST_1D, DCT_1D, FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D, + DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, IDTX_1D, +#endif // CONFIG_EXT_TX +}; + +static const TX_TYPE_1D htx_tab[TX_TYPES] = { + DCT_1D, DCT_1D, ADST_1D, ADST_1D, +#if CONFIG_EXT_TX + DCT_1D, FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D, + IDTX_1D, DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, +#endif // CONFIG_EXT_TX +}; + +#if CONFIG_DAALA_DIST +static int od_compute_var_4x4(od_coeff *x, int stride) { + int sum; + int s2; + int i; + sum = 0; + s2 = 0; + for (i = 0; i < 4; i++) { + int j; + for (j = 0; j < 4; j++) { + int t; + + t = x[i * stride + j]; + sum += t; + s2 += t * t; + } + } + // TODO(yushin) : Check wheter any changes are required for high bit depth. + return (s2 - (sum * sum >> 4)) >> 4; +} + +/* OD_DIST_LP_MID controls the frequency weighting filter used for computing + the distortion. For a value X, the filter is [1 X 1]/(X + 2) and + is applied both horizontally and vertically. For X=5, the filter is + a good approximation for the OD_QM8_Q4_HVS quantization matrix. */ +#define OD_DIST_LP_MID (5) +#define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2) + +static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x, + od_coeff *y, od_coeff *e_lp, int stride) { + double sum; + int min_var; + double mean_var; + double var_stat; + double activity; + double calibration; + int i; + int j; + double vardist; + + vardist = 0; + OD_ASSERT(qm != OD_FLAT_QM); + (void)qm; +#if 1 + min_var = INT_MAX; + mean_var = 0; + for (i = 0; i < 3; i++) { + for (j = 0; j < 3; j++) { + int varx; + int vary; + varx = od_compute_var_4x4(x + 2 * i * stride + 2 * j, stride); + vary = od_compute_var_4x4(y + 2 * i * stride + 2 * j, stride); + min_var = OD_MINI(min_var, varx); + mean_var += 1. / (1 + varx); + /* The cast to (double) is to avoid an overflow before the sqrt.*/ + vardist += varx - 2 * sqrt(varx * (double)vary) + vary; + } + } + /* We use a different variance statistic depending on whether activity + masking is used, since the harmonic mean appeared slghtly worse with + masking off. The calibration constant just ensures that we preserve the + rate compared to activity=1. */ + if (use_activity_masking) { + calibration = 1.95; + var_stat = 9. / mean_var; + } else { + calibration = 1.62; + var_stat = min_var; + } + /* 1.62 is a calibration constant, 0.25 is a noise floor and 1/6 is the + activity masking constant. */ + activity = calibration * pow(.25 + var_stat, -1. / 6); +#else + activity = 1; +#endif // 1 + sum = 0; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) + sum += e_lp[i * stride + j] * (double)e_lp[i * stride + j]; + } + /* Normalize the filter to unit DC response. */ + sum *= 1. / (OD_DIST_LP_NORM * OD_DIST_LP_NORM * OD_DIST_LP_NORM * + OD_DIST_LP_NORM); + return activity * activity * (sum + vardist); +} + +// Note : Inputs x and y are in a pixel domain +static double od_compute_dist(int qm, int activity_masking, od_coeff *x, + od_coeff *y, int bsize_w, int bsize_h, + int qindex) { + int i; + double sum; + sum = 0; + + assert(bsize_w >= 8 && bsize_h >= 8); + + if (qm == OD_FLAT_QM) { + for (i = 0; i < bsize_w * bsize_h; i++) { + double tmp; + tmp = x[i] - y[i]; + sum += tmp * tmp; + } + } else { + int j; + DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]); + DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]); + DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]); + int mid = OD_DIST_LP_MID; + for (i = 0; i < bsize_h; i++) { + for (j = 0; j < bsize_w; j++) { + e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j]; + } + } + for (i = 0; i < bsize_h; i++) { + tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1]; + tmp[i * bsize_w + bsize_w - 1] = + mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2]; + for (j = 1; j < bsize_w - 1; j++) { + tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + + e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1]; + } + } + for (j = 0; j < bsize_w; j++) { + e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j]; + e_lp[(bsize_h - 1) * bsize_w + j] = + mid * tmp[(bsize_h - 1) * bsize_w + j] + + 2 * tmp[(bsize_h - 2) * bsize_w + j]; + } + for (i = 1; i < bsize_h - 1; i++) { + for (j = 0; j < bsize_w; j++) { + e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] + + tmp[(i - 1) * bsize_w + j] + + tmp[(i + 1) * bsize_w + j]; + } + } + for (i = 0; i < bsize_h; i += 8) { + for (j = 0; j < bsize_w; j += 8) { + sum += od_compute_dist_8x8(qm, activity_masking, &x[i * bsize_w + j], + &y[i * bsize_w + j], &e_lp[i * bsize_w + j], + bsize_w); + } + } + /* Scale according to linear regression against SSE, for 8x8 blocks. */ + if (activity_masking) { + sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) + + (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0); + } else { + sum *= qindex >= 128 + ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128) + : qindex <= 43 + ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43) + : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43); + } + } + return sum; +} + +static int64_t av1_daala_dist(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, int bsw, + int bsh, int qm, int use_activity_masking, + int qindex) { + int i, j; + int64_t d; + DECLARE_ALIGNED(16, od_coeff, orig[MAX_TX_SQUARE]); + DECLARE_ALIGNED(16, od_coeff, rec[MAX_TX_SQUARE]); + + assert(qm == OD_HVS_QM); + + for (j = 0; j < bsh; j++) + for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i]; + + for (j = 0; j < bsh; j++) + for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i]; + + d = (int64_t)od_compute_dist(qm, use_activity_masking, orig, rec, bsw, bsh, + qindex); + return d; +} +#endif // CONFIG_DAALA_DIST + +static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize, + const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + double *hordist, double *verdist) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + const int f_index = bsize - BLOCK_16X16; + if (f_index < 0) { + const int w_shift = bw == 8 ? 1 : 2; + const int h_shift = bh == 8 ? 1 : 2; +#if CONFIG_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (int i = 0; i < bh; ++i) + for (int j = 0; j < bw; ++j) { + const int index = (j >> w_shift) + ((i >> h_shift) << 2); + esq[index] += + (src16[j + i * src_stride] - dst16[j + i * dst_stride]) * + (src16[j + i * src_stride] - dst16[j + i * dst_stride]); + } + } else { +#endif // CONFIG_HIGHBITDEPTH + + for (int i = 0; i < bh; ++i) + for (int j = 0; j < bw; ++j) { + const int index = (j >> w_shift) + ((i >> h_shift) << 2); + esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) * + (src[j + i * src_stride] - dst[j + i * dst_stride]); + } +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH + } else { + cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[0]); + cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, + &esq[1]); + cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, + &esq[2]); + cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[3]); + src += bh / 4 * src_stride; + dst += bh / 4 * dst_stride; + + cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[4]); + cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, + &esq[5]); + cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, + &esq[6]); + cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[7]); + src += bh / 4 * src_stride; + dst += bh / 4 * dst_stride; + + cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[8]); + cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, + &esq[9]); + cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, + &esq[10]); + cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[11]); + src += bh / 4 * src_stride; + dst += bh / 4 * dst_stride; + + cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[12]); + cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, + &esq[13]); + cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, + &esq[14]); + cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[15]); + } + + double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] + + esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] + + esq[12] + esq[13] + esq[14] + esq[15]; + if (total > 0) { + const double e_recip = 1.0 / total; + hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip; + hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip; + hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip; + verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip; + verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip; + verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip; + } else { + hordist[0] = verdist[0] = 0.25; + hordist[1] = verdist[1] = 0.25; + hordist[2] = verdist[2] = 0.25; + } +} + +static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize, + const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride) { + int prune_bitmask = 0; + double svm_proj_h = 0, svm_proj_v = 0; + double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 }; + get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride, + hdist, vdist); + + svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] + + vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3]; + svm_proj_h = hdist[0] * ADST_FLIP_SVM[4] + hdist[1] * ADST_FLIP_SVM[5] + + hdist[2] * ADST_FLIP_SVM[6] + ADST_FLIP_SVM[7]; + if (svm_proj_v > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN) + prune_bitmask |= 1 << FLIPADST_1D; + else if (svm_proj_v < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN) + prune_bitmask |= 1 << ADST_1D; + + if (svm_proj_h > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN) + prune_bitmask |= 1 << (FLIPADST_1D + 8); + else if (svm_proj_h < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN) + prune_bitmask |= 1 << (ADST_1D + 8); + + return prune_bitmask; +} + +#if CONFIG_EXT_TX +static void get_horver_correlation(const int16_t *diff, int stride, int w, + int h, double *hcorr, double *vcorr) { + // Returns hor/ver correlation coefficient + const int num = (h - 1) * (w - 1); + double num_r; + int i, j; + int64_t xy_sum = 0, xz_sum = 0; + int64_t x_sum = 0, y_sum = 0, z_sum = 0; + int64_t x2_sum = 0, y2_sum = 0, z2_sum = 0; + double x_var_n, y_var_n, z_var_n, xy_var_n, xz_var_n; + *hcorr = *vcorr = 1; + + assert(num > 0); + num_r = 1.0 / num; + for (i = 1; i < h; ++i) { + for (j = 1; j < w; ++j) { + const int16_t x = diff[i * stride + j]; + const int16_t y = diff[i * stride + j - 1]; + const int16_t z = diff[(i - 1) * stride + j]; + xy_sum += x * y; + xz_sum += x * z; + x_sum += x; + y_sum += y; + z_sum += z; + x2_sum += x * x; + y2_sum += y * y; + z2_sum += z * z; + } + } + x_var_n = x2_sum - (x_sum * x_sum) * num_r; + y_var_n = y2_sum - (y_sum * y_sum) * num_r; + z_var_n = z2_sum - (z_sum * z_sum) * num_r; + xy_var_n = xy_sum - (x_sum * y_sum) * num_r; + xz_var_n = xz_sum - (x_sum * z_sum) * num_r; + if (x_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrt(x_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } + if (x_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrt(x_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } +} + +int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) { + double hcorr, vcorr; + int prune_bitmask = 0; + get_horver_correlation(diff, stride, w, h, &hcorr, &vcorr); + + if (vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN) + prune_bitmask |= 1 << IDTX_1D; + else if (vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN) + prune_bitmask |= 1 << DCT_1D; + + if (hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN) + prune_bitmask |= 1 << (IDTX_1D + 8); + else if (hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN) + prune_bitmask |= 1 << (DCT_1D + 8); + return prune_bitmask; +} + +// Performance drop: 0.5%, Speed improvement: 24% +static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, const MACROBLOCKD *xd, + int adst_flipadst, int dct_idtx) { + int prune = 0; + + if (adst_flipadst) { + const struct macroblock_plane *const p = &x->plane[0]; + const struct macroblockd_plane *const pd = &xd->plane[0]; + prune |= adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); + } + if (dct_idtx) { + av1_subtract_plane(x, bsize, 0); + const struct macroblock_plane *const p = &x->plane[0]; + const int bw = 4 << (b_width_log2_lookup[bsize]); + const int bh = 4 << (b_height_log2_lookup[bsize]); + prune |= dct_vs_idtx(p->src_diff, bw, bw, bh); + } + + return prune; +} +#endif // CONFIG_EXT_TX + +// Performance drop: 0.3%, Speed improvement: 5% +static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize, + const MACROBLOCK *x, const MACROBLOCKD *xd) { + const struct macroblock_plane *const p = &x->plane[0]; + const struct macroblockd_plane *const pd = &xd->plane[0]; + return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride); +} + +static int prune_tx_types(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, + const MACROBLOCKD *const xd, int tx_set) { +#if CONFIG_EXT_TX + const int *tx_set_1D = ext_tx_used_inter_1D[tx_set]; +#else + const int tx_set_1D[TX_TYPES_1D] = { 0 }; +#endif // CONFIG_EXT_TX + + switch (cpi->sf.tx_type_search.prune_mode) { + case NO_PRUNE: return 0; break; + case PRUNE_ONE: + if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) + return 0; + return prune_one_for_sby(cpi, bsize, x, xd); + break; +#if CONFIG_EXT_TX + case PRUNE_TWO: + if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) { + if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return 0; + return prune_two_for_sby(cpi, bsize, x, xd, 0, 1); + } + if ((tx_set >= 0) && !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) + return prune_two_for_sby(cpi, bsize, x, xd, 1, 0); + return prune_two_for_sby(cpi, bsize, x, xd, 1, 1); + break; +#endif // CONFIG_EXT_TX + } + assert(0); + return 0; +} + +static int do_tx_type_search(TX_TYPE tx_type, int prune) { +// TODO(sarahparker) implement for non ext tx +#if CONFIG_EXT_TX + return !(((prune >> vtx_tab[tx_type]) & 1) | + ((prune >> (htx_tab[tx_type] + 8)) & 1)); +#else + // temporary to avoid compiler warnings + (void)vtx_tab; + (void)htx_tab; + (void)tx_type; + (void)prune; + return 1; +#endif // CONFIG_EXT_TX +} + +static void model_rd_from_sse(const AV1_COMP *const cpi, + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, + int plane, int64_t sse, int *rate, + int64_t *dist) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dequant_shift = +#if CONFIG_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : +#endif // CONFIG_HIGHBITDEPTH + 3; + + // Fast approximate the modelling function. + if (cpi->sf.simple_model_rd_from_var) { + const int64_t square_error = sse; + int quantizer = (pd->dequant[1] >> dequant_shift); + + if (quantizer < 120) + *rate = (int)((square_error * (280 - quantizer)) >> + (16 - AV1_PROB_COST_SHIFT)); + else + *rate = 0; + *dist = (square_error * quantizer) >> 8; + } else { + av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize], + pd->dequant[1] >> dequant_shift, rate, dist); + } + + *dist <<= 4; +} + +static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, + int plane_to, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, + int64_t *skip_sse_sb) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + int plane; + const int ref = xd->mi[0]->mbmi.ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + x->pred_sse[ref] = 0; + + for (plane = plane_from; plane <= plane_to; ++plane) { + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; +#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 + const BLOCK_SIZE bs = AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd)); +#else + const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); +#endif // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 + + unsigned int sse; + int rate; + int64_t dist; + +#if CONFIG_CB4X4 + if (x->skip_chroma_rd && plane) continue; +#endif // CONFIG_CB4X4 + + // TODO(geza): Write direct sse functions that do not compute + // variance as well. + cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + &sse); + + if (plane == 0) x->pred_sse[ref] = sse; + + total_sse += sse; + + model_rd_from_sse(cpi, xd, bs, plane, sse, &rate, &dist); + + rate_sum += rate; + dist_sum += dist; + } + + *skip_txfm_sb = total_sse == 0; + *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int i; + int64_t error = 0, sqcoeff = 0; + + for (i = 0; i < block_size; i++) { + const int diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + sqcoeff += coeff[i] * coeff[i]; + } + + *ssz = sqcoeff; + return error; +} + +int64_t av1_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, + int block_size) { + int i; + int64_t error = 0; + + for (i = 0; i < block_size; i++) { + const int diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + } + + return error; +} + +#if CONFIG_HIGHBITDEPTH +int64_t av1_highbd_block_error_c(const tran_low_t *coeff, + const tran_low_t *dqcoeff, intptr_t block_size, + int64_t *ssz, int bd) { + int i; + int64_t error = 0, sqcoeff = 0; + int shift = 2 * (bd - 8); + int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i++) { + const int64_t diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i]; + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_PVQ +// Without PVQ, av1_block_error_c() return two kind of errors, +// 1) reconstruction (i.e. decoded) error and +// 2) Squared sum of transformed residue (i.e. 'coeff') +// However, if PVQ is enabled, coeff does not keep the transformed residue +// but instead a transformed original is kept. +// Hence, new parameter ref vector (i.e. transformed predicted signal) +// is required to derive the residue signal, +// i.e. coeff - ref = residue (all transformed). + +#if CONFIG_HIGHBITDEPTH +static int64_t av1_highbd_block_error2_c(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + const tran_low_t *ref, + intptr_t block_size, int64_t *ssz, + int bd) { + int64_t error; + int64_t sqcoeff; + int shift = 2 * (bd - 8); + int rounding = shift > 0 ? 1 << (shift - 1) : 0; + // Use the existing sse codes for calculating distortion of decoded signal: + // i.e. (orig - decoded)^2 + // For high bit depth, throw away ssz until a 32-bit version of + // av1_block_error_fp is written. + int64_t ssz_trash; + error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash); + // prediction residue^2 = (orig - ref)^2 + sqcoeff = av1_block_error(coeff, ref, block_size, &ssz_trash); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + *ssz = sqcoeff; + return error; +} +#else +// TODO(yushin) : Since 4x4 case does not need ssz, better to refactor into +// a separate function that does not do the extra computations for ssz. +static int64_t av1_block_error2_c(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + const tran_low_t *ref, intptr_t block_size, + int64_t *ssz) { + int64_t error; + // Use the existing sse codes for calculating distortion of decoded signal: + // i.e. (orig - decoded)^2 + error = av1_block_error_fp(coeff, dqcoeff, block_size); + // prediction residue^2 = (orig - ref)^2 + *ssz = av1_block_error_fp(coeff, ref, block_size); + return error; +} +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_PVQ + +#if !CONFIG_PVQ || CONFIG_VAR_TX +/* The trailing '0' is a terminator which is used inside av1_cost_coeffs() to + * decide whether to include cost of a trailing EOB node or not (i.e. we + * can skip this if the last coefficient in this transform block, e.g. the + * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block, + * were non-zero). */ +#if !CONFIG_LV_MAP +static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order, + const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l, + int use_fast_coef_costing) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const struct macroblock_plane *p = &x->plane[plane]; + const struct macroblockd_plane *pd = &xd->plane[plane]; + const PLANE_TYPE type = pd->plane_type; + const uint16_t *band_count = &band_count_table[tx_size][1]; + const int eob = p->eobs[block]; + const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + const int tx_size_ctx = txsize_sqr_map[tx_size]; + unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + x->token_costs[tx_size_ctx][type][is_inter_block(mbmi)]; + uint8_t token_cache[MAX_TX_SQUARE]; + int pt = combine_entropy_contexts(*a, *l); + int c, cost; + const int16_t *scan = scan_order->scan; + const int16_t *nb = scan_order->neighbors; +#if CONFIG_NEW_TOKENSET + const int ref = is_inter_block(mbmi); + aom_prob *blockz_probs = + cm->fc->blockzero_probs[txsize_sqr_map[tx_size]][type][ref]; + +#endif // CONFIG_NEW_TOKENSET + +#if CONFIG_HIGHBITDEPTH + const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd); +#else + const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8); +#endif // CONFIG_HIGHBITDEPTH + +#if !CONFIG_VAR_TX && !CONFIG_SUPERTX + // Check for consistency of tx_size with mode info + assert(tx_size == get_tx_size(plane, xd)); +#endif // !CONFIG_VAR_TX && !CONFIG_SUPERTX + (void)cm; + + if (eob == 0) { +#if CONFIG_NEW_TOKENSET + // single eob token + cost = av1_cost_bit(blockz_probs[pt], 0); +#else + cost = token_costs[0][0][pt][EOB_TOKEN]; +#endif // CONFIG_NEW_TOKENSET + } else { + if (use_fast_coef_costing) { + int band_left = *band_count++; + + // dc token + int v = qcoeff[0]; + int16_t prev_t; + cost = av1_get_token_cost(v, &prev_t, cat6_bits); +#if CONFIG_NEW_TOKENSET + cost += (*token_costs)[!prev_t][pt][prev_t]; +#else + cost += (*token_costs)[0][pt][prev_t]; +#endif + + token_cache[0] = av1_pt_energy_class[prev_t]; + ++token_costs; + + // ac tokens + for (c = 1; c < eob; c++) { + const int rc = scan[c]; + int16_t t; + + v = qcoeff[rc]; + cost += av1_get_token_cost(v, &t, cat6_bits); +#if CONFIG_NEW_TOKENSET + cost += (*token_costs)[!t][!prev_t][t]; +#else + cost += (*token_costs)[!prev_t][!prev_t][t]; +#endif + prev_t = t; + if (!--band_left) { + band_left = *band_count++; + ++token_costs; + } + } + + // eob token + if (band_left || CONFIG_NEW_TOKENSET) + cost += (*token_costs)[0][!prev_t][EOB_TOKEN]; + + } else { // !use_fast_coef_costing + int band_left = *band_count++; + + // dc token + int v = qcoeff[0]; + int16_t tok; +#if !CONFIG_NEW_TOKENSET + unsigned int(*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS]; +#endif + cost = av1_get_token_cost(v, &tok, cat6_bits); +#if CONFIG_NEW_TOKENSET + cost += (*token_costs)[!tok][pt][tok]; +#else + cost += (*token_costs)[0][pt][tok]; +#endif + + token_cache[0] = av1_pt_energy_class[tok]; + ++token_costs; + +#if !CONFIG_NEW_TOKENSET + tok_cost_ptr = &((*token_costs)[!tok]); +#endif + + // ac tokens + for (c = 1; c < eob; c++) { + const int rc = scan[c]; + + v = qcoeff[rc]; + cost += av1_get_token_cost(v, &tok, cat6_bits); + pt = get_coef_context(nb, token_cache, c); +#if CONFIG_NEW_TOKENSET + cost += (*token_costs)[!tok][pt][tok]; +#else + cost += (*tok_cost_ptr)[pt][tok]; +#endif + token_cache[rc] = av1_pt_energy_class[tok]; + if (!--band_left) { + band_left = *band_count++; + ++token_costs; + } +#if !CONFIG_NEW_TOKENSET + tok_cost_ptr = &((*token_costs)[!tok]); +#endif + } + + // eob token + if (band_left || CONFIG_NEW_TOKENSET) { + pt = get_coef_context(nb, token_cache, c); + cost += (*token_costs)[0][pt][EOB_TOKEN]; + } + } + } + + return cost; +} +#endif // !CONFIG_LV_MAP + +int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order, + const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l, + int use_fast_coef_costing) { +#if !CONFIG_LV_MAP + const AV1_COMMON *const cm = &cpi->common; + return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l, + use_fast_coef_costing); +#else // !CONFIG_LV_MAP + (void)scan_order; + (void)use_fast_coef_costing; + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const struct macroblockd_plane *pd = &xd->plane[plane]; + const BLOCK_SIZE bsize = mbmi->sb_type; +#if CONFIG_CB4X4 +#if CONFIG_CHROMA_2X2 + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); +#else + const BLOCK_SIZE plane_bsize = + AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd)); +#endif // CONFIG_CHROMA_2X2 +#else // CONFIG_CB4X4 + const BLOCK_SIZE plane_bsize = + get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd); +#endif // CONFIG_CB4X4 + + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + return av1_cost_coeffs_txb(cpi, x, plane, block, &txb_ctx); +#endif // !CONFIG_LV_MAP +} +#endif // !CONFIG_PVQ || CONFIG_VAR_TX + +// Get transform block visible dimensions cropped to the MI units. +static void get_txb_dimensions(const MACROBLOCKD *xd, int plane, + BLOCK_SIZE plane_bsize, int blk_row, int blk_col, + BLOCK_SIZE tx_bsize, int *width, int *height, + int *visible_width, int *visible_height) { + assert(tx_bsize <= plane_bsize); + int txb_height = block_size_high[tx_bsize]; + int txb_width = block_size_wide[tx_bsize]; + const int block_height = block_size_high[plane_bsize]; + const int block_width = block_size_wide[plane_bsize]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + // TODO(aconverse@google.com): Investigate using crop_width/height here rather + // than the MI size + const int block_rows = + (xd->mb_to_bottom_edge >= 0) + ? block_height + : (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height; + const int block_cols = + (xd->mb_to_right_edge >= 0) + ? block_width + : (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width; + const int tx_unit_size = tx_size_wide_log2[0]; + if (width) *width = txb_width; + if (height) *height = txb_height; + *visible_width = clamp(block_cols - (blk_col << tx_unit_size), 0, txb_width); + *visible_height = + clamp(block_rows - (blk_row << tx_unit_size), 0, txb_height); +} + +// Compute the pixel domain sum square error on all visible 4x4s in the +// transform block. +static unsigned pixel_sse(const AV1_COMP *const cpi, const MACROBLOCKD *xd, + int plane, const uint8_t *src, const int src_stride, + const uint8_t *dst, const int dst_stride, int blk_row, + int blk_col, const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize) { + int txb_rows, txb_cols, visible_rows, visible_cols; + get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, + &txb_cols, &txb_rows, &visible_cols, &visible_rows); + assert(visible_rows > 0); + assert(visible_cols > 0); + if (txb_rows == visible_rows && txb_cols == visible_cols) { + unsigned sse; + cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); + return sse; + } +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint64_t sse = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride, + visible_cols, visible_rows); + return (unsigned int)ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); + } +#endif // CONFIG_HIGHBITDEPTH + unsigned sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, + visible_cols, visible_rows); + return sse; +} + +// Compute the squares sum squares on all visible 4x4s in the transform block. +static int64_t sum_squares_visible(const MACROBLOCKD *xd, int plane, + const int16_t *diff, const int diff_stride, + int blk_row, int blk_col, + const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize) { + int visible_rows, visible_cols; + get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, + NULL, &visible_cols, &visible_rows); + return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows); +} + +void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col, + TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse, + OUTPUT_STATUS output_status) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; +#if CONFIG_DAALA_DIST + int qm = OD_HVS_QM; + int use_activity_masking = 0; +#if CONFIG_PVQ + use_activity_masking = x->daala_enc.use_activity_masking; +#endif // CONFIG_PVQ + struct macroblockd_plane *const pd = &xd->plane[plane]; +#else // CONFIG_DAALA_DIST + const struct macroblockd_plane *const pd = &xd->plane[plane]; +#endif // CONFIG_DAALA_DIST + + if (cpi->sf.use_transform_domain_distortion && !CONFIG_DAALA_DIST) { + // Transform domain distortion computation is more efficient as it does + // not involve an inverse transform, but it is less accurate. + const int buffer_length = tx_size_2d[tx_size]; + int64_t this_sse; + int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2; + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); +#if CONFIG_PVQ + tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block); + +#if CONFIG_HIGHBITDEPTH + const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8; + *out_dist = av1_highbd_block_error2_c(coeff, dqcoeff, ref_coeff, + buffer_length, &this_sse, bd) >> + shift; +#else + *out_dist = av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length, + &this_sse) >> + shift; +#endif // CONFIG_HIGHBITDEPTH +#elif CONFIG_HIGHBITDEPTH + const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8; + *out_dist = + av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, bd) >> + shift; +#else + *out_dist = + av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift; +#endif // CONFIG_PVQ + *out_sse = this_sse >> shift; + } else { + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; +#if !CONFIG_PVQ || CONFIG_DAALA_DIST + const int bsw = block_size_wide[tx_bsize]; + const int bsh = block_size_high[tx_bsize]; +#endif + const int src_stride = x->plane[plane].src.stride; + const int dst_stride = xd->plane[plane].dst.stride; + // Scale the transform block index to pixel unit. + const int src_idx = (blk_row * src_stride + blk_col) + << tx_size_wide_log2[0]; + const int dst_idx = (blk_row * dst_stride + blk_col) + << tx_size_wide_log2[0]; + const uint8_t *src = &x->plane[plane].src.buf[src_idx]; + const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx]; + const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const uint16_t eob = p->eobs[block]; + + assert(cpi != NULL); + assert(tx_size_wide_log2[0] == tx_size_high_log2[0]); + +#if CONFIG_DAALA_DIST + if (plane == 0 && bsw >= 8 && bsh >= 8) { + if (output_status == OUTPUT_HAS_DECODED_PIXELS) { + const int pred_stride = block_size_wide[plane_bsize]; + const int pred_idx = (blk_row * pred_stride + blk_col) + << tx_size_wide_log2[0]; + const int16_t *pred = &pd->pred[pred_idx]; + int i, j; + DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]); + + for (j = 0; j < bsh; j++) + for (i = 0; i < bsw; i++) + pred8[j * bsw + i] = pred[j * pred_stride + i]; + *out_sse = av1_daala_dist(src, src_stride, pred8, bsw, bsw, bsh, qm, + use_activity_masking, x->qindex); + } else { + *out_sse = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh, + qm, use_activity_masking, x->qindex); + } + } else +#endif // CONFIG_DAALA_DIST + { + const int diff_stride = block_size_wide[plane_bsize]; + const int diff_idx = (blk_row * diff_stride + blk_col) + << tx_size_wide_log2[0]; + const int16_t *diff = &p->src_diff[diff_idx]; + *out_sse = sum_squares_visible(xd, plane, diff, diff_stride, blk_row, + blk_col, plane_bsize, tx_bsize); +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + *out_sse = ROUND_POWER_OF_TWO(*out_sse, (xd->bd - 8) * 2); +#endif // CONFIG_HIGHBITDEPTH + } + *out_sse *= 16; + + if (eob) { + if (output_status == OUTPUT_HAS_DECODED_PIXELS) { +#if CONFIG_DAALA_DIST + if (plane == 0 && bsw >= 8 && bsh >= 8) + *out_dist = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh, + qm, use_activity_masking, x->qindex); + else +#endif // CONFIG_DAALA_DIST + *out_dist = + pixel_sse(cpi, xd, plane, src, src_stride, dst, dst_stride, + blk_row, blk_col, plane_bsize, tx_bsize); + } else { +#if CONFIG_HIGHBITDEPTH + uint8_t *recon; + DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + recon = CONVERT_TO_BYTEPTR(recon16); + else + recon = (uint8_t *)recon16; +#else + DECLARE_ALIGNED(16, uint8_t, recon[MAX_TX_SQUARE]); +#endif // CONFIG_HIGHBITDEPTH + +#if !CONFIG_PVQ +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + aom_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0, + NULL, 0, bsw, bsh, xd->bd); + } else { +#endif // CONFIG_HIGHBITDEPTH + aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0, NULL, + 0, bsw, bsh); +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH +#else + (void)dst; +#endif // !CONFIG_PVQ + + const PLANE_TYPE plane_type = get_plane_type(plane); + TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); + + av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, recon, + MAX_TX_SIZE, eob); + +#if CONFIG_DAALA_DIST + if (plane == 0 && bsw >= 8 && bsh >= 8) { + *out_dist = av1_daala_dist(src, src_stride, recon, MAX_TX_SIZE, bsw, + bsh, qm, use_activity_masking, x->qindex); + } else { + if (plane == 0) { + // Save decoded pixels for inter block in pd->pred to avoid + // block_8x8_rd_txfm_daala_dist() need to produce them + // by calling av1_inverse_transform_block() again. + const int pred_stride = block_size_wide[plane_bsize]; + const int pred_idx = (blk_row * pred_stride + blk_col) + << tx_size_wide_log2[0]; + int16_t *pred = &pd->pred[pred_idx]; + int i, j; + + for (j = 0; j < bsh; j++) + for (i = 0; i < bsw; i++) + pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i]; + } +#endif // CONFIG_DAALA_DIST + *out_dist = + pixel_sse(cpi, xd, plane, src, src_stride, recon, MAX_TX_SIZE, + blk_row, blk_col, plane_bsize, tx_bsize); +#if CONFIG_DAALA_DIST + } +#endif // CONFIG_DAALA_DIST + } + *out_dist *= 16; + } else { + *out_dist = *out_sse; + } + } +} + +static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { + struct rdcost_block_args *args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const AV1_COMP *cpi = args->cpi; + ENTROPY_CONTEXT *a = args->t_above + blk_col; + ENTROPY_CONTEXT *l = args->t_left + blk_row; +#if !CONFIG_TXK_SEL + const AV1_COMMON *cm = &cpi->common; +#endif + int64_t rd1, rd2, rd; + RD_STATS this_rd_stats; + + assert(tx_size == get_tx_size(plane, xd)); + + av1_init_rd_stats(&this_rd_stats); + + if (args->exit_early) return; + + if (!is_inter_block(mbmi)) { + av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size); + av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); + } + +#if !CONFIG_TXK_SEL + // full forward transform and quantization + const int coeff_ctx = combine_entropy_contexts(*a, *l); + av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + coeff_ctx, AV1_XFORM_QUANT_FP); + if (x->plane[plane].eobs[block] && !xd->lossless[mbmi->segment_id]) + av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx); + + if (!is_inter_block(mbmi)) { + struct macroblock_plane *const p = &x->plane[plane]; + av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col, + p->eobs[block]); + av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, + tx_size, &this_rd_stats.dist, &this_rd_stats.sse, + OUTPUT_HAS_DECODED_PIXELS); + } else { + av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, + tx_size, &this_rd_stats.dist, &this_rd_stats.sse, + OUTPUT_HAS_PREDICTED_PIXELS); + } +#if CONFIG_CFL + if (plane == AOM_PLANE_Y && x->cfl_store_y) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = + &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; + cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size); + } +#endif + rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist); + if (args->this_rd + rd > args->best_rd) { + args->exit_early = 1; + return; + } +#if !CONFIG_PVQ + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); + const SCAN_ORDER *scan_order = + get_scan(cm, tx_size, tx_type, is_inter_block(mbmi)); + this_rd_stats.rate = + av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, a, l, + args->use_fast_coef_costing); +#else // !CONFIG_PVQ + this_rd_stats.rate = x->rate; +#endif // !CONFIG_PVQ +#else // !CONFIG_TXK_SEL + av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, + tx_size, a, l, args->use_fast_coef_costing, + &this_rd_stats); +#endif // !CONFIG_TXK_SEL + +#if !CONFIG_PVQ +#if CONFIG_RD_DEBUG + av1_update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col, + this_rd_stats.rate); +#endif // CONFIG_RD_DEBUG + av1_set_txb_context(x, plane, block, tx_size, a, l); +#endif // !CONFIG_PVQ + + rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist); + rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse); + + // TODO(jingning): temporarily enabled only for luma component + rd = AOMMIN(rd1, rd2); + +#if CONFIG_DAALA_DIST + if (plane == 0 && + (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) { + this_rd_stats.dist = 0; + this_rd_stats.sse = 0; + rd = 0; + x->rate_4x4[block] = this_rd_stats.rate; + } +#endif // CONFIG_DAALA_DIST + +#if !CONFIG_PVQ + this_rd_stats.skip &= !x->plane[plane].eobs[block]; +#else + this_rd_stats.skip &= x->pvq_skip[plane]; +#endif // !CONFIG_PVQ + av1_merge_rd_stats(&args->rd_stats, &this_rd_stats); + + args->this_rd += rd; + + if (args->this_rd > args->best_rd) { + args->exit_early = 1; + return; + } +} + +#if CONFIG_DAALA_DIST +static void block_8x8_rd_txfm_daala_dist(int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct rdcost_block_args *args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + int64_t rd, rd1, rd2; + RD_STATS this_rd_stats; + int qm = OD_HVS_QM; + int use_activity_masking = 0; + + (void)tx_size; +#if CONFIG_PVQ + use_activity_masking = x->daala_enc.use_activity_masking; +#endif // CONFIG_PVQ + av1_init_rd_stats(&this_rd_stats); + + if (args->exit_early) return; + + { + const struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + const int diff_stride = block_size_wide[plane_bsize]; + + const uint8_t *src = + &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]]; + const uint8_t *dst = + &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; + + unsigned int tmp1, tmp2; + int qindex = x->qindex; + const int pred_stride = block_size_wide[plane_bsize]; + const int pred_idx = (blk_row * pred_stride + blk_col) + << tx_size_wide_log2[0]; + int16_t *pred = &pd->pred[pred_idx]; + int i, j; + const int tx_blk_size = 8; + + DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]); + + for (j = 0; j < tx_blk_size; j++) + for (i = 0; i < tx_blk_size; i++) + pred8[j * tx_blk_size + i] = pred[j * diff_stride + i]; + + tmp1 = av1_daala_dist(src, src_stride, pred8, tx_blk_size, 8, 8, qm, + use_activity_masking, qindex); + tmp2 = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8, qm, + use_activity_masking, qindex); + + if (!is_inter_block(mbmi)) { + this_rd_stats.sse = (int64_t)tmp1 * 16; + this_rd_stats.dist = (int64_t)tmp2 * 16; + } else { + // For inter mode, the decoded pixels are provided in pd->pred, + // while the predicted pixels are in dst. + this_rd_stats.sse = (int64_t)tmp2 * 16; + this_rd_stats.dist = (int64_t)tmp1 * 16; + } + } + + rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist); + if (args->this_rd + rd > args->best_rd) { + args->exit_early = 1; + return; + } + + { + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + // The rate of the current 8x8 block is the sum of four 4x4 blocks in it. + this_rd_stats.rate = x->rate_4x4[block - max_blocks_wide - 1] + + x->rate_4x4[block - max_blocks_wide] + + x->rate_4x4[block - 1] + x->rate_4x4[block]; + } + rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist); + rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse); + rd = AOMMIN(rd1, rd2); + + args->rd_stats.dist += this_rd_stats.dist; + args->rd_stats.sse += this_rd_stats.sse; + + args->this_rd += rd; + + if (args->this_rd > args->best_rd) { + args->exit_early = 1; + return; + } +} +#endif // CONFIG_DAALA_DIST + +static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, + RD_STATS *rd_stats, int64_t ref_best_rd, int plane, + BLOCK_SIZE bsize, TX_SIZE tx_size, + int use_fast_coef_casting) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + struct rdcost_block_args args; + av1_zero(args); + args.x = x; + args.cpi = cpi; + args.best_rd = ref_best_rd; + args.use_fast_coef_costing = use_fast_coef_casting; + av1_init_rd_stats(&args.rd_stats); + + if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size; + + av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left); + +#if CONFIG_DAALA_DIST + if (plane == 0 && + (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) + av1_foreach_8x8_transformed_block_in_plane( + xd, bsize, plane, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args); + else +#endif // CONFIG_DAALA_DIST + av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm, + &args); + + if (args.exit_early) { + av1_invalid_rd_stats(rd_stats); + } else { + *rd_stats = args.rd_stats; + } +} + +#if CONFIG_SUPERTX +void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate, + int64_t *distortion, int *skippable, + int64_t *sse, int64_t ref_best_rd, int plane, + BLOCK_SIZE bsize, TX_SIZE tx_size, + int use_fast_coef_casting) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + struct rdcost_block_args args; + av1_zero(args); + args.cpi = cpi; + args.x = x; + args.best_rd = ref_best_rd; + args.use_fast_coef_costing = use_fast_coef_casting; + +#if CONFIG_EXT_TX + assert(tx_size < TX_SIZES); +#endif // CONFIG_EXT_TX + + if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size; + + av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left); + + block_rd_txfm(plane, 0, 0, 0, get_plane_block_size(bsize, pd), tx_size, + &args); + + if (args.exit_early) { + *rate = INT_MAX; + *distortion = INT64_MAX; + *sse = INT64_MAX; + *skippable = 0; + } else { + *distortion = args.rd_stats.dist; + *rate = args.rd_stats.rate; + *sse = args.rd_stats.sse; + *skippable = !x->plane[plane].eobs[0]; + } +} +#endif // CONFIG_SUPERTX + +static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + const AV1_COMMON *const cm = &cpi->common; + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + + const int tx_select = + cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8; + + if (tx_select) { + const int is_inter = is_inter_block(mbmi); + const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize] + : intra_tx_size_cat_lookup[bsize]; + const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size]; + const int depth = tx_size_to_depth(coded_tx_size); + const int tx_size_ctx = get_tx_size_context(xd); + const int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth]; + return r_tx_size; + } else { + return 0; + } +} + +// #TODO(angiebird): use this function whenever it's possible +int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd, + BLOCK_SIZE bsize, int plane, TX_SIZE tx_size, + TX_TYPE tx_type) { + if (plane > 0) return 0; + + const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const int is_inter = is_inter_block(mbmi); +#if CONFIG_EXT_TX + const AV1_COMMON *cm = &cpi->common; + if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 && + !xd->lossless[xd->mi[0]->mbmi.segment_id]) { + const int ext_tx_set = + get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used); + if (is_inter) { + if (ext_tx_set > 0) + return cpi + ->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]][tx_type]; + } else { + if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX) + return cpi->intra_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]] + [mbmi->mode][tx_type]; + } + } +#else + (void)bsize; + if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] && + !FIXED_TX_TYPE) { + if (is_inter) { + return cpi->inter_tx_type_costs[tx_size][tx_type]; + } else { + return cpi->intra_tx_type_costs[tx_size] + [intra_mode_to_tx_type_context[mbmi->mode]] + [tx_type]; + } + } +#endif // CONFIG_EXT_TX + return 0; +} +static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs, + TX_TYPE tx_type, int tx_size) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + int64_t rd = INT64_MAX; + aom_prob skip_prob = av1_get_skip_prob(cm, xd); + int s0, s1; + const int is_inter = is_inter_block(mbmi); + const int tx_select = + cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8; + + const int r_tx_size = tx_size_cost(cpi, x, bs, tx_size); + + assert(skip_prob > 0); +#if CONFIG_EXT_TX && CONFIG_RECT_TX + assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs))); +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + + s0 = av1_cost_bit(skip_prob, 0); + s1 = av1_cost_bit(skip_prob, 1); + + mbmi->tx_type = tx_type; + mbmi->tx_size = tx_size; + txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, tx_size, + cpi->sf.use_fast_coef_costing); + if (rd_stats->rate == INT_MAX) return INT64_MAX; +#if !CONFIG_TXK_SEL + int plane = 0; + rd_stats->rate += av1_tx_type_cost(cpi, xd, bs, plane, tx_size, tx_type); +#endif + + if (rd_stats->skip) { + if (is_inter) { + rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse); + } else { + rd = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select, + rd_stats->sse); + } + } else { + rd = RDCOST(x->rdmult, x->rddiv, + rd_stats->rate + s0 + r_tx_size * tx_select, rd_stats->dist); + } + + if (tx_select) rd_stats->rate += r_tx_size; + + if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && + !(rd_stats->skip)) + rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse)); + + return rd; +} + +static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs, + TX_TYPE tx_type, TX_SIZE tx_size) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const TX_SIZE max_tx_size = max_txsize_lookup[bs]; + const int is_inter = is_inter_block(mbmi); + int prune = 0; + if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE) + // passing -1 in for tx_type indicates that all 1D + // transforms should be considered for pruning + prune = prune_tx_types(cpi, bs, x, xd, -1); + +#if CONFIG_REF_MV + if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1; +#endif // CONFIG_REF_MV + if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size)) + return 1; + if (!is_inter && x->use_default_intra_tx_type && + tx_type != get_default_tx_type(0, xd, 0, tx_size)) + return 1; + if (is_inter && x->use_default_inter_tx_type && + tx_type != get_default_tx_type(0, xd, 0, tx_size)) + return 1; + if (max_tx_size >= TX_32X32 && tx_size == TX_4X4) return 1; +#if CONFIG_EXT_TX + const AV1_COMMON *const cm = &cpi->common; + int ext_tx_set = + get_ext_tx_set(tx_size, bs, is_inter, cm->reduced_tx_set_used); + if (is_inter) { + if (!ext_tx_used_inter[ext_tx_set][tx_type]) return 1; + if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) { + if (!do_tx_type_search(tx_type, prune)) return 1; + } + } else { + if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) { + if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) return 1; + } + if (!ext_tx_used_intra[ext_tx_set][tx_type]) return 1; + } +#else // CONFIG_EXT_TX + if (tx_size >= TX_32X32 && tx_type != DCT_DCT) return 1; + if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE && + !do_tx_type_search(tx_type, prune)) + return 1; +#endif // CONFIG_EXT_TX + return 0; +} + +#if CONFIG_EXT_INTER +static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs, + MACROBLOCK *x, int *r, int64_t *d, int *s, + int64_t *sse, int64_t ref_best_rd) { + RD_STATS rd_stats; + int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs, DCT_DCT, + max_txsize_lookup[bs]); + *r = rd_stats.rate; + *d = rd_stats.dist; + *s = rd_stats.skip; + *sse = rd_stats.sse; + return rd; +} +#endif // CONFIG_EXT_INTER + +static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, + BLOCK_SIZE bs) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + TX_TYPE tx_type, best_tx_type = DCT_DCT; + int64_t this_rd, best_rd = INT64_MAX; + aom_prob skip_prob = av1_get_skip_prob(cm, xd); + int s0 = av1_cost_bit(skip_prob, 0); + int s1 = av1_cost_bit(skip_prob, 1); + const int is_inter = is_inter_block(mbmi); + int prune = 0; + const int plane = 0; +#if CONFIG_EXT_TX + int ext_tx_set; +#endif // CONFIG_EXT_TX + av1_invalid_rd_stats(rd_stats); + + mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode, is_inter); +#if CONFIG_VAR_TX + mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size); +#endif // CONFIG_VAR_TX +#if CONFIG_EXT_TX + ext_tx_set = + get_ext_tx_set(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used); +#endif // CONFIG_EXT_TX + + if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE) +#if CONFIG_EXT_TX + prune = prune_tx_types(cpi, bs, x, xd, ext_tx_set); +#else + prune = prune_tx_types(cpi, bs, x, xd, 0); +#endif // CONFIG_EXT_TX +#if CONFIG_EXT_TX + if (get_ext_tx_types(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used) > + 1 && + !xd->lossless[mbmi->segment_id]) { +#if CONFIG_PVQ + od_rollback_buffer pre_buf, post_buf; + + od_encode_checkpoint(&x->daala_enc, &pre_buf); + od_encode_checkpoint(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + + for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) { + RD_STATS this_rd_stats; + if (is_inter) { + if (x->use_default_inter_tx_type && + tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size)) + continue; + if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue; + if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) { + if (!do_tx_type_search(tx_type, prune)) continue; + } + } else { + if (x->use_default_intra_tx_type && + tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size)) + continue; + if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) { + if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue; + } + if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue; + } + + mbmi->tx_type = tx_type; + + txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs, + mbmi->tx_size, cpi->sf.use_fast_coef_costing); +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &pre_buf); +#endif // CONFIG_PVQ + if (this_rd_stats.rate == INT_MAX) continue; + av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type); + + if (this_rd_stats.skip) + this_rd = RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse); + else + this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + s0, + this_rd_stats.dist); + if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] && + !this_rd_stats.skip) + this_rd = + AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse)); + + if (this_rd < best_rd) { + best_rd = this_rd; + best_tx_type = mbmi->tx_type; + *rd_stats = this_rd_stats; +#if CONFIG_PVQ + od_encode_checkpoint(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + } + } +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + } else { + mbmi->tx_type = DCT_DCT; + txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size, + cpi->sf.use_fast_coef_costing); + } +#else // CONFIG_EXT_TX + if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id]) { + for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) { + RD_STATS this_rd_stats; + if (!is_inter && x->use_default_intra_tx_type && + tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size)) + continue; + if (is_inter && x->use_default_inter_tx_type && + tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size)) + continue; + mbmi->tx_type = tx_type; + txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs, + mbmi->tx_size, cpi->sf.use_fast_coef_costing); + if (this_rd_stats.rate == INT_MAX) continue; + + av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type); + if (is_inter) { + if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE && + !do_tx_type_search(tx_type, prune)) + continue; + } + if (this_rd_stats.skip) + this_rd = RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse); + else + this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + s0, + this_rd_stats.dist); + if (is_inter && !xd->lossless[mbmi->segment_id] && !this_rd_stats.skip) + this_rd = + AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse)); + + if (this_rd < best_rd) { + best_rd = this_rd; + best_tx_type = mbmi->tx_type; + *rd_stats = this_rd_stats; + } + } + } else { + mbmi->tx_type = DCT_DCT; + txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size, + cpi->sf.use_fast_coef_costing); + } +#endif // CONFIG_EXT_TX + mbmi->tx_type = best_tx_type; +} + +static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, + BLOCK_SIZE bs) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + + mbmi->tx_size = TX_4X4; + mbmi->tx_type = DCT_DCT; +#if CONFIG_VAR_TX + mbmi->min_tx_size = get_min_tx_size(TX_4X4); +#endif // CONFIG_VAR_TX + + txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size, + cpi->sf.use_fast_coef_costing); +} + +#if CONFIG_TXK_SEL || CONFIG_VAR_TX +static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) { + int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * tx_size_wide_log2[0]); + return num_blk; +} +#endif // CONFIG_TXK_SEL || CONFIG_VAR_TX + +static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, + MACROBLOCK *x, RD_STATS *rd_stats, + int64_t ref_best_rd, BLOCK_SIZE bs) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + int64_t rd = INT64_MAX; + int n; + int start_tx, end_tx; + int64_t best_rd = INT64_MAX, last_rd = INT64_MAX; + const TX_SIZE max_tx_size = max_txsize_lookup[bs]; + TX_SIZE best_tx_size = max_tx_size; + TX_TYPE best_tx_type = DCT_DCT; +#if CONFIG_TXK_SEL + TX_TYPE best_txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; + const int num_blk = bsize_to_num_blk(bs); +#endif // CONFIG_TXK_SEL + const int tx_select = cm->tx_mode == TX_MODE_SELECT; + const int is_inter = is_inter_block(mbmi); +#if CONFIG_PVQ + od_rollback_buffer buf; + od_encode_checkpoint(&x->daala_enc, &buf); +#endif // CONFIG_PVQ + + av1_invalid_rd_stats(rd_stats); + +#if CONFIG_EXT_TX && CONFIG_RECT_TX + int evaluate_rect_tx = 0; + if (tx_select) { + evaluate_rect_tx = is_rect_tx_allowed(xd, mbmi); + } else { + const TX_SIZE chosen_tx_size = + tx_size_from_tx_mode(bs, cm->tx_mode, is_inter); + evaluate_rect_tx = is_rect_tx(chosen_tx_size); + assert(IMPLIES(evaluate_rect_tx, is_rect_tx_allowed(xd, mbmi))); + } + if (evaluate_rect_tx) { + TX_TYPE tx_start = DCT_DCT; + TX_TYPE tx_end = TX_TYPES; +#if CONFIG_TXK_SEL + // The tx_type becomes dummy when lv_map is on. The tx_type search will be + // performed in av1_search_txk_type() + tx_end = DCT_DCT + 1; +#endif + TX_TYPE tx_type; + for (tx_type = tx_start; tx_type < tx_end; ++tx_type) { +#if CONFIG_REF_MV + if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue; +#endif // CONFIG_REF_MV + const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs]; + RD_STATS this_rd_stats; + int ext_tx_set = + get_ext_tx_set(rect_tx_size, bs, is_inter, cm->reduced_tx_set_used); + if ((is_inter && ext_tx_used_inter[ext_tx_set][tx_type]) || + (!is_inter && ext_tx_used_intra[ext_tx_set][tx_type])) { + rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, + rect_tx_size); + if (rd < best_rd) { +#if CONFIG_TXK_SEL + memcpy(best_txk_type, mbmi->txk_type, + sizeof(best_txk_type[0]) * num_blk); +#endif + best_tx_type = tx_type; + best_tx_size = rect_tx_size; + best_rd = rd; + *rd_stats = this_rd_stats; + } + } +#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 + const int is_inter = is_inter_block(mbmi); + if (mbmi->sb_type < BLOCK_8X8 && is_inter) break; +#endif // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 + } + } +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + + if (tx_select) { + start_tx = max_tx_size; + end_tx = (max_tx_size >= TX_32X32) ? TX_8X8 : TX_4X4; + } else { + const TX_SIZE chosen_tx_size = + tx_size_from_tx_mode(bs, cm->tx_mode, is_inter); + start_tx = chosen_tx_size; + end_tx = chosen_tx_size; + } + + last_rd = INT64_MAX; + for (n = start_tx; n >= end_tx; --n) { +#if CONFIG_EXT_TX && CONFIG_RECT_TX + if (is_rect_tx(n)) break; +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + TX_TYPE tx_start = DCT_DCT; + TX_TYPE tx_end = TX_TYPES; +#if CONFIG_TXK_SEL + // The tx_type becomes dummy when lv_map is on. The tx_type search will be + // performed in av1_search_txk_type() + tx_end = DCT_DCT + 1; +#endif + TX_TYPE tx_type; + for (tx_type = tx_start; tx_type < tx_end; ++tx_type) { + RD_STATS this_rd_stats; + if (skip_txfm_search(cpi, x, bs, tx_type, n)) continue; + rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, n); +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &buf); +#endif // CONFIG_PVQ + // Early termination in transform size search. + if (cpi->sf.tx_size_search_breakout && + (rd == INT64_MAX || + (this_rd_stats.skip == 1 && tx_type != DCT_DCT && n < start_tx) || + (n < (int)max_tx_size && rd > last_rd))) + break; + + last_rd = rd; + if (rd < best_rd) { +#if CONFIG_TXK_SEL + memcpy(best_txk_type, mbmi->txk_type, + sizeof(best_txk_type[0]) * num_blk); +#endif + best_tx_type = tx_type; + best_tx_size = n; + best_rd = rd; + *rd_stats = this_rd_stats; + } +#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 + const int is_inter = is_inter_block(mbmi); + if (mbmi->sb_type < BLOCK_8X8 && is_inter) break; +#endif // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4 + } + } + mbmi->tx_size = best_tx_size; + mbmi->tx_type = best_tx_type; +#if CONFIG_TXK_SEL + memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * num_blk); +#endif + +#if CONFIG_VAR_TX + mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size); +#endif // CONFIG_VAR_TX + +#if !CONFIG_EXT_TX + if (mbmi->tx_size >= TX_32X32) assert(mbmi->tx_type == DCT_DCT); +#endif // !CONFIG_EXT_TX +#if CONFIG_PVQ + if (best_rd != INT64_MAX) { + txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, best_tx_type, best_tx_size); + } +#endif // CONFIG_PVQ +} + +static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bs, + int64_t ref_best_rd) { + MACROBLOCKD *xd = &x->e_mbd; + av1_init_rd_stats(rd_stats); + + assert(bs == xd->mi[0]->mbmi.sb_type); + + if (xd->lossless[xd->mi[0]->mbmi.segment_id]) { + choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); + } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) { + choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); + } else { + choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs); + } +} + +static int conditional_skipintra(PREDICTION_MODE mode, + PREDICTION_MODE best_intra_mode) { + if (mode == D117_PRED && best_intra_mode != V_PRED && + best_intra_mode != D135_PRED) + return 1; + if (mode == D63_PRED && best_intra_mode != V_PRED && + best_intra_mode != D45_PRED) + return 1; + if (mode == D207_PRED && best_intra_mode != H_PRED && + best_intra_mode != D45_PRED) + return 1; + if (mode == D153_PRED && best_intra_mode != H_PRED && + best_intra_mode != D135_PRED) + return 1; + return 0; +} + +// Model based RD estimation for luma intra blocks. +static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int mode_cost) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + RD_STATS this_rd_stats; + int row, col; + int64_t temp_sse, this_rd; + const TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cpi->common.tx_mode, 0); + const int stepr = tx_size_high_unit[tx_size]; + const int stepc = tx_size_wide_unit[tx_size]; + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int max_blocks_high = max_block_high(xd, bsize, 0); + mbmi->tx_size = tx_size; + // Prediction. + const int step = stepr * stepc; + int block = 0; + for (row = 0; row < max_blocks_high; row += stepr) { + for (col = 0; col < max_blocks_wide; col += stepc) { + av1_predict_intra_block_facade(xd, 0, block, col, row, tx_size); + block += step; + } + } + // RD estimation. + model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, + &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse); +#if CONFIG_EXT_INTRA + if (av1_is_directional_mode(mbmi->mode, bsize)) { + mode_cost += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, + MAX_ANGLE_DELTA + mbmi->angle_delta[0]); + } +#endif // CONFIG_EXT_INTRA +#if CONFIG_FILTER_INTRA + if (mbmi->mode == DC_PRED) { + const aom_prob prob = cpi->common.fc->filter_intra_probs[0]; + if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) { + const int mode = mbmi->filter_intra_mode_info.filter_intra_mode[0]; + mode_cost += (av1_cost_bit(prob, 1) + + write_uniform_cost(FILTER_INTRA_MODES, mode)); + } else { + mode_cost += av1_cost_bit(prob, 0); + } + } +#endif // CONFIG_FILTER_INTRA + this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + mode_cost, + this_rd_stats.dist); + return this_rd; +} + +#if CONFIG_PALETTE +// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x +// new_height'. Extra rows and columns are filled in by copying last valid +// row/column. +static void extend_palette_color_map(uint8_t *const color_map, int orig_width, + int orig_height, int new_width, + int new_height) { + int j; + assert(new_width >= orig_width); + assert(new_height >= orig_height); + if (new_width == orig_width && new_height == orig_height) return; + + for (j = orig_height - 1; j >= 0; --j) { + memmove(color_map + j * new_width, color_map + j * orig_width, orig_width); + // Copy last column to extra columns. + memset(color_map + j * new_width + orig_width, + color_map[j * new_width + orig_width - 1], new_width - orig_width); + } + // Copy last row to extra rows. + for (j = orig_height; j < new_height; ++j) { + memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width, + new_width); + } +} + +static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int palette_ctx, + int dc_mode_cost, MB_MODE_INFO *best_mbmi, + uint8_t *best_palette_color_map, + int64_t *best_rd, int64_t *best_model_rd, + int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable) { + int rate_overhead = 0; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mic = xd->mi[0]; + MB_MODE_INFO *const mbmi = &mic->mbmi; + int this_rate, colors, n; + const int src_stride = x->plane[0].src.stride; + const uint8_t *const src = x->plane[0].src.buf; + uint8_t *const color_map = xd->plane[0].color_index_map; + int block_width, block_height, rows, cols; + av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, + &cols); + + assert(cpi->common.allow_screen_content_tools); + +#if CONFIG_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) + colors = av1_count_colors_highbd(src, src_stride, rows, cols, + cpi->common.bit_depth); + else +#endif // CONFIG_HIGHBITDEPTH + colors = av1_count_colors(src, src_stride, rows, cols); +#if CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; +#endif // CONFIG_FILTER_INTRA + + if (colors > 1 && colors <= 64) { + int r, c, i, j, k, palette_mode_cost; + const int max_itr = 50; + uint8_t color_order[PALETTE_MAX_SIZE]; + float *const data = x->palette_buffer->kmeans_data_buf; + float centroids[PALETTE_MAX_SIZE]; + float lb, ub, val; + RD_STATS tokenonly_rd_stats; + int64_t this_rd, this_model_rd; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; +#if CONFIG_HIGHBITDEPTH + uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + if (cpi->common.use_highbitdepth) + lb = ub = src16[0]; + else +#endif // CONFIG_HIGHBITDEPTH + lb = ub = src[0]; + +#if CONFIG_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) { + val = src16[r * src_stride + c]; + data[r * cols + c] = val; + if (val < lb) + lb = val; + else if (val > ub) + ub = val; + } + } + } else { +#endif // CONFIG_HIGHBITDEPTH + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) { + val = src[r * src_stride + c]; + data[r * cols + c] = val; + if (val < lb) + lb = val; + else if (val > ub) + ub = val; + } + } +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH + + mbmi->mode = DC_PRED; +#if CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; +#endif // CONFIG_FILTER_INTRA + + if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return 0; + + for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2; + --n) { + for (i = 0; i < n; ++i) + centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2; + av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr); + k = av1_remove_duplicates(centroids, n); + +#if CONFIG_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) + for (i = 0; i < k; ++i) + pmi->palette_colors[i] = + clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth); + else +#endif // CONFIG_HIGHBITDEPTH + for (i = 0; i < k; ++i) + pmi->palette_colors[i] = clip_pixel((int)centroids[i]); + pmi->palette_size[0] = k; + + av1_calc_indices(data, centroids, color_map, rows * cols, k, 1); + extend_palette_color_map(color_map, cols, rows, block_width, + block_height); + palette_mode_cost = + dc_mode_cost + + cpi->palette_y_size_cost[bsize - BLOCK_8X8][k - PALETTE_MIN_SIZE] + + write_uniform_cost(k, color_map[0]) + + av1_cost_bit( + av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], + 1); + palette_mode_cost += av1_palette_color_cost_y(pmi, cpi->common.bit_depth); + for (i = 0; i < rows; ++i) { + for (j = (i == 0 ? 1 : 0); j < cols; ++j) { + int color_idx; + const int color_ctx = av1_get_palette_color_index_context( + color_map, block_width, i, j, k, color_order, &color_idx); + assert(color_idx >= 0 && color_idx < k); + palette_mode_cost += cpi->palette_y_color_cost[k - PALETTE_MIN_SIZE] + [color_ctx][color_idx]; + } + } + this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost); + if (*best_model_rd != INT64_MAX && + this_model_rd > *best_model_rd + (*best_model_rd >> 1)) + continue; + if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd; + super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) continue; + this_rate = tokenonly_rd_stats.rate + palette_mode_cost; + this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist); + if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) { + tokenonly_rd_stats.rate -= tx_size_cost(cpi, x, bsize, mbmi->tx_size); + } + if (this_rd < *best_rd) { + *best_rd = this_rd; + memcpy(best_palette_color_map, color_map, + block_width * block_height * sizeof(color_map[0])); + *best_mbmi = *mbmi; + rate_overhead = this_rate - tokenonly_rd_stats.rate; + if (rate) *rate = this_rate; + if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate; + if (distortion) *distortion = tokenonly_rd_stats.dist; + if (skippable) *skippable = tokenonly_rd_stats.skip; + } + } + } + + if (best_mbmi->palette_mode_info.palette_size[0] > 0) { + memcpy(color_map, best_palette_color_map, + rows * cols * sizeof(best_palette_color_map[0])); + } + *mbmi = *best_mbmi; + return rate_overhead; +} +#endif // CONFIG_PALETTE + +static int64_t rd_pick_intra_sub_8x8_y_subblock_mode( + const AV1_COMP *const cpi, MACROBLOCK *x, int row, int col, + PREDICTION_MODE *best_mode, const int *bmode_costs, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int64_t *bestdistortion, + BLOCK_SIZE bsize, TX_SIZE tx_size, int *y_skip, int64_t rd_thresh) { + const AV1_COMMON *const cm = &cpi->common; + PREDICTION_MODE mode; + MACROBLOCKD *const xd = &x->e_mbd; + int64_t best_rd = rd_thresh; + struct macroblock_plane *p = &x->plane[0]; + struct macroblockd_plane *pd = &xd->plane[0]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4]; + uint8_t *dst_init = &pd->dst.buf[row * 4 * dst_stride + col * 4]; +#if CONFIG_CB4X4 + // TODO(jingning): This is a temporal change. The whole function should be + // out when cb4x4 is enabled. + ENTROPY_CONTEXT ta[4], tempa[4]; + ENTROPY_CONTEXT tl[4], templ[4]; +#else + ENTROPY_CONTEXT ta[2], tempa[2]; + ENTROPY_CONTEXT tl[2], templ[2]; +#endif // CONFIG_CB4X4 + + const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize]; + const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize]; + const int tx_width_unit = tx_size_wide_unit[tx_size]; + const int tx_height_unit = tx_size_high_unit[tx_size]; + const int pred_block_width = block_size_wide[bsize]; + const int pred_block_height = block_size_high[bsize]; + const int tx_width = tx_size_wide[tx_size]; + const int tx_height = tx_size_high[tx_size]; + const int pred_width_in_transform_blocks = pred_block_width / tx_width; + const int pred_height_in_transform_blocks = pred_block_height / tx_height; + int idx, idy; + int best_can_skip = 0; + uint8_t best_dst[8 * 8]; +#if CONFIG_HIGHBITDEPTH + uint16_t best_dst16[8 * 8]; +#endif // CONFIG_HIGHBITDEPTH + const int is_lossless = xd->lossless[xd->mi[0]->mbmi.segment_id]; +#if CONFIG_EXT_TX && CONFIG_RECT_TX + const int sub_bsize = bsize; +#else + const int sub_bsize = BLOCK_4X4; +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + +#if CONFIG_PVQ + od_rollback_buffer pre_buf, post_buf; + od_encode_checkpoint(&x->daala_enc, &pre_buf); + od_encode_checkpoint(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + + assert(bsize < BLOCK_8X8); + assert(tx_width < 8 || tx_height < 8); +#if CONFIG_EXT_TX && CONFIG_RECT_TX + if (is_lossless) + assert(tx_width == 4 && tx_height == 4); + else + assert(tx_width == pred_block_width && tx_height == pred_block_height); +#else + assert(tx_width == 4 && tx_height == 4); +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + + memcpy(ta, a, pred_width_in_transform_blocks * sizeof(a[0])); + memcpy(tl, l, pred_height_in_transform_blocks * sizeof(l[0])); + + xd->mi[0]->mbmi.tx_size = tx_size; + +#if CONFIG_PALETTE + xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0; +#endif // CONFIG_PALETTE + +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { +#if CONFIG_PVQ + od_encode_checkpoint(&x->daala_enc, &pre_buf); +#endif + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + int64_t this_rd; + int ratey = 0; + int64_t distortion = 0; + int rate = bmode_costs[mode]; + int can_skip = 1; + + if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] & + (1 << mode))) + continue; + + // Only do the oblique modes if the best so far is + // one of the neighboring directional modes + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(mode, *best_mode)) continue; + } + + memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0])); + memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0])); + + for (idy = 0; idy < pred_height_in_transform_blocks; ++idy) { + for (idx = 0; idx < pred_width_in_transform_blocks; ++idx) { + const int block_raster_idx = (row + idy) * 2 + (col + idx); + const int block = + av1_raster_order_to_block_index(tx_size, block_raster_idx); + const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; + uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; +#if !CONFIG_PVQ + int16_t *const src_diff = av1_raster_block_offset_int16( + BLOCK_8X8, block_raster_idx, p->src_diff); +#endif + int skip; + assert(block < 4); + assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4, + idx == 0 && idy == 0)); + assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4, + block == 0 || block == 2)); + xd->mi[0]->bmi[block_raster_idx].as_mode = mode; + av1_predict_intra_block( + xd, pd->width, pd->height, txsize_to_bsize[tx_size], mode, dst, + dst_stride, dst, dst_stride, col + idx, row + idy, 0); +#if !CONFIG_PVQ + aom_highbd_subtract_block(tx_height, tx_width, src_diff, 8, src, + src_stride, dst, dst_stride, xd->bd); +#endif + if (is_lossless) { + TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size); + const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0); + const int coeff_ctx = + combine_entropy_contexts(tempa[idx], templ[idy]); +#if !CONFIG_PVQ + av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8, + tx_size, coeff_ctx, AV1_XFORM_QUANT_FP); + ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, + tempa + idx, templ + idy, + cpi->sf.use_fast_coef_costing); + skip = (p->eobs[block] == 0); + can_skip &= skip; + tempa[idx] = !skip; + templ[idy] = !skip; +#if CONFIG_EXT_TX + if (tx_size == TX_8X4) { + tempa[idx + 1] = tempa[idx]; + } else if (tx_size == TX_4X8) { + templ[idy + 1] = templ[idy]; + } +#endif // CONFIG_EXT_TX +#else + (void)scan_order; + + av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8, + tx_size, coeff_ctx, AV1_XFORM_QUANT_B); + + ratey += x->rate; + skip = x->pvq_skip[0]; + tempa[idx] = !skip; + templ[idy] = !skip; + can_skip &= skip; +#endif + if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) + goto next_highbd; +#if CONFIG_PVQ + if (!skip) +#endif + av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block), + DCT_DCT, tx_size, dst, dst_stride, + p->eobs[block]); + } else { + int64_t dist; + unsigned int tmp; + TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size); + const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0); + const int coeff_ctx = + combine_entropy_contexts(tempa[idx], templ[idy]); +#if !CONFIG_PVQ + av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8, + tx_size, coeff_ctx, AV1_XFORM_QUANT_FP); + av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx); + ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, + tempa + idx, templ + idy, + cpi->sf.use_fast_coef_costing); + skip = (p->eobs[block] == 0); + can_skip &= skip; + tempa[idx] = !skip; + templ[idy] = !skip; +#if CONFIG_EXT_TX + if (tx_size == TX_8X4) { + tempa[idx + 1] = tempa[idx]; + } else if (tx_size == TX_4X8) { + templ[idy + 1] = templ[idy]; + } +#endif // CONFIG_EXT_TX +#else + (void)scan_order; + + av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8, + tx_size, coeff_ctx, AV1_XFORM_QUANT_FP); + ratey += x->rate; + skip = x->pvq_skip[0]; + tempa[idx] = !skip; + templ[idy] = !skip; + can_skip &= skip; +#endif +#if CONFIG_PVQ + if (!skip) +#endif + av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block), + tx_type, tx_size, dst, dst_stride, + p->eobs[block]); + cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp); + dist = (int64_t)tmp << 4; + distortion += dist; + if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) + goto next_highbd; + } + } + } + + rate += ratey; + this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + + if (this_rd < best_rd) { + *bestrate = rate; + *bestratey = ratey; + *bestdistortion = distortion; + best_rd = this_rd; + best_can_skip = can_skip; + *best_mode = mode; + memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0])); + memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0])); +#if CONFIG_PVQ + od_encode_checkpoint(&x->daala_enc, &post_buf); +#endif + for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) { + memcpy(best_dst16 + idy * 8, + CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride), + pred_width_in_transform_blocks * 4 * sizeof(uint16_t)); + } + } + next_highbd : {} +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &pre_buf); +#endif + } + + if (best_rd >= rd_thresh) return best_rd; + +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &post_buf); +#endif + + if (y_skip) *y_skip &= best_can_skip; + + for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) { + memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride), + best_dst16 + idy * 8, + pred_width_in_transform_blocks * 4 * sizeof(uint16_t)); + } + + return best_rd; + } +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_PVQ + od_encode_checkpoint(&x->daala_enc, &pre_buf); +#endif // CONFIG_PVQ + + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + int64_t this_rd; + int ratey = 0; + int64_t distortion = 0; + int rate = bmode_costs[mode]; + int can_skip = 1; + + if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] & + (1 << mode))) { + continue; + } + + // Only do the oblique modes if the best so far is + // one of the neighboring directional modes + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(mode, *best_mode)) continue; + } + + memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0])); + memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0])); + + for (idy = 0; idy < pred_height_in_4x4_blocks; idy += tx_height_unit) { + for (idx = 0; idx < pred_width_in_4x4_blocks; idx += tx_width_unit) { + const int block_raster_idx = (row + idy) * 2 + (col + idx); + int block = av1_raster_order_to_block_index(tx_size, block_raster_idx); + const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; + uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; +#if !CONFIG_PVQ + int16_t *const src_diff = av1_raster_block_offset_int16( + BLOCK_8X8, block_raster_idx, p->src_diff); +#endif // !CONFIG_PVQ + int skip; + assert(block < 4); + assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4, + idx == 0 && idy == 0)); + assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4, + block == 0 || block == 2)); + xd->mi[0]->bmi[block_raster_idx].as_mode = mode; + av1_predict_intra_block(xd, pd->width, pd->height, + txsize_to_bsize[tx_size], mode, dst, dst_stride, + dst, dst_stride, +#if CONFIG_CB4X4 + 2 * (col + idx), 2 * (row + idy), +#else + col + idx, row + idy, +#endif // CONFIG_CB4X4 + 0); +#if !CONFIG_PVQ + aom_subtract_block(tx_height, tx_width, src_diff, 8, src, src_stride, + dst, dst_stride); +#endif // !CONFIG_PVQ + + TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size); + const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0); + const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); +#if CONFIG_CB4X4 + block = 4 * block; +#endif // CONFIG_CB4X4 +#if !CONFIG_PVQ + const AV1_XFORM_QUANT xform_quant = + is_lossless ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP; + av1_xform_quant(cm, x, 0, block, +#if CONFIG_CB4X4 + 2 * (row + idy), 2 * (col + idx), +#else + row + idy, col + idx, +#endif // CONFIG_CB4X4 + BLOCK_8X8, tx_size, coeff_ctx, xform_quant); + + if (!is_lossless) { + av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx); + } + + ratey += + av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, tempa + idx, + templ + idy, cpi->sf.use_fast_coef_costing); + skip = (p->eobs[block] == 0); + can_skip &= skip; + tempa[idx] = !skip; + templ[idy] = !skip; +#if CONFIG_EXT_TX + if (tx_size == TX_8X4) { + tempa[idx + 1] = tempa[idx]; + } else if (tx_size == TX_4X8) { + templ[idy + 1] = templ[idy]; + } +#endif // CONFIG_EXT_TX +#else + (void)scan_order; + + av1_xform_quant(cm, x, 0, block, +#if CONFIG_CB4X4 + 2 * (row + idy), 2 * (col + idx), +#else + row + idy, col + idx, +#endif // CONFIG_CB4X4 + BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_FP); + + ratey += x->rate; + skip = x->pvq_skip[0]; + tempa[idx] = !skip; + templ[idy] = !skip; + can_skip &= skip; +#endif // !CONFIG_PVQ + + if (!is_lossless) { // To use the pixel domain distortion, we need to + // calculate inverse txfm *before* calculating RD + // cost. Compared to calculating the distortion in + // the frequency domain, the overhead of encoding + // effort is low. +#if CONFIG_PVQ + if (!skip) +#endif // CONFIG_PVQ + av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block), + tx_type, tx_size, dst, dst_stride, + p->eobs[block]); + unsigned int tmp; + cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp); + const int64_t dist = (int64_t)tmp << 4; + distortion += dist; + } + + if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) + goto next; + + if (is_lossless) { // Calculate inverse txfm *after* RD cost. +#if CONFIG_PVQ + if (!skip) +#endif // CONFIG_PVQ + av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block), + DCT_DCT, tx_size, dst, dst_stride, + p->eobs[block]); + } + } + } + + rate += ratey; + this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + + if (this_rd < best_rd) { + *bestrate = rate; + *bestratey = ratey; + *bestdistortion = distortion; + best_rd = this_rd; + best_can_skip = can_skip; + *best_mode = mode; + memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0])); + memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0])); +#if CONFIG_PVQ + od_encode_checkpoint(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) + memcpy(best_dst + idy * 8, dst_init + idy * dst_stride, + pred_width_in_transform_blocks * 4); + } + next : {} +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &pre_buf); +#endif // CONFIG_PVQ + } // mode decision loop + + if (best_rd >= rd_thresh) return best_rd; + +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + + if (y_skip) *y_skip &= best_can_skip; + + for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) + memcpy(dst_init + idy * dst_stride, best_dst + idy * 8, + pred_width_in_transform_blocks * 4); + + return best_rd; +} + +static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi, + MACROBLOCK *mb, int *rate, + int *rate_y, int64_t *distortion, + int *y_skip, int64_t best_rd) { + const MACROBLOCKD *const xd = &mb->e_mbd; + MODE_INFO *const mic = xd->mi[0]; + const MODE_INFO *above_mi = xd->above_mi; + const MODE_INFO *left_mi = xd->left_mi; + MB_MODE_INFO *const mbmi = &mic->mbmi; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize]; + const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + int cost = 0; + int64_t total_distortion = 0; + int tot_rate_y = 0; + int64_t total_rd = 0; + const int *bmode_costs = cpi->mbmode_cost[0]; + const int is_lossless = xd->lossless[mbmi->segment_id]; +#if CONFIG_EXT_TX && CONFIG_RECT_TX + const TX_SIZE tx_size = is_lossless ? TX_4X4 : max_txsize_rect_lookup[bsize]; +#else + const TX_SIZE tx_size = TX_4X4; +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + +#if CONFIG_EXT_INTRA +#if CONFIG_INTRA_INTERP + mbmi->intra_filter = INTRA_FILTER_LINEAR; +#endif // CONFIG_INTRA_INTERP +#endif // CONFIG_EXT_INTRA +#if CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; +#endif // CONFIG_FILTER_INTRA + + // TODO(any): Add search of the tx_type to improve rd performance at the + // expense of speed. + mbmi->tx_type = DCT_DCT; + mbmi->tx_size = tx_size; + + if (y_skip) *y_skip = 1; + + // Pick modes for each prediction sub-block (of size 4x4, 4x8, or 8x4) in this + // 8x8 coding block. + for (idy = 0; idy < 2; idy += pred_height_in_4x4_blocks) { + for (idx = 0; idx < 2; idx += pred_width_in_4x4_blocks) { + PREDICTION_MODE best_mode = DC_PRED; + int r = INT_MAX, ry = INT_MAX; + int64_t d = INT64_MAX, this_rd = INT64_MAX; + int j; + const int pred_block_idx = idy * 2 + idx; + if (cpi->common.frame_type == KEY_FRAME) { + const PREDICTION_MODE A = + av1_above_block_mode(mic, above_mi, pred_block_idx); + const PREDICTION_MODE L = + av1_left_block_mode(mic, left_mi, pred_block_idx); + + bmode_costs = cpi->y_mode_costs[A][L]; + } + this_rd = rd_pick_intra_sub_8x8_y_subblock_mode( + cpi, mb, idy, idx, &best_mode, bmode_costs, + xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r, + &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd); +#if !CONFIG_DAALA_DIST + if (this_rd >= best_rd - total_rd) return INT64_MAX; +#endif // !CONFIG_DAALA_DIST + total_rd += this_rd; + cost += r; + total_distortion += d; + tot_rate_y += ry; + + mic->bmi[pred_block_idx].as_mode = best_mode; + for (j = 1; j < pred_height_in_4x4_blocks; ++j) + mic->bmi[pred_block_idx + j * 2].as_mode = best_mode; + for (j = 1; j < pred_width_in_4x4_blocks; ++j) + mic->bmi[pred_block_idx + j].as_mode = best_mode; + + if (total_rd >= best_rd) return INT64_MAX; + } + } + mbmi->mode = mic->bmi[3].as_mode; + +#if CONFIG_DAALA_DIST + { + const struct macroblock_plane *p = &mb->plane[0]; + const struct macroblockd_plane *pd = &xd->plane[0]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + uint8_t *src = p->src.buf; + uint8_t *dst = pd->dst.buf; + int use_activity_masking = 0; + int qm = OD_HVS_QM; + +#if CONFIG_PVQ + use_activity_masking = mb->daala_enc.use_activity_masking; +#endif // CONFIG_PVQ + // Daala-defined distortion computed for the block of 8x8 pixels + total_distortion = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8, + qm, use_activity_masking, mb->qindex) + << 4; + } +#endif // CONFIG_DAALA_DIST + // Add in the cost of the transform type + if (!is_lossless) { + int rate_tx_type = 0; +#if CONFIG_EXT_TX + if (get_ext_tx_types(tx_size, bsize, 0, cpi->common.reduced_tx_set_used) > + 1) { + const int eset = + get_ext_tx_set(tx_size, bsize, 0, cpi->common.reduced_tx_set_used); + rate_tx_type = cpi->intra_tx_type_costs[eset][txsize_sqr_map[tx_size]] + [mbmi->mode][mbmi->tx_type]; + } +#else + rate_tx_type = + cpi->intra_tx_type_costs[txsize_sqr_map[tx_size]] + [intra_mode_to_tx_type_context[mbmi->mode]] + [mbmi->tx_type]; +#endif // CONFIG_EXT_TX + assert(mbmi->tx_size == tx_size); + cost += rate_tx_type; + tot_rate_y += rate_tx_type; + } + + *rate = cost; + *rate_y = tot_rate_y; + *distortion = total_distortion; + + return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion); +} + +#if CONFIG_FILTER_INTRA +// Return 1 if an filter intra mode is selected; return 0 otherwise. +static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, + BLOCK_SIZE bsize, int mode_cost, + int64_t *best_rd, int64_t *best_model_rd, + uint16_t skip_mask) { + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mic = xd->mi[0]; + MB_MODE_INFO *mbmi = &mic->mbmi; + int filter_intra_selected_flag = 0; + FILTER_INTRA_MODE mode; + TX_SIZE best_tx_size = TX_4X4; + FILTER_INTRA_MODE_INFO filter_intra_mode_info; + TX_TYPE best_tx_type; + + av1_zero(filter_intra_mode_info); + mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 1; + mbmi->mode = DC_PRED; +#if CONFIG_PALETTE + mbmi->palette_mode_info.palette_size[0] = 0; +#endif // CONFIG_PALETTE + + for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) { + int this_rate; + int64_t this_rd, this_model_rd; + RD_STATS tokenonly_rd_stats; + if (skip_mask & (1 << mode)) continue; + mbmi->filter_intra_mode_info.filter_intra_mode[0] = mode; + this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost); + if (*best_model_rd != INT64_MAX && + this_model_rd > *best_model_rd + (*best_model_rd >> 1)) + continue; + if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd; + super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) continue; + this_rate = tokenonly_rd_stats.rate + + av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 1) + + write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost; + this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist); + + if (this_rd < *best_rd) { + *best_rd = this_rd; + best_tx_size = mic->mbmi.tx_size; + filter_intra_mode_info = mbmi->filter_intra_mode_info; + best_tx_type = mic->mbmi.tx_type; + *rate = this_rate; + *rate_tokenonly = tokenonly_rd_stats.rate; + *distortion = tokenonly_rd_stats.dist; + *skippable = tokenonly_rd_stats.skip; + filter_intra_selected_flag = 1; + } + } + + if (filter_intra_selected_flag) { + mbmi->mode = DC_PRED; + mbmi->tx_size = best_tx_size; + mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = + filter_intra_mode_info.use_filter_intra_mode[0]; + mbmi->filter_intra_mode_info.filter_intra_mode[0] = + filter_intra_mode_info.filter_intra_mode[0]; + mbmi->tx_type = best_tx_type; + return 1; + } else { + return 0; + } +} +#endif // CONFIG_FILTER_INTRA + +#if CONFIG_EXT_INTRA +// Run RD calculation with given luma intra prediction angle., and return +// the RD cost. Update the best mode info. if the RD cost is the best so far. +static int64_t calc_rd_given_intra_angle( + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost, + int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate, + RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size, + TX_TYPE *best_tx_type, +#if CONFIG_INTRA_INTERP + INTRA_FILTER *best_filter, +#endif // CONFIG_INTRA_INTERP + int64_t *best_rd, int64_t *best_model_rd) { + int this_rate; + RD_STATS tokenonly_rd_stats; + int64_t this_rd, this_model_rd; + MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi; + + mbmi->angle_delta[0] = angle_delta; + this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost); + if (*best_model_rd != INT64_MAX && + this_model_rd > *best_model_rd + (*best_model_rd >> 1)) + return INT64_MAX; + if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd; + super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in); + if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX; + + this_rate = tokenonly_rd_stats.rate + mode_cost + + write_uniform_cost(2 * max_angle_delta + 1, + mbmi->angle_delta[0] + max_angle_delta); + this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist); + + if (this_rd < *best_rd) { + *best_rd = this_rd; + *best_angle_delta = mbmi->angle_delta[0]; + *best_tx_size = mbmi->tx_size; +#if CONFIG_INTRA_INTERP + *best_filter = mbmi->intra_filter; +#endif // CONFIG_INTRA_INTERP + *best_tx_type = mbmi->tx_type; + *rate = this_rate; + rd_stats->rate = tokenonly_rd_stats.rate; + rd_stats->dist = tokenonly_rd_stats.dist; + rd_stats->skip = tokenonly_rd_stats.skip; + } + return this_rd; +} + +// With given luma directional intra prediction mode, pick the best angle delta +// Return the RD cost corresponding to the best angle delta. +static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, RD_STATS *rd_stats, + BLOCK_SIZE bsize, int mode_cost, + int64_t best_rd, + int64_t *best_model_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mic = xd->mi[0]; + MB_MODE_INFO *mbmi = &mic->mbmi; + int i, angle_delta, best_angle_delta = 0; + int first_try = 1; +#if CONFIG_INTRA_INTERP + int p_angle; + const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd); + INTRA_FILTER filter, best_filter = INTRA_FILTER_LINEAR; +#endif // CONFIG_INTRA_INTERP + int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; + TX_SIZE best_tx_size = mic->mbmi.tx_size; + TX_TYPE best_tx_type = mbmi->tx_type; + + for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; + + for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { +#if CONFIG_INTRA_INTERP + for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) { + if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue; + mic->mbmi.intra_filter = filter; +#endif // CONFIG_INTRA_INTERP + for (i = 0; i < 2; ++i) { + best_rd_in = (best_rd == INT64_MAX) + ? INT64_MAX + : (best_rd + (best_rd >> (first_try ? 3 : 5))); + this_rd = calc_rd_given_intra_angle( + cpi, x, bsize, +#if CONFIG_INTRA_INTERP + mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter], +#else + mode_cost, +#endif // CONFIG_INTRA_INTERP + best_rd_in, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, + rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type, +#if CONFIG_INTRA_INTERP + &best_filter, +#endif // CONFIG_INTRA_INTERP + &best_rd, best_model_rd); + rd_cost[2 * angle_delta + i] = this_rd; + if (first_try && this_rd == INT64_MAX) return best_rd; + first_try = 0; + if (angle_delta == 0) { + rd_cost[1] = this_rd; + break; + } + } +#if CONFIG_INTRA_INTERP + } +#endif // CONFIG_INTRA_INTERP + } + + assert(best_rd != INT64_MAX); + for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + int64_t rd_thresh; +#if CONFIG_INTRA_INTERP + for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) { + if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue; + mic->mbmi.intra_filter = filter; +#endif // CONFIG_INTRA_INTERP + for (i = 0; i < 2; ++i) { + int skip_search = 0; + rd_thresh = best_rd + (best_rd >> 5); + if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && + rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) + skip_search = 1; + if (!skip_search) { + calc_rd_given_intra_angle( + cpi, x, bsize, +#if CONFIG_INTRA_INTERP + mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter], +#else + mode_cost, +#endif // CONFIG_INTRA_INTERP + best_rd, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, + rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type, +#if CONFIG_INTRA_INTERP + &best_filter, +#endif // CONFIG_INTRA_INTERP + &best_rd, best_model_rd); + } + } +#if CONFIG_INTRA_INTERP + } +#endif // CONFIG_INTRA_INTERP + } + +#if CONFIG_INTRA_INTERP + if (FILTER_FAST_SEARCH && rd_stats->rate < INT_MAX) { + p_angle = mode_to_angle_map[mbmi->mode] + best_angle_delta * ANGLE_STEP; + if (av1_is_intra_filter_switchable(p_angle)) { + for (filter = INTRA_FILTER_LINEAR + 1; filter < INTRA_FILTERS; ++filter) { + mic->mbmi.intra_filter = filter; + this_rd = calc_rd_given_intra_angle( + cpi, x, bsize, + mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter], + best_rd, best_angle_delta, MAX_ANGLE_DELTA, rate, rd_stats, + &best_angle_delta, &best_tx_size, &best_tx_type, &best_filter, + &best_rd, best_model_rd); + } + } + } +#endif // CONFIG_INTRA_INTERP + + mbmi->tx_size = best_tx_size; + mbmi->angle_delta[0] = best_angle_delta; +#if CONFIG_INTRA_INTERP + mic->mbmi.intra_filter = best_filter; +#endif // CONFIG_INTRA_INTERP + mbmi->tx_type = best_tx_type; + return best_rd; +} + +// Indices are sign, integer, and fractional part of the gradient value +static const uint8_t gradient_to_angle_bin[2][7][16] = { + { + { 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, + { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, + }, + { + { 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 }, + { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3 }, + { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }, + { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }, + { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }, + { 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, + { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, + }, +}; + +static const uint8_t mode_to_angle_bin[INTRA_MODES] = { + 0, 2, 6, 0, 4, 3, 5, 7, 1, 0, +}; + +static void angle_estimation(const uint8_t *src, int src_stride, int rows, + int cols, uint8_t *directional_mode_skip_mask) { + int i, r, c, index, dx, dy, temp, sn, remd, quot; + uint64_t hist[DIRECTIONAL_MODES]; + uint64_t hist_sum = 0; + + memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0])); + src += src_stride; + for (r = 1; r < rows; ++r) { + for (c = 1; c < cols; ++c) { + dx = src[c] - src[c - 1]; + dy = src[c] - src[c - src_stride]; + temp = dx * dx + dy * dy; + if (dy == 0) { + index = 2; + } else { + sn = (dx > 0) ^ (dy > 0); + dx = abs(dx); + dy = abs(dy); + remd = dx % dy; + quot = dx / dy; + remd = remd * 16 / dy; + index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)]; + } + hist[index] += temp; + } + src += src_stride; + } + + for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i]; + for (i = 0; i < INTRA_MODES; ++i) { + if (i != DC_PRED && i != TM_PRED) { + const uint8_t angle_bin = mode_to_angle_bin[i]; + uint64_t score = 2 * hist[angle_bin]; + int weight = 2; + if (angle_bin > 0) { + score += hist[angle_bin - 1]; + ++weight; + } + if (angle_bin < DIRECTIONAL_MODES - 1) { + score += hist[angle_bin + 1]; + ++weight; + } + if (score * ANGLE_SKIP_THRESH < hist_sum * weight) + directional_mode_skip_mask[i] = 1; + } + } +} + +#if CONFIG_HIGHBITDEPTH +static void highbd_angle_estimation(const uint8_t *src8, int src_stride, + int rows, int cols, + uint8_t *directional_mode_skip_mask) { + int i, r, c, index, dx, dy, temp, sn, remd, quot; + uint64_t hist[DIRECTIONAL_MODES]; + uint64_t hist_sum = 0; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + + memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0])); + src += src_stride; + for (r = 1; r < rows; ++r) { + for (c = 1; c < cols; ++c) { + dx = src[c] - src[c - 1]; + dy = src[c] - src[c - src_stride]; + temp = dx * dx + dy * dy; + if (dy == 0) { + index = 2; + } else { + sn = (dx > 0) ^ (dy > 0); + dx = abs(dx); + dy = abs(dy); + remd = dx % dy; + quot = dx / dy; + remd = remd * 16 / dy; + index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)]; + } + hist[index] += temp; + } + src += src_stride; + } + + for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i]; + for (i = 0; i < INTRA_MODES; ++i) { + if (i != DC_PRED && i != TM_PRED) { + const uint8_t angle_bin = mode_to_angle_bin[i]; + uint64_t score = 2 * hist[angle_bin]; + int weight = 2; + if (angle_bin > 0) { + score += hist[angle_bin - 1]; + ++weight; + } + if (angle_bin < DIRECTIONAL_MODES - 1) { + score += hist[angle_bin + 1]; + ++weight; + } + if (score * ANGLE_SKIP_THRESH < hist_sum * weight) + directional_mode_skip_mask[i] = 1; + } + } +} +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_EXT_INTRA + +// This function is used only for intra_only frames +static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, + BLOCK_SIZE bsize, int64_t best_rd) { + uint8_t mode_idx; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mic = xd->mi[0]; + MB_MODE_INFO *const mbmi = &mic->mbmi; + MB_MODE_INFO best_mbmi = *mbmi; + int64_t best_model_rd = INT64_MAX; +#if CONFIG_EXT_INTRA + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; +#if CONFIG_INTRA_INTERP + const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd); +#endif // CONFIG_INTRA_INTERP + int is_directional_mode; + uint8_t directional_mode_skip_mask[INTRA_MODES]; + const int src_stride = x->plane[0].src.stride; + const uint8_t *src = x->plane[0].src.buf; +#endif // CONFIG_EXT_INTRA +#if CONFIG_FILTER_INTRA + int beat_best_rd = 0; + uint16_t filter_intra_mode_skip_mask = (1 << FILTER_INTRA_MODES) - 1; +#endif // CONFIG_FILTER_INTRA + const int *bmode_costs; +#if CONFIG_PALETTE + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + uint8_t *best_palette_color_map = + cpi->common.allow_screen_content_tools + ? x->palette_buffer->best_palette_color_map + : NULL; + int palette_y_mode_ctx = 0; + const int try_palette = + cpi->common.allow_screen_content_tools && bsize >= BLOCK_8X8; +#endif // CONFIG_PALETTE + const MODE_INFO *above_mi = xd->above_mi; + const MODE_INFO *left_mi = xd->left_mi; + const PREDICTION_MODE A = av1_above_block_mode(mic, above_mi, 0); + const PREDICTION_MODE L = av1_left_block_mode(mic, left_mi, 0); + const PREDICTION_MODE FINAL_MODE_SEARCH = TM_PRED + 1; +#if CONFIG_PVQ + od_rollback_buffer pre_buf, post_buf; + + od_encode_checkpoint(&x->daala_enc, &pre_buf); + od_encode_checkpoint(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + bmode_costs = cpi->y_mode_costs[A][L]; + +#if CONFIG_EXT_INTRA + mbmi->angle_delta[0] = 0; + memset(directional_mode_skip_mask, 0, + sizeof(directional_mode_skip_mask[0]) * INTRA_MODES); +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + highbd_angle_estimation(src, src_stride, rows, cols, + directional_mode_skip_mask); + else +#endif // CONFIG_HIGHBITDEPTH + angle_estimation(src, src_stride, rows, cols, directional_mode_skip_mask); +#endif // CONFIG_EXT_INTRA +#if CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; +#endif // CONFIG_FILTER_INTRA +#if CONFIG_PALETTE + pmi->palette_size[0] = 0; + if (above_mi) + palette_y_mode_ctx += + (above_mi->mbmi.palette_mode_info.palette_size[0] > 0); + if (left_mi) + palette_y_mode_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0); +#endif // CONFIG_PALETTE + + if (cpi->sf.tx_type_search.fast_intra_tx_type_search) + x->use_default_intra_tx_type = 1; + else + x->use_default_intra_tx_type = 0; + + /* Y Search for intra prediction mode */ + for (mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) { + RD_STATS this_rd_stats; + int this_rate, this_rate_tokenonly, s; + int64_t this_distortion, this_rd, this_model_rd; + if (mode_idx == FINAL_MODE_SEARCH) { + if (x->use_default_intra_tx_type == 0) break; + mbmi->mode = best_mbmi.mode; + x->use_default_intra_tx_type = 0; + } else { + mbmi->mode = mode_idx; + } +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &pre_buf); +#endif // CONFIG_PVQ +#if CONFIG_EXT_INTRA + mbmi->angle_delta[0] = 0; +#endif // CONFIG_EXT_INTRA + this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]); + if (best_model_rd != INT64_MAX && + this_model_rd > best_model_rd + (best_model_rd >> 1)) + continue; + if (this_model_rd < best_model_rd) best_model_rd = this_model_rd; +#if CONFIG_EXT_INTRA + is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize); + if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue; + if (is_directional_mode) { + this_rd_stats.rate = INT_MAX; + rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize, + bmode_costs[mbmi->mode], best_rd, &best_model_rd); + } else { + super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd); + } +#else + super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd); +#endif // CONFIG_EXT_INTRA + this_rate_tokenonly = this_rd_stats.rate; + this_distortion = this_rd_stats.dist; + s = this_rd_stats.skip; + + if (this_rate_tokenonly == INT_MAX) continue; + + this_rate = this_rate_tokenonly + bmode_costs[mbmi->mode]; + + if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) { + // super_block_yrd above includes the cost of the tx_size in the + // tokenonly rate, but for intra blocks, tx_size is always coded + // (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + this_rate_tokenonly -= tx_size_cost(cpi, x, bsize, mbmi->tx_size); + } +#if CONFIG_PALETTE + if (try_palette && mbmi->mode == DC_PRED) { + this_rate += + av1_cost_bit(av1_default_palette_y_mode_prob[bsize - BLOCK_8X8] + [palette_y_mode_ctx], + 0); + } +#endif // CONFIG_PALETTE +#if CONFIG_FILTER_INTRA + if (mbmi->mode == DC_PRED) + this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 0); +#endif // CONFIG_FILTER_INTRA +#if CONFIG_EXT_INTRA + if (is_directional_mode) { +#if CONFIG_INTRA_INTERP + const int p_angle = + mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP; + if (av1_is_intra_filter_switchable(p_angle)) + this_rate += + cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter]; +#endif // CONFIG_INTRA_INTERP + this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, + MAX_ANGLE_DELTA + mbmi->angle_delta[0]); + } +#endif // CONFIG_EXT_INTRA + this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion); +#if CONFIG_FILTER_INTRA + if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) { + filter_intra_mode_skip_mask ^= (1 << mbmi->mode); + } +#endif // CONFIG_FILTER_INTRA + + if (this_rd < best_rd) { + best_mbmi = *mbmi; + best_rd = this_rd; +#if CONFIG_FILTER_INTRA + beat_best_rd = 1; +#endif // CONFIG_FILTER_INTRA + *rate = this_rate; + *rate_tokenonly = this_rate_tokenonly; + *distortion = this_distortion; + *skippable = s; +#if CONFIG_PVQ + od_encode_checkpoint(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + } + } + +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + +#if CONFIG_CFL + // Perform one extra txfm_rd_in_plane() call, this time with the best value so + // we can store reconstructed luma values + RD_STATS this_rd_stats; + x->cfl_store_y = 1; + txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, 0, bsize, + mic->mbmi.tx_size, cpi->sf.use_fast_coef_costing); + x->cfl_store_y = 0; +#endif + +#if CONFIG_PALETTE + if (try_palette) { + rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx, + bmode_costs[DC_PRED], &best_mbmi, + best_palette_color_map, &best_rd, &best_model_rd, + rate, rate_tokenonly, distortion, skippable); + } +#endif // CONFIG_PALETTE + +#if CONFIG_FILTER_INTRA + if (beat_best_rd) { + if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion, + skippable, bsize, bmode_costs[DC_PRED], + &best_rd, &best_model_rd, + filter_intra_mode_skip_mask)) { + best_mbmi = *mbmi; + } + } +#endif // CONFIG_FILTER_INTRA + + *mbmi = best_mbmi; + return best_rd; +} + +// Return value 0: early termination triggered, no valid rd cost available; +// 1: rd cost values are valid. +static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]); + int plane; + int is_cost_valid = 1; + av1_init_rd_stats(rd_stats); + + if (ref_best_rd < 0) is_cost_valid = 0; + +#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 + if (x->skip_chroma_rd) return is_cost_valid; + + bsize = scale_chroma_bsize(bsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y); +#endif // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 + +#if !CONFIG_PVQ + if (is_inter_block(mbmi) && is_cost_valid) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) + av1_subtract_plane(x, bsize, plane); + } +#endif // !CONFIG_PVQ + + if (is_cost_valid) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + RD_STATS pn_rd_stats; + txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, plane, bsize, + uv_tx_size, cpi->sf.use_fast_coef_costing); + if (pn_rd_stats.rate == INT_MAX) { + is_cost_valid = 0; + break; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + if (RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) > + ref_best_rd && + RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse) > ref_best_rd) { + is_cost_valid = 0; + break; + } + } + } + + if (!is_cost_valid) { + // reset cost value + av1_invalid_rd_stats(rd_stats); + } + + return is_cost_valid; +} + +#if CONFIG_VAR_TX +// FIXME crop these calls +static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride, + TX_SIZE tx_size) { + return aom_sum_squares_2d_i16(diff, diff_stride, tx_size_wide[tx_size], + tx_size_high[tx_size]); +} + +void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, + int blk_row, int blk_col, int plane, int block, + int plane_bsize, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l, RD_STATS *rd_stats) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + int64_t tmp; + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + PLANE_TYPE plane_type = get_plane_type(plane); + TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); + const SCAN_ORDER *const scan_order = + get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi)); + BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size]; + int bh = block_size_high[txm_bsize]; + int bw = block_size_wide[txm_bsize]; + int txb_h = tx_size_high_unit[tx_size]; + int txb_w = tx_size_wide_unit[tx_size]; + + int src_stride = p->src.stride; + uint8_t *src = + &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]]; + uint8_t *dst = + &pd->dst + .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]]; +#if CONFIG_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, rec_buffer16[MAX_TX_SQUARE]); + uint8_t *rec_buffer; +#else + DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]); +#endif // CONFIG_HIGHBITDEPTH + int max_blocks_high = block_size_high[plane_bsize]; + int max_blocks_wide = block_size_wide[plane_bsize]; + const int diff_stride = max_blocks_wide; + const int16_t *diff = + &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]]; + int txb_coeff_cost; + + assert(tx_size < TX_SIZES_ALL); + + if (xd->mb_to_bottom_edge < 0) + max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y); + if (xd->mb_to_right_edge < 0) + max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x); + + max_blocks_high >>= tx_size_wide_log2[0]; + max_blocks_wide >>= tx_size_wide_log2[0]; + + int coeff_ctx = get_entropy_context(tx_size, a, l); + + av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + coeff_ctx, AV1_XFORM_QUANT_FP); + + av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx); + +// TODO(any): Use av1_dist_block to compute distortion +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16); + aom_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, + 0, NULL, 0, bw, bh, xd->bd); + } else { + rec_buffer = (uint8_t *)rec_buffer16; + aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, + NULL, 0, bw, bh); + } +#else + aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, NULL, + 0, bw, bh); +#endif // CONFIG_HIGHBITDEPTH + + if (blk_row + txb_h > max_blocks_high || blk_col + txb_w > max_blocks_wide) { + int idx, idy; + int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row); + int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col); + tmp = 0; + for (idy = 0; idy < blocks_height; ++idy) { + for (idx = 0; idx < blocks_width; ++idx) { + const int16_t *d = + diff + ((idy * diff_stride + idx) << tx_size_wide_log2[0]); + tmp += sum_squares_2d(d, diff_stride, 0); + } + } + } else { + tmp = sum_squares_2d(diff, diff_stride, tx_size); + } + +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2); +#endif // CONFIG_HIGHBITDEPTH + rd_stats->sse += tmp * 16; + const int eob = p->eobs[block]; + + av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, rec_buffer, + MAX_TX_SIZE, eob); + if (eob > 0) { + if (txb_w + blk_col > max_blocks_wide || + txb_h + blk_row > max_blocks_high) { + int idx, idy; + unsigned int this_dist; + int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row); + int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col); + tmp = 0; + for (idy = 0; idy < blocks_height; ++idy) { + for (idx = 0; idx < blocks_width; ++idx) { + uint8_t *const s = + src + ((idy * src_stride + idx) << tx_size_wide_log2[0]); + uint8_t *const r = + rec_buffer + ((idy * MAX_TX_SIZE + idx) << tx_size_wide_log2[0]); + cpi->fn_ptr[0].vf(s, src_stride, r, MAX_TX_SIZE, &this_dist); + tmp += this_dist; + } + } + } else { + uint32_t this_dist; + cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, MAX_TX_SIZE, + &this_dist); + tmp = this_dist; + } + } + rd_stats->dist += tmp * 16; + txb_coeff_cost = + av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, a, l, 0); + rd_stats->rate += txb_coeff_cost; + rd_stats->skip &= (eob == 0); + +#if CONFIG_RD_DEBUG + av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col, + txb_coeff_cost); +#endif // CONFIG_RD_DEBUG +} + +static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, + int blk_col, int plane, int block, int block32, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, + ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl, + TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, + RD_STATS *rd_stats, int64_t ref_best_rd, + int *is_cost_valid, RD_STATS *rd_stats_stack) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int tx_row = blk_row >> (1 - pd->subsampling_y); + const int tx_col = blk_col >> (1 - pd->subsampling_x); + TX_SIZE(*const inter_tx_size) + [MAX_MIB_SIZE] = + (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col]; + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + int64_t this_rd = INT64_MAX; + ENTROPY_CONTEXT *pta = ta + blk_col; + ENTROPY_CONTEXT *ptl = tl + blk_row; + int coeff_ctx, i; + int ctx = + txfm_partition_context(tx_above + (blk_col >> 1), + tx_left + (blk_row >> 1), mbmi->sb_type, tx_size); + int64_t sum_rd = INT64_MAX; + int tmp_eob = 0; + int zero_blk_rate; + RD_STATS sum_rd_stats; + const int tx_size_ctx = txsize_sqr_map[tx_size]; + + av1_init_rd_stats(&sum_rd_stats); + + assert(tx_size < TX_SIZES_ALL); + + if (ref_best_rd < 0) { + *is_cost_valid = 0; + return; + } + + coeff_ctx = get_entropy_context(tx_size, pta, ptl); + + av1_init_rd_stats(rd_stats); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0] + [coeff_ctx][EOB_TOKEN]; + + if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) { + inter_tx_size[0][0] = tx_size; + + if (tx_size == TX_32X32 && mbmi->tx_type != DCT_DCT && + rd_stats_stack[block32].rate != INT_MAX) { + *rd_stats = rd_stats_stack[block32]; + p->eobs[block] = !rd_stats->skip; + x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip; + } else { + av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, + plane_bsize, pta, ptl, rd_stats); + if (tx_size == TX_32X32) { + rd_stats_stack[block32] = *rd_stats; + } + } + + if ((RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >= + RDCOST(x->rdmult, x->rddiv, zero_blk_rate, rd_stats->sse) || + rd_stats->skip == 1) && + !xd->lossless[mbmi->segment_id]) { +#if CONFIG_RD_DEBUG + av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col, + zero_blk_rate - rd_stats->rate); +#endif // CONFIG_RD_DEBUG + rd_stats->rate = zero_blk_rate; + rd_stats->dist = rd_stats->sse; + rd_stats->skip = 1; + x->blk_skip[plane][blk_row * bw + blk_col] = 1; + p->eobs[block] = 0; + } else { + x->blk_skip[plane][blk_row * bw + blk_col] = 0; + rd_stats->skip = 0; + } + + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += + av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0); + this_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist); + tmp_eob = p->eobs[block]; + } + + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsl = tx_size_wide_unit[sub_txs]; + int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs]; + RD_STATS this_rd_stats; + int this_cost_valid = 1; + int64_t tmp_rd = 0; + + sum_rd_stats.rate = + av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1); + + assert(tx_size < TX_SIZES_ALL); + + for (i = 0; i < 4 && this_cost_valid; ++i) { + int offsetr = blk_row + (i >> 1) * bsl; + int offsetc = blk_col + (i & 0x01) * bsl; + + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + + select_tx_block(cpi, x, offsetr, offsetc, plane, block, block32, sub_txs, + depth + 1, plane_bsize, ta, tl, tx_above, tx_left, + &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid, + rd_stats_stack); + + av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats); + + tmp_rd = + RDCOST(x->rdmult, x->rddiv, sum_rd_stats.rate, sum_rd_stats.dist); + if (this_rd < tmp_rd) break; + block += sub_step; + } + if (this_cost_valid) sum_rd = tmp_rd; + } + + if (this_rd < sum_rd) { + int idx, idy; + for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) pta[i] = !(tmp_eob == 0); + for (i = 0; i < tx_size_high_unit[tx_size]; ++i) ptl[i] = !(tmp_eob == 0); + txfm_partition_update(tx_above + (blk_col >> 1), tx_left + (blk_row >> 1), + tx_size, tx_size); + inter_tx_size[0][0] = tx_size; + for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy) + for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx) + inter_tx_size[idy][idx] = tx_size; + mbmi->tx_size = tx_size; + if (this_rd == INT64_MAX) *is_cost_valid = 0; + x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip; + } else { + *rd_stats = sum_rd_stats; + if (sum_rd == INT64_MAX) *is_cost_valid = 0; + } +} + +static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd, RD_STATS *rd_stats_stack) { + MACROBLOCKD *const xd = &x->e_mbd; + int is_cost_valid = 1; + int64_t this_rd = 0; + + if (ref_best_rd < 0) is_cost_valid = 0; + + av1_init_rd_stats(rd_stats); + + if (is_cost_valid) { + const struct macroblockd_plane *const pd = &xd->plane[0]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0]; + const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize]; + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + int idx, idy; + int block = 0; + int block32 = 0; + int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE]; + TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; + + RD_STATS pn_rd_stats; + av1_init_rd_stats(&pn_rd_stats); + + av1_get_entropy_contexts(bsize, 0, pd, ctxa, ctxl); + memcpy(tx_above, xd->above_txfm_context, + sizeof(TXFM_CONTEXT) * (mi_width >> 1)); + memcpy(tx_left, xd->left_txfm_context, + sizeof(TXFM_CONTEXT) * (mi_height >> 1)); + + for (idy = 0; idy < mi_height; idy += bh) { + for (idx = 0; idx < mi_width; idx += bw) { + select_tx_block(cpi, x, idy, idx, 0, block, block32, max_tx_size, + mi_height != mi_width, plane_bsize, ctxa, ctxl, + tx_above, tx_left, &pn_rd_stats, ref_best_rd - this_rd, + &is_cost_valid, rd_stats_stack); + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + this_rd += AOMMIN( + RDCOST(x->rdmult, x->rddiv, pn_rd_stats.rate, pn_rd_stats.dist), + RDCOST(x->rdmult, x->rddiv, 0, pn_rd_stats.sse)); + block += step; + ++block32; + } + } + } + + this_rd = AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist), + RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse)); + if (this_rd > ref_best_rd) is_cost_valid = 0; + + if (!is_cost_valid) { + // reset cost value + av1_invalid_rd_stats(rd_stats); + } +} + +static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd, TX_TYPE tx_type, + RD_STATS *rd_stats_stack) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const int is_inter = is_inter_block(mbmi); + aom_prob skip_prob = av1_get_skip_prob(cm, xd); + int s0 = av1_cost_bit(skip_prob, 0); + int s1 = av1_cost_bit(skip_prob, 1); + int64_t rd; + int row, col; + const int max_blocks_high = max_block_high(xd, bsize, 0); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + + mbmi->tx_type = tx_type; + mbmi->min_tx_size = TX_SIZES_ALL; + inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, rd_stats_stack); + + if (rd_stats->rate == INT_MAX) return INT64_MAX; + + for (row = 0; row < max_blocks_high / 2; ++row) + for (col = 0; col < max_blocks_wide / 2; ++col) + mbmi->min_tx_size = AOMMIN( + mbmi->min_tx_size, get_min_tx_size(mbmi->inter_tx_size[row][col])); + +#if CONFIG_EXT_TX + if (get_ext_tx_types(mbmi->min_tx_size, bsize, is_inter, + cm->reduced_tx_set_used) > 1 && + !xd->lossless[xd->mi[0]->mbmi.segment_id]) { + const int ext_tx_set = get_ext_tx_set(mbmi->min_tx_size, bsize, is_inter, + cm->reduced_tx_set_used); + if (is_inter) { + if (ext_tx_set > 0) + rd_stats->rate += + cpi->inter_tx_type_costs[ext_tx_set] + [txsize_sqr_map[mbmi->min_tx_size]] + [mbmi->tx_type]; + } else { + if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX) + rd_stats->rate += + cpi->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode] + [mbmi->tx_type]; + } + } +#else // CONFIG_EXT_TX + if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id]) + rd_stats->rate += + cpi->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type]; +#endif // CONFIG_EXT_TX + + if (rd_stats->skip) + rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse); + else + rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate + s0, rd_stats->dist); + + if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && + !(rd_stats->skip)) + rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse)); + + return rd; +} + +static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd) { + const AV1_COMMON *cm = &cpi->common; + const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + int64_t rd = INT64_MAX; + int64_t best_rd = INT64_MAX; + TX_TYPE tx_type, best_tx_type = DCT_DCT; + const int is_inter = is_inter_block(mbmi); + TX_SIZE best_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE]; + TX_SIZE best_tx = max_txsize_lookup[bsize]; + TX_SIZE best_min_tx_size = TX_SIZES_ALL; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8]; + const int n4 = bsize_to_num_blk(bsize); + int idx, idy; + int prune = 0; + const int count32 = + 1 << (2 * (cm->mib_size_log2 - mi_width_log2_lookup[BLOCK_32X32])); +#if CONFIG_EXT_PARTITION + RD_STATS rd_stats_stack[16]; +#else + RD_STATS rd_stats_stack[4]; +#endif // CONFIG_EXT_PARTITION +#if CONFIG_EXT_TX + const int ext_tx_set = + get_ext_tx_set(max_tx_size, bsize, is_inter, cm->reduced_tx_set_used); +#endif // CONFIG_EXT_TX + + if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE) +#if CONFIG_EXT_TX + prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set); +#else + prune = prune_tx_types(cpi, bsize, x, xd, 0); +#endif // CONFIG_EXT_TX + + av1_invalid_rd_stats(rd_stats); + + for (idx = 0; idx < count32; ++idx) + av1_invalid_rd_stats(&rd_stats_stack[idx]); + + for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) { + RD_STATS this_rd_stats; + av1_init_rd_stats(&this_rd_stats); +#if CONFIG_EXT_TX + if (is_inter) { + if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue; + if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) { + if (!do_tx_type_search(tx_type, prune)) continue; + } + } else { + if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) { + if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue; + } + if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue; + } +#else // CONFIG_EXT_TX + if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE && + !do_tx_type_search(tx_type, prune)) + continue; +#endif // CONFIG_EXT_TX + if (is_inter && x->use_default_inter_tx_type && + tx_type != get_default_tx_type(0, xd, 0, max_tx_size)) + continue; + + if (xd->lossless[mbmi->segment_id]) + if (tx_type != DCT_DCT) continue; + + rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd, + tx_type, rd_stats_stack); + + if (rd < best_rd) { + best_rd = rd; + *rd_stats = this_rd_stats; + best_tx_type = mbmi->tx_type; + best_tx = mbmi->tx_size; + best_min_tx_size = mbmi->min_tx_size; + memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4); + for (idy = 0; idy < xd->n8_h; ++idy) + for (idx = 0; idx < xd->n8_w; ++idx) + best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx]; + } + } + + mbmi->tx_type = best_tx_type; + for (idy = 0; idy < xd->n8_h; ++idy) + for (idx = 0; idx < xd->n8_w; ++idx) + mbmi->inter_tx_size[idy][idx] = best_tx_size[idy][idx]; + mbmi->tx_size = best_tx; + mbmi->min_tx_size = best_min_tx_size; + memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4); +} + +static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, + int blk_col, int plane, int block, TX_SIZE tx_size, + BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx, + ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; + const int tx_row = blk_row >> (1 - pd->subsampling_y); + const int tx_col = blk_col >> (1 - pd->subsampling_x); + TX_SIZE plane_tx_size; + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + assert(tx_size < TX_SIZES_ALL); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + plane_tx_size = + plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0] + : mbmi->inter_tx_size[tx_row][tx_col]; + + if (tx_size == plane_tx_size) { + int i; + ENTROPY_CONTEXT *ta = above_ctx + blk_col; + ENTROPY_CONTEXT *tl = left_ctx + blk_row; + av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, + plane_bsize, ta, tl, rd_stats); + + for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) + ta[i] = !(p->eobs[block] == 0); + for (i = 0; i < tx_size_high_unit[tx_size]; ++i) + tl[i] = !(p->eobs[block] == 0); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsl = tx_size_wide_unit[sub_txs]; + int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs]; + int i; + + assert(bsl > 0); + + for (i = 0; i < 4; ++i) { + int offsetr = blk_row + (i >> 1) * bsl; + int offsetc = blk_col + (i & 0x01) * bsl; + + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + + tx_block_rd(cpi, x, offsetr, offsetc, plane, block, sub_txs, plane_bsize, + above_ctx, left_ctx, rd_stats); + block += step; + } + } +} + +// Return value 0: early termination triggered, no valid rd cost available; +// 1: rd cost values are valid. +static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + int plane; + int is_cost_valid = 1; + int64_t this_rd; + + if (ref_best_rd < 0) is_cost_valid = 0; + + av1_init_rd_stats(rd_stats); + +#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 + if (x->skip_chroma_rd) return is_cost_valid; + bsize = AOMMAX(BLOCK_8X8, bsize); +#endif // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 + +#if CONFIG_EXT_TX && CONFIG_RECT_TX + if (is_rect_tx(mbmi->tx_size)) { + return super_block_uvrd(cpi, x, rd_stats, bsize, ref_best_rd); + } +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + + if (is_inter_block(mbmi) && is_cost_valid) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) + av1_subtract_plane(x, bsize, plane); + } + + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0]; + const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize]; + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + int idx, idy; + int block = 0; + const int step = bh * bw; + ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE]; + ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE]; + RD_STATS pn_rd_stats; + av1_init_rd_stats(&pn_rd_stats); + + av1_get_entropy_contexts(bsize, 0, pd, ta, tl); + + for (idy = 0; idy < mi_height; idy += bh) { + for (idx = 0; idx < mi_width; idx += bw) { + tx_block_rd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize, + ta, tl, &pn_rd_stats); + block += step; + } + } + + if (pn_rd_stats.rate == INT_MAX) { + is_cost_valid = 0; + break; + } + + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + + this_rd = + AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist), + RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse)); + + if (this_rd > ref_best_rd) { + is_cost_valid = 0; + break; + } + } + + if (!is_cost_valid) { + // reset cost value + av1_invalid_rd_stats(rd_stats); + } + + return is_cost_valid; +} +#endif // CONFIG_VAR_TX + +#if CONFIG_PALETTE +static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, + int dc_mode_cost, + uint8_t *best_palette_color_map, + MB_MODE_INFO *const best_mbmi, + int64_t *best_rd, int *rate, + int *rate_tokenonly, int64_t *distortion, + int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const BLOCK_SIZE bsize = mbmi->sb_type; + int this_rate; + int64_t this_rd; + int colors_u, colors_v, colors; + const int src_stride = x->plane[1].src.stride; + const uint8_t *const src_u = x->plane[1].src.buf; + const uint8_t *const src_v = x->plane[2].src.buf; + uint8_t *const color_map = xd->plane[1].color_index_map; + RD_STATS tokenonly_rd_stats; + int plane_block_width, plane_block_height, rows, cols; + av1_get_block_dimensions(bsize, 1, xd, &plane_block_width, + &plane_block_height, &rows, &cols); + if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return; + + mbmi->uv_mode = DC_PRED; +#if CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0; +#endif // CONFIG_FILTER_INTRA + +#if CONFIG_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) { + colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols, + cpi->common.bit_depth); + colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols, + cpi->common.bit_depth); + } else { +#endif // CONFIG_HIGHBITDEPTH + colors_u = av1_count_colors(src_u, src_stride, rows, cols); + colors_v = av1_count_colors(src_v, src_stride, rows, cols); +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH + + colors = colors_u > colors_v ? colors_u : colors_v; + if (colors > 1 && colors <= 64) { + int r, c, n, i, j; + const int max_itr = 50; + uint8_t color_order[PALETTE_MAX_SIZE]; + float lb_u, ub_u, val_u; + float lb_v, ub_v, val_v; + float *const data = x->palette_buffer->kmeans_data_buf; + float centroids[2 * PALETTE_MAX_SIZE]; + +#if CONFIG_HIGHBITDEPTH + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u); + uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v); + if (cpi->common.use_highbitdepth) { + lb_u = src_u16[0]; + ub_u = src_u16[0]; + lb_v = src_v16[0]; + ub_v = src_v16[0]; + } else { +#endif // CONFIG_HIGHBITDEPTH + lb_u = src_u[0]; + ub_u = src_u[0]; + lb_v = src_v[0]; + ub_v = src_v[0]; +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH + + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) { +#if CONFIG_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) { + val_u = src_u16[r * src_stride + c]; + val_v = src_v16[r * src_stride + c]; + data[(r * cols + c) * 2] = val_u; + data[(r * cols + c) * 2 + 1] = val_v; + } else { +#endif // CONFIG_HIGHBITDEPTH + val_u = src_u[r * src_stride + c]; + val_v = src_v[r * src_stride + c]; + data[(r * cols + c) * 2] = val_u; + data[(r * cols + c) * 2 + 1] = val_v; +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH + if (val_u < lb_u) + lb_u = val_u; + else if (val_u > ub_u) + ub_u = val_u; + if (val_v < lb_v) + lb_v = val_v; + else if (val_v > ub_v) + ub_v = val_v; + } + } + + for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2; + --n) { + for (i = 0; i < n; ++i) { + centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2; + centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2; + } + av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr); +#if CONFIG_PALETTE_DELTA_ENCODING + // Sort the U channel colors in ascending order. + for (i = 0; i < 2 * (n - 1); i += 2) { + int min_idx = i; + float min_val = centroids[i]; + for (j = i + 2; j < 2 * n; j += 2) + if (centroids[j] < min_val) min_val = centroids[j], min_idx = j; + if (min_idx != i) { + float temp_u = centroids[i], temp_v = centroids[i + 1]; + centroids[i] = centroids[min_idx]; + centroids[i + 1] = centroids[min_idx + 1]; + centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v; + } + } + av1_calc_indices(data, centroids, color_map, rows * cols, n, 2); +#endif // CONFIG_PALETTE_DELTA_ENCODING + extend_palette_color_map(color_map, cols, rows, plane_block_width, + plane_block_height); + pmi->palette_size[1] = n; + for (i = 1; i < 3; ++i) { + for (j = 0; j < n; ++j) { +#if CONFIG_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) + pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd( + (int)centroids[j * 2 + i - 1], cpi->common.bit_depth); + else +#endif // CONFIG_HIGHBITDEPTH + pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = + clip_pixel((int)centroids[j * 2 + i - 1]); + } + } + + super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) continue; + this_rate = + tokenonly_rd_stats.rate + dc_mode_cost + + cpi->palette_uv_size_cost[bsize - BLOCK_8X8][n - PALETTE_MIN_SIZE] + + write_uniform_cost(n, color_map[0]) + + av1_cost_bit( + av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 1); + this_rate += av1_palette_color_cost_uv(pmi, cpi->common.bit_depth); + for (i = 0; i < rows; ++i) { + for (j = (i == 0 ? 1 : 0); j < cols; ++j) { + int color_idx; + const int color_ctx = av1_get_palette_color_index_context( + color_map, plane_block_width, i, j, n, color_order, &color_idx); + assert(color_idx >= 0 && color_idx < n); + this_rate += cpi->palette_uv_color_cost[n - PALETTE_MIN_SIZE] + [color_ctx][color_idx]; + } + } + + this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist); + if (this_rd < *best_rd) { + *best_rd = this_rd; + *best_mbmi = *mbmi; + memcpy(best_palette_color_map, color_map, + plane_block_width * plane_block_height * + sizeof(best_palette_color_map[0])); + *rate = this_rate; + *distortion = tokenonly_rd_stats.dist; + *rate_tokenonly = tokenonly_rd_stats.rate; + *skippable = tokenonly_rd_stats.skip; + } + } + } + if (best_mbmi->palette_mode_info.palette_size[1] > 0) { + memcpy(color_map, best_palette_color_map, + rows * cols * sizeof(best_palette_color_map[0])); + } +} +#endif // CONFIG_PALETTE + +#if CONFIG_FILTER_INTRA +// Return 1 if an filter intra mode is selected; return 0 otherwise. +static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, + BLOCK_SIZE bsize, int64_t *best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + int filter_intra_selected_flag = 0; + int this_rate; + int64_t this_rd; + FILTER_INTRA_MODE mode; + FILTER_INTRA_MODE_INFO filter_intra_mode_info; + RD_STATS tokenonly_rd_stats; + + av1_zero(filter_intra_mode_info); + mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1; + mbmi->uv_mode = DC_PRED; +#if CONFIG_PALETTE + mbmi->palette_mode_info.palette_size[1] = 0; +#endif // CONFIG_PALETTE + + for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) { + mbmi->filter_intra_mode_info.filter_intra_mode[1] = mode; + if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd)) + continue; + + this_rate = tokenonly_rd_stats.rate + + av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) + + cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] + + write_uniform_cost(FILTER_INTRA_MODES, mode); + this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist); + if (this_rd < *best_rd) { + *best_rd = this_rd; + *rate = this_rate; + *rate_tokenonly = tokenonly_rd_stats.rate; + *distortion = tokenonly_rd_stats.dist; + *skippable = tokenonly_rd_stats.skip; + filter_intra_mode_info = mbmi->filter_intra_mode_info; + filter_intra_selected_flag = 1; + } + } + + if (filter_intra_selected_flag) { + mbmi->uv_mode = DC_PRED; + mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = + filter_intra_mode_info.use_filter_intra_mode[1]; + mbmi->filter_intra_mode_info.filter_intra_mode[1] = + filter_intra_mode_info.filter_intra_mode[1]; + return 1; + } else { + return 0; + } +} +#endif // CONFIG_FILTER_INTRA + +#if CONFIG_EXT_INTRA +// Run RD calculation with given chroma intra prediction angle., and return +// the RD cost. Update the best mode info. if the RD cost is the best so far. +static int64_t pick_intra_angle_routine_sbuv( + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats, + int *best_angle_delta, int64_t *best_rd) { + MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi; + int this_rate; + int64_t this_rd; + RD_STATS tokenonly_rd_stats; + + if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in)) + return INT64_MAX; + this_rate = tokenonly_rd_stats.rate + rate_overhead; + this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist); + if (this_rd < *best_rd) { + *best_rd = this_rd; + *best_angle_delta = mbmi->angle_delta[1]; + *rate = this_rate; + rd_stats->rate = tokenonly_rd_stats.rate; + rd_stats->dist = tokenonly_rd_stats.dist; + rd_stats->skip = tokenonly_rd_stats.skip; + } + return this_rd; +} + +// With given chroma directional intra prediction mode, pick the best angle +// delta. Return true if a RD cost that is smaller than the input one is found. +static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int rate_overhead, + int64_t best_rd, int *rate, + RD_STATS *rd_stats) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + int i, angle_delta, best_angle_delta = 0; + int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; + + rd_stats->rate = INT_MAX; + rd_stats->skip = 0; + rd_stats->dist = INT64_MAX; + for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; + + for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + for (i = 0; i < 2; ++i) { + best_rd_in = (best_rd == INT64_MAX) + ? INT64_MAX + : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5))); + mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta; + this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, + best_rd_in, rate, rd_stats, + &best_angle_delta, &best_rd); + rd_cost[2 * angle_delta + i] = this_rd; + if (angle_delta == 0) { + if (this_rd == INT64_MAX) return 0; + rd_cost[1] = this_rd; + break; + } + } + } + + assert(best_rd != INT64_MAX); + for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + int64_t rd_thresh; + for (i = 0; i < 2; ++i) { + int skip_search = 0; + rd_thresh = best_rd + (best_rd >> 5); + if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && + rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) + skip_search = 1; + if (!skip_search) { + mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta; + pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd, + rate, rd_stats, &best_angle_delta, + &best_rd); + } + } + } + + mbmi->angle_delta[1] = best_angle_delta; + return rd_stats->rate != INT_MAX; +} +#endif // CONFIG_EXT_INTRA + +static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, + BLOCK_SIZE bsize, TX_SIZE max_tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO best_mbmi = *mbmi; + PREDICTION_MODE mode; + int64_t best_rd = INT64_MAX, this_rd; + int this_rate; + RD_STATS tokenonly_rd_stats; +#if CONFIG_PVQ + od_rollback_buffer buf; + od_encode_checkpoint(&x->daala_enc, &buf); +#endif // CONFIG_PVQ +#if CONFIG_PALETTE + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + uint8_t *best_palette_color_map = NULL; +#endif // CONFIG_PALETTE + +#if CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0; +#endif // CONFIG_FILTER_INTRA +#if CONFIG_PALETTE + pmi->palette_size[1] = 0; +#endif // CONFIG_PALETTE + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { +#if CONFIG_EXT_INTRA + const int is_directional_mode = + av1_is_directional_mode(mode, mbmi->sb_type); +#endif // CONFIG_EXT_INTRA + if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] & + (1 << mode))) + continue; + + mbmi->uv_mode = mode; +#if CONFIG_EXT_INTRA + mbmi->angle_delta[1] = 0; + if (is_directional_mode) { + const int rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode] + + write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, 0); + if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd, + &this_rate, &tokenonly_rd_stats)) + continue; + } else { +#endif // CONFIG_EXT_INTRA + if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) { +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &buf); +#endif // CONFIG_PVQ + continue; + } +#if CONFIG_EXT_INTRA + } +#endif // CONFIG_EXT_INTRA + this_rate = + tokenonly_rd_stats.rate + cpi->intra_uv_mode_cost[mbmi->mode][mode]; + +#if CONFIG_EXT_INTRA + if (is_directional_mode) { + this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, + MAX_ANGLE_DELTA + mbmi->angle_delta[1]); + } +#endif // CONFIG_EXT_INTRA +#if CONFIG_FILTER_INTRA + if (mbmi->sb_type >= BLOCK_8X8 && mode == DC_PRED) + this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 0); +#endif // CONFIG_FILTER_INTRA +#if CONFIG_PALETTE + if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8 && + mode == DC_PRED) + this_rate += av1_cost_bit( + av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0); +#endif // CONFIG_PALETTE + +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &buf); +#endif // CONFIG_PVQ + this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist); + + if (this_rd < best_rd) { + best_mbmi = *mbmi; + best_rd = this_rd; + *rate = this_rate; + *rate_tokenonly = tokenonly_rd_stats.rate; + *distortion = tokenonly_rd_stats.dist; + *skippable = tokenonly_rd_stats.skip; + } + } + +#if CONFIG_PALETTE + if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8) { + best_palette_color_map = x->palette_buffer->best_palette_color_map; + rd_pick_palette_intra_sbuv(cpi, x, + cpi->intra_uv_mode_cost[mbmi->mode][DC_PRED], + best_palette_color_map, &best_mbmi, &best_rd, + rate, rate_tokenonly, distortion, skippable); + } +#endif // CONFIG_PALETTE + +#if CONFIG_FILTER_INTRA + if (mbmi->sb_type >= BLOCK_8X8) { + if (rd_pick_filter_intra_sbuv(cpi, x, rate, rate_tokenonly, distortion, + skippable, bsize, &best_rd)) + best_mbmi = *mbmi; + } +#endif // CONFIG_FILTER_INTRA + + *mbmi = best_mbmi; + // Make sure we actually chose a mode + assert(best_rd < INT64_MAX); + return best_rd; +} + +static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, + PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize, + TX_SIZE max_tx_size, int *rate_uv, + int *rate_uv_tokenonly, int64_t *dist_uv, + int *skip_uv, PREDICTION_MODE *mode_uv) { + // Use an estimated rd for uv_intra based on DC_PRED if the + // appropriate speed flag is set. + (void)ctx; +#if CONFIG_CB4X4 +#if CONFIG_CHROMA_2X2 + rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, + bsize, max_tx_size); +#else + max_tx_size = AOMMAX(max_tx_size, TX_4X4); + if (x->skip_chroma_rd) { + *rate_uv = 0; + *rate_uv_tokenonly = 0; + *dist_uv = 0; + *skip_uv = 1; + *mode_uv = DC_PRED; + return; + } + BLOCK_SIZE bs = scale_chroma_bsize(bsize, x->e_mbd.plane[1].subsampling_x, + x->e_mbd.plane[1].subsampling_y); + rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, + bs, max_tx_size); +#endif // CONFIG_CHROMA_2X2 +#else + rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, + bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size); +#endif // CONFIG_CB4X4 + *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode; +} + +static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode, + int16_t mode_context) { +#if CONFIG_EXT_INTER + if (is_inter_compound_mode(mode)) { + return cpi + ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)]; + } +#endif + +#if CONFIG_REF_MV + int mode_cost = 0; + int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; + int16_t is_all_zero_mv = mode_context & (1 << ALL_ZERO_FLAG_OFFSET); + + assert(is_inter_mode(mode)); + + if (mode == NEWMV) { + mode_cost = cpi->newmv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost = cpi->newmv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK; + + if (is_all_zero_mv) return mode_cost; + + if (mode == ZEROMV) { + mode_cost += cpi->zeromv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost += cpi->zeromv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; + + if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6; + if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7; + if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8; + + mode_cost += cpi->refmv_mode_cost[mode_ctx][mode != NEARESTMV]; + return mode_cost; + } + } +#else + assert(is_inter_mode(mode)); + return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)]; +#endif // CONFIG_REF_MV +} + +#if CONFIG_EXT_INTER +static int get_interinter_compound_type_bits(BLOCK_SIZE bsize, + COMPOUND_TYPE comp_type) { + (void)bsize; + switch (comp_type) { + case COMPOUND_AVERAGE: return 0; +#if CONFIG_WEDGE + case COMPOUND_WEDGE: return get_interinter_wedge_bits(bsize); +#endif // CONFIG_WEDGE +#if CONFIG_COMPOUND_SEGMENT + case COMPOUND_SEG: return 1; +#endif // CONFIG_COMPOUND_SEGMENT + default: assert(0); return 0; + } +} +#endif // CONFIG_EXT_INTER + +static int set_and_cost_bmi_mvs( + const AV1_COMP *const cpi, MACROBLOCK *x, MACROBLOCKD *xd, int i, + PREDICTION_MODE mode, int_mv this_mv[2], + int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], + int_mv seg_mvs[TOTAL_REFS_PER_FRAME], +#if CONFIG_EXT_INTER + int_mv compound_seg_newmvs[2], +#endif // CONFIG_EXT_INTER + int_mv *best_ref_mv[2], const int *mvjcost, int *mvcost[2], int mi_row, + int mi_col) { + MODE_INFO *const mic = xd->mi[0]; + const MB_MODE_INFO *const mbmi = &mic->mbmi; + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + int thismvcost = 0; + int idx, idy; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type]; + const int is_compound = has_second_ref(mbmi); + int mode_ctx; + (void)mi_row; + (void)mi_col; + + switch (mode) { + case NEWMV: this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int; +#if CONFIG_EXT_INTER + if (!cpi->common.allow_high_precision_mv) + lower_mv_precision(&this_mv[0].as_mv, 0); +#endif // CONFIG_EXT_INTER + +#if CONFIG_REF_MV + for (idx = 0; idx < 1 + is_compound; ++idx) { + this_mv[idx] = seg_mvs[mbmi->ref_frame[idx]]; + av1_set_mvcost(x, mbmi->ref_frame[idx], idx, mbmi->ref_mv_idx); + thismvcost += + av1_mv_bit_cost(&this_mv[idx].as_mv, &best_ref_mv[idx]->as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT_SUB); + } + (void)mvjcost; + (void)mvcost; +#else + thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv, + mvjcost, mvcost, MV_COST_WEIGHT_SUB); +#if !CONFIG_EXT_INTER + if (is_compound) { + this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int; + thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv, + mvjcost, mvcost, MV_COST_WEIGHT_SUB); + } +#endif // !CONFIG_EXT_INTER +#endif // CONFIG_REF_MV + break; + case NEARMV: + case NEARESTMV: + this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int; + if (is_compound) + this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int; + break; + case ZEROMV: { + int ref; + for (ref = 0; ref < 1 + is_compound; ++ref) { +#if CONFIG_GLOBAL_MOTION + this_mv[ref].as_int = + gm_get_motion_vector( + &cpi->common.global_motion[mbmi->ref_frame[ref]], + cpi->common.allow_high_precision_mv, mbmi->sb_type, mi_col, + mi_row, i) + .as_int; +#else + this_mv[ref].as_int = 0; +#endif // CONFIG_GLOBAL_MOTION + } + break; + } +#if CONFIG_EXT_INTER + case NEW_NEWMV: + if (compound_seg_newmvs[0].as_int == INVALID_MV || + compound_seg_newmvs[1].as_int == INVALID_MV) { + this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int; + this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int; + } else { + this_mv[0].as_int = compound_seg_newmvs[0].as_int; + this_mv[1].as_int = compound_seg_newmvs[1].as_int; + } + if (!cpi->common.allow_high_precision_mv) + lower_mv_precision(&this_mv[0].as_mv, 0); + if (!cpi->common.allow_high_precision_mv) + lower_mv_precision(&this_mv[1].as_mv, 0); +#if CONFIG_REF_MV + av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx); +#endif + thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv, + mvjcost, mvcost, MV_COST_WEIGHT_SUB); +#if CONFIG_REF_MV + av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx); +#endif + thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv, + mvjcost, mvcost, MV_COST_WEIGHT_SUB); + break; + case NEW_NEARMV: + case NEW_NEARESTMV: + this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int; + if (!cpi->common.allow_high_precision_mv) + lower_mv_precision(&this_mv[0].as_mv, 0); +#if CONFIG_REF_MV + av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx); +#endif + thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv, + mvjcost, mvcost, MV_COST_WEIGHT_SUB); + this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int; + break; + case NEAR_NEWMV: + case NEAREST_NEWMV: + this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int; + this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int; + if (!cpi->common.allow_high_precision_mv) + lower_mv_precision(&this_mv[1].as_mv, 0); +#if CONFIG_REF_MV + av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx); +#endif + thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv, + mvjcost, mvcost, MV_COST_WEIGHT_SUB); + break; + case NEAREST_NEARMV: + case NEAR_NEARESTMV: + case NEAREST_NEARESTMV: + case NEAR_NEARMV: + this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int; + this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int; + break; + case ZERO_ZEROMV: +#if CONFIG_GLOBAL_MOTION + this_mv[0].as_int = + gm_get_motion_vector(&cpi->common.global_motion[mbmi->ref_frame[0]], + cpi->common.allow_high_precision_mv, + mbmi->sb_type, mi_col, mi_row, i) + .as_int; + this_mv[1].as_int = + gm_get_motion_vector(&cpi->common.global_motion[mbmi->ref_frame[1]], + cpi->common.allow_high_precision_mv, + mbmi->sb_type, mi_col, mi_row, i) + .as_int; +#else + this_mv[0].as_int = 0; + this_mv[1].as_int = 0; +#endif // CONFIG_GLOBAL_MOTION + break; +#endif // CONFIG_EXT_INTER + default: break; + } + + mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int; + if (is_compound) mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int; + + mic->bmi[i].as_mode = mode; + +#if CONFIG_REF_MV + if (mode == NEWMV) { + mic->bmi[i].pred_mv[0].as_int = + mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_int; + if (is_compound) + mic->bmi[i].pred_mv[1].as_int = + mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_int; + } else { + mic->bmi[i].pred_mv[0].as_int = this_mv[0].as_int; + if (is_compound) mic->bmi[i].pred_mv[1].as_int = this_mv[1].as_int; + } +#endif // CONFIG_REF_MV + + for (idy = 0; idy < num_4x4_blocks_high; ++idy) + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) + memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i])); + +#if CONFIG_REF_MV +#if CONFIG_EXT_INTER + if (is_compound) + mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]; + else +#endif // CONFIG_EXT_INTER + mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, + mbmi->ref_frame, mbmi->sb_type, i); +#else // CONFIG_REF_MV + mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]]; +#endif // CONFIG_REF_MV + return cost_mv_ref(cpi, mode, mode_ctx) + thismvcost; +} + +static int64_t encode_inter_mb_segment_sub8x8( + const AV1_COMP *const cpi, MACROBLOCK *x, int64_t best_yrd, int i, + int *labelyrate, int64_t *distortion, int64_t *sse, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, int ir, int ic, int mi_row, int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[0]; + struct macroblock_plane *const p = &x->plane[0]; + MODE_INFO *const mi = xd->mi[0]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd); + const int txb_width = max_block_wide(xd, plane_bsize, 0); + const int txb_height = max_block_high(xd, plane_bsize, 0); + const int width = block_size_wide[plane_bsize]; + const int height = block_size_high[plane_bsize]; + int idx, idy; + const uint8_t *const src = + &p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)]; + uint8_t *const dst = + &pd->dst.buf[av1_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)]; + int64_t thisdistortion = 0, thissse = 0; + int thisrate = 0; + TX_SIZE tx_size = mi->mbmi.tx_size; + TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, tx_size); + const int num_4x4_w = tx_size_wide_unit[tx_size]; + const int num_4x4_h = tx_size_high_unit[tx_size]; +#if !CONFIG_PVQ + const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 1); +#else + (void)cpi; + (void)ta; + (void)tl; + (void)tx_type; +#endif // !CONFIG_PVQ + +#if CONFIG_EXT_TX && CONFIG_RECT_TX + assert(IMPLIES(xd->lossless[mi->mbmi.segment_id], tx_size == TX_4X4)); + assert(IMPLIES(!xd->lossless[mi->mbmi.segment_id], + tx_size == max_txsize_rect_lookup[mi->mbmi.sb_type])); +#else + assert(tx_size == TX_4X4); +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + + assert(tx_type == DCT_DCT); + + av1_build_inter_predictor_sub8x8(xd, 0, i, ir, ic, mi_row, mi_col); + +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + aom_highbd_subtract_block( + height, width, av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), + 8, src, p->src.stride, dst, pd->dst.stride, xd->bd); + } else { + aom_subtract_block(height, width, + av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), + 8, src, p->src.stride, dst, pd->dst.stride); + } +#else + aom_subtract_block(height, width, + av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), + 8, src, p->src.stride, dst, pd->dst.stride); +#endif // CONFIG_HIGHBITDEPTH + + for (idy = 0; idy < txb_height; idy += num_4x4_h) { + for (idx = 0; idx < txb_width; idx += num_4x4_w) { + int64_t dist, ssz, rd, rd1, rd2; + int coeff_ctx; + const int k = i + (idy * 2 + idx); + const int block = av1_raster_order_to_block_index(tx_size, k); + assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4, + idx == 0 && idy == 0)); + coeff_ctx = combine_entropy_contexts(*(ta + (k & 1)), *(tl + (k >> 1))); + av1_xform_quant(cm, x, 0, block, idy + (i >> 1), idx + (i & 0x01), + BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_FP); + if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0) + av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx); + av1_dist_block(cpi, x, 0, BLOCK_8X8, block, idy + (i >> 1), + idx + (i & 0x1), tx_size, &dist, &ssz, + OUTPUT_HAS_PREDICTED_PIXELS); + thisdistortion += dist; + thissse += ssz; +#if !CONFIG_PVQ + thisrate += + av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, (ta + (k & 1)), + (tl + (k >> 1)), cpi->sf.use_fast_coef_costing); + *(ta + (k & 1)) = !(p->eobs[block] == 0); + *(tl + (k >> 1)) = !(p->eobs[block] == 0); +#else + thisrate += x->rate; +#endif // !CONFIG_PVQ +#if CONFIG_EXT_TX + if (tx_size == TX_8X4) { + *(ta + (k & 1) + 1) = *(ta + (k & 1)); + } + if (tx_size == TX_4X8) { + *(tl + (k >> 1) + 1) = *(tl + (k >> 1)); + } +#endif // CONFIG_EXT_TX + rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion); + rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse); + rd = AOMMIN(rd1, rd2); + if (rd >= best_yrd) return INT64_MAX; + } + } + + *distortion = thisdistortion; + *labelyrate = thisrate; + *sse = thissse; + + return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion); +} + +typedef struct { + int eobs; + int brate; + int byrate; + int64_t bdist; + int64_t bsse; + int64_t brdcost; + int_mv mvs[2]; +#if CONFIG_REF_MV + int_mv pred_mv[2]; +#endif // CONFIG_REF_MV +#if CONFIG_EXT_INTER + int_mv ref_mv[2]; +#endif // CONFIG_EXT_INTER + +#if CONFIG_CB4X4 + ENTROPY_CONTEXT ta[4]; + ENTROPY_CONTEXT tl[4]; +#else + ENTROPY_CONTEXT ta[2]; + ENTROPY_CONTEXT tl[2]; +#endif // CONFIG_CB4X4 +} SEG_RDSTAT; + +typedef struct { + int_mv *ref_mv[2]; + int_mv mvp; + + int64_t segment_rd; + int r; + int64_t d; + int64_t sse; + int segment_yrate; + PREDICTION_MODE modes[4]; +#if CONFIG_EXT_INTER + SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES]; +#else + SEG_RDSTAT rdstat[4][INTER_MODES]; +#endif // CONFIG_EXT_INTER + int mvthresh; +} BEST_SEG_INFO; + +static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) { + return (mv->row >> 3) < mv_limits->row_min || + (mv->row >> 3) > mv_limits->row_max || + (mv->col >> 3) < mv_limits->col_min || + (mv->col >> 3) > mv_limits->col_max; +} + +static INLINE void mi_buf_shift(MACROBLOCK *x, int i) { + MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &x->e_mbd.plane[0]; + + p->src.buf = + &p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)]; + assert(((intptr_t)pd->pre[0].buf & 0x7) == 0); + pd->pre[0].buf = + &pd->pre[0].buf[av1_raster_block_offset(BLOCK_8X8, i, pd->pre[0].stride)]; + if (has_second_ref(mbmi)) + pd->pre[1].buf = + &pd->pre[1] + .buf[av1_raster_block_offset(BLOCK_8X8, i, pd->pre[1].stride)]; +} + +static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src, + struct buf_2d orig_pre[2]) { + MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi; + x->plane[0].src = orig_src; + x->e_mbd.plane[0].pre[0] = orig_pre[0]; + if (has_second_ref(mbmi)) x->e_mbd.plane[0].pre[1] = orig_pre[1]; +} + +// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion. +// TODO(aconverse): Find out if this is still productive then clean up or remove +static int check_best_zero_mv( + const AV1_COMP *const cpi, const int16_t mode_context[TOTAL_REFS_PER_FRAME], +#if CONFIG_REF_MV && CONFIG_EXT_INTER + const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME], +#endif // CONFIG_REF_MV && CONFIG_EXT_INTER + int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode, + const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block, + int mi_row, int mi_col) { + int_mv zeromv[2]; + int comp_pred_mode = ref_frames[1] > INTRA_FRAME; + int cur_frm; + (void)mi_row; + (void)mi_col; + for (cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) { +#if CONFIG_GLOBAL_MOTION + if (this_mode == ZEROMV +#if CONFIG_EXT_INTER + || this_mode == ZERO_ZEROMV +#endif // CONFIG_EXT_INTER + ) + zeromv[cur_frm].as_int = + gm_get_motion_vector(&cpi->common.global_motion[ref_frames[cur_frm]], + cpi->common.allow_high_precision_mv, bsize, + mi_col, mi_row, block) + .as_int; + else +#endif // CONFIG_GLOBAL_MOTION + zeromv[cur_frm].as_int = 0; + } +#if !CONFIG_EXT_INTER + assert(ref_frames[1] != INTRA_FRAME); // Just sanity check +#endif // !CONFIG_EXT_INTER + if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && + frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int && + (ref_frames[1] <= INTRA_FRAME || + frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int)) { +#if CONFIG_REF_MV + int16_t rfc = + av1_mode_context_analyzer(mode_context, ref_frames, bsize, block); +#else + int16_t rfc = mode_context[ref_frames[0]]; +#endif // CONFIG_REF_MV + int c1 = cost_mv_ref(cpi, NEARMV, rfc); + int c2 = cost_mv_ref(cpi, NEARESTMV, rfc); + int c3 = cost_mv_ref(cpi, ZEROMV, rfc); + +#if !CONFIG_REF_MV + (void)bsize; + (void)block; +#endif // !CONFIG_REF_MV + + if (this_mode == NEARMV) { + if (c1 > c3) return 0; + } else if (this_mode == NEARESTMV) { + if (c2 > c3) return 0; + } else { + assert(this_mode == ZEROMV); + if (ref_frames[1] <= INTRA_FRAME) { + if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) || + (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0)) + return 0; + } else { + if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 && + frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) || + (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 && + frame_mv[NEARMV][ref_frames[1]].as_int == 0)) + return 0; + } + } + } +#if CONFIG_EXT_INTER + else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAREST_NEARMV || + this_mode == NEAR_NEARESTMV || this_mode == NEAR_NEARMV || + this_mode == ZERO_ZEROMV) && + frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int && + frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) { +#if CONFIG_REF_MV + int16_t rfc = compound_mode_context[ref_frames[0]]; +#else + int16_t rfc = mode_context[ref_frames[0]]; +#endif // CONFIG_REF_MV + int c1 = cost_mv_ref(cpi, NEAREST_NEARMV, rfc); + int c2 = cost_mv_ref(cpi, NEAREST_NEARESTMV, rfc); + int c3 = cost_mv_ref(cpi, ZERO_ZEROMV, rfc); + int c4 = cost_mv_ref(cpi, NEAR_NEARESTMV, rfc); + int c5 = cost_mv_ref(cpi, NEAR_NEARMV, rfc); + + if (this_mode == NEAREST_NEARMV) { + if (c1 > c3) return 0; + } else if (this_mode == NEAREST_NEARESTMV) { + if (c2 > c3) return 0; + } else if (this_mode == NEAR_NEARESTMV) { + if (c4 > c3) return 0; + } else if (this_mode == NEAR_NEARMV) { + if (c5 > c3) return 0; + } else { + assert(this_mode == ZERO_ZEROMV); + if ((c3 >= c2 && frame_mv[NEAREST_NEARESTMV][ref_frames[0]].as_int == 0 && + frame_mv[NEAREST_NEARESTMV][ref_frames[1]].as_int == 0) || + (c3 >= c1 && frame_mv[NEAREST_NEARMV][ref_frames[0]].as_int == 0 && + frame_mv[NEAREST_NEARMV][ref_frames[1]].as_int == 0) || + (c3 >= c5 && frame_mv[NEAR_NEARMV][ref_frames[0]].as_int == 0 && + frame_mv[NEAR_NEARMV][ref_frames[1]].as_int == 0) || + (c3 >= c4 && frame_mv[NEAR_NEARESTMV][ref_frames[0]].as_int == 0 && + frame_mv[NEAR_NEARESTMV][ref_frames[1]].as_int == 0)) + return 0; + } + } +#endif // CONFIG_EXT_INTER + return 1; +} + +static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row, + int mi_col, +#if CONFIG_EXT_INTER + int_mv *ref_mv_sub8x8[2], +#endif // CONFIG_EXT_INTER + int *rate_mv, const int block) { + const AV1_COMMON *const cm = &cpi->common; + const int pw = block_size_wide[bsize]; + const int ph = block_size_high[bsize]; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + // This function should only ever be called for compound modes + assert(has_second_ref(mbmi)); + const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] }; + int_mv ref_mv[2]; + int ite, ref; +#if CONFIG_DUAL_FILTER + InterpFilter interp_filter[4] = { + mbmi->interp_filter[0], mbmi->interp_filter[1], mbmi->interp_filter[2], + mbmi->interp_filter[3], + }; +#else + const InterpFilter interp_filter = mbmi->interp_filter; +#endif // CONFIG_DUAL_FILTER + struct scale_factors sf; + struct macroblockd_plane *const pd = &xd->plane[0]; +#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + // ic and ir are the 4x4 coordiantes of the sub8x8 at index "block" + const int ic = block & 1; + const int ir = (block - ic) >> 1; + const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic; + const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir; +#if CONFIG_GLOBAL_MOTION + int is_global[2]; + for (ref = 0; ref < 2; ++ref) { + WarpedMotionParams *const wm = + &xd->global_motion[xd->mi[0]->mbmi.ref_frame[ref]]; + is_global[ref] = is_global_mv_block(xd->mi[0], block, wm->wmtype); + } +#endif // CONFIG_GLOBAL_MOTION +#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + + // Do joint motion search in compound mode to get more accurate mv. + struct buf_2d backup_yv12[2][MAX_MB_PLANE]; + int last_besterr[2] = { INT_MAX, INT_MAX }; + const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = { + av1_get_scaled_ref_frame(cpi, refs[0]), + av1_get_scaled_ref_frame(cpi, refs[1]) + }; + +// Prediction buffer from second frame. +#if CONFIG_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]); + uint8_t *second_pred; +#else + DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]); +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_EXT_INTER && CONFIG_CB4X4 + (void)ref_mv_sub8x8; +#endif // CONFIG_EXT_INTER && CONFIG_CB4X4 + + for (ref = 0; ref < 2; ++ref) { +#if CONFIG_EXT_INTER && !CONFIG_CB4X4 + if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL) + ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int; + else +#endif // CONFIG_EXT_INTER && !CONFIG_CB4X4 + ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0]; + + if (scaled_ref_frame[ref]) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[ref][i] = xd->plane[i].pre[ref]; + av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, + NULL); + } + } + +// Since we have scaled the reference frames to match the size of the current +// frame we must use a unit scaling factor during mode selection. +#if CONFIG_HIGHBITDEPTH + av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width, + cm->height, cm->use_highbitdepth); +#else + av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width, + cm->height); +#endif // CONFIG_HIGHBITDEPTH + + // Allow joint search multiple times iteratively for each reference frame + // and break out of the search loop if it couldn't find a better mv. + for (ite = 0; ite < 4; ite++) { + struct buf_2d ref_yv12[2]; + int bestsme = INT_MAX; + int sadpb = x->sadperbit16; + MV *const best_mv = &x->best_mv.as_mv; + int search_range = 3; + + MvLimits tmp_mv_limits = x->mv_limits; + int id = ite % 2; // Even iterations search in the first reference frame, + // odd iterations search in the second. The predictor + // found for the 'other' reference frame is factored in. + const int plane = 0; + ConvolveParams conv_params = get_conv_params(0, plane); +#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + WarpTypesAllowed warp_types; +#if CONFIG_GLOBAL_MOTION + warp_types.global_warp_allowed = is_global[!id]; +#endif // CONFIG_GLOBAL_MOTION +#if CONFIG_WARPED_MOTION + warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL; +#endif // CONFIG_WARPED_MOTION +#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + + // Initialized here because of compiler problem in Visual Studio. + ref_yv12[0] = xd->plane[plane].pre[0]; + ref_yv12[1] = xd->plane[plane].pre[1]; + +#if CONFIG_DUAL_FILTER + // reload the filter types + interp_filter[0] = + (id == 0) ? mbmi->interp_filter[2] : mbmi->interp_filter[0]; + interp_filter[1] = + (id == 0) ? mbmi->interp_filter[3] : mbmi->interp_filter[1]; +#endif // CONFIG_DUAL_FILTER + +// Get the prediction block from the 'other' reference frame. +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); + av1_highbd_build_inter_predictor( + ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw, + &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, interp_filter, +#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + &warp_types, p_col, p_row, +#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd); + } else { + second_pred = (uint8_t *)second_pred_alloc_16; +#endif // CONFIG_HIGHBITDEPTH + av1_build_inter_predictor( + ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw, + &frame_mv[refs[!id]].as_mv, &sf, pw, ph, &conv_params, interp_filter, +#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + &warp_types, p_col, p_row, plane, !id, +#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd); +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH + + // Do compound motion search on the current reference frame. + if (id) xd->plane[plane].pre[0] = ref_yv12[id]; + av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv); + + // Use the mv result from the single mode as mv predictor. + *best_mv = frame_mv[refs[id]].as_mv; + + best_mv->col >>= 3; + best_mv->row >>= 3; + +#if CONFIG_REF_MV + av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx); +#endif // CONFIG_REF_MV + + // Small-range full-pixel motion search. + bestsme = + av1_refining_search_8p_c(x, sadpb, search_range, &cpi->fn_ptr[bsize], + &ref_mv[id].as_mv, second_pred); + if (bestsme < INT_MAX) + bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv, + second_pred, &cpi->fn_ptr[bsize], 1); + + x->mv_limits = tmp_mv_limits; + + if (bestsme < INT_MAX) { + int dis; /* TODO: use dis in distortion calculation later. */ + unsigned int sse; + if (cpi->sf.use_upsampled_references) { + // Use up-sampled reference frames. + struct buf_2d backup_pred = pd->pre[0]; + const YV12_BUFFER_CONFIG *upsampled_ref = + get_upsampled_ref(cpi, refs[id]); + + // Set pred for Y plane + setup_pred_plane(&pd->pre[0], bsize, upsampled_ref->y_buffer, + upsampled_ref->y_crop_width, + upsampled_ref->y_crop_height, upsampled_ref->y_stride, + (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x, + pd->subsampling_y); + +// If bsize < BLOCK_8X8, adjust pred pointer for this block +#if !CONFIG_CB4X4 + if (bsize < BLOCK_8X8) + pd->pre[0].buf = + &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, block, + pd->pre[0].stride)) + << 3]; +#endif // !CONFIG_CB4X4 + + bestsme = cpi->find_fractional_mv_step( + x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], 0, + cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost, + &dis, &sse, second_pred, pw, ph, 1); + + // Restore the reference frames. + pd->pre[0] = backup_pred; + } else { + (void)block; + bestsme = cpi->find_fractional_mv_step( + x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], 0, + cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost, + &dis, &sse, second_pred, pw, ph, 0); + } + } + + // Restore the pointer to the first (possibly scaled) prediction buffer. + if (id) xd->plane[plane].pre[0] = ref_yv12[0]; + + if (bestsme < last_besterr[id]) { + frame_mv[refs[id]].as_mv = *best_mv; + last_besterr[id] = bestsme; + } else { + break; + } + } + + *rate_mv = 0; + + for (ref = 0; ref < 2; ++ref) { + if (scaled_ref_frame[ref]) { + // Restore the prediction frame pointers to their unscaled versions. + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[ref] = backup_yv12[ref][i]; + } +#if CONFIG_REF_MV + av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx); +#endif // CONFIG_REF_MV +#if CONFIG_EXT_INTER && !CONFIG_CB4X4 + if (bsize >= BLOCK_8X8) +#endif // CONFIG_EXT_INTER && !CONFIG_CB4X4 + *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv, + &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); +#if CONFIG_EXT_INTER && !CONFIG_CB4X4 + else + *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv, + &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost, + x->mvcost, MV_COST_WEIGHT); +#endif // CONFIG_EXT_INTER && !CONFIG_CB4X4 + } +} + +#if CONFIG_REF_MV && !CONFIG_EXT_INTER +static void update_mv_search_and_seg_mvs( + int *const run_mv_search, int_mv *const seg_mvs, int has_second_rf, + const MV_REFERENCE_FRAME *const ref_frame, + const SEG_RDSTAT *const ref_rdstat, int_mv *const bsi_ref_mv[2]) { + if (has_second_rf) { + if (seg_mvs[ref_frame[0]].as_int == ref_rdstat->mvs[0].as_int && + ref_rdstat->mvs[0].as_int != INVALID_MV) + if (bsi_ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int) + --*run_mv_search; + + if (seg_mvs[ref_frame[1]].as_int == ref_rdstat->mvs[1].as_int && + ref_rdstat->mvs[1].as_int != INVALID_MV) + if (bsi_ref_mv[1]->as_int == ref_rdstat->pred_mv[1].as_int) + --*run_mv_search; + } else { + if (bsi_ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int && + ref_rdstat->mvs[0].as_int != INVALID_MV) { + *run_mv_search = 0; + seg_mvs[ref_frame[0]].as_int = ref_rdstat->mvs[0].as_int; + } + } +} +#endif // CONFIG_REF_MV && !CONFIG_EXT_INTER + +static int64_t rd_pick_inter_best_sub8x8_mode( + const AV1_COMP *const cpi, MACROBLOCK *x, int_mv *best_ref_mv, + int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate, + int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse, + int mvthresh, int_mv seg_mvs[4][TOTAL_REFS_PER_FRAME], +#if CONFIG_EXT_INTER + int_mv compound_seg_newmvs[4][2], +#endif // CONFIG_EXT_INTER + BEST_SEG_INFO *bsi_buf, int filter_idx, int mi_row, int mi_col) { + BEST_SEG_INFO *bsi = bsi_buf + filter_idx; +#if CONFIG_REF_MV + int_mv tmp_ref_mv[2]; +#endif // CONFIG_REF_MV + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + MB_MODE_INFO *mbmi = &mi->mbmi; + int mode_idx; + int k, br = 0, idx, idy; + int64_t bd = 0, block_sse = 0; + PREDICTION_MODE this_mode; + const AV1_COMMON *cm = &cpi->common; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int label_count = 4; + int64_t this_segment_rd = 0; + int label_mv_thresh; + int segmentyrate = 0; + const BLOCK_SIZE bsize = mbmi->sb_type; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; +#if CONFIG_CB4X4 + ENTROPY_CONTEXT t_above[4], t_left[4]; +#else + ENTROPY_CONTEXT t_above[2], t_left[2]; +#endif // CONFIG_CB4X4 + int subpelmv = 1, have_ref = 0; + const int has_second_rf = has_second_ref(mbmi); + const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize]; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; +#if CONFIG_PVQ + od_rollback_buffer pre_buf; + + od_encode_checkpoint(&x->daala_enc, &pre_buf); +#endif // CONFIG_PVQ +#if CONFIG_EXT_TX && CONFIG_RECT_TX + mbmi->tx_size = + xd->lossless[mbmi->segment_id] ? TX_4X4 : max_txsize_rect_lookup[bsize]; +#else + mbmi->tx_size = TX_4X4; +#endif // CONFIG_EXT_TX && CONFIG_RECT_TX + + av1_zero(*bsi); + + bsi->segment_rd = best_rd; + bsi->ref_mv[0] = best_ref_mv; + bsi->ref_mv[1] = second_best_ref_mv; + bsi->mvp.as_int = best_ref_mv->as_int; + bsi->mvthresh = mvthresh; + + for (idx = 0; idx < 4; ++idx) bsi->modes[idx] = ZEROMV; + +#if CONFIG_REF_MV + for (idx = 0; idx < 4; ++idx) { + for (k = NEARESTMV; k <= NEWMV; ++k) { + bsi->rdstat[idx][INTER_OFFSET(k)].pred_mv[0].as_int = INVALID_MV; + bsi->rdstat[idx][INTER_OFFSET(k)].pred_mv[1].as_int = INVALID_MV; + + bsi->rdstat[idx][INTER_OFFSET(k)].mvs[0].as_int = INVALID_MV; + bsi->rdstat[idx][INTER_OFFSET(k)].mvs[1].as_int = INVALID_MV; + } + } +#endif // CONFIG_REF_MV + + memcpy(t_above, pd->above_context, sizeof(t_above)); + memcpy(t_left, pd->left_context, sizeof(t_left)); + + // 64 makes this threshold really big effectively + // making it so that we very rarely check mvs on + // segments. setting this to 1 would make mv thresh + // roughly equal to what it is for macroblocks + label_mv_thresh = 1 * bsi->mvthresh / label_count; + + // Segmentation method overheads + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + // TODO(jingning,rbultje): rewrite the rate-distortion optimization + // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop + int_mv mode_mv[MB_MODE_COUNT][2]; + int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME]; + PREDICTION_MODE mode_selected = ZEROMV; + int64_t new_best_rd = INT64_MAX; + const int index = idy * 2 + idx; + int ref; +#if CONFIG_REF_MV + CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE]; + uint8_t ref_mv_count[2]; +#endif // CONFIG_REF_MV +#if CONFIG_EXT_INTER + int_mv ref_mvs_sub8x8[2][2]; +#endif // CONFIG_EXT_INTER +#if CONFIG_PVQ + od_rollback_buffer idx_buf, post_buf; + od_encode_checkpoint(&x->daala_enc, &idx_buf); + od_encode_checkpoint(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + + for (ref = 0; ref < 1 + has_second_rf; ++ref) { + const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; +#if CONFIG_EXT_INTER + int_mv mv_ref_list[MAX_MV_REF_CANDIDATES]; + av1_update_mv_context(cm, xd, mi, frame, mv_ref_list, index, mi_row, + mi_col, NULL); +#endif // CONFIG_EXT_INTER +#if CONFIG_GLOBAL_MOTION + frame_mv[ZEROMV][frame].as_int = + gm_get_motion_vector(&cm->global_motion[frame], + cm->allow_high_precision_mv, mbmi->sb_type, + mi_col, mi_row, index) + .as_int; +#else // CONFIG_GLOBAL_MOTION + frame_mv[ZEROMV][frame].as_int = 0; +#endif // CONFIG_GLOBAL_MOTION + av1_append_sub8x8_mvs_for_idx(cm, xd, index, ref, mi_row, mi_col, +#if CONFIG_REF_MV + ref_mv_stack[ref], &ref_mv_count[ref], +#endif // CONFIG_REF_MV +#if CONFIG_EXT_INTER + mv_ref_list, +#endif // CONFIG_EXT_INTER + &frame_mv[NEARESTMV][frame], + &frame_mv[NEARMV][frame]); + +#if CONFIG_REF_MV + tmp_ref_mv[ref] = frame_mv[NEARESTMV][mbmi->ref_frame[ref]]; + lower_mv_precision(&tmp_ref_mv[ref].as_mv, cm->allow_high_precision_mv); + bsi->ref_mv[ref] = &tmp_ref_mv[ref]; + mbmi_ext->ref_mvs[frame][0] = tmp_ref_mv[ref]; +#endif // CONFIG_REF_MV + +#if CONFIG_EXT_INTER + mv_ref_list[0].as_int = frame_mv[NEARESTMV][frame].as_int; + mv_ref_list[1].as_int = frame_mv[NEARMV][frame].as_int; + av1_find_best_ref_mvs(cm->allow_high_precision_mv, mv_ref_list, + &ref_mvs_sub8x8[0][ref], &ref_mvs_sub8x8[1][ref]); + + if (has_second_rf) { +#if CONFIG_GLOBAL_MOTION + frame_mv[ZERO_ZEROMV][frame].as_int = + gm_get_motion_vector(&cm->global_motion[frame], + cm->allow_high_precision_mv, mbmi->sb_type, + mi_col, mi_row, index) + .as_int; +#else + frame_mv[ZERO_ZEROMV][frame].as_int = 0; +#endif // CONFIG_GLOBAL_MOTION + frame_mv[NEAREST_NEARESTMV][frame].as_int = + frame_mv[NEARESTMV][frame].as_int; + + if (ref == 0) { + frame_mv[NEAREST_NEARMV][frame].as_int = + frame_mv[NEARESTMV][frame].as_int; + frame_mv[NEAR_NEARESTMV][frame].as_int = + frame_mv[NEARMV][frame].as_int; + frame_mv[NEAREST_NEWMV][frame].as_int = + frame_mv[NEARESTMV][frame].as_int; + frame_mv[NEAR_NEWMV][frame].as_int = frame_mv[NEARMV][frame].as_int; + frame_mv[NEAR_NEARMV][frame].as_int = + frame_mv[NEARMV][frame].as_int; + } else if (ref == 1) { + frame_mv[NEAREST_NEARMV][frame].as_int = + frame_mv[NEARMV][frame].as_int; + frame_mv[NEAR_NEARESTMV][frame].as_int = + frame_mv[NEARESTMV][frame].as_int; + frame_mv[NEW_NEARESTMV][frame].as_int = + frame_mv[NEARESTMV][frame].as_int; + frame_mv[NEW_NEARMV][frame].as_int = frame_mv[NEARMV][frame].as_int; + frame_mv[NEAR_NEARMV][frame].as_int = + frame_mv[NEARMV][frame].as_int; + } + } +#endif // CONFIG_EXT_INTER + } + +// search for the best motion vector on this segment +#if CONFIG_EXT_INTER + for (this_mode = (has_second_rf ? NEAREST_NEARESTMV : NEARESTMV); + this_mode <= (has_second_rf ? NEW_NEWMV : NEWMV); ++this_mode) +#else + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) +#endif // CONFIG_EXT_INTER + { + const struct buf_2d orig_src = x->plane[0].src; + struct buf_2d orig_pre[2]; + // This flag controls if the motion estimation will kick off. When it + // is set to a non-zero value, the encoder will force motion estimation. + int run_mv_search = 0; + + mode_idx = INTER_OFFSET(this_mode); +#if CONFIG_EXT_INTER + for (ref = 0; ref < 1 + has_second_rf; ++ref) + bsi->ref_mv[ref]->as_int = ref_mvs_sub8x8[0][ref].as_int; +#endif // CONFIG_EXT_INTER + bsi->rdstat[index][mode_idx].brdcost = INT64_MAX; + if (!(inter_mode_mask & (1 << this_mode))) continue; + +#if CONFIG_REF_MV + run_mv_search = 2; +#if !CONFIG_EXT_INTER + if (filter_idx > 0 && this_mode == NEWMV) { + const BEST_SEG_INFO *ref_bsi = bsi_buf; + const SEG_RDSTAT *ref_rdstat = &ref_bsi->rdstat[index][mode_idx]; + + update_mv_search_and_seg_mvs(&run_mv_search, seg_mvs[index], + has_second_rf, mbmi->ref_frame, + ref_rdstat, bsi->ref_mv); + + if (run_mv_search != 0 && filter_idx > 1) { + ref_bsi = bsi_buf + 1; + ref_rdstat = &ref_bsi->rdstat[index][mode_idx]; + run_mv_search = 2; + update_mv_search_and_seg_mvs(&run_mv_search, seg_mvs[index], + has_second_rf, mbmi->ref_frame, + ref_rdstat, bsi->ref_mv); + } + } +#endif // !CONFIG_EXT_INTER +#endif // CONFIG_REF_MV + +#if CONFIG_GLOBAL_MOTION + if (cm->global_motion[mbmi->ref_frame[0]].wmtype == IDENTITY && + (!has_second_rf || + cm->global_motion[mbmi->ref_frame[1]].wmtype == IDENTITY)) +#endif // CONFIG_GLOBAL_MOTION + + if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, +#if CONFIG_REF_MV && CONFIG_EXT_INTER + mbmi_ext->compound_mode_context, +#endif // CONFIG_REF_MV && CONFIG_EXT_INTER + frame_mv, this_mode, mbmi->ref_frame, bsize, + index, mi_row, mi_col)) + continue; + + memcpy(orig_pre, pd->pre, sizeof(orig_pre)); + memcpy(bsi->rdstat[index][mode_idx].ta, t_above, + sizeof(bsi->rdstat[index][mode_idx].ta)); + memcpy(bsi->rdstat[index][mode_idx].tl, t_left, + sizeof(bsi->rdstat[index][mode_idx].tl)); +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &idx_buf); +#endif // CONFIG_PVQ + + // motion search for newmv (single predictor case only) + if (!has_second_rf && +#if CONFIG_EXT_INTER + have_newmv_in_inter_mode(this_mode) && + (seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV) +#else + this_mode == NEWMV && + (seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV || + run_mv_search) +#endif // CONFIG_EXT_INTER + ) { + int step_param = 0; + int bestsme = INT_MAX; + int sadpb = x->sadperbit4; + MV mvp_full; + int max_mv; + int cost_list[5]; + MvLimits tmp_mv_limits = x->mv_limits; + + /* Is the best so far sufficiently good that we cant justify doing + * and new motion search. */ + if (new_best_rd < label_mv_thresh) break; + +#if CONFIG_EXT_INTER + bsi->mvp.as_int = bsi->ref_mv[0]->as_int; +#else +// use previous block's result as next block's MV predictor. +#if !CONFIG_REF_MV + if (index > 0) { + bsi->mvp.as_int = mi->bmi[index - 1].as_mv[0].as_int; + if (index == 2) + bsi->mvp.as_int = mi->bmi[index - 2].as_mv[0].as_int; + } +#endif // !CONFIG_REF_MV +#endif // CONFIG_EXT_INTER + max_mv = (index == 0) ? (int)x->max_mv_context[mbmi->ref_frame[0]] + : AOMMAX(abs(bsi->mvp.as_mv.row), + abs(bsi->mvp.as_mv.col)) >> + 3; + + if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) { + // Take wtd average of the step_params based on the last frame's + // max mv magnitude and the best ref mvs of the current block for + // the given reference. + step_param = + (av1_init_search_range(max_mv) + cpi->mv_step_param) / 2; + } else { + step_param = cpi->mv_step_param; + } + +#if CONFIG_REF_MV + mvp_full.row = bsi->ref_mv[0]->as_mv.row >> 3; + mvp_full.col = bsi->ref_mv[0]->as_mv.col >> 3; +#else + mvp_full.row = bsi->mvp.as_mv.row >> 3; + mvp_full.col = bsi->mvp.as_mv.col >> 3; +#endif // CONFIG_REF_MV + + if (cpi->sf.adaptive_motion_search) { + mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3; + mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3; + step_param = AOMMAX(step_param, 8); + } + + // adjust src pointer for this block + mi_buf_shift(x, index); + + av1_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv); + + x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV; + +#if CONFIG_REF_MV + av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx); +#endif // CONFIG_REF_MV + bestsme = av1_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, sadpb, + cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL, + &bsi->ref_mv[0]->as_mv, INT_MAX, 1); + + x->mv_limits = tmp_mv_limits; + + if (bestsme < INT_MAX) { + int distortion; + if (cpi->sf.use_upsampled_references) { + int best_mv_var; + const int try_second = + x->second_best_mv.as_int != INVALID_MV && + x->second_best_mv.as_int != x->best_mv.as_int; + const int pw = block_size_wide[bsize]; + const int ph = block_size_high[bsize]; + // Use up-sampled reference frames. + struct buf_2d backup_pred = pd->pre[0]; + const YV12_BUFFER_CONFIG *upsampled_ref = + get_upsampled_ref(cpi, mbmi->ref_frame[0]); + + // Set pred for Y plane + setup_pred_plane( + &pd->pre[0], bsize, upsampled_ref->y_buffer, + upsampled_ref->y_crop_width, upsampled_ref->y_crop_height, + upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL, + pd->subsampling_x, pd->subsampling_y); + + // adjust pred pointer for this block + pd->pre[0].buf = + &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, index, + pd->pre[0].stride)) + << 3]; + + best_mv_var = cpi->find_fractional_mv_step( + x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, + &distortion, &x->pred_sse[mbmi->ref_frame[0]], NULL, pw, ph, + 1); + + if (try_second) { + int this_var; + MV best_mv = x->best_mv.as_mv; + const MV ref_mv = bsi->ref_mv[0]->as_mv; + const int minc = + AOMMAX(x->mv_limits.col_min * 8, ref_mv.col - MV_MAX); + const int maxc = + AOMMIN(x->mv_limits.col_max * 8, ref_mv.col + MV_MAX); + const int minr = + AOMMAX(x->mv_limits.row_min * 8, ref_mv.row - MV_MAX); + const int maxr = + AOMMIN(x->mv_limits.row_max * 8, ref_mv.row + MV_MAX); + + x->best_mv = x->second_best_mv; + if (x->best_mv.as_mv.row * 8 <= maxr && + x->best_mv.as_mv.row * 8 >= minr && + x->best_mv.as_mv.col * 8 <= maxc && + x->best_mv.as_mv.col * 8 >= minc) { + this_var = cpi->find_fractional_mv_step( + x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_cost_list(cpi, cost_list), x->nmvjointcost, + x->mvcost, &distortion, &x->pred_sse[mbmi->ref_frame[0]], + NULL, pw, ph, 1); + if (this_var < best_mv_var) best_mv = x->best_mv.as_mv; + x->best_mv.as_mv = best_mv; + } + } + + // Restore the reference frames. + pd->pre[0] = backup_pred; + } else { + cpi->find_fractional_mv_step( + x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, + &distortion, &x->pred_sse[mbmi->ref_frame[0]], NULL, 0, 0, 0); + } + +// save motion search result for use in compound prediction +#if CONFIG_EXT_INTER + seg_mvs[index][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv; +#else + seg_mvs[index][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv; +#endif // CONFIG_EXT_INTER + } + + if (cpi->sf.adaptive_motion_search) + x->pred_mv[mbmi->ref_frame[0]] = x->best_mv.as_mv; + +#if CONFIG_EXT_INTER + mode_mv[this_mode][0] = x->best_mv; +#else + mode_mv[NEWMV][0] = x->best_mv; +#endif // CONFIG_EXT_INTER + + // restore src pointers + mi_buf_restore(x, orig_src, orig_pre); + } + + if (has_second_rf) { +#if CONFIG_EXT_INTER + if (seg_mvs[index][mbmi->ref_frame[1]].as_int == INVALID_MV || + seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV) +#else + if (seg_mvs[index][mbmi->ref_frame[1]].as_int == INVALID_MV || + seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV) +#endif // CONFIG_EXT_INTER + continue; + } + +#if CONFIG_DUAL_FILTER + (void)run_mv_search; +#endif // CONFIG_DUAL_FILTER + + if (has_second_rf && +#if CONFIG_EXT_INTER + this_mode == NEW_NEWMV && +#else + this_mode == NEWMV && +#endif // CONFIG_EXT_INTER +#if CONFIG_DUAL_FILTER + (mbmi->interp_filter[0] == EIGHTTAP_REGULAR || run_mv_search)) +#else + (mbmi->interp_filter == EIGHTTAP_REGULAR || run_mv_search)) +#endif // CONFIG_DUAL_FILTER + { + // adjust src pointers + mi_buf_shift(x, index); + if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { + int rate_mv; + frame_mv[this_mode][mbmi->ref_frame[0]].as_int = + seg_mvs[index][mbmi->ref_frame[0]].as_int; + frame_mv[this_mode][mbmi->ref_frame[1]].as_int = + seg_mvs[index][mbmi->ref_frame[1]].as_int; + joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row, + mi_col, +#if CONFIG_EXT_INTER + bsi->ref_mv, +#endif // CONFIG_EXT_INTER + &rate_mv, index); +#if CONFIG_EXT_INTER + compound_seg_newmvs[index][0].as_int = + frame_mv[this_mode][mbmi->ref_frame[0]].as_int; + compound_seg_newmvs[index][1].as_int = + frame_mv[this_mode][mbmi->ref_frame[1]].as_int; +#else + seg_mvs[index][mbmi->ref_frame[0]].as_int = + frame_mv[this_mode][mbmi->ref_frame[0]].as_int; + seg_mvs[index][mbmi->ref_frame[1]].as_int = + frame_mv[this_mode][mbmi->ref_frame[1]].as_int; +#endif // CONFIG_EXT_INTER + } + // restore src pointers + mi_buf_restore(x, orig_src, orig_pre); + } + + bsi->rdstat[index][mode_idx].brate = set_and_cost_bmi_mvs( + cpi, x, xd, index, this_mode, mode_mv[this_mode], frame_mv, + seg_mvs[index], +#if CONFIG_EXT_INTER + compound_seg_newmvs[index], +#endif // CONFIG_EXT_INTER + bsi->ref_mv, x->nmvjointcost, x->mvcost, mi_row, mi_col); + + for (ref = 0; ref < 1 + has_second_rf; ++ref) { + bsi->rdstat[index][mode_idx].mvs[ref].as_int = + mode_mv[this_mode][ref].as_int; + if (num_4x4_blocks_wide > 1) + bsi->rdstat[index + 1][mode_idx].mvs[ref].as_int = + mode_mv[this_mode][ref].as_int; + if (num_4x4_blocks_high > 1) + bsi->rdstat[index + 2][mode_idx].mvs[ref].as_int = + mode_mv[this_mode][ref].as_int; +#if CONFIG_REF_MV + bsi->rdstat[index][mode_idx].pred_mv[ref].as_int = + mi->bmi[index].pred_mv[ref].as_int; + if (num_4x4_blocks_wide > 1) + bsi->rdstat[index + 1][mode_idx].pred_mv[ref].as_int = + mi->bmi[index].pred_mv[ref].as_int; + if (num_4x4_blocks_high > 1) + bsi->rdstat[index + 2][mode_idx].pred_mv[ref].as_int = + mi->bmi[index].pred_mv[ref].as_int; +#endif // CONFIG_REF_MV +#if CONFIG_EXT_INTER + bsi->rdstat[index][mode_idx].ref_mv[ref].as_int = + bsi->ref_mv[ref]->as_int; + if (num_4x4_blocks_wide > 1) + bsi->rdstat[index + 1][mode_idx].ref_mv[ref].as_int = + bsi->ref_mv[ref]->as_int; + if (num_4x4_blocks_high > 1) + bsi->rdstat[index + 2][mode_idx].ref_mv[ref].as_int = + bsi->ref_mv[ref]->as_int; +#endif // CONFIG_EXT_INTER + } + + // Trap vectors that reach beyond the UMV borders + if (mv_check_bounds(&x->mv_limits, &mode_mv[this_mode][0].as_mv) || + (has_second_rf && + mv_check_bounds(&x->mv_limits, &mode_mv[this_mode][1].as_mv))) + continue; + + if (filter_idx > 0) { + BEST_SEG_INFO *ref_bsi = bsi_buf; + subpelmv = 0; + have_ref = 1; + + for (ref = 0; ref < 1 + has_second_rf; ++ref) { + subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv); +#if CONFIG_EXT_INTER + if (have_newmv_in_inter_mode(this_mode)) + have_ref &= + ((mode_mv[this_mode][ref].as_int == + ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int) && + (bsi->ref_mv[ref]->as_int == + ref_bsi->rdstat[index][mode_idx].ref_mv[ref].as_int)); + else +#endif // CONFIG_EXT_INTER + have_ref &= mode_mv[this_mode][ref].as_int == + ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int; + } + + have_ref &= ref_bsi->rdstat[index][mode_idx].brate > 0; + + if (filter_idx > 1 && !subpelmv && !have_ref) { + ref_bsi = bsi_buf + 1; + have_ref = 1; + for (ref = 0; ref < 1 + has_second_rf; ++ref) +#if CONFIG_EXT_INTER + if (have_newmv_in_inter_mode(this_mode)) + have_ref &= + ((mode_mv[this_mode][ref].as_int == + ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int) && + (bsi->ref_mv[ref]->as_int == + ref_bsi->rdstat[index][mode_idx].ref_mv[ref].as_int)); + else +#endif // CONFIG_EXT_INTER + have_ref &= mode_mv[this_mode][ref].as_int == + ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int; + + have_ref &= ref_bsi->rdstat[index][mode_idx].brate > 0; + } + + if (!subpelmv && have_ref && + ref_bsi->rdstat[index][mode_idx].brdcost < INT64_MAX) { +#if CONFIG_REF_MV + bsi->rdstat[index][mode_idx].byrate = + ref_bsi->rdstat[index][mode_idx].byrate; + bsi->rdstat[index][mode_idx].bdist = + ref_bsi->rdstat[index][mode_idx].bdist; + bsi->rdstat[index][mode_idx].bsse = + ref_bsi->rdstat[index][mode_idx].bsse; + bsi->rdstat[index][mode_idx].brate += + ref_bsi->rdstat[index][mode_idx].byrate; + bsi->rdstat[index][mode_idx].eobs = + ref_bsi->rdstat[index][mode_idx].eobs; + + bsi->rdstat[index][mode_idx].brdcost = + RDCOST(x->rdmult, x->rddiv, bsi->rdstat[index][mode_idx].brate, + bsi->rdstat[index][mode_idx].bdist); + + memcpy(bsi->rdstat[index][mode_idx].ta, + ref_bsi->rdstat[index][mode_idx].ta, + sizeof(bsi->rdstat[index][mode_idx].ta)); + memcpy(bsi->rdstat[index][mode_idx].tl, + ref_bsi->rdstat[index][mode_idx].tl, + sizeof(bsi->rdstat[index][mode_idx].tl)); +#else + memcpy(&bsi->rdstat[index][mode_idx], + &ref_bsi->rdstat[index][mode_idx], sizeof(SEG_RDSTAT)); +#endif // CONFIG_REF_MV + if (num_4x4_blocks_wide > 1) + bsi->rdstat[index + 1][mode_idx].eobs = + ref_bsi->rdstat[index + 1][mode_idx].eobs; + if (num_4x4_blocks_high > 1) + bsi->rdstat[index + 2][mode_idx].eobs = + ref_bsi->rdstat[index + 2][mode_idx].eobs; + + if (bsi->rdstat[index][mode_idx].brdcost < new_best_rd) { +#if CONFIG_REF_MV + // If the NEWMV mode is using the same motion vector as the + // NEARESTMV mode, skip the rest rate-distortion calculations + // and use the inferred motion vector modes. + if (this_mode == NEWMV) { + if (has_second_rf) { + if (bsi->rdstat[index][mode_idx].mvs[0].as_int == + bsi->ref_mv[0]->as_int && + bsi->rdstat[index][mode_idx].mvs[1].as_int == + bsi->ref_mv[1]->as_int) + continue; + } else { + if (bsi->rdstat[index][mode_idx].mvs[0].as_int == + bsi->ref_mv[0]->as_int) + continue; + } + } +#endif // CONFIG_REF_MV + mode_selected = this_mode; + new_best_rd = bsi->rdstat[index][mode_idx].brdcost; +#if CONFIG_PVQ + od_encode_checkpoint(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + } + continue; + } + } + + bsi->rdstat[index][mode_idx].brdcost = encode_inter_mb_segment_sub8x8( + cpi, x, bsi->segment_rd - this_segment_rd, index, + &bsi->rdstat[index][mode_idx].byrate, + &bsi->rdstat[index][mode_idx].bdist, + &bsi->rdstat[index][mode_idx].bsse, bsi->rdstat[index][mode_idx].ta, + bsi->rdstat[index][mode_idx].tl, idy, idx, mi_row, mi_col); + + if (bsi->rdstat[index][mode_idx].brdcost < INT64_MAX) { + bsi->rdstat[index][mode_idx].brdcost += RDCOST( + x->rdmult, x->rddiv, bsi->rdstat[index][mode_idx].brate, 0); + bsi->rdstat[index][mode_idx].brate += + bsi->rdstat[index][mode_idx].byrate; + bsi->rdstat[index][mode_idx].eobs = p->eobs[index]; + if (num_4x4_blocks_wide > 1) + bsi->rdstat[index + 1][mode_idx].eobs = p->eobs[index + 1]; + if (num_4x4_blocks_high > 1) + bsi->rdstat[index + 2][mode_idx].eobs = p->eobs[index + 2]; + } + + if (bsi->rdstat[index][mode_idx].brdcost < new_best_rd) { +#if CONFIG_REF_MV + // If the NEWMV mode is using the same motion vector as the + // NEARESTMV mode, skip the rest rate-distortion calculations + // and use the inferred motion vector modes. + if (this_mode == NEWMV) { + if (has_second_rf) { + if (bsi->rdstat[index][mode_idx].mvs[0].as_int == + bsi->ref_mv[0]->as_int && + bsi->rdstat[index][mode_idx].mvs[1].as_int == + bsi->ref_mv[1]->as_int) + continue; + } else { + if (bsi->rdstat[index][mode_idx].mvs[0].as_int == + bsi->ref_mv[0]->as_int) + continue; + } + } +#endif // CONFIG_REF_MV + mode_selected = this_mode; + new_best_rd = bsi->rdstat[index][mode_idx].brdcost; + +#if CONFIG_PVQ + od_encode_checkpoint(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + } + } /*for each 4x4 mode*/ + + if (new_best_rd == INT64_MAX) { + int iy, midx; + for (iy = index + 1; iy < 4; ++iy) +#if CONFIG_EXT_INTER + for (midx = 0; midx < INTER_MODES + INTER_COMPOUND_MODES; ++midx) +#else + for (midx = 0; midx < INTER_MODES; ++midx) +#endif // CONFIG_EXT_INTER + bsi->rdstat[iy][midx].brdcost = INT64_MAX; + bsi->segment_rd = INT64_MAX; +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &pre_buf); +#endif // CONFIG_PVQ + return INT64_MAX; + } + + mode_idx = INTER_OFFSET(mode_selected); + memcpy(t_above, bsi->rdstat[index][mode_idx].ta, sizeof(t_above)); + memcpy(t_left, bsi->rdstat[index][mode_idx].tl, sizeof(t_left)); +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &post_buf); +#endif // CONFIG_PVQ + +#if CONFIG_EXT_INTER + bsi->ref_mv[0]->as_int = bsi->rdstat[index][mode_idx].ref_mv[0].as_int; + if (has_second_rf) + bsi->ref_mv[1]->as_int = bsi->rdstat[index][mode_idx].ref_mv[1].as_int; +#endif // CONFIG_EXT_INTER + set_and_cost_bmi_mvs(cpi, x, xd, index, mode_selected, + mode_mv[mode_selected], frame_mv, seg_mvs[index], +#if CONFIG_EXT_INTER + compound_seg_newmvs[index], +#endif // CONFIG_EXT_INTER + bsi->ref_mv, x->nmvjointcost, x->mvcost, mi_row, + mi_col); + + br += bsi->rdstat[index][mode_idx].brate; + bd += bsi->rdstat[index][mode_idx].bdist; + block_sse += bsi->rdstat[index][mode_idx].bsse; + segmentyrate += bsi->rdstat[index][mode_idx].byrate; + this_segment_rd += bsi->rdstat[index][mode_idx].brdcost; + + if (this_segment_rd > bsi->segment_rd) { + int iy, midx; + for (iy = index + 1; iy < 4; ++iy) +#if CONFIG_EXT_INTER + for (midx = 0; midx < INTER_MODES + INTER_COMPOUND_MODES; ++midx) +#else + for (midx = 0; midx < INTER_MODES; ++midx) +#endif // CONFIG_EXT_INTER + bsi->rdstat[iy][midx].brdcost = INT64_MAX; + bsi->segment_rd = INT64_MAX; +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &pre_buf); +#endif // CONFIG_PVQ + return INT64_MAX; + } + } + } /* for each label */ +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &pre_buf); +#endif // CONFIG_PVQ + + bsi->r = br; + bsi->d = bd; + bsi->segment_yrate = segmentyrate; + bsi->segment_rd = this_segment_rd; + bsi->sse = block_sse; + + // update the coding decisions + for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode; + +#if CONFIG_DAALA_DIST + // Compute prediction (i.e. skip) and decoded distortion by daala-distortion. + { + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + uint8_t *src = p->src.buf; + uint8_t *dst = pd->dst.buf; + const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd); + const int use_activity_masking = 0; + const int qm = OD_HVS_QM; + const int bsw = block_size_wide[plane_bsize]; + const int bsh = block_size_high[plane_bsize]; + int64_t rd1, rd2; + int64_t daala_sse, daala_dist; + TX_SIZE tx_size = mbmi->tx_size; + +#if CONFIG_HIGHBITDEPTH + uint8_t *recon_8x8; + DECLARE_ALIGNED(16, uint16_t, recon16[8 * 8]); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + recon_8x8 = CONVERT_TO_BYTEPTR(recon16); + else + recon_8x8 = (uint8_t *)recon16; +#else + DECLARE_ALIGNED(16, uint8_t, recon_8x8[8 * 8]); +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_PVQ + use_activity_masking = x->daala_enc.use_activity_masking; +#endif // CONFIG_PVQ + + // For each of sub8x8 prediction block in a 8x8 block + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + int i = idy * 2 + idx; + const uint8_t *const src_sub8x8 = + src + av1_raster_block_offset(BLOCK_8X8, i, p->src.stride); + uint8_t *const dst_sub8x8 = + dst + av1_raster_block_offset(BLOCK_8X8, i, pd->dst.stride); + uint8_t *recon_sub8x8 = recon_8x8 + (idy * 8 + idx) * 4; + const int txb_width = max_block_wide(xd, plane_bsize, 0); + const int txb_height = max_block_high(xd, plane_bsize, 0); + int idx_, idy_; + + av1_build_inter_predictor_sub8x8(xd, 0, i, idy, idx, mi_row, mi_col); +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + aom_highbd_subtract_block( + height, width, + av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, + src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride, xd->bd); + } else { + aom_subtract_block( + height, width, + av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, + src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride); + } +#else + aom_subtract_block( + bsh, bsw, av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), + 8, src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride); +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + aom_highbd_convolve_copy(dst_sub8x8, dst_stride, recon_sub8x8, 8, + NULL, 0, NULL, 0, bsw, bsh, xd->bd); + } else { +#endif // CONFIG_HIGHBITDEPTH + aom_convolve_copy(dst_sub8x8, dst_stride, recon_sub8x8, 8, NULL, 0, + NULL, 0, bsw, bsh); +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH + + // To get decoded pixels, do 4x4 xform and quant for each 4x4 block + // in a sub8x8 prediction block. In case remaining parts of + // sub8x8 inter mode rdo assume pd->dst stores predicted pixels, + // use local buffer to store decoded pixels. + for (idy_ = 0; idy_ < txb_height; idy_++) { + for (idx_ = 0; idx_ < txb_width; idx_++) { + int coeff_ctx = 0; + const tran_low_t *dqcoeff; + uint16_t eob; + const PLANE_TYPE plane_type = PLANE_TYPE_Y; + uint8_t *recon_4x4 = recon_sub8x8 + (idy_ * 8 + idx_) * 4; + const int block_raster_idx = (idy + idy_) * 2 + (idx + idx_); + const int block = + av1_raster_order_to_block_index(tx_size, block_raster_idx); + TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size); + + dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + av1_xform_quant(cm, x, 0, block, idy + idy_, idx + idx_, BLOCK_8X8, + tx_size, coeff_ctx, AV1_XFORM_QUANT_FP); + if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0) + av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx); + + eob = p->eobs[block]; + av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, + recon_4x4, 8, eob); + } + } + } + } + // Compute daala-distortion for a 8x8 block + daala_sse = av1_daala_dist(src, src_stride, pd->dst.buf, dst_stride, 8, 8, + qm, use_activity_masking, x->qindex) + << 4; + + daala_dist = av1_daala_dist(src, src_stride, recon_8x8, 8, 8, 8, qm, + use_activity_masking, x->qindex) + << 4; + + bsi->sse = daala_sse; + bsi->d = daala_dist; + + rd1 = RDCOST(x->rdmult, x->rddiv, bsi->r, bsi->d); + rd2 = RDCOST(x->rdmult, x->rddiv, 0, bsi->sse); + bsi->segment_rd = AOMMIN(rd1, rd2); + } +#endif // CONFIG_DAALA_DIST + + if (bsi->segment_rd > best_rd) return INT64_MAX; + /* set it to the best */ + for (idx = 0; idx < 4; idx++) { + mode_idx = INTER_OFFSET(bsi->modes[idx]); + mi->bmi[idx].as_mv[0].as_int = bsi->rdstat[idx][mode_idx].mvs[0].as_int; + if (has_second_ref(mbmi)) + mi->bmi[idx].as_mv[1].as_int = bsi->rdstat[idx][mode_idx].mvs[1].as_int; +#if CONFIG_REF_MV + mi->bmi[idx].pred_mv[0] = bsi->rdstat[idx][mode_idx].pred_mv[0]; + if (has_second_ref(mbmi)) + mi->bmi[idx].pred_mv[1] = bsi->rdstat[idx][mode_idx].pred_mv[1]; +#endif // CONFIG_REF_MV +#if CONFIG_EXT_INTER + mi->bmi[idx].ref_mv[0].as_int = bsi->rdstat[idx][mode_idx].ref_mv[0].as_int; + if (has_second_rf) + mi->bmi[idx].ref_mv[1].as_int = + bsi->rdstat[idx][mode_idx].ref_mv[1].as_int; +#endif // CONFIG_EXT_INTER + x->plane[0].eobs[idx] = bsi->rdstat[idx][mode_idx].eobs; + mi->bmi[idx].as_mode = bsi->modes[idx]; + } + + /* + * used to set mbmi->mv.as_int + */ + *returntotrate = bsi->r; + *returndistortion = bsi->d; + *returnyrate = bsi->segment_yrate; + *skippable = av1_is_skippable_in_plane(x, BLOCK_8X8, 0); + *psse = bsi->sse; + mbmi->mode = bsi->modes[3]; + + return bsi->segment_rd; +} + +static void estimate_ref_frame_costs(const AV1_COMMON *cm, + const MACROBLOCKD *xd, int segment_id, + unsigned int *ref_costs_single, + unsigned int *ref_costs_comp, + aom_prob *comp_mode_p) { + int seg_ref_active = + segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + if (seg_ref_active) { + memset(ref_costs_single, 0, + TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_single)); + memset(ref_costs_comp, 0, TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_comp)); + *comp_mode_p = 128; + } else { + aom_prob intra_inter_p = av1_get_intra_inter_prob(cm, xd); + aom_prob comp_inter_p = 128; + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + comp_inter_p = av1_get_reference_mode_prob(cm, xd); + *comp_mode_p = comp_inter_p; + } else { + *comp_mode_p = 128; + } + + ref_costs_single[INTRA_FRAME] = av1_cost_bit(intra_inter_p, 0); + + if (cm->reference_mode != COMPOUND_REFERENCE) { + aom_prob ref_single_p1 = av1_get_pred_prob_single_ref_p1(cm, xd); + aom_prob ref_single_p2 = av1_get_pred_prob_single_ref_p2(cm, xd); +#if CONFIG_EXT_REFS + aom_prob ref_single_p3 = av1_get_pred_prob_single_ref_p3(cm, xd); + aom_prob ref_single_p4 = av1_get_pred_prob_single_ref_p4(cm, xd); + aom_prob ref_single_p5 = av1_get_pred_prob_single_ref_p5(cm, xd); +#endif // CONFIG_EXT_REFS + + unsigned int base_cost = av1_cost_bit(intra_inter_p, 1); + + ref_costs_single[LAST_FRAME] = +#if CONFIG_EXT_REFS + ref_costs_single[LAST2_FRAME] = ref_costs_single[LAST3_FRAME] = + ref_costs_single[BWDREF_FRAME] = +#endif // CONFIG_EXT_REFS + ref_costs_single[GOLDEN_FRAME] = + ref_costs_single[ALTREF_FRAME] = base_cost; + +#if CONFIG_EXT_REFS + ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0); + ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p1, 0); + ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p1, 0); + ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 0); + ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p1, 1); + ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1); + + ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p3, 0); + ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p3, 0); + ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p3, 1); + ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p3, 1); + + ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p2, 0); + ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1); + + ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p4, 0); + ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p4, 1); + + ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p5, 0); + ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p5, 1); +#else + ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0); + ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1); + ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1); + + ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p2, 0); + ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1); +#endif // CONFIG_EXT_REFS + } else { + ref_costs_single[LAST_FRAME] = 512; +#if CONFIG_EXT_REFS + ref_costs_single[LAST2_FRAME] = 512; + ref_costs_single[LAST3_FRAME] = 512; + ref_costs_single[BWDREF_FRAME] = 512; +#endif // CONFIG_EXT_REFS + ref_costs_single[GOLDEN_FRAME] = 512; + ref_costs_single[ALTREF_FRAME] = 512; + } + + if (cm->reference_mode != SINGLE_REFERENCE) { + aom_prob ref_comp_p = av1_get_pred_prob_comp_ref_p(cm, xd); +#if CONFIG_EXT_REFS + aom_prob ref_comp_p1 = av1_get_pred_prob_comp_ref_p1(cm, xd); + aom_prob ref_comp_p2 = av1_get_pred_prob_comp_ref_p2(cm, xd); + aom_prob bwdref_comp_p = av1_get_pred_prob_comp_bwdref_p(cm, xd); +#endif // CONFIG_EXT_REFS + + unsigned int base_cost = av1_cost_bit(intra_inter_p, 1); + + ref_costs_comp[LAST_FRAME] = +#if CONFIG_EXT_REFS + ref_costs_comp[LAST2_FRAME] = ref_costs_comp[LAST3_FRAME] = +#endif // CONFIG_EXT_REFS + ref_costs_comp[GOLDEN_FRAME] = base_cost; + +#if CONFIG_EXT_REFS + ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF_FRAME] = 0; +#endif // CONFIG_EXT_REFS + +#if CONFIG_EXT_REFS + ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0); + ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0); + ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1); + ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1); + + ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1); + ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0); + + ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0); + ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1); + + // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1 + // more bit. + ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0); + ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1); +#else + ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0); + ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1); +#endif // CONFIG_EXT_REFS + } else { + ref_costs_comp[LAST_FRAME] = 512; +#if CONFIG_EXT_REFS + ref_costs_comp[LAST2_FRAME] = 512; + ref_costs_comp[LAST3_FRAME] = 512; + ref_costs_comp[BWDREF_FRAME] = 512; + ref_costs_comp[ALTREF_FRAME] = 512; +#endif // CONFIG_EXT_REFS + ref_costs_comp[GOLDEN_FRAME] = 512; + } + } +} + +static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, + int mode_index, + int64_t comp_pred_diff[REFERENCE_MODES], + int skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + // Take a snapshot of the coding context so it can be + // restored if we decide to encode this way + ctx->skip = x->skip; + ctx->skippable = skippable; + ctx->best_mode_index = mode_index; + ctx->mic = *xd->mi[0]; + ctx->mbmi_ext = *x->mbmi_ext; + ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE]; + ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE]; + ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT]; +} + +static void setup_buffer_inter( + const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, + BLOCK_SIZE block_size, int mi_row, int mi_col, + int_mv frame_nearest_mv[TOTAL_REFS_PER_FRAME], + int_mv frame_near_mv[TOTAL_REFS_PER_FRAME], + struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE]) { + const AV1_COMMON *cm = &cpi->common; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; + const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + + assert(yv12 != NULL); + + // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this + // use the UV scaling factors. + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); + + // Gets an initial list of candidate vectors from neighbours and orders them + av1_find_mv_refs( + cm, xd, mi, ref_frame, +#if CONFIG_REF_MV + &mbmi_ext->ref_mv_count[ref_frame], mbmi_ext->ref_mv_stack[ref_frame], +#if CONFIG_EXT_INTER + mbmi_ext->compound_mode_context, +#endif // CONFIG_EXT_INTER +#endif // CONFIG_REF_MV + candidates, mi_row, mi_col, NULL, NULL, mbmi_ext->mode_context); + + // Candidate refinement carried out at encoder and decoder + av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates, + &frame_nearest_mv[ref_frame], + &frame_near_mv[ref_frame]); + +// Further refinement that is encode side only to test the top few candidates +// in full and choose the best as the centre point for subsequent searches. +// The current implementation doesn't support scaling. +#if CONFIG_CB4X4 + av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, + block_size); +#else + if (!av1_is_scaled(sf) && block_size >= BLOCK_8X8) + av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, + block_size); +#endif // CONFIG_CB4X4 +} + +static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, +#if CONFIG_EXT_INTER + int ref_idx, +#endif // CONFIG_EXT_INTER + int *rate_mv) { + MACROBLOCKD *xd = &x->e_mbd; + const AV1_COMMON *cm = &cpi->common; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; + int bestsme = INT_MAX; + int step_param; + int sadpb = x->sadperbit16; + MV mvp_full; +#if CONFIG_EXT_INTER + int ref = mbmi->ref_frame[ref_idx]; +#else + int ref = mbmi->ref_frame[0]; + int ref_idx = 0; +#endif // CONFIG_EXT_INTER + MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv; + + MvLimits tmp_mv_limits = x->mv_limits; + int cost_list[5]; + + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + + MV pred_mv[3]; + pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv; + pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv; + pred_mv[2] = x->pred_mv[ref]; + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[i] = xd->plane[i].pre[ref_idx]; + + av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL); + } + + av1_set_mv_search_range(&x->mv_limits, &ref_mv); + +#if CONFIG_REF_MV + av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx); +#endif // CONFIG_REF_MV + + // Work out the size of the first step in the mv step search. + // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc. + if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) { + // Take wtd average of the step_params based on the last frame's + // max mv magnitude and that based on the best ref mvs of the current + // block for the given reference. + step_param = + (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) / + 2; + } else { + step_param = cpi->mv_step_param; + } + + if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) { + int boffset = + 2 * (b_width_log2_lookup[cm->sb_size] - + AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize])); + step_param = AOMMAX(step_param, boffset); + } + + if (cpi->sf.adaptive_motion_search) { + int bwl = b_width_log2_lookup[bsize]; + int bhl = b_height_log2_lookup[bsize]; + int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4); + + if (tlevel < 5) step_param += 2; + + // prev_mv_sad is not setup for dynamically scaled frames. + if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) { + int i; + for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) { + if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) { + x->pred_mv[ref].row = 0; + x->pred_mv[ref].col = 0; + x->best_mv.as_int = INVALID_MV; + + if (scaled_ref_frame) { + int j; + for (j = 0; j < MAX_MB_PLANE; ++j) + xd->plane[j].pre[ref_idx] = backup_yv12[j]; + } + return; + } + } + } + } + + av1_set_mv_search_range(&x->mv_limits, &ref_mv); + +#if CONFIG_MOTION_VAR + if (mbmi->motion_mode != SIMPLE_TRANSLATION) + mvp_full = mbmi->mv[0].as_mv; + else +#endif // CONFIG_MOTION_VAR + mvp_full = pred_mv[x->mv_best_ref_index[ref]]; + + mvp_full.col >>= 3; + mvp_full.row >>= 3; + + x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV; + +#if CONFIG_MOTION_VAR + switch (mbmi->motion_mode) { + case SIMPLE_TRANSLATION: +#endif // CONFIG_MOTION_VAR + bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, + sadpb, cond_cost_list(cpi, cost_list), + &ref_mv, INT_MAX, 1); +#if CONFIG_MOTION_VAR + break; + case OBMC_CAUSAL: + bestsme = av1_obmc_full_pixel_diamond( + cpi, x, &mvp_full, step_param, sadpb, + MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv, + &(x->best_mv.as_mv), 0); + break; + default: assert("Invalid motion mode!\n"); + } +#endif // CONFIG_MOTION_VAR + + x->mv_limits = tmp_mv_limits; + + if (bestsme < INT_MAX) { + int dis; /* TODO: use dis in distortion calculation later. */ +#if CONFIG_MOTION_VAR + switch (mbmi->motion_mode) { + case SIMPLE_TRANSLATION: +#endif // CONFIG_MOTION_VAR + if (cpi->sf.use_upsampled_references) { + int best_mv_var; + const int try_second = x->second_best_mv.as_int != INVALID_MV && + x->second_best_mv.as_int != x->best_mv.as_int; + const int pw = block_size_wide[bsize]; + const int ph = block_size_high[bsize]; + // Use up-sampled reference frames. + struct macroblockd_plane *const pd = &xd->plane[0]; + struct buf_2d backup_pred = pd->pre[ref_idx]; + const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref); + + // Set pred for Y plane + setup_pred_plane( + &pd->pre[ref_idx], bsize, upsampled_ref->y_buffer, + upsampled_ref->y_crop_width, upsampled_ref->y_crop_height, + upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL, + pd->subsampling_x, pd->subsampling_y); + + best_mv_var = cpi->find_fractional_mv_step( + x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph, + 1); + + if (try_second) { + const int minc = + AOMMAX(x->mv_limits.col_min * 8, ref_mv.col - MV_MAX); + const int maxc = + AOMMIN(x->mv_limits.col_max * 8, ref_mv.col + MV_MAX); + const int minr = + AOMMAX(x->mv_limits.row_min * 8, ref_mv.row - MV_MAX); + const int maxr = + AOMMIN(x->mv_limits.row_max * 8, ref_mv.row + MV_MAX); + int this_var; + MV best_mv = x->best_mv.as_mv; + + x->best_mv = x->second_best_mv; + if (x->best_mv.as_mv.row * 8 <= maxr && + x->best_mv.as_mv.row * 8 >= minr && + x->best_mv.as_mv.col * 8 <= maxc && + x->best_mv.as_mv.col * 8 >= minc) { + this_var = cpi->find_fractional_mv_step( + x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, + &dis, &x->pred_sse[ref], NULL, pw, ph, 1); + if (this_var < best_mv_var) best_mv = x->best_mv.as_mv; + x->best_mv.as_mv = best_mv; + } + } + + // Restore the reference frames. + pd->pre[ref_idx] = backup_pred; + } else { + cpi->find_fractional_mv_step( + x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0, + 0); + } +#if CONFIG_MOTION_VAR + break; + case OBMC_CAUSAL: + av1_find_best_obmc_sub_pixel_tree_up( + cpi, x, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv, + cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step, + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0, + cpi->sf.use_upsampled_references); + break; + default: assert("Invalid motion mode!\n"); + } +#endif // CONFIG_MOTION_VAR + } + *rate_mv = av1_mv_bit_cost(&x->best_mv.as_mv, &ref_mv, x->nmvjointcost, + x->mvcost, MV_COST_WEIGHT); + +#if CONFIG_MOTION_VAR + if (cpi->sf.adaptive_motion_search && mbmi->motion_mode == SIMPLE_TRANSLATION) +#else + if (cpi->sf.adaptive_motion_search) +#endif // CONFIG_MOTION_VAR + x->pred_mv[ref] = x->best_mv.as_mv; + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[ref_idx] = backup_yv12[i]; + } +} + +static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = dst.plane[i]; + xd->plane[i].dst.stride = dst.stride[i]; + } +} + +#if CONFIG_EXT_INTER +#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE +static void do_masked_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, + const uint8_t *mask, int mask_stride, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *tmp_mv, int *rate_mv, int ref_idx) { + MACROBLOCKD *xd = &x->e_mbd; + const AV1_COMMON *cm = &cpi->common; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; + int bestsme = INT_MAX; + int step_param; + int sadpb = x->sadperbit16; + MV mvp_full; + int ref = mbmi->ref_frame[ref_idx]; + MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv; + + MvLimits tmp_mv_limits = x->mv_limits; + + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + int i; + + MV pred_mv[3]; + pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv; + pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv; + pred_mv[2] = x->pred_mv[ref]; + +#if CONFIG_REF_MV + av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx); +#endif // CONFIG_REF_MV + + if (scaled_ref_frame) { + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[i] = xd->plane[i].pre[ref_idx]; + + av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL); + } + + av1_set_mv_search_range(&x->mv_limits, &ref_mv); + + // Work out the size of the first step in the mv step search. + // 0 here is maximum length first step. 1 is MAX >> 1 etc. + if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) { + // Take wtd average of the step_params based on the last frame's + // max mv magnitude and that based on the best ref mvs of the current + // block for the given reference. + step_param = + (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) / + 2; + } else { + step_param = cpi->mv_step_param; + } + + // TODO(debargha): is show_frame needed here? + if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size && cm->show_frame) { + int boffset = + 2 * (b_width_log2_lookup[cm->sb_size] - + AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize])); + step_param = AOMMAX(step_param, boffset); + } + + if (cpi->sf.adaptive_motion_search) { + int bwl = b_width_log2_lookup[bsize]; + int bhl = b_height_log2_lookup[bsize]; + int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4); + + if (tlevel < 5) step_param += 2; + + // prev_mv_sad is not setup for dynamically scaled frames. + if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) { + for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) { + if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) { + x->pred_mv[ref].row = 0; + x->pred_mv[ref].col = 0; + tmp_mv->as_int = INVALID_MV; + + if (scaled_ref_frame) { + int j; + for (j = 0; j < MAX_MB_PLANE; ++j) + xd->plane[j].pre[ref_idx] = backup_yv12[j]; + } + return; + } + } + } + } + + mvp_full = pred_mv[x->mv_best_ref_index[ref]]; + + mvp_full.col >>= 3; + mvp_full.row >>= 3; + + bestsme = av1_masked_full_pixel_diamond( + cpi, x, mask, mask_stride, &mvp_full, step_param, sadpb, + MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv, + &tmp_mv->as_mv, ref_idx); + + x->mv_limits = tmp_mv_limits; + + if (bestsme < INT_MAX) { + int dis; /* TODO: use dis in distortion calculation later. */ + av1_find_best_masked_sub_pixel_tree_up( + cpi, x, mask, mask_stride, mi_row, mi_col, &tmp_mv->as_mv, &ref_mv, + cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step, + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], ref_idx, + cpi->sf.use_upsampled_references); + } + *rate_mv = av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, + x->mvcost, MV_COST_WEIGHT); + + if (cpi->sf.adaptive_motion_search && cm->show_frame) + x->pred_mv[ref] = tmp_mv->as_mv; + + if (scaled_ref_frame) { + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[ref_idx] = backup_yv12[i]; + } +} + +static void do_masked_motion_search_indexed( + const AV1_COMP *const cpi, MACROBLOCK *x, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize, + int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv, int which) { + // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + BLOCK_SIZE sb_type = mbmi->sb_type; + const uint8_t *mask; + const int mask_stride = block_size_wide[bsize]; + + mask = av1_get_compound_type_mask(comp_data, sb_type); + + if (which == 0 || which == 2) + do_masked_motion_search(cpi, x, mask, mask_stride, bsize, mi_row, mi_col, + &tmp_mv[0], &rate_mv[0], 0); + + if (which == 1 || which == 2) { +// get the negative mask +#if CONFIG_COMPOUND_SEGMENT + uint8_t inv_mask_buf[2 * MAX_SB_SQUARE]; + const int h = block_size_high[bsize]; + mask = av1_get_compound_type_mask_inverse( + comp_data, inv_mask_buf, h, mask_stride, mask_stride, sb_type); +#else + mask = av1_get_compound_type_mask_inverse(comp_data, sb_type); +#endif // CONFIG_COMPOUND_SEGMENT + do_masked_motion_search(cpi, x, mask, mask_stride, bsize, mi_row, mi_col, + &tmp_mv[1], &rate_mv[1], 1); + } +} +#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE +#endif // CONFIG_EXT_INTER + +// In some situations we want to discount tha pparent cost of a new motion +// vector. Where there is a subtle motion field and especially where there is +// low spatial complexity then it can be hard to cover the cost of a new motion +// vector in a single block, even if that motion vector reduces distortion. +// However, once established that vector may be usable through the nearest and +// near mv modes to reduce distortion in subsequent blocks and also improve +// visual quality. +static int discount_newmv_test(const AV1_COMP *const cpi, int this_mode, + int_mv this_mv, + int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], + int ref_frame) { + return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) && + (this_mv.as_int != 0) && + ((mode_mv[NEARESTMV][ref_frame].as_int == 0) || + (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) && + ((mode_mv[NEARMV][ref_frame].as_int == 0) || + (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV))); +} + +#define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3) +#define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3) + +// TODO(jingning): this mv clamping function should be block size dependent. +static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); +} + +#if CONFIG_EXT_INTER +#if CONFIG_WEDGE +static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x, + const BLOCK_SIZE bsize, const uint8_t *pred0, + int stride0, const uint8_t *pred1, int stride1) { + const struct macroblock_plane *const p = &x->plane[0]; + const uint8_t *src = p->src.buf; + int src_stride = p->src.stride; + const int f_index = bsize - BLOCK_8X8; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + uint32_t esq[2][4], var; + int64_t tl, br; + +#if CONFIG_HIGHBITDEPTH + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + pred0 = CONVERT_TO_BYTEPTR(pred0); + pred1 = CONVERT_TO_BYTEPTR(pred1); + } +#endif // CONFIG_HIGHBITDEPTH + + var = cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]); + var = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2, + stride0, &esq[0][1]); + var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride, + pred0 + bh / 2 * stride0, stride0, &esq[0][2]); + var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride, + pred0 + bh / 2 * stride0 + bw / 2, stride0, + &esq[0][3]); + var = cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]); + var = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred1 + bw / 2, + stride1, &esq[1][1]); + var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride, + pred1 + bh / 2 * stride1, stride0, &esq[1][2]); + var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride, + pred1 + bh / 2 * stride1 + bw / 2, stride0, + &esq[1][3]); + (void)var; + + tl = (int64_t)(esq[0][0] + esq[0][1] + esq[0][2]) - + (int64_t)(esq[1][0] + esq[1][1] + esq[1][2]); + br = (int64_t)(esq[1][3] + esq[1][1] + esq[1][2]) - + (int64_t)(esq[0][3] + esq[0][1] + esq[0][2]); + return (tl + br > 0); +} +#endif // CONFIG_WEDGE +#endif // CONFIG_EXT_INTER + +#if !CONFIG_DUAL_FILTER +static InterpFilter predict_interp_filter( + const AV1_COMP *cpi, const MACROBLOCK *x, const BLOCK_SIZE bsize, + const int mi_row, const int mi_col, + InterpFilter (*single_filter)[TOTAL_REFS_PER_FRAME]) { + InterpFilter best_filter = SWITCHABLE; + const AV1_COMMON *cm = &cpi->common; + const MACROBLOCKD *xd = &x->e_mbd; + int bsl = mi_width_log2_lookup[bsize]; + int pred_filter_search = + cpi->sf.cb_pred_filter_search + ? (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & + 0x1 + : 0; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const int is_comp_pred = has_second_ref(mbmi); + const int this_mode = mbmi->mode; + int refs[2] = { mbmi->ref_frame[0], + (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; + if (pred_filter_search) { + InterpFilter af = SWITCHABLE, lf = SWITCHABLE; + if (xd->up_available) af = xd->mi[-xd->mi_stride]->mbmi.interp_filter; + if (xd->left_available) lf = xd->mi[-1]->mbmi.interp_filter; + +#if CONFIG_EXT_INTER + if ((this_mode != NEWMV && this_mode != NEW_NEWMV) || (af == lf)) +#else + if ((this_mode != NEWMV) || (af == lf)) +#endif // CONFIG_EXT_INTER + best_filter = af; + } + if (is_comp_pred) { + if (cpi->sf.adaptive_mode_search) { +#if CONFIG_EXT_INTER + switch (this_mode) { + case NEAREST_NEARESTMV: + if (single_filter[NEARESTMV][refs[0]] == + single_filter[NEARESTMV][refs[1]]) + best_filter = single_filter[NEARESTMV][refs[0]]; + break; + case NEAREST_NEARMV: + if (single_filter[NEARESTMV][refs[0]] == + single_filter[NEARMV][refs[1]]) + best_filter = single_filter[NEARESTMV][refs[0]]; + break; + case NEAR_NEARESTMV: + if (single_filter[NEARMV][refs[0]] == + single_filter[NEARESTMV][refs[1]]) + best_filter = single_filter[NEARMV][refs[0]]; + break; + case NEAR_NEARMV: + if (single_filter[NEARMV][refs[0]] == single_filter[NEARMV][refs[1]]) + best_filter = single_filter[NEARMV][refs[0]]; + break; + case ZERO_ZEROMV: + if (single_filter[ZEROMV][refs[0]] == single_filter[ZEROMV][refs[1]]) + best_filter = single_filter[ZEROMV][refs[0]]; + break; + case NEW_NEWMV: + if (single_filter[NEWMV][refs[0]] == single_filter[NEWMV][refs[1]]) + best_filter = single_filter[NEWMV][refs[0]]; + break; + case NEAREST_NEWMV: + if (single_filter[NEARESTMV][refs[0]] == + single_filter[NEWMV][refs[1]]) + best_filter = single_filter[NEARESTMV][refs[0]]; + break; + case NEAR_NEWMV: + if (single_filter[NEARMV][refs[0]] == single_filter[NEWMV][refs[1]]) + best_filter = single_filter[NEARMV][refs[0]]; + break; + case NEW_NEARESTMV: + if (single_filter[NEWMV][refs[0]] == + single_filter[NEARESTMV][refs[1]]) + best_filter = single_filter[NEWMV][refs[0]]; + break; + case NEW_NEARMV: + if (single_filter[NEWMV][refs[0]] == single_filter[NEARMV][refs[1]]) + best_filter = single_filter[NEWMV][refs[0]]; + break; + default: + if (single_filter[this_mode][refs[0]] == + single_filter[this_mode][refs[1]]) + best_filter = single_filter[this_mode][refs[0]]; + break; + } +#else + if (single_filter[this_mode][refs[0]] == + single_filter[this_mode][refs[1]]) + best_filter = single_filter[this_mode][refs[0]]; +#endif // CONFIG_EXT_INTER + } + } + if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) { + best_filter = EIGHTTAP_REGULAR; + } + return best_filter; +} +#endif // !CONFIG_DUAL_FILTER + +#if CONFIG_EXT_INTER +// Choose the best wedge index and sign +#if CONFIG_WEDGE +static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, + const BLOCK_SIZE bsize, const uint8_t *const p0, + const uint8_t *const p1, int *const best_wedge_sign, + int *const best_wedge_index) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const src = &x->plane[0].src; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int N = bw * bh; + int rate; + int64_t dist; + int64_t rd, best_rd = INT64_MAX; + int wedge_index; + int wedge_sign; + int wedge_types = (1 << get_wedge_bits_lookup(bsize)); + const uint8_t *mask; + uint64_t sse; +#if CONFIG_HIGHBITDEPTH + const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; +#else + const int bd_round = 0; +#endif // CONFIG_HIGHBITDEPTH + + DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]); + + int64_t sign_limit; + +#if CONFIG_HIGHBITDEPTH + if (hbd) { + aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(p0), bw, xd->bd); + aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(p1), bw, xd->bd); + aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw, + CONVERT_TO_BYTEPTR(p0), bw, xd->bd); + } else // NOLINT +#endif // CONFIG_HIGHBITDEPTH + { + aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw); + aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw); + aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw); + } + + sign_limit = ((int64_t)aom_sum_squares_i16(r0, N) - + (int64_t)aom_sum_squares_i16(r1, N)) * + (1 << WEDGE_WEIGHT_BITS) / 2; + + if (N < 64) + av1_wedge_compute_delta_squares_c(ds, r0, r1, N); + else + av1_wedge_compute_delta_squares(ds, r0, r1, N); + + for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize); + + // TODO(jingning): Make sse2 functions support N = 16 case + if (N < 64) + wedge_sign = av1_wedge_sign_from_residuals_c(ds, mask, N, sign_limit); + else + wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit); + + mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + if (N < 64) + sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N); + else + sse = av1_wedge_sse_from_residuals(r1, d10, mask, N); + sse = ROUND_POWER_OF_TWO(sse, bd_round); + + model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); + rd = RDCOST(x->rdmult, x->rddiv, rate, dist); + + if (rd < best_rd) { + *best_wedge_index = wedge_index; + *best_wedge_sign = wedge_sign; + best_rd = rd; + } + } + + return best_rd; +} + +// Choose the best wedge index the specified sign +static int64_t pick_wedge_fixed_sign( + const AV1_COMP *const cpi, const MACROBLOCK *const x, + const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1, + const int wedge_sign, int *const best_wedge_index) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const src = &x->plane[0].src; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int N = bw * bh; + int rate; + int64_t dist; + int64_t rd, best_rd = INT64_MAX; + int wedge_index; + int wedge_types = (1 << get_wedge_bits_lookup(bsize)); + const uint8_t *mask; + uint64_t sse; +#if CONFIG_HIGHBITDEPTH + const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; +#else + const int bd_round = 0; +#endif // CONFIG_HIGHBITDEPTH + + DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]); + +#if CONFIG_HIGHBITDEPTH + if (hbd) { + aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(p1), bw, xd->bd); + aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw, + CONVERT_TO_BYTEPTR(p0), bw, xd->bd); + } else // NOLINT +#endif // CONFIG_HIGHBITDEPTH + { + aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw); + aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw); + } + + for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + if (N < 64) + sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N); + else + sse = av1_wedge_sse_from_residuals(r1, d10, mask, N); + sse = ROUND_POWER_OF_TWO(sse, bd_round); + + model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); + rd = RDCOST(x->rdmult, x->rddiv, rate, dist); + + if (rd < best_rd) { + *best_wedge_index = wedge_index; + best_rd = rd; + } + } + + return best_rd; +} + +static int64_t pick_interinter_wedge(const AV1_COMP *const cpi, + MACROBLOCK *const x, + const BLOCK_SIZE bsize, + const uint8_t *const p0, + const uint8_t *const p1) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const int bw = block_size_wide[bsize]; + + int64_t rd; + int wedge_index = -1; + int wedge_sign = 0; + + assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); + + if (cpi->sf.fast_wedge_sign_estimate) { + wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw); + rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, wedge_sign, &wedge_index); + } else { + rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index); + } + + mbmi->wedge_sign = wedge_sign; + mbmi->wedge_index = wedge_index; + return rd; +} +#endif // CONFIG_WEDGE + +#if CONFIG_COMPOUND_SEGMENT +static int64_t pick_interinter_seg(const AV1_COMP *const cpi, + MACROBLOCK *const x, const BLOCK_SIZE bsize, + const uint8_t *const p0, + const uint8_t *const p1) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const struct buf_2d *const src = &x->plane[0].src; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int N = bw * bh; + int rate; + uint64_t sse; + int64_t dist; + int64_t rd0; + SEG_MASK_TYPE cur_mask_type; + int64_t best_rd = INT64_MAX; + SEG_MASK_TYPE best_mask_type = 0; +#if CONFIG_HIGHBITDEPTH + const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; +#else + const int bd_round = 0; +#endif // CONFIG_HIGHBITDEPTH + DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]); + DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]); + +#if CONFIG_HIGHBITDEPTH + if (hbd) { + aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(p0), bw, xd->bd); + aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(p1), bw, xd->bd); + aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw, + CONVERT_TO_BYTEPTR(p0), bw, xd->bd); + } else // NOLINT +#endif // CONFIG_HIGHBITDEPTH + { + aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw); + aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw); + aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw); + } + + // try each mask type and its inverse + for (cur_mask_type = 0; cur_mask_type < SEG_MASK_TYPES; cur_mask_type++) { +// build mask and inverse +#if CONFIG_HIGHBITDEPTH + if (hbd) + build_compound_seg_mask_highbd( + xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw, + CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd); + else +#endif // CONFIG_HIGHBITDEPTH + build_compound_seg_mask(xd->seg_mask, cur_mask_type, p0, bw, p1, bw, + bsize, bh, bw); + + // compute rd for mask + sse = av1_wedge_sse_from_residuals(r1, d10, xd->seg_mask, N); + sse = ROUND_POWER_OF_TWO(sse, bd_round); + + model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); + rd0 = RDCOST(x->rdmult, x->rddiv, rate, dist); + + if (rd0 < best_rd) { + best_mask_type = cur_mask_type; + best_rd = rd0; + } + } + + // make final mask + mbmi->mask_type = best_mask_type; +#if CONFIG_HIGHBITDEPTH + if (hbd) + build_compound_seg_mask_highbd( + xd->seg_mask, mbmi->mask_type, CONVERT_TO_BYTEPTR(p0), bw, + CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd); + else +#endif // CONFIG_HIGHBITDEPTH + build_compound_seg_mask(xd->seg_mask, mbmi->mask_type, p0, bw, p1, bw, + bsize, bh, bw); + + return best_rd; +} +#endif // CONFIG_COMPOUND_SEGMENT + +#if CONFIG_WEDGE && CONFIG_INTERINTRA +static int64_t pick_interintra_wedge(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const BLOCK_SIZE bsize, + const uint8_t *const p0, + const uint8_t *const p1) { + const MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + + int64_t rd; + int wedge_index = -1; + + assert(is_interintra_wedge_used(bsize)); + + rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index); + + mbmi->interintra_wedge_sign = 0; + mbmi->interintra_wedge_index = wedge_index; + return rd; +} +#endif // CONFIG_WEDGE && CONFIG_INTERINTRA + +#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE +static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x, + const BLOCK_SIZE bsize, + const uint8_t *const p0, + const uint8_t *const p1) { + const COMPOUND_TYPE compound_type = + x->e_mbd.mi[0]->mbmi.interinter_compound_type; + switch (compound_type) { +#if CONFIG_WEDGE + case COMPOUND_WEDGE: return pick_interinter_wedge(cpi, x, bsize, p0, p1); +#endif // CONFIG_WEDGE +#if CONFIG_COMPOUND_SEGMENT + case COMPOUND_SEG: return pick_interinter_seg(cpi, x, bsize, p0, p1); +#endif // CONFIG_COMPOUND_SEGMENT + default: assert(0); return 0; + } +} + +static int interinter_compound_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *x, + const BLOCK_SIZE bsize, + const int this_mode, int mi_row, + int mi_col) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + int_mv tmp_mv[2]; + int rate_mvs[2], tmp_rate_mv = 0; + const INTERINTER_COMPOUND_DATA compound_data = { +#if CONFIG_WEDGE + mbmi->wedge_index, + mbmi->wedge_sign, +#endif // CONFIG_WEDGE +#if CONFIG_COMPOUND_SEGMENT + mbmi->mask_type, + xd->seg_mask, +#endif // CONFIG_COMPOUND_SEGMENT + mbmi->interinter_compound_type + }; + if (this_mode == NEW_NEWMV) { + do_masked_motion_search_indexed(cpi, x, &compound_data, bsize, mi_row, + mi_col, tmp_mv, rate_mvs, 2); + tmp_rate_mv = rate_mvs[0] + rate_mvs[1]; + mbmi->mv[0].as_int = tmp_mv[0].as_int; + mbmi->mv[1].as_int = tmp_mv[1].as_int; + } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) { + do_masked_motion_search_indexed(cpi, x, &compound_data, bsize, mi_row, + mi_col, tmp_mv, rate_mvs, 0); + tmp_rate_mv = rate_mvs[0]; + mbmi->mv[0].as_int = tmp_mv[0].as_int; + } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { + do_masked_motion_search_indexed(cpi, x, &compound_data, bsize, mi_row, + mi_col, tmp_mv, rate_mvs, 1); + tmp_rate_mv = rate_mvs[1]; + mbmi->mv[1].as_int = tmp_mv[1].as_int; + } + return tmp_rate_mv; +} + +static int64_t build_and_cost_compound_type( + const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, + const BLOCK_SIZE bsize, const int this_mode, int rs2, int rate_mv, + BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1, + int *strides, int mi_row, int mi_col) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + int rate_sum; + int64_t dist_sum; + int64_t best_rd_cur = INT64_MAX; + int64_t rd = INT64_MAX; + int tmp_skip_txfm_sb; + int64_t tmp_skip_sse_sb; + const COMPOUND_TYPE compound_type = mbmi->interinter_compound_type; + + best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1); + best_rd_cur += RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv, 0); + + if (have_newmv_in_inter_mode(this_mode) && + use_masked_motion_search(compound_type)) { + *out_rate_mv = interinter_compound_motion_search(cpi, x, bsize, this_mode, + mi_row, mi_col); + av1_build_inter_predictors_sby(xd, mi_row, mi_col, ctx, bsize); + model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb); + rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum); + if (rd >= best_rd_cur) { + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + *out_rate_mv = rate_mv; + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, +#if CONFIG_SUPERTX + 0, 0, +#endif // CONFIG_SUPERTX + preds0, strides, preds1, + strides); + } + av1_subtract_plane(x, bsize, 0); + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum); + best_rd_cur = rd; + + } else { + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, +#if CONFIG_SUPERTX + 0, 0, +#endif // CONFIG_SUPERTX + preds0, strides, preds1, strides); + av1_subtract_plane(x, bsize, 0); + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum); + best_rd_cur = rd; + } + return best_rd_cur; +} +#endif // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE +#endif // CONFIG_EXT_INTER + +typedef struct { +#if CONFIG_MOTION_VAR + // Inter prediction buffers and respective strides + uint8_t *above_pred_buf[MAX_MB_PLANE]; + int above_pred_stride[MAX_MB_PLANE]; + uint8_t *left_pred_buf[MAX_MB_PLANE]; + int left_pred_stride[MAX_MB_PLANE]; +#endif // CONFIG_MOTION_VAR + int_mv *single_newmv; +#if CONFIG_EXT_INTER + // Pointer to array of motion vectors to use for each ref and their rates + // Should point to first of 2 arrays in 2D array + int *single_newmv_rate; + // Pointers costs of compound inter-intra and inter-inter predictions + int *compmode_interintra_cost; + int *compmode_interinter_cost; + // Pointer to array of predicted rate-distortion + // Should point to first of 2 arrays in 2D array + int64_t (*modelled_rd)[TOTAL_REFS_PER_FRAME]; +#endif // CONFIG_EXT_INTER + InterpFilter single_filter[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME]; +} HandleInterModeArgs; + +static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, + int_mv (*const mode_mv)[TOTAL_REFS_PER_FRAME], + const int mi_row, const int mi_col, + int *const rate_mv, int_mv *const single_newmv, + HandleInterModeArgs *const args) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int is_comp_pred = has_second_ref(mbmi); + const PREDICTION_MODE this_mode = mbmi->mode; +#if CONFIG_EXT_INTER + const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME); +#endif // CONFIG_EXT_INTER + int_mv *const frame_mv = mode_mv[this_mode]; + const int refs[2] = { mbmi->ref_frame[0], + mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] }; + int i; + + (void)args; + + if (is_comp_pred) { +#if CONFIG_EXT_INTER + for (i = 0; i < 2; ++i) { + single_newmv[refs[i]].as_int = args->single_newmv[refs[i]].as_int; + } + + if (this_mode == NEW_NEWMV) { + frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; + frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; + + if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { + joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, + rate_mv, 0); + } else { + *rate_mv = 0; + for (i = 0; i < 2; ++i) { +#if CONFIG_REF_MV + av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx); +#endif // CONFIG_REF_MV + *rate_mv += av1_mv_bit_cost( + &frame_mv[refs[i]].as_mv, &mbmi_ext->ref_mvs[refs[i]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + } + } + } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { + frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; +#if CONFIG_REF_MV + av1_set_mvcost(x, refs[1], 1, mbmi->ref_mv_idx); +#endif // CONFIG_REF_MV + *rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv, + &mbmi_ext->ref_mvs[refs[1]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + } else { + assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV); + frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; +#if CONFIG_REF_MV + av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx); +#endif // CONFIG_REF_MV + *rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv, + &mbmi_ext->ref_mvs[refs[0]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + } +#else + // Initialize mv using single prediction mode result. + frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; + frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; + + if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { + joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, rate_mv, 0); + } else { + *rate_mv = 0; + for (i = 0; i < 2; ++i) { +#if CONFIG_REF_MV + av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx); +#endif // CONFIG_REF_MV + *rate_mv += av1_mv_bit_cost(&frame_mv[refs[i]].as_mv, + &mbmi_ext->ref_mvs[refs[i]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + } + } +#endif // CONFIG_EXT_INTER + } else { +#if CONFIG_EXT_INTER + if (is_comp_interintra_pred) { + x->best_mv = args->single_newmv[refs[0]]; + *rate_mv = args->single_newmv_rate[refs[0]]; + } else { + single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv); + args->single_newmv[refs[0]] = x->best_mv; + args->single_newmv_rate[refs[0]] = *rate_mv; + } +#else + single_motion_search(cpi, x, bsize, mi_row, mi_col, rate_mv); + single_newmv[refs[0]] = x->best_mv; +#endif // CONFIG_EXT_INTER + + if (x->best_mv.as_int == INVALID_MV) return INT64_MAX; + + frame_mv[refs[0]] = x->best_mv; + xd->mi[0]->bmi[0].as_mv[0] = x->best_mv; + + // Estimate the rate implications of a new mv but discount this + // under certain circumstances where we want to help initiate a weak + // motion field, where the distortion gain for a single block may not + // be enough to overcome the cost of a new mv. + if (discount_newmv_test(cpi, this_mode, x->best_mv, mode_mv, refs[0])) { + *rate_mv = AOMMAX(*rate_mv / NEW_MV_DISCOUNT_FACTOR, 1); + } + } + + return 0; +} + +int64_t interpolation_filter_search( + MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col, const BUFFER_SET *const tmp_dst, + BUFFER_SET *const orig_dst, + InterpFilter (*const single_filter)[TOTAL_REFS_PER_FRAME], + int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb, + int64_t *const skip_sse_sb) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + int i; + int tmp_rate; + int64_t tmp_dist; + + (void)single_filter; + + InterpFilter assign_filter = SWITCHABLE; + + if (cm->interp_filter == SWITCHABLE) { +#if !CONFIG_DUAL_FILTER + assign_filter = av1_is_interp_needed(xd) + ? predict_interp_filter(cpi, x, bsize, mi_row, mi_col, + single_filter) + : cm->interp_filter; +#endif // !CONFIG_DUAL_FILTER + } else { + assign_filter = cm->interp_filter; + } + + set_default_interp_filters(mbmi, assign_filter); + + *switchable_rate = av1_get_switchable_rate(cpi, xd); + av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize); + model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist, + skip_txfm_sb, skip_sse_sb); + *rd = RDCOST(x->rdmult, x->rddiv, *switchable_rate + tmp_rate, tmp_dist); + + if (assign_filter == SWITCHABLE) { + // do interp_filter search + if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd)) { +#if CONFIG_DUAL_FILTER + const int filter_set_size = DUAL_FILTER_SET_SIZE; +#else + const int filter_set_size = SWITCHABLE_FILTERS; +#endif // CONFIG_DUAL_FILTER + int best_in_temp = 0; +#if CONFIG_DUAL_FILTER + InterpFilter best_filter[4]; + av1_copy(best_filter, mbmi->interp_filter); +#else + InterpFilter best_filter = mbmi->interp_filter; +#endif // CONFIG_DUAL_FILTER + restore_dst_buf(xd, *tmp_dst); + // EIGHTTAP_REGULAR mode is calculated beforehand + for (i = 1; i < filter_set_size; ++i) { + int tmp_skip_sb = 0; + int64_t tmp_skip_sse = INT64_MAX; + int tmp_rs; + int64_t tmp_rd; +#if CONFIG_DUAL_FILTER + mbmi->interp_filter[0] = filter_sets[i][0]; + mbmi->interp_filter[1] = filter_sets[i][1]; + mbmi->interp_filter[2] = filter_sets[i][0]; + mbmi->interp_filter[3] = filter_sets[i][1]; +#else + mbmi->interp_filter = (InterpFilter)i; +#endif // CONFIG_DUAL_FILTER + tmp_rs = av1_get_switchable_rate(cpi, xd); + av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize); + model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, + &tmp_dist, &tmp_skip_sb, &tmp_skip_sse); + tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rs + tmp_rate, tmp_dist); + + if (tmp_rd < *rd) { + *rd = tmp_rd; + *switchable_rate = av1_get_switchable_rate(cpi, xd); +#if CONFIG_DUAL_FILTER + av1_copy(best_filter, mbmi->interp_filter); +#else + best_filter = mbmi->interp_filter; +#endif // CONFIG_DUAL_FILTER + *skip_txfm_sb = tmp_skip_sb; + *skip_sse_sb = tmp_skip_sse; + best_in_temp = !best_in_temp; + if (best_in_temp) { + restore_dst_buf(xd, *orig_dst); + } else { + restore_dst_buf(xd, *tmp_dst); + } + } + } + if (best_in_temp) { + restore_dst_buf(xd, *tmp_dst); + } else { + restore_dst_buf(xd, *orig_dst); + } +#if CONFIG_DUAL_FILTER + av1_copy(mbmi->interp_filter, best_filter); +#else + mbmi->interp_filter = best_filter; +#endif // CONFIG_DUAL_FILTER + } else { +#if CONFIG_DUAL_FILTER + for (i = 0; i < 4; ++i) + assert(mbmi->interp_filter[i] == EIGHTTAP_REGULAR); +#else + assert(mbmi->interp_filter == EIGHTTAP_REGULAR); +#endif // CONFIG_DUAL_FILTER + } + } + + return 0; +} + +// TODO(afergs): Refactor the MBMI references in here - there's four +// TODO(afergs): Refactor optional args - add them to a struct or remove +static int64_t motion_mode_rd( + const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, + int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row, + int mi_col, HandleInterModeArgs *const args, const int64_t ref_best_rd, + const int *refs, int rate_mv, +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION +#if CONFIG_EXT_INTER + int rate2_bmc_nocoeff, MB_MODE_INFO *best_bmc_mbmi, +#if CONFIG_MOTION_VAR + int rate_mv_bmc, +#endif // CONFIG_MOTION_VAR +#endif // CONFIG_EXT_INTER +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + int rs, int *skip_txfm_sb, int64_t *skip_sse_sb, BUFFER_SET *orig_dst) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + MB_MODE_INFO *mbmi = &mi->mbmi; + const int is_comp_pred = has_second_ref(mbmi); + const PREDICTION_MODE this_mode = mbmi->mode; + + (void)mode_mv; + (void)mi_row; + (void)mi_col; + (void)args; + (void)refs; + (void)rate_mv; + (void)is_comp_pred; + (void)this_mode; + +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + MOTION_MODE motion_mode, last_motion_mode_allowed; + int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0; + RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; + MB_MODE_INFO base_mbmi, best_mbmi; +#if CONFIG_VAR_TX + uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4]; +#endif // CONFIG_VAR_TX +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + +#if CONFIG_WARPED_MOTION + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; +#endif // CONFIG_WARPED_MOTION + +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + av1_invalid_rd_stats(&best_rd_stats); +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + + if (cm->interp_filter == SWITCHABLE) rd_stats->rate += rs; +#if CONFIG_WARPED_MOTION + aom_clear_system_state(); + mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref); +#if CONFIG_EXT_INTER + best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0]; +#endif // CONFIG_EXT_INTER +#endif // CONFIG_WARPED_MOTION +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + rate2_nocoeff = rd_stats->rate; + last_motion_mode_allowed = motion_mode_allowed( +#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION + 0, xd->global_motion, +#endif // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION + mi); + base_mbmi = *mbmi; +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + int64_t best_rd = INT64_MAX; + for (motion_mode = SIMPLE_TRANSLATION; + motion_mode <= last_motion_mode_allowed; motion_mode++) { + int64_t tmp_rd = INT64_MAX; + int tmp_rate; + int64_t tmp_dist; +#if CONFIG_EXT_INTER + int tmp_rate2 = + motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff : rate2_nocoeff; +#else + int tmp_rate2 = rate2_nocoeff; +#endif // CONFIG_EXT_INTER + + *mbmi = base_mbmi; + mbmi->motion_mode = motion_mode; +#if CONFIG_MOTION_VAR + if (mbmi->motion_mode == OBMC_CAUSAL) { +#if CONFIG_EXT_INTER + *mbmi = *best_bmc_mbmi; + mbmi->motion_mode = OBMC_CAUSAL; +#endif // CONFIG_EXT_INTER + if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) { + int tmp_rate_mv = 0; + + single_motion_search(cpi, x, bsize, mi_row, mi_col, +#if CONFIG_EXT_INTER + 0, +#endif // CONFIG_EXT_INTER + &tmp_rate_mv); + mbmi->mv[0].as_int = x->best_mv.as_int; + if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv, + refs[0])) { + tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); + } +#if CONFIG_EXT_INTER + tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv; +#else + tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv; +#endif // CONFIG_EXT_INTER +#if CONFIG_DUAL_FILTER + if (!has_subpel_mv_component(xd->mi[0], xd, 0)) + mbmi->interp_filter[0] = EIGHTTAP_REGULAR; + if (!has_subpel_mv_component(xd->mi[0], xd, 1)) + mbmi->interp_filter[1] = EIGHTTAP_REGULAR; +#endif // CONFIG_DUAL_FILTER + av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize); +#if CONFIG_EXT_INTER + } else { + av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize); +#endif // CONFIG_EXT_INTER + } + av1_build_obmc_inter_prediction( + cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride, + args->left_pred_buf, args->left_pred_stride); + model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, + &tmp_dist, skip_txfm_sb, skip_sse_sb); + } +#endif // CONFIG_MOTION_VAR + +#if CONFIG_WARPED_MOTION + if (mbmi->motion_mode == WARPED_CAUSAL) { +#if CONFIG_EXT_INTER + *mbmi = *best_bmc_mbmi; + mbmi->motion_mode = WARPED_CAUSAL; +#endif // CONFIG_EXT_INTER + mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE; +#if CONFIG_DUAL_FILTER + for (int dir = 0; dir < 4; ++dir) + mbmi->interp_filter[dir] = cm->interp_filter == SWITCHABLE + ? EIGHTTAP_REGULAR + : cm->interp_filter; +#else + mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR + : cm->interp_filter; +#endif // CONFIG_DUAL_FILTER + + if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, + mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, + &mbmi->wm_params[0], mi_row, mi_col) == 0) { + av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize); + model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, + &tmp_dist, skip_txfm_sb, skip_sse_sb); + } else { + continue; + } + } +#endif // CONFIG_WARPED_MOTION + x->skip = 0; + + rd_stats->dist = 0; + rd_stats->sse = 0; + rd_stats->skip = 1; + rd_stats->rate = tmp_rate2; + if (last_motion_mode_allowed > SIMPLE_TRANSLATION) { +#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR + if (last_motion_mode_allowed == WARPED_CAUSAL) +#endif // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR + rd_stats->rate += cpi->motion_mode_cost[bsize][mbmi->motion_mode]; +#if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR + else + rd_stats->rate += cpi->motion_mode_cost1[bsize][mbmi->motion_mode]; +#endif // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR + } +#if CONFIG_WARPED_MOTION + if (mbmi->motion_mode == WARPED_CAUSAL) { + rd_stats->rate -= rs; + } +#endif // CONFIG_WARPED_MOTION +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + if (!*skip_txfm_sb) { + int64_t rdcosty = INT64_MAX; + int is_cost_valid_uv = 0; + + // cost and distortion + av1_subtract_plane(x, bsize, 0); +#if CONFIG_VAR_TX + if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { + select_tx_type_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd); + } else { + int idx, idy; + super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd); + for (idy = 0; idy < xd->n8_h; ++idy) + for (idx = 0; idx < xd->n8_w; ++idx) + mbmi->inter_tx_size[idy][idx] = mbmi->tx_size; + memset(x->blk_skip[0], rd_stats_y->skip, + sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4); + } +#else + /* clang-format off */ + super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd); +/* clang-format on */ +#endif // CONFIG_VAR_TX + + if (rd_stats_y->rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + if (mbmi->motion_mode != SIMPLE_TRANSLATION) { + continue; + } else { +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + restore_dst_buf(xd, *orig_dst); + return INT64_MAX; +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + } +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + } + + av1_merge_rd_stats(rd_stats, rd_stats_y); + + rdcosty = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist); + rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse)); +/* clang-format off */ +#if CONFIG_VAR_TX + is_cost_valid_uv = + inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty); +#else + is_cost_valid_uv = + super_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty); +#endif // CONFIG_VAR_TX + if (!is_cost_valid_uv) { +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + continue; +#else + restore_dst_buf(xd, *orig_dst); + return INT64_MAX; +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + } + /* clang-format on */ + av1_merge_rd_stats(rd_stats, rd_stats_uv); +#if CONFIG_RD_DEBUG + // record transform block coefficient cost + // TODO(angiebird): So far rd_debug tool only detects discrepancy of + // coefficient cost. Therefore, it is fine to copy rd_stats into mbmi + // here because we already collect the coefficient cost. Move this part to + // other place when we need to compare non-coefficient cost. + mbmi->rd_stats = *rd_stats; +#endif // CONFIG_RD_DEBUG +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + if (rd_stats->skip) { + rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + mbmi->skip = 0; + // here mbmi->skip temporarily plays a role as what this_skip2 does + } else if (!xd->lossless[mbmi->segment_id] && + (RDCOST(x->rdmult, x->rddiv, + rd_stats_y->rate + rd_stats_uv->rate + + av1_cost_bit(av1_get_skip_prob(cm, xd), 0), + rd_stats->dist) >= + RDCOST(x->rdmult, x->rddiv, + av1_cost_bit(av1_get_skip_prob(cm, xd), 1), + rd_stats->sse))) { + rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; + rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + rd_stats->dist = rd_stats->sse; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + mbmi->skip = 1; + } else { + rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + mbmi->skip = 0; + } + *disable_skip = 0; +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + } else { + x->skip = 1; + *disable_skip = 1; + mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1); + +// The cost of skip bit needs to be added. +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + mbmi->skip = 0; +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + + rd_stats->dist = *skip_sse_sb; + rd_stats->sse = *skip_sse_sb; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + rd_stats->skip = 1; + } + +#if CONFIG_GLOBAL_MOTION + if (this_mode == ZEROMV +#if CONFIG_EXT_INTER + || this_mode == ZERO_ZEROMV +#endif // CONFIG_EXT_INTER + ) { + if (is_nontrans_global_motion(xd)) { + rd_stats->rate -= rs; +#if CONFIG_DUAL_FILTER + mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE + ? EIGHTTAP_REGULAR + : cm->interp_filter; + mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE + ? EIGHTTAP_REGULAR + : cm->interp_filter; +#else + mbmi->interp_filter = cm->interp_filter == SWITCHABLE + ? EIGHTTAP_REGULAR + : cm->interp_filter; +#endif // CONFIG_DUAL_FILTER + } + } +#endif // CONFIG_GLOBAL_MOTION + +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + tmp_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist); + if (mbmi->motion_mode == SIMPLE_TRANSLATION || (tmp_rd < best_rd)) { + best_mbmi = *mbmi; + best_rd = tmp_rd; + best_rd_stats = *rd_stats; + best_rd_stats_y = *rd_stats_y; + best_rd_stats_uv = *rd_stats_uv; +#if CONFIG_VAR_TX + for (int i = 0; i < MAX_MB_PLANE; ++i) + memcpy(best_blk_skip[i], x->blk_skip[i], + sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4); +#endif // CONFIG_VAR_TX + best_xskip = x->skip; + best_disable_skip = *disable_skip; + } + } + + if (best_rd == INT64_MAX) { + av1_invalid_rd_stats(rd_stats); + restore_dst_buf(xd, *orig_dst); + return INT64_MAX; + } + *mbmi = best_mbmi; + *rd_stats = best_rd_stats; + *rd_stats_y = best_rd_stats_y; + *rd_stats_uv = best_rd_stats_uv; +#if CONFIG_VAR_TX + for (int i = 0; i < MAX_MB_PLANE; ++i) + memcpy(x->blk_skip[i], best_blk_skip[i], + sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4); +#endif // CONFIG_VAR_TX + x->skip = best_xskip; + *disable_skip = best_disable_skip; +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + + restore_dst_buf(xd, *orig_dst); + return 0; +} + +static int64_t handle_inter_mode( + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, + int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row, + int mi_col, HandleInterModeArgs *args, const int64_t ref_best_rd) { + const AV1_COMMON *cm = &cpi->common; + (void)cm; + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + MB_MODE_INFO *mbmi = &mi->mbmi; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int is_comp_pred = has_second_ref(mbmi); + const int this_mode = mbmi->mode; + int_mv *frame_mv = mode_mv[this_mode]; + int i; + int refs[2] = { mbmi->ref_frame[0], + (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; + int_mv cur_mv[2]; + int rate_mv = 0; +#if CONFIG_EXT_INTER + int pred_exists = 1; + const int bw = block_size_wide[bsize]; + int_mv single_newmv[TOTAL_REFS_PER_FRAME]; +#if CONFIG_INTERINTRA + const unsigned int *const interintra_mode_cost = + cpi->interintra_mode_cost[size_group_lookup[bsize]]; +#endif // CONFIG_INTERINTRA + const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME); +#if CONFIG_REF_MV + uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); +#endif // CONFIG_REF_MV +#else + int_mv *const single_newmv = args->single_newmv; +#endif // CONFIG_EXT_INTER +#if CONFIG_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); +#else + DECLARE_ALIGNED(16, uint8_t, tmp_buf_[MAX_MB_PLANE * MAX_SB_SQUARE]); +#endif // CONFIG_HIGHBITDEPTH + uint8_t *tmp_buf; + +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION +#if CONFIG_EXT_INTER + int rate2_bmc_nocoeff; + MB_MODE_INFO best_bmc_mbmi; +#if CONFIG_MOTION_VAR + int rate_mv_bmc; +#endif // CONFIG_MOTION_VAR +#endif // CONFIG_EXT_INTER +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + int64_t rd = INT64_MAX; + BUFFER_SET orig_dst, tmp_dst; + int rs = 0; + + int skip_txfm_sb = 0; + int64_t skip_sse_sb = INT64_MAX; + int16_t mode_ctx; + +#if CONFIG_EXT_INTER + *args->compmode_interintra_cost = 0; + mbmi->use_wedge_interintra = 0; + *args->compmode_interinter_cost = 0; + mbmi->interinter_compound_type = COMPOUND_AVERAGE; + + // is_comp_interintra_pred implies !is_comp_pred + assert(!is_comp_interintra_pred || (!is_comp_pred)); + // is_comp_interintra_pred implies is_interintra_allowed(mbmi->sb_type) + assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi)); +#endif // CONFIG_EXT_INTER + +#if CONFIG_REF_MV +#if CONFIG_EXT_INTER + if (is_comp_pred) + mode_ctx = mbmi_ext->compound_mode_context[refs[0]]; + else +#endif // CONFIG_EXT_INTER + mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, + mbmi->ref_frame, bsize, -1); +#else // CONFIG_REF_MV + mode_ctx = mbmi_ext->mode_context[refs[0]]; +#endif // CONFIG_REF_MV + +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_); + else +#endif // CONFIG_HIGHBITDEPTH + tmp_buf = tmp_buf_; + // Make sure that we didn't leave the plane destination buffers set + // to tmp_buf at the end of the last iteration + assert(xd->plane[0].dst.buf != tmp_buf); + +#if CONFIG_WARPED_MOTION + mbmi->num_proj_ref[0] = 0; + mbmi->num_proj_ref[1] = 0; +#endif // CONFIG_WARPED_MOTION + + if (is_comp_pred) { + if (frame_mv[refs[0]].as_int == INVALID_MV || + frame_mv[refs[1]].as_int == INVALID_MV) + return INT64_MAX; + } + + mbmi->motion_mode = SIMPLE_TRANSLATION; + if (have_newmv_in_inter_mode(this_mode)) { + const int64_t ret_val = handle_newmv(cpi, x, bsize, mode_mv, mi_row, mi_col, + &rate_mv, single_newmv, args); + if (ret_val != 0) + return ret_val; + else + rd_stats->rate += rate_mv; + } + for (i = 0; i < is_comp_pred + 1; ++i) { + cur_mv[i] = frame_mv[refs[i]]; + // Clip "next_nearest" so that it does not extend to far out of image + if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd); + if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX; + mbmi->mv[i].as_int = cur_mv[i].as_int; + } + +#if CONFIG_REF_MV +#if CONFIG_EXT_INTER + if (this_mode == NEAREST_NEARESTMV) +#else + if (this_mode == NEARESTMV && is_comp_pred) +#endif // CONFIG_EXT_INTER + { +#if !CONFIG_EXT_INTER + uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); +#endif // !CONFIG_EXT_INTER + if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) { + cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv; + cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv; + + for (i = 0; i < 2; ++i) { + clamp_mv2(&cur_mv[i].as_mv, xd); + if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX; + mbmi->mv[i].as_int = cur_mv[i].as_int; + } + } + } + +#if CONFIG_EXT_INTER + if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) { + if (this_mode == NEAREST_NEWMV || this_mode == NEAREST_NEARMV) { + cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv; + + lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv); + clamp_mv2(&cur_mv[0].as_mv, xd); + if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX; + mbmi->mv[0].as_int = cur_mv[0].as_int; + } + + if (this_mode == NEW_NEARESTMV || this_mode == NEAR_NEARESTMV) { + cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv; + + lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv); + clamp_mv2(&cur_mv[1].as_mv, xd); + if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX; + mbmi->mv[1].as_int = cur_mv[1].as_int; + } + } + + if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) { + int ref_mv_idx = mbmi->ref_mv_idx + 1; + if (this_mode == NEAR_NEWMV || this_mode == NEAR_NEARESTMV || + this_mode == NEAR_NEARMV) { + cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; + + lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv); + clamp_mv2(&cur_mv[0].as_mv, xd); + if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX; + mbmi->mv[0].as_int = cur_mv[0].as_int; + } + + if (this_mode == NEW_NEARMV || this_mode == NEAREST_NEARMV || + this_mode == NEAR_NEARMV) { + cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv; + + lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv); + clamp_mv2(&cur_mv[1].as_mv, xd); + if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX; + mbmi->mv[1].as_int = cur_mv[1].as_int; + } + } +#else + if (this_mode == NEARMV && is_comp_pred) { + uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) { + int ref_mv_idx = mbmi->ref_mv_idx + 1; + cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; + cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv; + + for (i = 0; i < 2; ++i) { + clamp_mv2(&cur_mv[i].as_mv, xd); + if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX; + mbmi->mv[i].as_int = cur_mv[i].as_int; + } + } + } +#endif // CONFIG_EXT_INTER +#endif // CONFIG_REF_MV + + // do first prediction into the destination buffer. Do the next + // prediction into a temporary buffer. Then keep track of which one + // of these currently holds the best predictor, and use the other + // one for future predictions. In the end, copy from tmp_buf to + // dst if necessary. + for (i = 0; i < MAX_MB_PLANE; i++) { + tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE; + tmp_dst.stride[i] = MAX_SB_SIZE; + } + for (i = 0; i < MAX_MB_PLANE; i++) { + orig_dst.plane[i] = xd->plane[i].dst.buf; + orig_dst.stride[i] = xd->plane[i].dst.stride; + } + + // We don't include the cost of the second reference here, because there + // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other + // words if you present them in that order, the second one is always known + // if the first is known. + // + // Under some circumstances we discount the cost of new mv mode to encourage + // initiation of a motion field. + if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv, + refs[0])) { +#if CONFIG_EXT_INTER + rd_stats->rate += + AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx), + cost_mv_ref(cpi, is_comp_pred ? NEAREST_NEARESTMV : NEARESTMV, + mode_ctx)); +#else + rd_stats->rate += AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx), + cost_mv_ref(cpi, NEARESTMV, mode_ctx)); +#endif // CONFIG_REF_MV && CONFIG_EXT_INTER + } else { + rd_stats->rate += cost_mv_ref(cpi, this_mode, mode_ctx); + } + + if (RDCOST(x->rdmult, x->rddiv, rd_stats->rate, 0) > ref_best_rd && +#if CONFIG_EXT_INTER + mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV +#else + mbmi->mode != NEARESTMV +#endif // CONFIG_EXT_INTER + ) + return INT64_MAX; + + int64_t ret_val = interpolation_filter_search( + x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter, + &rd, &rs, &skip_txfm_sb, &skip_sse_sb); + if (ret_val != 0) return ret_val; + +#if CONFIG_EXT_INTER +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + best_bmc_mbmi = *mbmi; + rate2_bmc_nocoeff = rd_stats->rate; + if (cm->interp_filter == SWITCHABLE) rate2_bmc_nocoeff += rs; +#if CONFIG_MOTION_VAR + rate_mv_bmc = rate_mv; +#endif // CONFIG_MOTION_VAR +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + + if (is_comp_pred) { + int rate_sum, rs2; + int64_t dist_sum; + int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX; + INTERINTER_COMPOUND_DATA best_compound_data; + int_mv best_mv[2]; + int best_tmp_rate_mv = rate_mv; + int tmp_skip_txfm_sb; + int64_t tmp_skip_sse_sb; + int compound_type_cost[COMPOUND_TYPES]; + uint8_t pred0[2 * MAX_SB_SQUARE]; + uint8_t pred1[2 * MAX_SB_SQUARE]; + uint8_t *preds0[1] = { pred0 }; + uint8_t *preds1[1] = { pred1 }; + int strides[1] = { bw }; + int tmp_rate_mv; + int masked_compound_used = is_any_masked_compound_used(bsize); + COMPOUND_TYPE cur_type; + + best_mv[0].as_int = cur_mv[0].as_int; + best_mv[1].as_int = cur_mv[1].as_int; + memset(&best_compound_data, 0, sizeof(best_compound_data)); +#if CONFIG_COMPOUND_SEGMENT + uint8_t tmp_mask_buf[2 * MAX_SB_SQUARE]; + best_compound_data.seg_mask = tmp_mask_buf; +#endif // CONFIG_COMPOUND_SEGMENT + av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize], + av1_compound_type_tree); + + if (masked_compound_used) { + av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize], + av1_compound_type_tree); + // get inter predictors to use for masked compound modes + av1_build_inter_predictors_for_planes_single_buf( + xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides); + av1_build_inter_predictors_for_planes_single_buf( + xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides); + } + + for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) { + if (!is_interinter_compound_used(cur_type, bsize)) break; + tmp_rate_mv = rate_mv; + best_rd_cur = INT64_MAX; + mbmi->interinter_compound_type = cur_type; + rs2 = av1_cost_literal(get_interinter_compound_type_bits( + bsize, mbmi->interinter_compound_type)) + + (masked_compound_used + ? compound_type_cost[mbmi->interinter_compound_type] + : 0); + + switch (cur_type) { + case COMPOUND_AVERAGE: + av1_build_inter_predictors_sby(xd, mi_row, mi_col, &orig_dst, bsize); + av1_subtract_plane(x, bsize, 0); + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, + INT64_MAX); + if (rd != INT64_MAX) + best_rd_cur = + RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum); + best_rd_compound = best_rd_cur; + break; +#if CONFIG_WEDGE + case COMPOUND_WEDGE: + if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh && + best_rd_compound / 3 < ref_best_rd) { + best_rd_cur = build_and_cost_compound_type( + cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst, + &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col); + } + break; +#endif // CONFIG_WEDGE +#if CONFIG_COMPOUND_SEGMENT + case COMPOUND_SEG: + if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh && + best_rd_compound / 3 < ref_best_rd) { + best_rd_cur = build_and_cost_compound_type( + cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst, + &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col); + } + break; +#endif // CONFIG_COMPOUND_SEGMENT + default: assert(0); return 0; + } + + if (best_rd_cur < best_rd_compound) { + best_rd_compound = best_rd_cur; +#if CONFIG_WEDGE + best_compound_data.wedge_index = mbmi->wedge_index; + best_compound_data.wedge_sign = mbmi->wedge_sign; +#endif // CONFIG_WEDGE +#if CONFIG_COMPOUND_SEGMENT + best_compound_data.mask_type = mbmi->mask_type; + memcpy(best_compound_data.seg_mask, xd->seg_mask, + 2 * MAX_SB_SQUARE * sizeof(uint8_t)); +#endif // CONFIG_COMPOUND_SEGMENT + best_compound_data.interinter_compound_type = + mbmi->interinter_compound_type; + if (have_newmv_in_inter_mode(this_mode)) { + if (use_masked_motion_search(cur_type)) { + best_tmp_rate_mv = tmp_rate_mv; + best_mv[0].as_int = mbmi->mv[0].as_int; + best_mv[1].as_int = mbmi->mv[1].as_int; + } else { + best_mv[0].as_int = cur_mv[0].as_int; + best_mv[1].as_int = cur_mv[1].as_int; + } + } + } + // reset to original mvs for next iteration + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + } +#if CONFIG_WEDGE + mbmi->wedge_index = best_compound_data.wedge_index; + mbmi->wedge_sign = best_compound_data.wedge_sign; +#endif // CONFIG_WEDGE +#if CONFIG_COMPOUND_SEGMENT + mbmi->mask_type = best_compound_data.mask_type; + memcpy(xd->seg_mask, best_compound_data.seg_mask, + 2 * MAX_SB_SQUARE * sizeof(uint8_t)); +#endif // CONFIG_COMPOUND_SEGMENT + mbmi->interinter_compound_type = + best_compound_data.interinter_compound_type; + if (have_newmv_in_inter_mode(this_mode)) { + mbmi->mv[0].as_int = best_mv[0].as_int; + mbmi->mv[1].as_int = best_mv[1].as_int; + xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int; + xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int; + if (use_masked_motion_search(mbmi->interinter_compound_type)) { + rd_stats->rate += best_tmp_rate_mv - rate_mv; + rate_mv = best_tmp_rate_mv; + } + } + + if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) { + restore_dst_buf(xd, orig_dst); + return INT64_MAX; + } + + pred_exists = 0; + + *args->compmode_interinter_cost = + av1_cost_literal(get_interinter_compound_type_bits( + bsize, mbmi->interinter_compound_type)) + + (masked_compound_used + ? compound_type_cost[mbmi->interinter_compound_type] + : 0); + } + +#if CONFIG_INTERINTRA + if (is_comp_interintra_pred) { + INTERINTRA_MODE best_interintra_mode = II_DC_PRED; + int64_t best_interintra_rd = INT64_MAX; + int rmode, rate_sum; + int64_t dist_sum; + int j; + int tmp_rate_mv = 0; + int tmp_skip_txfm_sb; + int64_t tmp_skip_sse_sb; + DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_SB_SQUARE]); + uint8_t *intrapred; + +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + intrapred = CONVERT_TO_BYTEPTR(intrapred_); + else +#endif // CONFIG_HIGHBITDEPTH + intrapred = intrapred_; + + mbmi->ref_frame[1] = NONE_FRAME; + for (j = 0; j < MAX_MB_PLANE; j++) { + xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE; + xd->plane[j].dst.stride = bw; + } + av1_build_inter_predictors_sby(xd, mi_row, mi_col, &orig_dst, bsize); + restore_dst_buf(xd, orig_dst); + mbmi->ref_frame[1] = INTRA_FRAME; + mbmi->use_wedge_interintra = 0; + + for (j = 0; j < INTERINTRA_MODES; ++j) { + mbmi->interintra_mode = (INTERINTRA_MODE)j; + rmode = interintra_mode_cost[mbmi->interintra_mode]; + av1_build_intra_predictors_for_interintra(xd, bsize, 0, &orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb); + rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_mv + rate_sum, dist_sum); + if (rd < best_interintra_rd) { + best_interintra_rd = rd; + best_interintra_mode = mbmi->interintra_mode; + } + } + mbmi->interintra_mode = best_interintra_mode; + rmode = interintra_mode_cost[mbmi->interintra_mode]; + av1_build_intra_predictors_for_interintra(xd, bsize, 0, &orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + av1_subtract_plane(x, bsize, 0); + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, x->rddiv, rate_mv + rmode + rate_sum, dist_sum); + best_interintra_rd = rd; + + if (ref_best_rd < INT64_MAX && best_interintra_rd > 2 * ref_best_rd) { + // Don't need to call restore_dst_buf here + return INT64_MAX; + } +#if CONFIG_WEDGE + if (is_interintra_wedge_used(bsize)) { + int64_t best_interintra_rd_nowedge = INT64_MAX; + int64_t best_interintra_rd_wedge = INT64_MAX; + int_mv tmp_mv; + int rwedge = av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge + rate_sum, + dist_sum); + best_interintra_rd_nowedge = rd; + + // Disable wedge search if source variance is small + if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) { + mbmi->use_wedge_interintra = 1; + + rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) + + av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 1); + + best_interintra_rd_wedge = + pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + + best_interintra_rd_wedge += + RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge, 0); + // Refine motion vector. + if (have_newmv_in_inter_mode(this_mode)) { + // get negative of mask + const uint8_t *mask = av1_get_contiguous_soft_mask( + mbmi->interintra_wedge_index, 1, bsize); + do_masked_motion_search(cpi, x, mask, bw, bsize, mi_row, mi_col, + &tmp_mv, &tmp_rate_mv, 0); + mbmi->mv[0].as_int = tmp_mv.as_int; + av1_build_inter_predictors_sby(xd, mi_row, mi_col, &orig_dst, bsize); + model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb); + rd = RDCOST(x->rdmult, x->rddiv, + rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum); + if (rd < best_interintra_rd_wedge) { + best_interintra_rd_wedge = rd; + } else { + tmp_mv.as_int = cur_mv[0].as_int; + tmp_rate_mv = rate_mv; + } + } else { + tmp_mv.as_int = cur_mv[0].as_int; + tmp_rate_mv = rate_mv; + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + // Evaluate closer to true rd + av1_subtract_plane(x, bsize, 0); + rd = + estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, x->rddiv, + rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum); + best_interintra_rd_wedge = rd; + if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { + mbmi->use_wedge_interintra = 1; + best_interintra_rd = best_interintra_rd_wedge; + mbmi->mv[0].as_int = tmp_mv.as_int; + rd_stats->rate += tmp_rate_mv - rate_mv; + rate_mv = tmp_rate_mv; + } else { + mbmi->use_wedge_interintra = 0; + best_interintra_rd = best_interintra_rd_nowedge; + mbmi->mv[0].as_int = cur_mv[0].as_int; + } + } else { + mbmi->use_wedge_interintra = 0; + best_interintra_rd = best_interintra_rd_nowedge; + } + } +#endif // CONFIG_WEDGE + + pred_exists = 0; + *args->compmode_interintra_cost = + av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1); + *args->compmode_interintra_cost += + interintra_mode_cost[mbmi->interintra_mode]; + if (is_interintra_wedge_used(bsize)) { + *args->compmode_interintra_cost += av1_cost_bit( + cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra); + if (mbmi->use_wedge_interintra) { + *args->compmode_interintra_cost += + av1_cost_literal(get_interintra_wedge_bits(bsize)); + } + } + } else if (is_interintra_allowed(mbmi)) { + *args->compmode_interintra_cost = + av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 0); + } +#endif // CONFIG_INTERINTRA + + if (pred_exists == 0) { + int tmp_rate; + int64_t tmp_dist; + av1_build_inter_predictors_sb(xd, mi_row, mi_col, &orig_dst, bsize); + model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, + &tmp_dist, &skip_txfm_sb, &skip_sse_sb); + rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); + } +#endif // CONFIG_EXT_INTER + + if (!is_comp_pred) +#if CONFIG_DUAL_FILTER + args->single_filter[this_mode][refs[0]] = mbmi->interp_filter[0]; +#else + args->single_filter[this_mode][refs[0]] = mbmi->interp_filter; +#endif // CONFIG_DUAL_FILTER + +#if CONFIG_EXT_INTER + if (args->modelled_rd != NULL) { + if (is_comp_pred) { + const int mode0 = compound_ref0_mode(this_mode); + const int mode1 = compound_ref1_mode(this_mode); + const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]], + args->modelled_rd[mode1][refs[1]]); + if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) { + restore_dst_buf(xd, orig_dst); + return INT64_MAX; + } + } else if (!is_comp_interintra_pred) { + args->modelled_rd[this_mode][refs[0]] = rd; + } + } +#endif // CONFIG_EXT_INTER + + if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { + // if current pred_error modeled rd is substantially more than the best + // so far, do not bother doing full rd + if (rd / 2 > ref_best_rd) { + restore_dst_buf(xd, orig_dst); + return INT64_MAX; + } + } + + ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, + disable_skip, mode_mv, mi_row, mi_col, args, + ref_best_rd, refs, rate_mv, +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION +#if CONFIG_EXT_INTER + rate2_bmc_nocoeff, &best_bmc_mbmi, +#if CONFIG_MOTION_VAR + rate_mv_bmc, +#endif // CONFIG_MOTION_VAR +#endif // CONFIG_EXT_INTER +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst); + if (ret_val != 0) return ret_val; + + return 0; // The rate-distortion cost will be re-calculated by caller. +} + +#if CONFIG_INTRABC +static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_cost, BLOCK_SIZE bsize, + int64_t best_rd) { + const AV1_COMMON *const cm = &cpi->common; + if (bsize < BLOCK_8X8 || !cm->allow_screen_content_tools) return INT64_MAX; + + MACROBLOCKD *const xd = &x->e_mbd; + const TileInfo *tile = &xd->tile; + MODE_INFO *const mi = xd->mi[0]; + const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE); + const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE); + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int sb_row = mi_row / MAX_MIB_SIZE; + + int_mv dv_ref; + av1_find_ref_dv(&dv_ref, mi_row, mi_col); + + const MvLimits tmp_mv_limits = x->mv_limits; + + // TODO(aconverse@google.com): Handle same row DV. + x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE; + x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w; + x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE; + x->mv_limits.row_max = (sb_row * MAX_MIB_SIZE - mi_row) * MI_SIZE - h; + assert(x->mv_limits.col_min >= tmp_mv_limits.col_min); + assert(x->mv_limits.col_max <= tmp_mv_limits.col_max); + assert(x->mv_limits.row_min >= tmp_mv_limits.row_min); + assert(x->mv_limits.row_max <= tmp_mv_limits.row_max); + av1_set_mv_search_range(&x->mv_limits, &dv_ref.as_mv); + + if (x->mv_limits.col_max < x->mv_limits.col_min || + x->mv_limits.row_max < x->mv_limits.row_min) { + x->mv_limits = tmp_mv_limits; + return INT64_MAX; + } + + struct buf_2d yv12_mb[MAX_MB_PLANE]; + av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL); + for (int i = 0; i < MAX_MB_PLANE; ++i) { + xd->plane[i].pre[0] = yv12_mb[i]; + } + + int step_param = cpi->mv_step_param; + MV mvp_full = dv_ref.as_mv; + mvp_full.col >>= 3; + mvp_full.row >>= 3; + int sadpb = x->sadperbit16; + int cost_list[5]; + int bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, + sadpb, cond_cost_list(cpi, cost_list), + &dv_ref.as_mv, INT_MAX, 1); + + x->mv_limits = tmp_mv_limits; + if (bestsme == INT_MAX) return INT64_MAX; + mvp_full = x->best_mv.as_mv; + MV dv = {.row = mvp_full.row * 8, .col = mvp_full.col * 8 }; + if (mv_check_bounds(&x->mv_limits, &dv)) return INT64_MAX; + if (!is_dv_valid(dv, tile, mi_row, mi_col, bsize)) return INT64_MAX; + MB_MODE_INFO *mbmi = &mi->mbmi; + MB_MODE_INFO best_mbmi = *mbmi; + RD_STATS best_rdcost = *rd_cost; + int best_skip = x->skip; +#if CONFIG_PALETTE + memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info)); +#endif + mbmi->use_intrabc = 1; + mbmi->mode = DC_PRED; + mbmi->uv_mode = DC_PRED; + mbmi->mv[0].as_mv = dv; +#if CONFIG_DUAL_FILTER + for (int idx = 0; idx < 4; ++idx) mbmi->interp_filter[idx] = BILINEAR; +#else + mbmi->interp_filter = BILINEAR; +#endif + mbmi->skip = 0; + x->skip = 0; + av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize); + + int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost, x->mvcost, + MV_COST_WEIGHT); + const PREDICTION_MODE A = av1_above_block_mode(mi, xd->above_mi, 0); + const PREDICTION_MODE L = av1_left_block_mode(mi, xd->left_mi, 0); + const int rate_mode = + cpi->y_mode_costs[A][L][DC_PRED] + av1_cost_bit(INTRABC_PROB, 1); + + RD_STATS rd_stats, rd_stats_uv; + av1_subtract_plane(x, bsize, 0); + super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX); + super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); + av1_merge_rd_stats(&rd_stats, &rd_stats_uv); +#if CONFIG_RD_DEBUG + mbmi->rd_stats = rd_stats; +#endif + + const aom_prob skip_prob = av1_get_skip_prob(cm, xd); + + RD_STATS rdc_noskip; + av1_init_rd_stats(&rdc_noskip); + rdc_noskip.rate = + rate_mode + rate_mv + rd_stats.rate + av1_cost_bit(skip_prob, 0); + rdc_noskip.dist = rd_stats.dist; + rdc_noskip.rdcost = + RDCOST(x->rdmult, x->rddiv, rdc_noskip.rate, rdc_noskip.dist); + if (rdc_noskip.rdcost < best_rd) { + best_rd = rdc_noskip.rdcost; + best_mbmi = *mbmi; + best_skip = x->skip; + best_rdcost = rdc_noskip; + } + + x->skip = 1; + mbmi->skip = 1; + RD_STATS rdc_skip; + av1_init_rd_stats(&rdc_skip); + rdc_skip.rate = rate_mode + rate_mv + av1_cost_bit(skip_prob, 1); + rdc_skip.dist = rd_stats.sse; + rdc_skip.rdcost = RDCOST(x->rdmult, x->rddiv, rdc_skip.rate, rdc_skip.dist); + if (rdc_skip.rdcost < best_rd) { + best_rd = rdc_skip.rdcost; + best_mbmi = *mbmi; + best_skip = x->skip; + best_rdcost = rdc_skip; + } + *mbmi = best_mbmi; + *rd_cost = best_rdcost; + x->skip = best_skip; + return best_rd; +} +#endif // CONFIG_INTRABC + +void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblockd_plane *const pd = xd->plane; + int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0; + int y_skip = 0, uv_skip = 0; + int64_t dist_y = 0, dist_uv = 0; + TX_SIZE max_uv_tx_size; + const int unify_bsize = CONFIG_CB4X4; + + ctx->skip = 0; + xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME; + xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME; +#if CONFIG_INTRABC + xd->mi[0]->mbmi.use_intrabc = 0; +#endif // CONFIG_INTRABC + + const int64_t intra_yrd = + (bsize >= BLOCK_8X8 || unify_bsize) + ? rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, + &y_skip, bsize, best_rd) + : rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly, + &dist_y, &y_skip, best_rd); + + if (intra_yrd < best_rd) { + max_uv_tx_size = uv_txsize_lookup[bsize][xd->mi[0]->mbmi.tx_size] + [pd[1].subsampling_x][pd[1].subsampling_y]; + +#if CONFIG_CB4X4 +#if !CONFIG_CHROMA_2X2 + max_uv_tx_size = AOMMAX(max_uv_tx_size, TX_4X4); +#endif // !CONFIG_CHROMA_2X2 + if (!x->skip_chroma_rd) + rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv, + &uv_skip, bsize, max_uv_tx_size); +#else + rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv, + &uv_skip, AOMMAX(BLOCK_8X8, bsize), max_uv_tx_size); +#endif // CONFIG_CB4X4 + + if (y_skip && uv_skip) { + rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly + + av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + rd_cost->dist = dist_y + dist_uv; + } else { + rd_cost->rate = + rate_y + rate_uv + av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + rd_cost->dist = dist_y + dist_uv; + } + rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist); + } else { + rd_cost->rate = INT_MAX; + } + +#if CONFIG_INTRABC + if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd) + best_rd = rd_cost->rdcost; + if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) { + ctx->skip = x->skip; // FIXME where is the proper place to set this?! + assert(rd_cost->rate != INT_MAX); + rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist); + } +#endif + if (rd_cost->rate == INT_MAX) return; + + ctx->mic = *xd->mi[0]; + ctx->mbmi_ext = *x->mbmi_ext; +} + +// Do we have an internal image edge (e.g. formatting bars). +int av1_internal_image_edge(const AV1_COMP *cpi) { + return (cpi->oxcf.pass == 2) && + ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) || + (cpi->twopass.this_frame_stats.inactive_zone_cols > 0)); +} + +// Checks to see if a super block is on a horizontal image edge. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) { + int top_edge = 0; + int bottom_edge = cpi->common.mi_rows; + int is_active_h_edge = 0; + + // For two pass account for any formatting bars detected. + if (cpi->oxcf.pass == 2) { + const TWO_PASS *const twopass = &cpi->twopass; + + // The inactive region is specified in MBs not mi units. + // The image edge is in the following MB row. + top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2); + + bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2); + bottom_edge = AOMMAX(top_edge, bottom_edge); + } + + if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) || + ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) { + is_active_h_edge = 1; + } + return is_active_h_edge; +} + +// Checks to see if a super block is on a vertical image edge. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) { + int left_edge = 0; + int right_edge = cpi->common.mi_cols; + int is_active_v_edge = 0; + + // For two pass account for any formatting bars detected. + if (cpi->oxcf.pass == 2) { + const TWO_PASS *const twopass = &cpi->twopass; + + // The inactive region is specified in MBs not mi units. + // The image edge is in the following MB row. + left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2); + + right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2); + right_edge = AOMMAX(left_edge, right_edge); + } + + if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) || + ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) { + is_active_v_edge = 1; + } + return is_active_v_edge; +} + +// Checks to see if a super block is at the edge of the active image. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +int av1_active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) { + return av1_active_h_edge(cpi, mi_row, cpi->common.mib_size) || + av1_active_v_edge(cpi, mi_col, cpi->common.mib_size); +} + +#if CONFIG_PALETTE +static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const BLOCK_SIZE bsize = mbmi->sb_type; + int src_stride = x->plane[1].src.stride; + const uint8_t *const src_u = x->plane[1].src.buf; + const uint8_t *const src_v = x->plane[2].src.buf; + float *const data = x->palette_buffer->kmeans_data_buf; + float centroids[2 * PALETTE_MAX_SIZE]; + uint8_t *const color_map = xd->plane[1].color_index_map; + int r, c; +#if CONFIG_HIGHBITDEPTH + const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u); + const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v); +#endif // CONFIG_HIGHBITDEPTH + int plane_block_width, plane_block_height, rows, cols; + av1_get_block_dimensions(bsize, 1, xd, &plane_block_width, + &plane_block_height, &rows, &cols); + (void)cpi; + + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) { +#if CONFIG_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) { + data[(r * cols + c) * 2] = src_u16[r * src_stride + c]; + data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c]; + } else { +#endif // CONFIG_HIGHBITDEPTH + data[(r * cols + c) * 2] = src_u[r * src_stride + c]; + data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c]; +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH + } + } + + for (r = 1; r < 3; ++r) { + for (c = 0; c < pmi->palette_size[1]; ++c) { + centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c]; + } + } + + av1_calc_indices(data, centroids, color_map, rows * cols, + pmi->palette_size[1], 2); + extend_palette_color_map(color_map, cols, rows, plane_block_width, + plane_block_height); +} +#endif // CONFIG_PALETTE + +#if CONFIG_FILTER_INTRA +static void pick_filter_intra_interframe( + const AV1_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, + BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_uv_intra, + int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv, + PREDICTION_MODE *mode_uv, FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv, +#if CONFIG_EXT_INTRA + int8_t *uv_angle_delta, +#endif // CONFIG_EXT_INTRA +#if CONFIG_PALETTE + PALETTE_MODE_INFO *pmi_uv, int palette_ctx, +#endif // CONFIG_PALETTE + int skip_mask, unsigned int *ref_costs_single, int64_t *best_rd, + int64_t *best_intra_rd, PREDICTION_MODE *best_intra_mode, + int *best_mode_index, int *best_skip2, int *best_mode_skippable, +#if CONFIG_SUPERTX + int *returnrate_nocoef, +#endif // CONFIG_SUPERTX + int64_t *best_pred_rd, MB_MODE_INFO *best_mbmode, RD_STATS *rd_cost) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; +#if CONFIG_PALETTE + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; +#endif // CONFIG_PALETTE + int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i; + int dc_mode_index; + const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]]; + int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd; + int64_t distortion_uv, model_rd = INT64_MAX; + TX_SIZE uv_tx; + + for (i = 0; i < MAX_MODES; ++i) + if (av1_mode_order[i].mode == DC_PRED && + av1_mode_order[i].ref_frame[0] == INTRA_FRAME) + break; + dc_mode_index = i; + assert(i < MAX_MODES); + + // TODO(huisu): use skip_mask for further speedup. + (void)skip_mask; + mbmi->mode = DC_PRED; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + if (!rd_pick_filter_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y, + &skippable, bsize, intra_mode_cost[mbmi->mode], + &this_rd, &model_rd, 0)) { + return; + } + if (rate_y == INT_MAX) return; + + uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x] + [xd->plane[1].subsampling_y]; + if (rate_uv_intra[uv_tx] == INT_MAX) { + choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx], + &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx], + &skip_uv[uv_tx], &mode_uv[uv_tx]); +#if CONFIG_PALETTE + if (cm->allow_screen_content_tools) pmi_uv[uv_tx] = *pmi; +#endif // CONFIG_PALETTE + filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info; +#if CONFIG_EXT_INTRA + uv_angle_delta[uv_tx] = mbmi->angle_delta[1]; +#endif // CONFIG_EXT_INTRA + } + + rate_uv = rate_uv_tokenonly[uv_tx]; + distortion_uv = dist_uv[uv_tx]; + skippable = skippable && skip_uv[uv_tx]; + mbmi->uv_mode = mode_uv[uv_tx]; +#if CONFIG_PALETTE + if (cm->allow_screen_content_tools) { + pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1]; + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, + pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE, + 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); + } +#endif // CONFIG_PALETTE +#if CONFIG_EXT_INTRA + mbmi->angle_delta[1] = uv_angle_delta[uv_tx]; +#endif // CONFIG_EXT_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = + filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]; + if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) { + mbmi->filter_intra_mode_info.filter_intra_mode[1] = + filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1]; + } + + rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv + + cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode]; +#if CONFIG_PALETTE + if (cpi->common.allow_screen_content_tools && mbmi->mode == DC_PRED && + bsize >= BLOCK_8X8) + rate2 += av1_cost_bit( + av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0); +#endif // CONFIG_PALETTE + + if (!xd->lossless[mbmi->segment_id]) { + // super_block_yrd above includes the cost of the tx_size in the + // tokenonly rate, but for intra blocks, tx_size is always coded + // (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size); + } + + rate2 += av1_cost_bit(cm->fc->filter_intra_probs[0], + mbmi->filter_intra_mode_info.use_filter_intra_mode[0]); + rate2 += write_uniform_cost( + FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]); +#if CONFIG_EXT_INTRA + if (av1_is_directional_mode(mbmi->uv_mode, bsize)) { + rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, + MAX_ANGLE_DELTA + mbmi->angle_delta[1]); + } +#endif // CONFIG_EXT_INTRA + if (mbmi->mode == DC_PRED) { + rate2 += + av1_cost_bit(cpi->common.fc->filter_intra_probs[1], + mbmi->filter_intra_mode_info.use_filter_intra_mode[1]); + if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1]) + rate2 += + write_uniform_cost(FILTER_INTRA_MODES, + mbmi->filter_intra_mode_info.filter_intra_mode[1]); + } + distortion2 = distortion_y + distortion_uv; + av1_encode_intra_block_plane((AV1_COMMON *)cm, x, bsize, 0, 0, mi_row, + mi_col); + + rate2 += ref_costs_single[INTRA_FRAME]; + + if (skippable) { + rate2 -= (rate_y + rate_uv); + rate_y = 0; + rate_uv = 0; + rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + } else { + rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + } + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + if (this_rd < *best_intra_rd) { + *best_intra_rd = this_rd; + *best_intra_mode = mbmi->mode; + } + for (i = 0; i < REFERENCE_MODES; ++i) + best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd); + + if (this_rd < *best_rd) { + *best_mode_index = dc_mode_index; + mbmi->mv[0].as_int = 0; + rd_cost->rate = rate2; +#if CONFIG_SUPERTX + if (x->skip) + *returnrate_nocoef = rate2; + else + *returnrate_nocoef = rate2 - rate_y - rate_uv; + *returnrate_nocoef -= av1_cost_bit(av1_get_skip_prob(cm, xd), skippable); + *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd), + mbmi->ref_frame[0] != INTRA_FRAME); +#endif // CONFIG_SUPERTX + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + *best_rd = this_rd; + *best_mbmode = *mbmi; + *best_skip2 = 0; + *best_mode_skippable = skippable; + } +} +#endif // CONFIG_FILTER_INTRA + +#if CONFIG_MOTION_VAR +static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, + const MACROBLOCKD *xd, int mi_row, + int mi_col, const uint8_t *above, + int above_stride, const uint8_t *left, + int left_stride); +#endif // CONFIG_MOTION_VAR + +void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, + MACROBLOCK *x, int mi_row, int mi_col, + RD_STATS *rd_cost, +#if CONFIG_SUPERTX + int *returnrate_nocoef, +#endif // CONFIG_SUPERTX + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + const AV1_COMMON *const cm = &cpi->common; + const RD_OPT *const rd_opt = &cpi->rd; + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; +#if CONFIG_PALETTE + const int try_palette = + cpi->common.allow_screen_content_tools && bsize >= BLOCK_8X8; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; +#endif // CONFIG_PALETTE + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const struct segmentation *const seg = &cm->seg; + PREDICTION_MODE this_mode; + MV_REFERENCE_FRAME ref_frame, second_ref_frame; + unsigned char segment_id = mbmi->segment_id; + int comp_pred, i, k; + int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME]; + struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE]; + int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } }; +#if CONFIG_EXT_INTER + int single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 }; + int64_t modelled_rd[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME]; +#endif // CONFIG_EXT_INTER + static const int flag_list[TOTAL_REFS_PER_FRAME] = { + 0, + AOM_LAST_FLAG, +#if CONFIG_EXT_REFS + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, +#endif // CONFIG_EXT_REFS + AOM_GOLD_FLAG, +#if CONFIG_EXT_REFS + AOM_BWD_FLAG, +#endif // CONFIG_EXT_REFS + AOM_ALT_FLAG + }; + int64_t best_rd = best_rd_so_far; + int best_rate_y = INT_MAX, best_rate_uv = INT_MAX; + int64_t best_pred_diff[REFERENCE_MODES]; + int64_t best_pred_rd[REFERENCE_MODES]; + MB_MODE_INFO best_mbmode; +#if CONFIG_REF_MV + int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1); +#endif // CONFIG_REF_MV + int best_mode_skippable = 0; + int midx, best_mode_index = -1; + unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME]; + unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME]; + aom_prob comp_mode_p; + int64_t best_intra_rd = INT64_MAX; + unsigned int best_pred_sse = UINT_MAX; + PREDICTION_MODE best_intra_mode = DC_PRED; + int rate_uv_intra[TX_SIZES_ALL], rate_uv_tokenonly[TX_SIZES_ALL]; + int64_t dist_uvs[TX_SIZES_ALL]; + int skip_uvs[TX_SIZES_ALL]; + PREDICTION_MODE mode_uv[TX_SIZES_ALL]; +#if CONFIG_PALETTE + PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL]; +#endif // CONFIG_PALETTE +#if CONFIG_EXT_INTRA + int8_t uv_angle_delta[TX_SIZES_ALL]; + int is_directional_mode, angle_stats_ready = 0; + uint8_t directional_mode_skip_mask[INTRA_MODES]; +#endif // CONFIG_EXT_INTRA +#if CONFIG_FILTER_INTRA + int8_t dc_skipped = 1; + FILTER_INTRA_MODE_INFO filter_intra_mode_info_uv[TX_SIZES_ALL]; +#endif // CONFIG_FILTER_INTRA + const int intra_cost_penalty = av1_get_intra_cost_penalty( + cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]]; + int best_skip2 = 0; + uint8_t ref_frame_skip_mask[2] = { 0 }; +#if CONFIG_EXT_INTER + uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 }; + MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME; + int64_t best_single_inter_rd = INT64_MAX; +#else + uint16_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 }; +#endif // CONFIG_EXT_INTER + int mode_skip_start = sf->mode_skip_start + 1; + const int *const rd_threshes = rd_opt->threshes[segment_id][bsize]; + const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; + int64_t mode_threshold[MAX_MODES]; + int *mode_map = tile_data->mode_map[bsize]; + const int mode_search_skip_flags = sf->mode_search_skip_flags; +#if CONFIG_PVQ + od_rollback_buffer pre_buf; +#endif // CONFIG_PVQ + + HandleInterModeArgs args = { +#if CONFIG_MOTION_VAR + { NULL }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, + { NULL }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, +#endif // CONFIG_MOTION_VAR +#if CONFIG_EXT_INTER + NULL, + NULL, + NULL, + NULL, + NULL, +#else // CONFIG_EXT_INTER + NULL, +#endif // CONFIG_EXT_INTER + { { 0 } }, + }; + +#if CONFIG_PALETTE || CONFIG_EXT_INTRA + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; +#endif // CONFIG_PALETTE || CONFIG_EXT_INTRA +#if CONFIG_PALETTE + int palette_ctx = 0; + const MODE_INFO *above_mi = xd->above_mi; + const MODE_INFO *left_mi = xd->left_mi; +#endif // CONFIG_PALETTE +#if CONFIG_MOTION_VAR +#if CONFIG_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); +#else + DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]); +#endif // CONFIG_HIGHBITDEPTH + DECLARE_ALIGNED(16, int32_t, weighted_src_buf[MAX_SB_SQUARE]); + DECLARE_ALIGNED(16, int32_t, mask2d_buf[MAX_SB_SQUARE]); + int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + int len = sizeof(uint16_t); + args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf1); + args.above_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len); + args.above_pred_buf[2] = + CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_SB_SQUARE * len); + args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf2); + args.left_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len); + args.left_pred_buf[2] = + CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_SB_SQUARE * len); + } else { +#endif // CONFIG_HIGHBITDEPTH + args.above_pred_buf[0] = tmp_buf1; + args.above_pred_buf[1] = tmp_buf1 + MAX_SB_SQUARE; + args.above_pred_buf[2] = tmp_buf1 + 2 * MAX_SB_SQUARE; + args.left_pred_buf[0] = tmp_buf2; + args.left_pred_buf[1] = tmp_buf2 + MAX_SB_SQUARE; + args.left_pred_buf[2] = tmp_buf2 + 2 * MAX_SB_SQUARE; +#if CONFIG_HIGHBITDEPTH + } +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_MOTION_VAR + + av1_zero(best_mbmode); + +#if CONFIG_PALETTE + av1_zero(pmi_uv); + if (try_palette) { + if (above_mi) + palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0); + if (left_mi) + palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0); + } +#endif // CONFIG_PALETTE + +#if CONFIG_EXT_INTRA + memset(directional_mode_skip_mask, 0, + sizeof(directional_mode_skip_mask[0]) * INTRA_MODES); +#endif // CONFIG_EXT_INTRA + + estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, + &comp_mode_p); + + for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX; + for (i = 0; i < TX_SIZES_ALL; i++) rate_uv_intra[i] = INT_MAX; + for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX; + for (i = 0; i < MB_MODE_COUNT; ++i) { + for (k = 0; k < TOTAL_REFS_PER_FRAME; ++k) { + args.single_filter[i][k] = SWITCHABLE; + } + } + + rd_cost->rate = INT_MAX; +#if CONFIG_SUPERTX + *returnrate_nocoef = INT_MAX; +#endif // CONFIG_SUPERTX + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + x->pred_mv_sad[ref_frame] = INT_MAX; + x->mbmi_ext->mode_context[ref_frame] = 0; +#if CONFIG_REF_MV && CONFIG_EXT_INTER + x->mbmi_ext->compound_mode_context[ref_frame] = 0; +#endif // CONFIG_REF_MV && CONFIG_EXT_INTER + if (cpi->ref_frame_flags & flag_list[ref_frame]) { + assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); + setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); + } + frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; +#if CONFIG_GLOBAL_MOTION + frame_mv[ZEROMV][ref_frame].as_int = + gm_get_motion_vector(&cm->global_motion[ref_frame], + cm->allow_high_precision_mv, bsize, mi_col, mi_row, + 0) + .as_int; +#else // CONFIG_GLOBAL_MOTION + frame_mv[ZEROMV][ref_frame].as_int = 0; +#endif // CONFIG_GLOBAL_MOTION +#if CONFIG_EXT_INTER + frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV; +#if CONFIG_GLOBAL_MOTION + frame_mv[ZERO_ZEROMV][ref_frame].as_int = + gm_get_motion_vector(&cm->global_motion[ref_frame], + cm->allow_high_precision_mv, bsize, mi_col, mi_row, + 0) + .as_int; +#else // CONFIG_GLOBAL_MOTION + frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0; +#endif // CONFIG_GLOBAL_MOTION +#endif // CONFIG_EXT_INTER + } + +#if CONFIG_REF_MV + for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) { + MODE_INFO *const mi = xd->mi[0]; + int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; + x->mbmi_ext->mode_context[ref_frame] = 0; + av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame], + mbmi_ext->ref_mv_stack[ref_frame], +#if CONFIG_EXT_INTER + mbmi_ext->compound_mode_context, +#endif // CONFIG_EXT_INTER + candidates, mi_row, mi_col, NULL, NULL, + mbmi_ext->mode_context); + if (mbmi_ext->ref_mv_count[ref_frame] < 2) { + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_frame); + if (mbmi_ext->ref_mvs[rf[0]][0].as_int != + frame_mv[ZEROMV][rf[0]].as_int || + mbmi_ext->ref_mvs[rf[0]][1].as_int != + frame_mv[ZEROMV][rf[0]].as_int || + mbmi_ext->ref_mvs[rf[1]][0].as_int != + frame_mv[ZEROMV][rf[1]].as_int || + mbmi_ext->ref_mvs[rf[1]][1].as_int != frame_mv[ZEROMV][rf[1]].as_int) + mbmi_ext->mode_context[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET); + } + } +#endif // CONFIG_REF_MV + +#if CONFIG_MOTION_VAR + av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col); + if (check_num_overlappable_neighbors(mbmi) && + is_motion_variation_allowed_bsize(bsize)) { + av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, + args.above_pred_buf, dst_width1, + dst_height1, args.above_pred_stride); + av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, + args.left_pred_buf, dst_width2, + dst_height2, args.left_pred_stride); + av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row, + mi_col); + x->mask_buf = mask2d_buf; + x->wsrc_buf = weighted_src_buf; + calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0], + args.above_pred_stride[0], args.left_pred_buf[0], + args.left_pred_stride[0]); + } +#endif // CONFIG_MOTION_VAR + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + if (!(cpi->ref_frame_flags & flag_list[ref_frame])) { +// Skip checking missing references in both single and compound reference +// modes. Note that a mode will be skipped iff both reference frames +// are masked out. +#if CONFIG_EXT_REFS + if (ref_frame == BWDREF_FRAME || ref_frame == ALTREF_FRAME) { + ref_frame_skip_mask[0] |= (1 << ref_frame); + ref_frame_skip_mask[1] |= ((1 << ref_frame) | 0x01); + } else { +#endif // CONFIG_EXT_REFS + ref_frame_skip_mask[0] |= (1 << ref_frame); + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; +#if CONFIG_EXT_REFS + } +#endif // CONFIG_EXT_REFS + } else { + for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + // Skip fixed mv modes for poor references + if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) { + mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO; + break; + } + } + } + // If the segment reference frame feature is enabled.... + // then do nothing if the current ref frame is not allowed.. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + ref_frame_skip_mask[0] |= (1 << ref_frame); + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + } + } + + // Disable this drop out case if the ref frame + // segment level feature is enabled for this segment. This is to + // prevent the possibility that we end up unable to pick any mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative. We allow near/nearest as well + // because they may result in zero-zero MVs but be cheaper. + if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { + int_mv zeromv; + ref_frame_skip_mask[0] = (1 << LAST_FRAME) | +#if CONFIG_EXT_REFS + (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | + (1 << BWDREF_FRAME) | +#endif // CONFIG_EXT_REFS + (1 << GOLDEN_FRAME); + ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK; + // TODO(zoeliu): To further explore whether following needs to be done for + // BWDREF_FRAME as well. + mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO; +#if CONFIG_GLOBAL_MOTION + zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ALTREF_FRAME], + cm->allow_high_precision_mv, bsize, + mi_col, mi_row, 0) + .as_int; +#else + zeromv.as_int = 0; +#endif // CONFIG_GLOBAL_MOTION + if (frame_mv[NEARMV][ALTREF_FRAME].as_int != zeromv.as_int) + mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV); + if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int) + mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV); +#if CONFIG_EXT_INTER + if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int) + mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV); + if (frame_mv[NEAREST_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int) + mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARMV); + if (frame_mv[NEAR_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int) + mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARESTMV); + if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int) + mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV); +#endif // CONFIG_EXT_INTER + } + } + + if (cpi->rc.is_src_frame_alt_ref) { + if (sf->alt_ref_search_fp) { + assert(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]); + mode_skip_mask[ALTREF_FRAME] = 0; + ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME); + ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK; + } + } + + if (sf->alt_ref_search_fp) + if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX) + if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1)) + mode_skip_mask[ALTREF_FRAME] |= INTER_ALL; + + if (sf->adaptive_mode_search) { + if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref && + cpi->rc.frames_since_golden >= 3) + if (x->pred_mv_sad[GOLDEN_FRAME] > (x->pred_mv_sad[LAST_FRAME] << 1)) + mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL; + } + + if (bsize > sf->max_intra_bsize) { + ref_frame_skip_mask[0] |= (1 << INTRA_FRAME); + ref_frame_skip_mask[1] |= (1 << INTRA_FRAME); + } + + mode_skip_mask[INTRA_FRAME] |= + ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); + + for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0; + for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i) + mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5; + + midx = sf->schedule_mode_search ? mode_skip_start : 0; + while (midx > 4) { + uint8_t end_pos = 0; + for (i = 5; i < midx; ++i) { + if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) { + uint8_t tmp = mode_map[i]; + mode_map[i] = mode_map[i - 1]; + mode_map[i - 1] = tmp; + end_pos = i; + } + } + midx = end_pos; + } + + if (cpi->sf.tx_type_search.fast_intra_tx_type_search) + x->use_default_intra_tx_type = 1; + else + x->use_default_intra_tx_type = 0; + + if (cpi->sf.tx_type_search.fast_inter_tx_type_search) + x->use_default_inter_tx_type = 1; + else + x->use_default_inter_tx_type = 0; +#if CONFIG_PVQ + od_encode_checkpoint(&x->daala_enc, &pre_buf); +#endif // CONFIG_PVQ +#if CONFIG_EXT_INTER + for (i = 0; i < MB_MODE_COUNT; ++i) + for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame) + modelled_rd[i][ref_frame] = INT64_MAX; +#endif // CONFIG_EXT_INTER + + for (midx = 0; midx < MAX_MODES; ++midx) { + int mode_index; + int mode_excluded = 0; + int64_t this_rd = INT64_MAX; + int disable_skip = 0; + int compmode_cost = 0; +#if CONFIG_EXT_INTER + int compmode_interintra_cost = 0; + int compmode_interinter_cost = 0; +#endif // CONFIG_EXT_INTER + int rate2 = 0, rate_y = 0, rate_uv = 0; + int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; + int skippable = 0; + int this_skip2 = 0; + int64_t total_sse = INT64_MAX; +#if CONFIG_REF_MV + uint8_t ref_frame_type; +#endif // CONFIG_REF_MV +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &pre_buf); +#endif // CONFIG_PVQ + mode_index = mode_map[midx]; + this_mode = av1_mode_order[mode_index].mode; + ref_frame = av1_mode_order[mode_index].ref_frame[0]; + second_ref_frame = av1_mode_order[mode_index].ref_frame[1]; +#if CONFIG_REF_MV + mbmi->ref_mv_idx = 0; +#endif // CONFIG_REF_MV + +#if CONFIG_EXT_INTER + if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) { + // Mode must by compatible + if (!is_interintra_allowed_mode(this_mode)) continue; + if (!is_interintra_allowed_bsize(bsize)) continue; + } + + if (is_inter_compound_mode(this_mode)) { + frame_mv[this_mode][ref_frame].as_int = + frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int; + frame_mv[this_mode][second_ref_frame].as_int = + frame_mv[compound_ref1_mode(this_mode)][second_ref_frame].as_int; + } +#endif // CONFIG_EXT_INTER + + // Look at the reference frame of the best mode so far and set the + // skip mask to look at a subset of the remaining modes. + if (midx == mode_skip_start && best_mode_index >= 0) { + switch (best_mbmode.ref_frame[0]) { + case INTRA_FRAME: break; + case LAST_FRAME: + ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK; + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + break; +#if CONFIG_EXT_REFS + case LAST2_FRAME: + ref_frame_skip_mask[0] |= LAST2_FRAME_MODE_MASK; + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + break; + case LAST3_FRAME: + ref_frame_skip_mask[0] |= LAST3_FRAME_MODE_MASK; + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + break; +#endif // CONFIG_EXT_REFS + case GOLDEN_FRAME: + ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK; + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + break; +#if CONFIG_EXT_REFS + case BWDREF_FRAME: + ref_frame_skip_mask[0] |= BWDREF_FRAME_MODE_MASK; + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + break; +#endif // CONFIG_EXT_REFS + case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALTREF_FRAME_MODE_MASK; +#if CONFIG_EXT_REFS + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; +#endif // CONFIG_EXT_REFS + break; + case NONE_FRAME: + case TOTAL_REFS_PER_FRAME: + assert(0 && "Invalid Reference frame"); + break; + } + } + + if ((ref_frame_skip_mask[0] & (1 << ref_frame)) && + (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame)))) + continue; + + if (mode_skip_mask[ref_frame] & (1 << this_mode)) continue; + + // Test best rd so far against threshold for trying this mode. + if (best_mode_skippable && sf->schedule_mode_search) + mode_threshold[mode_index] <<= 1; + + if (best_rd < mode_threshold[mode_index]) continue; + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue; + +#if CONFIG_LOWDELAY_COMPOUND // Changes LL bitstream +#if CONFIG_EXT_REFS + if (cpi->oxcf.pass == 0) { + // Complexity-compression trade-offs + // if (ref_frame == ALTREF_FRAME) continue; + // if (ref_frame == BWDREF_FRAME) continue; + if (second_ref_frame == ALTREF_FRAME) continue; + // if (second_ref_frame == BWDREF_FRAME) continue; + } +#endif +#endif + comp_pred = second_ref_frame > INTRA_FRAME; + if (comp_pred) { + if (!cpi->allow_comp_inter_inter) continue; + + // Skip compound inter modes if ARF is not available. + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; + + if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && + best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME) + continue; + + mode_excluded = cm->reference_mode == SINGLE_REFERENCE; + } else { + if (ref_frame != INTRA_FRAME) + mode_excluded = cm->reference_mode == COMPOUND_REFERENCE; + } + + if (ref_frame == INTRA_FRAME) { + if (sf->adaptive_mode_search) + if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse) + continue; + + if (this_mode != DC_PRED) { + // Disable intra modes other than DC_PRED for blocks with low variance + // Threshold for intra skipping based on source variance + // TODO(debargha): Specialize the threshold for super block sizes + const unsigned int skip_intra_var_thresh = 64; + if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && + x->source_variance < skip_intra_var_thresh) + continue; + // Only search the oblique modes if the best so far is + // one of the neighboring directional modes + if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && + (this_mode >= D45_PRED && this_mode <= TM_PRED)) { + if (best_mode_index >= 0 && best_mbmode.ref_frame[0] > INTRA_FRAME) + continue; + } + if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(this_mode, best_intra_mode)) continue; + } + } +#if CONFIG_GLOBAL_MOTION + } else if (cm->global_motion[ref_frame].wmtype == IDENTITY && + (!comp_pred || + cm->global_motion[second_ref_frame].wmtype == IDENTITY)) { +#else // CONFIG_GLOBAL_MOTION + } else { +#endif // CONFIG_GLOBAL_MOTION + const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame }; + if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, +#if CONFIG_REF_MV && CONFIG_EXT_INTER + mbmi_ext->compound_mode_context, +#endif // CONFIG_REF_MV && CONFIG_EXT_INTER + frame_mv, this_mode, ref_frames, bsize, -1, + mi_row, mi_col)) + continue; + } + + mbmi->mode = this_mode; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = ref_frame; + mbmi->ref_frame[1] = second_ref_frame; +#if CONFIG_PALETTE + pmi->palette_size[0] = 0; + pmi->palette_size[1] = 0; +#endif // CONFIG_PALETTE +#if CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; + mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0; +#endif // CONFIG_FILTER_INTRA + // Evaluate all sub-pel filters irrespective of whether we can use + // them for this frame. + + set_default_interp_filters(mbmi, cm->interp_filter); + + mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + + x->skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + // Select prediction reference frames. + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } + +#if CONFIG_EXT_INTER + mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1); +#endif // CONFIG_EXT_INTER + + if (ref_frame == INTRA_FRAME) { + RD_STATS rd_stats_y; + TX_SIZE uv_tx; + struct macroblockd_plane *const pd = &xd->plane[1]; +#if CONFIG_EXT_INTRA + is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize); + if (is_directional_mode) { + int rate_dummy; + int64_t model_rd = INT64_MAX; + if (!angle_stats_ready) { + const int src_stride = x->plane[0].src.stride; + const uint8_t *src = x->plane[0].src.buf; +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + highbd_angle_estimation(src, src_stride, rows, cols, + directional_mode_skip_mask); + else +#endif // CONFIG_HIGHBITDEPTH + angle_estimation(src, src_stride, rows, cols, + directional_mode_skip_mask); + angle_stats_ready = 1; + } + if (directional_mode_skip_mask[mbmi->mode]) continue; + rd_stats_y.rate = INT_MAX; + rd_pick_intra_angle_sby(cpi, x, &rate_dummy, &rd_stats_y, bsize, + intra_mode_cost[mbmi->mode], best_rd, + &model_rd); + } else { + mbmi->angle_delta[0] = 0; + super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd); + } +#else + super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd); +#endif // CONFIG_EXT_INTRA + rate_y = rd_stats_y.rate; + distortion_y = rd_stats_y.dist; + skippable = rd_stats_y.skip; + + if (rate_y == INT_MAX) continue; + +#if CONFIG_FILTER_INTRA + if (mbmi->mode == DC_PRED) dc_skipped = 0; +#endif // CONFIG_FILTER_INTRA + + uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][pd->subsampling_x] + [pd->subsampling_y]; + if (rate_uv_intra[uv_tx] == INT_MAX) { + choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx], + &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx], + &skip_uvs[uv_tx], &mode_uv[uv_tx]); +#if CONFIG_PALETTE + if (try_palette) pmi_uv[uv_tx] = *pmi; +#endif // CONFIG_PALETTE + +#if CONFIG_EXT_INTRA + uv_angle_delta[uv_tx] = mbmi->angle_delta[1]; +#endif // CONFIG_EXT_INTRA +#if CONFIG_FILTER_INTRA + filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info; +#endif // CONFIG_FILTER_INTRA + } + + rate_uv = rate_uv_tokenonly[uv_tx]; + distortion_uv = dist_uvs[uv_tx]; + skippable = skippable && skip_uvs[uv_tx]; + mbmi->uv_mode = mode_uv[uv_tx]; +#if CONFIG_PALETTE + if (try_palette) { + pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1]; + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, + pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE, + 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); + } +#endif // CONFIG_PALETTE + +#if CONFIG_EXT_INTRA + mbmi->angle_delta[1] = uv_angle_delta[uv_tx]; +#endif // CONFIG_EXT_INTRA +#if CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = + filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]; + if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) { + mbmi->filter_intra_mode_info.filter_intra_mode[1] = + filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1]; + } +#endif // CONFIG_FILTER_INTRA + +#if CONFIG_CB4X4 + rate2 = rate_y + intra_mode_cost[mbmi->mode]; + if (!x->skip_chroma_rd) + rate2 += rate_uv + cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode]; +#else + rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv + + cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode]; +#endif // CONFIG_CB4X4 + +#if CONFIG_PALETTE + if (try_palette && mbmi->mode == DC_PRED) { + rate2 += av1_cost_bit( + av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0); + } +#endif // CONFIG_PALETTE + + if (!xd->lossless[mbmi->segment_id] && bsize >= BLOCK_8X8) { + // super_block_yrd above includes the cost of the tx_size in the + // tokenonly rate, but for intra blocks, tx_size is always coded + // (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size); + } +#if CONFIG_EXT_INTRA + if (is_directional_mode) { +#if CONFIG_INTRA_INTERP + const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd); + const int p_angle = + mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP; + if (av1_is_intra_filter_switchable(p_angle)) + rate2 += cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter]; +#endif // CONFIG_INTRA_INTERP + rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, + MAX_ANGLE_DELTA + mbmi->angle_delta[0]); + } + if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) { + rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, + MAX_ANGLE_DELTA + mbmi->angle_delta[1]); + } +#endif // CONFIG_EXT_INTRA +#if CONFIG_FILTER_INTRA + if (mbmi->mode == DC_PRED) { + rate2 += + av1_cost_bit(cm->fc->filter_intra_probs[0], + mbmi->filter_intra_mode_info.use_filter_intra_mode[0]); + if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) { + rate2 += write_uniform_cost( + FILTER_INTRA_MODES, + mbmi->filter_intra_mode_info.filter_intra_mode[0]); + } + } + if (mbmi->uv_mode == DC_PRED) { + rate2 += + av1_cost_bit(cpi->common.fc->filter_intra_probs[1], + mbmi->filter_intra_mode_info.use_filter_intra_mode[1]); + if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1]) + rate2 += write_uniform_cost( + FILTER_INTRA_MODES, + mbmi->filter_intra_mode_info.filter_intra_mode[1]); + } +#endif // CONFIG_FILTER_INTRA + if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED) + rate2 += intra_cost_penalty; + distortion2 = distortion_y + distortion_uv; + } else { +#if CONFIG_REF_MV + int_mv backup_ref_mv[2]; + +#if !SUB8X8_COMP_REF + if (bsize < BLOCK_8X8 && mbmi->ref_frame[1] > INTRA_FRAME) continue; +#endif // !SUB8X8_COMP_REF + + backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0]; + if (comp_pred) backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0]; +#endif // CONFIG_REF_MV +#if CONFIG_EXT_INTER + if (second_ref_frame == INTRA_FRAME) { + if (best_single_inter_ref != ref_frame) continue; + mbmi->interintra_mode = intra_to_interintra_mode[best_intra_mode]; +// TODO(debargha|geza.lore): +// Should we use ext_intra modes for interintra? +#if CONFIG_EXT_INTRA + mbmi->angle_delta[0] = 0; + mbmi->angle_delta[1] = 0; +#if CONFIG_INTRA_INTERP + mbmi->intra_filter = INTRA_FILTER_LINEAR; +#endif // CONFIG_INTRA_INTERP +#endif // CONFIG_EXT_INTRA +#if CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; + mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0; +#endif // CONFIG_FILTER_INTRA + } +#endif // CONFIG_EXT_INTER +#if CONFIG_REF_MV + mbmi->ref_mv_idx = 0; + ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + +#if CONFIG_EXT_INTER + if (comp_pred) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) { + int ref_mv_idx = 0; + // Special case: NEAR_NEWMV and NEW_NEARMV modes use + // 1 + mbmi->ref_mv_idx (like NEARMV) instead of + // mbmi->ref_mv_idx (like NEWMV) + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) + ref_mv_idx = 1; + + if (compound_ref0_mode(mbmi->mode) == NEWMV) { + int_mv this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; + clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, + xd->n8_h << MI_SIZE_LOG2, xd); + mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv; + } + if (compound_ref1_mode(mbmi->mode) == NEWMV) { + int_mv this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv; + clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, + xd->n8_h << MI_SIZE_LOG2, xd); + mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv; + } + } + } else { +#endif // CONFIG_EXT_INTER + if (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1) { + int ref; + for (ref = 0; ref < 1 + comp_pred; ++ref) { + int_mv this_mv = + (ref == 0) ? mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv + : mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv; + clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, + xd->n8_h << MI_SIZE_LOG2, xd); + mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv; + } + } +#if CONFIG_EXT_INTER + } +#endif // CONFIG_EXT_INTER +#endif // CONFIG_REF_MV + { + RD_STATS rd_stats, rd_stats_y, rd_stats_uv; + av1_init_rd_stats(&rd_stats); + rd_stats.rate = rate2; + + // Point to variables that are maintained between loop iterations + args.single_newmv = single_newmv; +#if CONFIG_EXT_INTER + args.single_newmv_rate = single_newmv_rate; + args.compmode_interintra_cost = &compmode_interintra_cost; + args.compmode_interinter_cost = &compmode_interinter_cost; + args.modelled_rd = modelled_rd; +#endif // CONFIG_EXT_INTER + this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y, + &rd_stats_uv, &disable_skip, frame_mv, + mi_row, mi_col, &args, best_rd); +// Prevent pointers from escaping local scope +#if CONFIG_EXT_INTER + args.compmode_interintra_cost = NULL; + args.compmode_interinter_cost = NULL; +#endif // CONFIG_EXT_INTER + + rate2 = rd_stats.rate; + skippable = rd_stats.skip; + distortion2 = rd_stats.dist; + total_sse = rd_stats.sse; + rate_y = rd_stats_y.rate; + rate_uv = rd_stats_uv.rate; + } + +#if CONFIG_REF_MV +// TODO(jingning): This needs some refactoring to improve code quality +// and reduce redundant steps. +#if CONFIG_EXT_INTER + if ((have_nearmv_in_inter_mode(mbmi->mode) && + mbmi_ext->ref_mv_count[ref_frame_type] > 2) || + ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) && + mbmi_ext->ref_mv_count[ref_frame_type] > 1)) { +#else + if ((mbmi->mode == NEARMV && + mbmi_ext->ref_mv_count[ref_frame_type] > 2) || + (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1)) { +#endif + int_mv backup_mv = frame_mv[NEARMV][ref_frame]; + MB_MODE_INFO backup_mbmi = *mbmi; + int backup_skip = x->skip; + int64_t tmp_ref_rd = this_rd; + int ref_idx; + +// TODO(jingning): This should be deprecated shortly. +#if CONFIG_EXT_INTER + int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; +#else + int idx_offset = (mbmi->mode == NEARMV) ? 1 : 0; +#endif // CONFIG_EXT_INTER + int ref_set = + AOMMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset); + + uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx_offset); + // Dummy + int_mv backup_fmv[2]; + backup_fmv[0] = frame_mv[NEWMV][ref_frame]; + if (comp_pred) backup_fmv[1] = frame_mv[NEWMV][second_ref_frame]; + + rate2 += (rate2 < INT_MAX ? cpi->drl_mode_cost0[drl_ctx][0] : 0); + + if (this_rd < INT64_MAX) { + if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) < + RDCOST(x->rdmult, x->rddiv, 0, total_sse)) + tmp_ref_rd = + RDCOST(x->rdmult, x->rddiv, + rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0), + distortion2); + else + tmp_ref_rd = + RDCOST(x->rdmult, x->rddiv, + rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) - + rate_y - rate_uv, + total_sse); + } +#if CONFIG_VAR_TX + for (i = 0; i < MAX_MB_PLANE; ++i) + memcpy(x->blk_skip_drl[i], x->blk_skip[i], + sizeof(uint8_t) * ctx->num_4x4_blk); +#endif // CONFIG_VAR_TX + + for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) { + int64_t tmp_alt_rd = INT64_MAX; + int dummy_disable_skip = 0; + int ref; + int_mv cur_mv; + RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv; +#if CONFIG_EXT_INTER + int tmp_compmode_interintra_cost = 0; + int tmp_compmode_interinter_cost = 0; +#endif // CONFIG_EXT_INTER + + av1_invalid_rd_stats(&tmp_rd_stats); + x->skip = 0; + + mbmi->ref_mv_idx = 1 + ref_idx; + +#if CONFIG_EXT_INTER + if (comp_pred) { + int ref_mv_idx = mbmi->ref_mv_idx; + // Special case: NEAR_NEWMV and NEW_NEARMV modes use + // 1 + mbmi->ref_mv_idx (like NEARMV) instead of + // mbmi->ref_mv_idx (like NEWMV) + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) + ref_mv_idx = 1 + mbmi->ref_mv_idx; + + if (compound_ref0_mode(mbmi->mode) == NEWMV) { + int_mv this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; + clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, + xd->n8_h << MI_SIZE_LOG2, xd); + mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv; + } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV) { + int_mv this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv; + clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, + xd->n8_h << MI_SIZE_LOG2, xd); + mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv; + } + + if (compound_ref1_mode(mbmi->mode) == NEWMV) { + int_mv this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv; + clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, + xd->n8_h << MI_SIZE_LOG2, xd); + mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv; + } else if (compound_ref1_mode(mbmi->mode) == NEARESTMV) { + int_mv this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv; + clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, + xd->n8_h << MI_SIZE_LOG2, xd); + mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv; + } + } else { +#endif // CONFIG_EXT_INTER + for (ref = 0; ref < 1 + comp_pred; ++ref) { + int_mv this_mv = + (ref == 0) + ? mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx] + .this_mv + : mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx] + .comp_mv; + clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2, + xd->n8_h << MI_SIZE_LOG2, xd); + mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv; + } +#if CONFIG_EXT_INTER + } +#endif + + cur_mv = + mbmi_ext->ref_mv_stack[ref_frame][mbmi->ref_mv_idx + idx_offset] + .this_mv; + clamp_mv2(&cur_mv.as_mv, xd); + + if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) { + int_mv dummy_single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } }; +#if CONFIG_EXT_INTER + int dummy_single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 }; +#endif // CONFIG_EXT_INTER + + frame_mv[NEARMV][ref_frame] = cur_mv; + av1_init_rd_stats(&tmp_rd_stats); + + // Point to variables that are not maintained between iterations + args.single_newmv = dummy_single_newmv; +#if CONFIG_EXT_INTER + args.single_newmv_rate = dummy_single_newmv_rate; + args.compmode_interintra_cost = &tmp_compmode_interintra_cost; + args.compmode_interinter_cost = &tmp_compmode_interinter_cost; + args.modelled_rd = NULL; +#endif // CONFIG_EXT_INTER + tmp_alt_rd = handle_inter_mode( + cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, &tmp_rd_stats_uv, + &dummy_disable_skip, frame_mv, mi_row, mi_col, &args, best_rd); + // Prevent pointers from escaping local scope + args.single_newmv = NULL; +#if CONFIG_EXT_INTER + args.single_newmv_rate = NULL; + args.compmode_interintra_cost = NULL; + args.compmode_interinter_cost = NULL; +#endif // CONFIG_EXT_INTER + } + + for (i = 0; i < mbmi->ref_mv_idx; ++i) { + uint8_t drl1_ctx = 0; + drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], + i + idx_offset); + tmp_rd_stats.rate += + (tmp_rd_stats.rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][1] + : 0); + } + + if (mbmi_ext->ref_mv_count[ref_frame_type] > + mbmi->ref_mv_idx + idx_offset + 1 && + ref_idx < ref_set - 1) { + uint8_t drl1_ctx = + av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], + mbmi->ref_mv_idx + idx_offset); + tmp_rd_stats.rate += + (tmp_rd_stats.rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][0] + : 0); + } + + if (tmp_alt_rd < INT64_MAX) { +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + tmp_alt_rd = RDCOST(x->rdmult, x->rddiv, tmp_rd_stats.rate, + tmp_rd_stats.dist); +#else + if (RDCOST(x->rdmult, x->rddiv, + tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate, + tmp_rd_stats.dist) < + RDCOST(x->rdmult, x->rddiv, 0, tmp_rd_stats.sse)) + tmp_alt_rd = + RDCOST(x->rdmult, x->rddiv, + tmp_rd_stats.rate + + av1_cost_bit(av1_get_skip_prob(cm, xd), 0), + tmp_rd_stats.dist); + else + tmp_alt_rd = + RDCOST(x->rdmult, x->rddiv, + tmp_rd_stats.rate + + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) - + tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate, + tmp_rd_stats.sse); +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + } + + if (tmp_ref_rd > tmp_alt_rd) { + rate2 = tmp_rd_stats.rate; + disable_skip = dummy_disable_skip; + distortion2 = tmp_rd_stats.dist; + skippable = tmp_rd_stats.skip; + rate_y = tmp_rd_stats_y.rate; + rate_uv = tmp_rd_stats_uv.rate; + total_sse = tmp_rd_stats.sse; + this_rd = tmp_alt_rd; + tmp_ref_rd = tmp_alt_rd; + backup_mbmi = *mbmi; + backup_skip = x->skip; +#if CONFIG_VAR_TX + for (i = 0; i < MAX_MB_PLANE; ++i) + memcpy(x->blk_skip_drl[i], x->blk_skip[i], + sizeof(uint8_t) * ctx->num_4x4_blk); +#endif // CONFIG_VAR_TX +#if CONFIG_EXT_INTER + compmode_interintra_cost = tmp_compmode_interintra_cost; + compmode_interinter_cost = tmp_compmode_interinter_cost; +#endif // CONFIG_EXT_INTER + } else { + *mbmi = backup_mbmi; + x->skip = backup_skip; + } + } + + frame_mv[NEARMV][ref_frame] = backup_mv; + frame_mv[NEWMV][ref_frame] = backup_fmv[0]; + if (comp_pred) frame_mv[NEWMV][second_ref_frame] = backup_fmv[1]; +#if CONFIG_VAR_TX + for (i = 0; i < MAX_MB_PLANE; ++i) + memcpy(x->blk_skip[i], x->blk_skip_drl[i], + sizeof(uint8_t) * ctx->num_4x4_blk); +#endif // CONFIG_VAR_TX + } + mbmi_ext->ref_mvs[ref_frame][0] = backup_ref_mv[0]; + if (comp_pred) mbmi_ext->ref_mvs[second_ref_frame][0] = backup_ref_mv[1]; +#endif // CONFIG_REF_MV + + if (this_rd == INT64_MAX) continue; + +#if SUB8X8_COMP_REF + compmode_cost = av1_cost_bit(comp_mode_p, comp_pred); +#else + if (mbmi->sb_type >= BLOCK_8X8) + compmode_cost = av1_cost_bit(comp_mode_p, comp_pred); +#endif // SUB8X8_COMP_REF + + if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost; + } + +#if CONFIG_EXT_INTER + rate2 += compmode_interintra_cost; + if (cm->reference_mode != SINGLE_REFERENCE && comp_pred) +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + if (mbmi->motion_mode == SIMPLE_TRANSLATION) +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + rate2 += compmode_interinter_cost; +#endif // CONFIG_EXT_INTER + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + if (comp_pred) { + rate2 += ref_costs_comp[ref_frame]; +#if CONFIG_EXT_REFS + rate2 += ref_costs_comp[second_ref_frame]; +#endif // CONFIG_EXT_REFS + } else { + rate2 += ref_costs_single[ref_frame]; + } + +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + if (ref_frame == INTRA_FRAME) { +#else + if (!disable_skip) { +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + if (skippable) { + // Back out the coefficient coding costs + rate2 -= (rate_y + rate_uv); + rate_y = 0; + rate_uv = 0; + // Cost the skip mb case + rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) { +#if CONFIG_REF_MV + if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + rate_skip0, + distortion2) < + RDCOST(x->rdmult, x->rddiv, rate_skip1, total_sse)) { +#else + if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) < + RDCOST(x->rdmult, x->rddiv, 0, total_sse)) { +#endif // CONFIG_REF_MV + // Add in the cost of the no skip flag. + rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + } else { + // FIXME(rbultje) make this work for splitmv also + rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + distortion2 = total_sse; + assert(total_sse >= 0); + rate2 -= (rate_y + rate_uv); + this_skip2 = 1; + rate_y = 0; + rate_uv = 0; + } + } else { + // Add in the cost of the no skip flag. + rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + } + + // Calculate the final RD estimate for this mode. + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + } else { + this_skip2 = mbmi->skip; + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + if (this_skip2) { + rate_y = 0; + rate_uv = 0; + } +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION + } + + if (ref_frame == INTRA_FRAME) { + // Keep record of best intra rd + if (this_rd < best_intra_rd) { + best_intra_rd = this_rd; + best_intra_mode = mbmi->mode; + } +#if CONFIG_EXT_INTER + } else if (second_ref_frame == NONE_FRAME) { + if (this_rd < best_single_inter_rd) { + best_single_inter_rd = this_rd; + best_single_inter_ref = mbmi->ref_frame[0]; + } +#endif // CONFIG_EXT_INTER + } + + if (!disable_skip && ref_frame == INTRA_FRAME) { + for (i = 0; i < REFERENCE_MODES; ++i) + best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd); + } + + // Did this mode help.. i.e. is it the new best mode + if (this_rd < best_rd || x->skip) { + if (!mode_excluded) { + // Note index of best mode so far + best_mode_index = mode_index; + + if (ref_frame == INTRA_FRAME) { + /* required for left and above block mv */ + mbmi->mv[0].as_int = 0; + } else { + best_pred_sse = x->pred_sse[ref_frame]; + } + + rd_cost->rate = rate2; +#if CONFIG_SUPERTX + if (x->skip) + *returnrate_nocoef = rate2; + else + *returnrate_nocoef = rate2 - rate_y - rate_uv; + *returnrate_nocoef -= av1_cost_bit( + av1_get_skip_prob(cm, xd), disable_skip || skippable || this_skip2); + *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd), + mbmi->ref_frame[0] != INTRA_FRAME); +#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION +#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION + MODE_INFO *const mi = xd->mi[0]; + const MOTION_MODE motion_allowed = motion_mode_allowed( +#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION + 0, xd->global_motion, +#endif // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION + mi); + if (motion_allowed == WARPED_CAUSAL) + *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode]; + else if (motion_allowed == OBMC_CAUSAL) + *returnrate_nocoef -= + cpi->motion_mode_cost1[bsize][mbmi->motion_mode]; +#else + *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode]; +#endif // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION +#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION +#endif // CONFIG_SUPERTX + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + best_rd = this_rd; + best_mbmode = *mbmi; + best_skip2 = this_skip2; + best_mode_skippable = skippable; + best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd), + this_skip2 || skippable); + best_rate_uv = rate_uv; + +#if CONFIG_VAR_TX + for (i = 0; i < MAX_MB_PLANE; ++i) + memcpy(ctx->blk_skip[i], x->blk_skip[i], + sizeof(uint8_t) * ctx->num_4x4_blk); +#endif // CONFIG_VAR_TX + } + } + + /* keep record of best compound/single-only prediction */ + if (!disable_skip && ref_frame != INTRA_FRAME) { + int64_t single_rd, hybrid_rd, single_rate, hybrid_rate; + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + single_rate = rate2 - compmode_cost; + hybrid_rate = rate2; + } else { + single_rate = rate2; + hybrid_rate = rate2 + compmode_cost; + } + + single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2); + hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2); + + if (!comp_pred) { + if (single_rd < best_pred_rd[SINGLE_REFERENCE]) + best_pred_rd[SINGLE_REFERENCE] = single_rd; + } else { + if (single_rd < best_pred_rd[COMPOUND_REFERENCE]) + best_pred_rd[COMPOUND_REFERENCE] = single_rd; + } + if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT]) + best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; + } + + if (x->skip && !comp_pred) break; + } + + if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 && + ((sf->tx_type_search.fast_inter_tx_type_search == 1 && + is_inter_mode(best_mbmode.mode)) || + (sf->tx_type_search.fast_intra_tx_type_search == 1 && + !is_inter_mode(best_mbmode.mode)))) { + int skip_blk = 0; + RD_STATS rd_stats_y, rd_stats_uv; + + x->use_default_inter_tx_type = 0; + x->use_default_intra_tx_type = 0; + + *mbmi = best_mbmode; + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (has_second_ref(mbmi)) + xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + if (is_inter_mode(mbmi->mode)) { + av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize); +#if CONFIG_MOTION_VAR + if (mbmi->motion_mode == OBMC_CAUSAL) { + av1_build_obmc_inter_prediction( + cm, xd, mi_row, mi_col, args.above_pred_buf, args.above_pred_stride, + args.left_pred_buf, args.left_pred_stride); + } +#endif // CONFIG_MOTION_VAR + av1_subtract_plane(x, bsize, 0); +#if CONFIG_VAR_TX + if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) { + select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); + } else { + int idx, idy; + super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); + for (idy = 0; idy < xd->n8_h; ++idy) + for (idx = 0; idx < xd->n8_w; ++idx) + mbmi->inter_tx_size[idy][idx] = mbmi->tx_size; + memset(x->blk_skip[0], rd_stats_y.skip, + sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4); + } + + inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); +#else + super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); + super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); +#endif // CONFIG_VAR_TX + } else { + super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); + super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); + } + + if (RDCOST(x->rdmult, x->rddiv, rd_stats_y.rate + rd_stats_uv.rate, + (rd_stats_y.dist + rd_stats_uv.dist)) > + RDCOST(x->rdmult, x->rddiv, 0, (rd_stats_y.sse + rd_stats_uv.sse))) { + skip_blk = 1; + rd_stats_y.rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + rd_stats_uv.rate = 0; + rd_stats_y.dist = rd_stats_y.sse; + rd_stats_uv.dist = rd_stats_uv.sse; + } else { + skip_blk = 0; + rd_stats_y.rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + } + + if (RDCOST(x->rdmult, x->rddiv, best_rate_y + best_rate_uv, rd_cost->dist) > + RDCOST(x->rdmult, x->rddiv, rd_stats_y.rate + rd_stats_uv.rate, + (rd_stats_y.dist + rd_stats_uv.dist))) { +#if CONFIG_VAR_TX + int idx, idy; +#endif // CONFIG_VAR_TX + best_mbmode.tx_type = mbmi->tx_type; + best_mbmode.tx_size = mbmi->tx_size; +#if CONFIG_VAR_TX + for (idy = 0; idy < xd->n8_h; ++idy) + for (idx = 0; idx < xd->n8_w; ++idx) + best_mbmode.inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx]; + + for (i = 0; i < MAX_MB_PLANE; ++i) + memcpy(ctx->blk_skip[i], x->blk_skip[i], + sizeof(uint8_t) * ctx->num_4x4_blk); + + best_mbmode.min_tx_size = mbmi->min_tx_size; +#endif // CONFIG_VAR_TX + rd_cost->rate += + (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv); + rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist; + rd_cost->rdcost = + RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist); + best_skip2 = skip_blk; + } + } + +#if CONFIG_PALETTE + // Only try palette mode when the best mode so far is an intra mode. + if (try_palette && !is_inter_mode(best_mbmode.mode)) { + int rate2 = 0; +#if CONFIG_SUPERTX + int best_rate_nocoef; +#endif // CONFIG_SUPERTX + int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd, + best_model_rd_palette = INT64_MAX; + int skippable = 0, rate_overhead_palette = 0; + RD_STATS rd_stats_y; + TX_SIZE uv_tx; + uint8_t *const best_palette_color_map = + x->palette_buffer->best_palette_color_map; + uint8_t *const color_map = xd->plane[0].color_index_map; + MB_MODE_INFO best_mbmi_palette = best_mbmode; + + mbmi->mode = DC_PRED; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + rate_overhead_palette = rd_pick_palette_intra_sby( + cpi, x, bsize, palette_ctx, intra_mode_cost[DC_PRED], + &best_mbmi_palette, best_palette_color_map, &best_rd_palette, + &best_model_rd_palette, NULL, NULL, NULL, NULL); + if (pmi->palette_size[0] == 0) goto PALETTE_EXIT; + memcpy(color_map, best_palette_color_map, + rows * cols * sizeof(best_palette_color_map[0])); + super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd); + if (rd_stats_y.rate == INT_MAX) goto PALETTE_EXIT; + uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x] + [xd->plane[1].subsampling_y]; + if (rate_uv_intra[uv_tx] == INT_MAX) { + choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx], + &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx], + &skip_uvs[uv_tx], &mode_uv[uv_tx]); + pmi_uv[uv_tx] = *pmi; +#if CONFIG_EXT_INTRA + uv_angle_delta[uv_tx] = mbmi->angle_delta[1]; +#endif // CONFIG_EXT_INTRA +#if CONFIG_FILTER_INTRA + filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info; +#endif // CONFIG_FILTER_INTRA + } + mbmi->uv_mode = mode_uv[uv_tx]; + pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1]; + if (pmi->palette_size[1] > 0) { + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, + pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE, + 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); + } +#if CONFIG_EXT_INTRA + mbmi->angle_delta[1] = uv_angle_delta[uv_tx]; +#endif // CONFIG_EXT_INTRA +#if CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = + filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]; + if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) { + mbmi->filter_intra_mode_info.filter_intra_mode[1] = + filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1]; + } +#endif // CONFIG_FILTER_INTRA + skippable = rd_stats_y.skip && skip_uvs[uv_tx]; + distortion2 = rd_stats_y.dist + dist_uvs[uv_tx]; + rate2 = rd_stats_y.rate + rate_overhead_palette + rate_uv_intra[uv_tx]; + rate2 += ref_costs_single[INTRA_FRAME]; + + if (skippable) { + rate2 -= (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]); +#if CONFIG_SUPERTX + best_rate_nocoef = rate2; +#endif // CONFIG_SUPERTX + rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + } else { +#if CONFIG_SUPERTX + best_rate_nocoef = rate2 - (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]); +#endif // CONFIG_SUPERTX + rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + } + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + if (this_rd < best_rd) { + best_mode_index = 3; + mbmi->mv[0].as_int = 0; + rd_cost->rate = rate2; +#if CONFIG_SUPERTX + *returnrate_nocoef = best_rate_nocoef; +#endif // CONFIG_SUPERTX + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + best_rd = this_rd; + best_mbmode = *mbmi; + best_skip2 = 0; + best_mode_skippable = skippable; + } + } +PALETTE_EXIT: +#endif // CONFIG_PALETTE + +#if CONFIG_FILTER_INTRA + // TODO(huisu): filter-intra is turned off in lossless mode for now to + // avoid a unit test failure + if (!xd->lossless[mbmi->segment_id] && +#if CONFIG_PALETTE + pmi->palette_size[0] == 0 && +#endif // CONFIG_PALETTE + !dc_skipped && best_mode_index >= 0 && + best_intra_rd < (best_rd + (best_rd >> 3))) { + pick_filter_intra_interframe( + cpi, x, ctx, bsize, mi_row, mi_col, rate_uv_intra, rate_uv_tokenonly, + dist_uvs, skip_uvs, mode_uv, filter_intra_mode_info_uv, +#if CONFIG_EXT_INTRA + uv_angle_delta, +#endif // CONFIG_EXT_INTRA +#if CONFIG_PALETTE + pmi_uv, palette_ctx, +#endif // CONFIG_PALETTE + 0, ref_costs_single, &best_rd, &best_intra_rd, &best_intra_mode, + &best_mode_index, &best_skip2, &best_mode_skippable, +#if CONFIG_SUPERTX + returnrate_nocoef, +#endif // CONFIG_SUPERTX + best_pred_rd, &best_mbmode, rd_cost); + } +#endif // CONFIG_FILTER_INTRA + + // The inter modes' rate costs are not calculated precisely in some cases. + // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and + // ZEROMV. Here, checks are added for those cases, and the mode decisions + // are corrected. + if (best_mbmode.mode == NEWMV +#if CONFIG_EXT_INTER + || best_mbmode.mode == NEW_NEWMV +#endif // CONFIG_EXT_INTER + ) { + const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0], + best_mbmode.ref_frame[1] }; + int comp_pred_mode = refs[1] > INTRA_FRAME; + int_mv zeromv[2]; +#if CONFIG_REF_MV + const uint8_t rf_type = av1_ref_frame_type(best_mbmode.ref_frame); +#endif // CONFIG_REF_MV +#if CONFIG_GLOBAL_MOTION + zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]], + cm->allow_high_precision_mv, bsize, + mi_col, mi_row, 0) + .as_int; + zeromv[1].as_int = comp_pred_mode + ? gm_get_motion_vector(&cm->global_motion[refs[1]], + cm->allow_high_precision_mv, + bsize, mi_col, mi_row, 0) + .as_int + : 0; +#else + zeromv[0].as_int = 0; + zeromv[1].as_int = 0; +#endif // CONFIG_GLOBAL_MOTION +#if CONFIG_REF_MV + if (!comp_pred_mode) { + int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2) + ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2) + : INT_MAX; + + for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) { + int_mv cur_mv = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv; + if (cur_mv.as_int == best_mbmode.mv[0].as_int) { + best_mbmode.mode = NEARMV; + best_mbmode.ref_mv_idx = i; + } + } + + if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int) + best_mbmode.mode = NEARESTMV; + else if (best_mbmode.mv[0].as_int == zeromv[0].as_int) + best_mbmode.mode = ZEROMV; + } else { + int_mv nearestmv[2]; + int_mv nearmv[2]; + +#if CONFIG_EXT_INTER + if (mbmi_ext->ref_mv_count[rf_type] > 1) { + nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv; + nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv; + } else { + nearmv[0] = frame_mv[NEARMV][refs[0]]; + nearmv[1] = frame_mv[NEARMV][refs[1]]; + } +#else + int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2) + ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2) + : INT_MAX; + + for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) { + nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv; + nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv; + + if (nearmv[0].as_int == best_mbmode.mv[0].as_int && + nearmv[1].as_int == best_mbmode.mv[1].as_int) { + best_mbmode.mode = NEARMV; + best_mbmode.ref_mv_idx = i; + } + } +#endif // CONFIG_EXT_INTER + if (mbmi_ext->ref_mv_count[rf_type] >= 1) { + nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv; + nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv; + } else { + nearestmv[0] = frame_mv[NEARESTMV][refs[0]]; + nearestmv[1] = frame_mv[NEARESTMV][refs[1]]; + } + + if (nearestmv[0].as_int == best_mbmode.mv[0].as_int && + nearestmv[1].as_int == best_mbmode.mv[1].as_int) { +#if CONFIG_EXT_INTER + best_mbmode.mode = NEAREST_NEARESTMV; + } else { + int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2) + ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2) + : INT_MAX; + + for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) { + nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv; + nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv; + + // Try switching to the NEAR_NEAREST type modes first + if (nearestmv[0].as_int == best_mbmode.mv[0].as_int && + nearmv[1].as_int == best_mbmode.mv[1].as_int) { + best_mbmode.mode = NEAREST_NEARMV; + best_mbmode.ref_mv_idx = i; + } else if (nearmv[0].as_int == best_mbmode.mv[0].as_int && + nearestmv[1].as_int == best_mbmode.mv[1].as_int) { + best_mbmode.mode = NEAR_NEARESTMV; + best_mbmode.ref_mv_idx = i; + } else if (nearmv[0].as_int == best_mbmode.mv[0].as_int && + nearmv[1].as_int == best_mbmode.mv[1].as_int) { + best_mbmode.mode = NEAR_NEARMV; + best_mbmode.ref_mv_idx = i; + } + } + + if (best_mbmode.mode == NEW_NEWMV && + best_mbmode.mv[0].as_int == zeromv[0].as_int && + best_mbmode.mv[1].as_int == zeromv[1].as_int) + best_mbmode.mode = ZERO_ZEROMV; + } +#else + best_mbmode.mode = NEARESTMV; + } else if (best_mbmode.mv[0].as_int == zeromv[0].as_int && + best_mbmode.mv[1].as_int == zeromv[1].as_int) { + best_mbmode.mode = ZEROMV; + } +#endif // CONFIG_EXT_INTER + } +#else +#if CONFIG_EXT_INTER + if (!comp_pred_mode) { +#endif // CONFIG_EXT_INTER + if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int && + ((comp_pred_mode && + frame_mv[NEARESTMV][refs[1]].as_int == best_mbmode.mv[1].as_int) || + !comp_pred_mode)) + best_mbmode.mode = NEARESTMV; + else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int && + ((comp_pred_mode && + frame_mv[NEARMV][refs[1]].as_int == + best_mbmode.mv[1].as_int) || + !comp_pred_mode)) + best_mbmode.mode = NEARMV; + else if (best_mbmode.mv[0].as_int == zeromv[0].as_int && + ((comp_pred_mode && + best_mbmode.mv[1].as_int == zeromv[1].as_int) || + !comp_pred_mode)) + best_mbmode.mode = ZEROMV; +#if CONFIG_EXT_INTER + } else { +#if CONFIG_GLOBAL_MOTION + zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]], + cm->allow_high_precision_mv, + bsize, mi_col, mi_row, 0) + .as_int; + zeromv[1].as_int = comp_pred_mode + ? gm_get_motion_vector(&cm->global_motion[refs[1]], + cm->allow_high_precision_mv, + bsize, mi_col, mi_row, 0) + .as_int + : 0; +#else + zeromv[0].as_int = 0; + zeromv[1].as_int = 0; +#endif // CONFIG_GLOBAL_MOTION + if (frame_mv[NEAREST_NEARESTMV][refs[0]].as_int == + best_mbmode.mv[0].as_int && + frame_mv[NEAREST_NEARESTMV][refs[1]].as_int == + best_mbmode.mv[1].as_int) + best_mbmode.mode = NEAREST_NEARESTMV; + else if (frame_mv[NEAREST_NEARMV][refs[0]].as_int == + best_mbmode.mv[0].as_int && + frame_mv[NEAREST_NEARMV][refs[1]].as_int == + best_mbmode.mv[1].as_int) + best_mbmode.mode = NEAREST_NEARMV; + else if (frame_mv[NEAR_NEARESTMV][refs[0]].as_int == + best_mbmode.mv[0].as_int && + frame_mv[NEAR_NEARESTMV][refs[1]].as_int == + best_mbmode.mv[1].as_int) + best_mbmode.mode = NEAR_NEARESTMV; + else if (frame_mv[NEAR_NEARMV][refs[0]].as_int == + best_mbmode.mv[0].as_int && + frame_mv[NEAR_NEARMV][refs[1]].as_int == + best_mbmode.mv[1].as_int) + best_mbmode.mode = NEAR_NEARMV; + else if (best_mbmode.mv[0].as_int == zeromv[0].as_int && + best_mbmode.mv[1].as_int == zeromv[1].as_int) + best_mbmode.mode = ZERO_ZEROMV; + } +#endif // CONFIG_EXT_INTER +#endif // CONFIG_REF_MV + } + +#if CONFIG_REF_MV + // Make sure that the ref_mv_idx is only nonzero when we're + // using a mode which can support ref_mv_idx + if (best_mbmode.ref_mv_idx != 0 && +#if CONFIG_EXT_INTER + !(best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV || + have_nearmv_in_inter_mode(best_mbmode.mode))) { +#else + !(best_mbmode.mode == NEARMV || best_mbmode.mode == NEWMV)) { +#endif + best_mbmode.ref_mv_idx = 0; + } + + { + int8_t ref_frame_type = av1_ref_frame_type(best_mbmode.ref_frame); + int16_t mode_ctx = mbmi_ext->mode_context[ref_frame_type]; + if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) { + int_mv zeromv[2]; +#if CONFIG_GLOBAL_MOTION + const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0], + best_mbmode.ref_frame[1] }; + zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]], + cm->allow_high_precision_mv, + bsize, mi_col, mi_row, 0) + .as_int; + zeromv[1].as_int = (refs[1] != NONE_FRAME) + ? gm_get_motion_vector(&cm->global_motion[refs[1]], + cm->allow_high_precision_mv, + bsize, mi_col, mi_row, 0) + .as_int + : 0; + lower_mv_precision(&zeromv[0].as_mv, cm->allow_high_precision_mv); + lower_mv_precision(&zeromv[1].as_mv, cm->allow_high_precision_mv); +#else + zeromv[0].as_int = zeromv[1].as_int = 0; +#endif // CONFIG_GLOBAL_MOTION + if (best_mbmode.ref_frame[0] > INTRA_FRAME && + best_mbmode.mv[0].as_int == zeromv[0].as_int && +#if CONFIG_EXT_INTER + (best_mbmode.ref_frame[1] <= INTRA_FRAME) +#else + (best_mbmode.ref_frame[1] == NONE_FRAME || + best_mbmode.mv[1].as_int == zeromv[1].as_int) +#endif // CONFIG_EXT_INTER + ) { + best_mbmode.mode = ZEROMV; + } + } + } +#endif // CONFIG_REF_MV + + if (best_mode_index < 0 || best_rd >= best_rd_so_far) { + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + +#if CONFIG_DUAL_FILTER + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == best_mbmode.interp_filter[0]) || + !is_inter_block(&best_mbmode)); + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == best_mbmode.interp_filter[1]) || + !is_inter_block(&best_mbmode)); + if (best_mbmode.ref_frame[1] > INTRA_FRAME) { + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == best_mbmode.interp_filter[2]) || + !is_inter_block(&best_mbmode)); + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == best_mbmode.interp_filter[3]) || + !is_inter_block(&best_mbmode)); + } +#else + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == best_mbmode.interp_filter) || + !is_inter_block(&best_mbmode)); +#endif // CONFIG_DUAL_FILTER + + if (!cpi->rc.is_src_frame_alt_ref) + av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact, + sf->adaptive_rd_thresh, bsize, best_mode_index); + + // macroblock modes + *mbmi = best_mbmode; + x->skip |= best_skip2; + +// Note: this section is needed since the mode may have been forced to +// ZEROMV by the all-zero mode handling of ref-mv. +#if CONFIG_GLOBAL_MOTION + if (mbmi->mode == ZEROMV +#if CONFIG_EXT_INTER + || mbmi->mode == ZERO_ZEROMV +#endif // CONFIG_EXT_INTER + ) { +#if CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR + // Correct the motion mode for ZEROMV + const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed( +#if SEPARATE_GLOBAL_MOTION + 0, xd->global_motion, +#endif // SEPARATE_GLOBAL_MOTION + xd->mi[0]); + if (mbmi->motion_mode > last_motion_mode_allowed) + mbmi->motion_mode = last_motion_mode_allowed; +#endif // CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR + + // Correct the interpolation filter for ZEROMV + if (is_nontrans_global_motion(xd)) { +#if CONFIG_DUAL_FILTER + mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE + ? EIGHTTAP_REGULAR + : cm->interp_filter; + mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE + ? EIGHTTAP_REGULAR + : cm->interp_filter; +#else + mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR + : cm->interp_filter; +#endif // CONFIG_DUAL_FILTER + } + } +#endif // CONFIG_GLOBAL_MOTION + +#if CONFIG_REF_MV + for (i = 0; i < 1 + has_second_ref(mbmi); ++i) { + if (mbmi->mode != NEWMV) + mbmi->pred_mv[i].as_int = mbmi->mv[i].as_int; + else + mbmi->pred_mv[i].as_int = mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_int; + } +#endif // CONFIG_REF_MV + + for (i = 0; i < REFERENCE_MODES; ++i) { + if (best_pred_rd[i] == INT64_MAX) + best_pred_diff[i] = INT_MIN; + else + best_pred_diff[i] = best_rd - best_pred_rd[i]; + } + + x->skip |= best_mode_skippable; + + assert(best_mode_index >= 0); + + store_coding_context(x, ctx, best_mode_index, best_pred_diff, + best_mode_skippable); + +#if CONFIG_PALETTE + if (cm->allow_screen_content_tools && pmi->palette_size[1] > 0) { + restore_uv_color_map(cpi, x); + } +#endif // CONFIG_PALETTE +} + +void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, + TileDataEnc *tile_data, MACROBLOCK *x, + int mi_row, int mi_col, + RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + unsigned char segment_id = mbmi->segment_id; + const int comp_pred = 0; + int i; + int64_t best_pred_diff[REFERENCE_MODES]; + unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME]; + unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME]; + aom_prob comp_mode_p; + InterpFilter best_filter = SWITCHABLE; + int64_t this_rd = INT64_MAX; + int rate2 = 0; + const int64_t distortion2 = 0; + (void)mi_row; + (void)mi_col; + + estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, + &comp_mode_p); + + for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX; + for (i = LAST_FRAME; i < TOTAL_REFS_PER_FRAME; ++i) + x->pred_mv_sad[i] = INT_MAX; + + rd_cost->rate = INT_MAX; + + assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); + +#if CONFIG_PALETTE + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; +#endif // CONFIG_PALETTE + +#if CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; + mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0; +#endif // CONFIG_FILTER_INTRA + mbmi->mode = ZEROMV; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; +#if CONFIG_GLOBAL_MOTION + mbmi->mv[0].as_int = + gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]], + cm->allow_high_precision_mv, bsize, mi_col, mi_row, + 0) + .as_int; +#else // CONFIG_GLOBAL_MOTION + mbmi->mv[0].as_int = 0; +#endif // CONFIG_GLOBAL_MOTION + mbmi->tx_size = max_txsize_lookup[bsize]; + x->skip = 1; + +#if CONFIG_REF_MV + mbmi->ref_mv_idx = 0; + mbmi->pred_mv[0].as_int = 0; +#endif // CONFIG_REF_MV + + mbmi->motion_mode = SIMPLE_TRANSLATION; +#if CONFIG_MOTION_VAR + av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col); +#endif +#if CONFIG_WARPED_MOTION + if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) { + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref); + } +#endif + + set_default_interp_filters(mbmi, cm->interp_filter); + + if (cm->interp_filter != SWITCHABLE) { + best_filter = cm->interp_filter; + } else { + best_filter = EIGHTTAP_REGULAR; + if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd) && + x->source_variance >= cpi->sf.disable_filter_search_var_thresh) { + int rs; + int best_rs = INT_MAX; + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { +#if CONFIG_DUAL_FILTER + int k; + for (k = 0; k < 4; ++k) mbmi->interp_filter[k] = i; +#else + mbmi->interp_filter = i; +#endif // CONFIG_DUAL_FILTER + rs = av1_get_switchable_rate(cpi, xd); + if (rs < best_rs) { + best_rs = rs; +#if CONFIG_DUAL_FILTER + best_filter = mbmi->interp_filter[0]; +#else + best_filter = mbmi->interp_filter; +#endif // CONFIG_DUAL_FILTER + } + } + } + } +// Set the appropriate filter +#if CONFIG_DUAL_FILTER + for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = best_filter; +#else + mbmi->interp_filter = best_filter; +#endif // CONFIG_DUAL_FILTER + rate2 += av1_get_switchable_rate(cpi, xd); + + if (cm->reference_mode == REFERENCE_MODE_SELECT) + rate2 += av1_cost_bit(comp_mode_p, comp_pred); + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + rate2 += ref_costs_single[LAST_FRAME]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + rd_cost->rate = rate2; + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + + if (this_rd >= best_rd_so_far) { + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + +#if CONFIG_DUAL_FILTER + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == mbmi->interp_filter[0])); +#else + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == mbmi->interp_filter)); +#endif // CONFIG_DUAL_FILTER + + av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact, + cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV); + + av1_zero(best_pred_diff); + + store_coding_context(x, ctx, THR_ZEROMV, best_pred_diff, 0); +} + +void av1_rd_pick_inter_mode_sub8x8(const struct AV1_COMP *cpi, + TileDataEnc *tile_data, struct macroblock *x, + int mi_row, int mi_col, + struct RD_STATS *rd_cost, +#if CONFIG_SUPERTX + int *returnrate_nocoef, +#endif // CONFIG_SUPERTX + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + const AV1_COMMON *const cm = &cpi->common; + const RD_OPT *const rd_opt = &cpi->rd; + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const struct segmentation *const seg = &cm->seg; + MV_REFERENCE_FRAME ref_frame, second_ref_frame; + unsigned char segment_id = mbmi->segment_id; + int comp_pred, i; + int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME]; + struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE]; + static const int flag_list[TOTAL_REFS_PER_FRAME] = { + 0, + AOM_LAST_FLAG, +#if CONFIG_EXT_REFS + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, +#endif // CONFIG_EXT_REFS + AOM_GOLD_FLAG, +#if CONFIG_EXT_REFS + AOM_BWD_FLAG, +#endif // CONFIG_EXT_REFS + AOM_ALT_FLAG + }; + int64_t best_rd = best_rd_so_far; + int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise + int64_t best_pred_diff[REFERENCE_MODES]; + int64_t best_pred_rd[REFERENCE_MODES]; + MB_MODE_INFO best_mbmode; + int ref_index, best_ref_index = 0; + unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME]; + unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME]; + aom_prob comp_mode_p; +#if CONFIG_DUAL_FILTER + InterpFilter tmp_best_filter[4] = { 0 }; +#else + InterpFilter tmp_best_filter = SWITCHABLE; +#endif // CONFIG_DUAL_FILTER + int rate_uv_intra, rate_uv_tokenonly = INT_MAX; + int64_t dist_uv = INT64_MAX; + int skip_uv; + PREDICTION_MODE mode_uv = DC_PRED; + const int intra_cost_penalty = av1_get_intra_cost_penalty( + cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + int_mv seg_mvs[4][TOTAL_REFS_PER_FRAME]; + b_mode_info best_bmodes[4]; + int best_skip2 = 0; + int ref_frame_skip_mask[2] = { 0 }; + int internal_active_edge = + av1_active_edge_sb(cpi, mi_row, mi_col) && av1_internal_image_edge(cpi); +#if CONFIG_PVQ + od_rollback_buffer pre_buf; + + od_encode_checkpoint(&x->daala_enc, &pre_buf); +#endif // CONFIG_PVQ + +#if CONFIG_SUPERTX + best_rd_so_far = INT64_MAX; + best_rd = best_rd_so_far; + best_yrd = best_rd_so_far; +#endif // CONFIG_SUPERTX + av1_zero(best_mbmode); + +#if CONFIG_FILTER_INTRA + mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0; + mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0; +#endif // CONFIG_FILTER_INTRA + mbmi->motion_mode = SIMPLE_TRANSLATION; +#if CONFIG_EXT_INTER + mbmi->interinter_compound_type = COMPOUND_AVERAGE; + mbmi->use_wedge_interintra = 0; +#endif // CONFIG_EXT_INTER +#if CONFIG_WARPED_MOTION + mbmi->num_proj_ref[0] = 0; + mbmi->num_proj_ref[1] = 0; +#endif // CONFIG_WARPED_MOTION + + for (i = 0; i < 4; i++) { + int j; + for (j = 0; j < TOTAL_REFS_PER_FRAME; j++) + seg_mvs[i][j].as_int = INVALID_MV; + } + + estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, + &comp_mode_p); + + for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX; + rate_uv_intra = INT_MAX; + + rd_cost->rate = INT_MAX; +#if CONFIG_SUPERTX + *returnrate_nocoef = INT_MAX; +#endif // CONFIG_SUPERTX + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + x->mbmi_ext->mode_context[ref_frame] = 0; +#if CONFIG_REF_MV && CONFIG_EXT_INTER + x->mbmi_ext->compound_mode_context[ref_frame] = 0; +#endif // CONFIG_REF_MV && CONFIG_EXT_INTER + if (cpi->ref_frame_flags & flag_list[ref_frame]) { + setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); + } else { + ref_frame_skip_mask[0] |= (1 << ref_frame); + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + } + frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; +#if CONFIG_EXT_INTER +#endif // CONFIG_EXT_INTER + frame_mv[ZEROMV][ref_frame].as_int = 0; + } + +#if CONFIG_PALETTE + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; +#endif // CONFIG_PALETTE + + for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) { + int mode_excluded = 0; + int64_t this_rd = INT64_MAX; + int disable_skip = 0; + int compmode_cost = 0; + int rate2 = 0, rate_y = 0, rate_uv = 0; + int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; + int skippable = 0; + int this_skip2 = 0; + int64_t total_sse = INT_MAX; + +#if CONFIG_PVQ + od_encode_rollback(&x->daala_enc, &pre_buf); +#endif // CONFIG_PVQ + + ref_frame = av1_ref_order[ref_index].ref_frame[0]; + second_ref_frame = av1_ref_order[ref_index].ref_frame[1]; + +#if CONFIG_REF_MV + mbmi->ref_mv_idx = 0; +#endif // CONFIG_REF_MV + + // Look at the reference frame of the best mode so far and set the + // skip mask to look at a subset of the remaining modes. + if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) { + if (ref_index == 3) { + switch (best_mbmode.ref_frame[0]) { + case INTRA_FRAME: break; + case LAST_FRAME: + ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | +#if CONFIG_EXT_REFS + (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | + (1 << BWDREF_FRAME) | +#endif // CONFIG_EXT_REFS + (1 << ALTREF_FRAME); + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + break; +#if CONFIG_EXT_REFS + case LAST2_FRAME: + ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << LAST3_FRAME) | + (1 << GOLDEN_FRAME) | + (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME); + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + break; + case LAST3_FRAME: + ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << LAST2_FRAME) | + (1 << GOLDEN_FRAME) | + (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME); + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + break; +#endif // CONFIG_EXT_REFS + case GOLDEN_FRAME: + ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | +#if CONFIG_EXT_REFS + (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | + (1 << BWDREF_FRAME) | +#endif // CONFIG_EXT_REFS + (1 << ALTREF_FRAME); + ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; + break; +#if CONFIG_EXT_REFS + case BWDREF_FRAME: + ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << LAST2_FRAME) | + (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | + (1 << ALTREF_FRAME); + ref_frame_skip_mask[1] |= (1 << ALTREF_FRAME) | 0x01; + break; +#endif // CONFIG_EXT_REFS + case ALTREF_FRAME: + ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | +#if CONFIG_EXT_REFS + (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | + (1 << BWDREF_FRAME) | +#endif // CONFIG_EXT_REFS + (1 << GOLDEN_FRAME); +#if CONFIG_EXT_REFS + ref_frame_skip_mask[1] |= (1 << BWDREF_FRAME) | 0x01; +#endif // CONFIG_EXT_REFS + break; + case NONE_FRAME: + case TOTAL_REFS_PER_FRAME: + assert(0 && "Invalid Reference frame"); + break; + } + } + } + + if ((ref_frame_skip_mask[0] & (1 << ref_frame)) && + (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame)))) + continue; + + // Test best rd so far against threshold for trying this mode. + if (!internal_active_edge && + rd_less_than_thresh(best_rd, + rd_opt->threshes[segment_id][bsize][ref_index], + tile_data->thresh_freq_fact[bsize][ref_index])) + continue; + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue; + +#if CONFIG_LOWDELAY_COMPOUND // Changes LL bitstream +#if CONFIG_EXT_REFS + if (cpi->oxcf.pass == 0) { + // Complexity-compression trade-offs + // if (ref_frame == ALTREF_FRAME) continue; + // if (ref_frame == BWDREF_FRAME) continue; + if (second_ref_frame == ALTREF_FRAME) continue; + // if (second_ref_frame == BWDREF_FRAME) continue; + } +#endif +#endif + comp_pred = second_ref_frame > INTRA_FRAME; + if (comp_pred) { + if (!cpi->allow_comp_inter_inter) continue; + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; + + if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && + best_mbmode.ref_frame[0] == INTRA_FRAME) + continue; + } + + // TODO(jingning, jkoleszar): scaling reference frame not supported for + // sub8x8 blocks. + if (ref_frame > INTRA_FRAME && + av1_is_scaled(&cm->frame_refs[ref_frame - 1].sf)) + continue; + + if (second_ref_frame > INTRA_FRAME && + av1_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf)) + continue; + + if (comp_pred) + mode_excluded = cm->reference_mode == SINGLE_REFERENCE; + else if (ref_frame != INTRA_FRAME) + mode_excluded = cm->reference_mode == COMPOUND_REFERENCE; + + // If the segment reference frame feature is enabled.... + // then do nothing if the current ref frame is not allowed.. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + continue; + // Disable this drop out case if the ref frame + // segment level feature is enabled for this segment. This is to + // prevent the possibility that we end up unable to pick any mode. + } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative. We allow near/nearest as well + // because they may result in zero-zero MVs but be cheaper. + if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) + continue; + } + + mbmi->tx_size = TX_4X4; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = ref_frame; + mbmi->ref_frame[1] = second_ref_frame; +// Evaluate all sub-pel filters irrespective of whether we can use +// them for this frame. +#if CONFIG_DUAL_FILTER + for (i = 0; i < 4; ++i) + mbmi->interp_filter[i] = cm->interp_filter == SWITCHABLE + ? EIGHTTAP_REGULAR + : cm->interp_filter; +#else + mbmi->interp_filter = + cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR : cm->interp_filter; +#endif // CONFIG_DUAL_FILTER + x->skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + // Select prediction reference frames. + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } + +#if CONFIG_VAR_TX + mbmi->inter_tx_size[0][0] = mbmi->tx_size; + mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size); +#endif // CONFIG_VAR_TX + + if (ref_frame == INTRA_FRAME) { + int rate; + if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y, &distortion_y, + NULL, best_rd) >= best_rd) + continue; + rate2 += rate; + rate2 += intra_cost_penalty; + distortion2 += distortion_y; + + if (rate_uv_intra == INT_MAX) { + choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4, &rate_uv_intra, + &rate_uv_tokenonly, &dist_uv, &skip_uv, &mode_uv); + } + rate2 += rate_uv_intra; + rate_uv = rate_uv_tokenonly; + distortion2 += dist_uv; + distortion_uv = dist_uv; + mbmi->uv_mode = mode_uv; + } else { + int rate; + int64_t distortion; + int64_t this_rd_thresh; + int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX; + int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX; + int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse; + int tmp_best_skippable = 0; + int switchable_filter_index; + int_mv *second_ref = + comp_pred ? &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL; + b_mode_info tmp_best_bmodes[16]; // Should this be 4 ? + MB_MODE_INFO tmp_best_mbmode; +#if CONFIG_DUAL_FILTER + BEST_SEG_INFO bsi[DUAL_FILTER_SET_SIZE]; +#else + BEST_SEG_INFO bsi[SWITCHABLE_FILTERS]; +#endif // CONFIG_DUAL_FILTER + int pred_exists = 0; + int uv_skippable; +#if CONFIG_EXT_INTER + int_mv compound_seg_newmvs[4][2]; + for (i = 0; i < 4; i++) { + compound_seg_newmvs[i][0].as_int = INVALID_MV; + compound_seg_newmvs[i][1].as_int = INVALID_MV; + } +#endif // CONFIG_EXT_INTER + + this_rd_thresh = (ref_frame == LAST_FRAME) + ? rd_opt->threshes[segment_id][bsize][THR_LAST] + : rd_opt->threshes[segment_id][bsize][THR_ALTR]; +#if CONFIG_EXT_REFS + this_rd_thresh = (ref_frame == LAST2_FRAME) + ? rd_opt->threshes[segment_id][bsize][THR_LAST2] + : this_rd_thresh; + this_rd_thresh = (ref_frame == LAST3_FRAME) + ? rd_opt->threshes[segment_id][bsize][THR_LAST3] + : this_rd_thresh; + this_rd_thresh = (ref_frame == BWDREF_FRAME) + ? rd_opt->threshes[segment_id][bsize][THR_BWDR] + : this_rd_thresh; +#endif // CONFIG_EXT_REFS + this_rd_thresh = (ref_frame == GOLDEN_FRAME) + ? rd_opt->threshes[segment_id][bsize][THR_GOLD] + : this_rd_thresh; + + // TODO(any): Add search of the tx_type to improve rd performance at the + // expense of speed. + mbmi->tx_type = DCT_DCT; + + if (cm->interp_filter != BILINEAR) { +#if CONFIG_DUAL_FILTER + tmp_best_filter[0] = EIGHTTAP_REGULAR; + tmp_best_filter[1] = EIGHTTAP_REGULAR; + tmp_best_filter[2] = EIGHTTAP_REGULAR; + tmp_best_filter[3] = EIGHTTAP_REGULAR; +#else + tmp_best_filter = EIGHTTAP_REGULAR; +#endif // CONFIG_DUAL_FILTER + if (x->source_variance < sf->disable_filter_search_var_thresh) { +#if CONFIG_DUAL_FILTER + tmp_best_filter[0] = EIGHTTAP_REGULAR; +#else + tmp_best_filter = EIGHTTAP_REGULAR; +#endif // CONFIG_DUAL_FILTER + } else if (sf->adaptive_pred_interp_filter == 1 && + ctx->pred_interp_filter < SWITCHABLE) { +#if CONFIG_DUAL_FILTER + tmp_best_filter[0] = ctx->pred_interp_filter; +#else + tmp_best_filter = ctx->pred_interp_filter; +#endif // CONFIG_DUAL_FILTER + } else if (sf->adaptive_pred_interp_filter == 2) { +#if CONFIG_DUAL_FILTER + tmp_best_filter[0] = ctx->pred_interp_filter < SWITCHABLE + ? ctx->pred_interp_filter + : 0; +#else + tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE + ? ctx->pred_interp_filter + : 0; +#endif // CONFIG_DUAL_FILTER + } else { +#if CONFIG_DUAL_FILTER + const int filter_set_size = DUAL_FILTER_SET_SIZE; +#else + const int filter_set_size = SWITCHABLE_FILTERS; +#endif // CONFIG_DUAL_FILTER + for (switchable_filter_index = 0; + switchable_filter_index < filter_set_size; + ++switchable_filter_index) { + int newbest, rs; + int64_t rs_rd; + MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext; +#if CONFIG_DUAL_FILTER + mbmi->interp_filter[0] = filter_sets[switchable_filter_index][0]; + mbmi->interp_filter[1] = filter_sets[switchable_filter_index][1]; + mbmi->interp_filter[2] = filter_sets[switchable_filter_index][0]; + mbmi->interp_filter[3] = filter_sets[switchable_filter_index][1]; +#else + mbmi->interp_filter = switchable_filter_index; +#endif // CONFIG_DUAL_FILTER + tmp_rd = rd_pick_inter_best_sub8x8_mode( + cpi, x, &mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd, + &rate, &rate_y, &distortion, &skippable, &total_sse, + (int)this_rd_thresh, seg_mvs, +#if CONFIG_EXT_INTER + compound_seg_newmvs, +#endif // CONFIG_EXT_INTER + bsi, switchable_filter_index, mi_row, mi_col); + if (tmp_rd == INT64_MAX) continue; + rs = av1_get_switchable_rate(cpi, xd); + rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); + if (cm->interp_filter == SWITCHABLE) tmp_rd += rs_rd; + + newbest = (tmp_rd < tmp_best_rd); + if (newbest) { +#if CONFIG_DUAL_FILTER + tmp_best_filter[0] = mbmi->interp_filter[0]; + tmp_best_filter[1] = mbmi->interp_filter[1]; + tmp_best_filter[2] = mbmi->interp_filter[2]; + tmp_best_filter[3] = mbmi->interp_filter[3]; +#else + tmp_best_filter = mbmi->interp_filter; +#endif // CONFIG_DUAL_FILTER + tmp_best_rd = tmp_rd; + } + if ((newbest && cm->interp_filter == SWITCHABLE) || + ( +#if CONFIG_DUAL_FILTER + mbmi->interp_filter[0] == cm->interp_filter +#else + mbmi->interp_filter == cm->interp_filter +#endif // CONFIG_DUAL_FILTER + && cm->interp_filter != SWITCHABLE)) { + tmp_best_rdu = tmp_rd; + tmp_best_rate = rate; + tmp_best_ratey = rate_y; + tmp_best_distortion = distortion; + tmp_best_sse = total_sse; + tmp_best_skippable = skippable; + tmp_best_mbmode = *mbmi; + for (i = 0; i < 4; i++) { + tmp_best_bmodes[i] = xd->mi[0]->bmi[i]; + } + pred_exists = 1; + } + } // switchable_filter_index loop + } + } + + if (tmp_best_rdu == INT64_MAX && pred_exists) continue; + +#if CONFIG_DUAL_FILTER + mbmi->interp_filter[0] = + (cm->interp_filter == SWITCHABLE ? tmp_best_filter[0] + : cm->interp_filter); + mbmi->interp_filter[1] = + (cm->interp_filter == SWITCHABLE ? tmp_best_filter[1] + : cm->interp_filter); + mbmi->interp_filter[2] = + (cm->interp_filter == SWITCHABLE ? tmp_best_filter[2] + : cm->interp_filter); + mbmi->interp_filter[3] = + (cm->interp_filter == SWITCHABLE ? tmp_best_filter[3] + : cm->interp_filter); +#else + mbmi->interp_filter = + (cm->interp_filter == SWITCHABLE ? tmp_best_filter + : cm->interp_filter); +#endif // CONFIG_DUAL_FILTER + + if (!pred_exists) { + // Handles the special case when a filter that is not in the + // switchable list (bilinear) is indicated at the frame level + tmp_rd = rd_pick_inter_best_sub8x8_mode( + cpi, x, &x->mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd, + &rate, &rate_y, &distortion, &skippable, &total_sse, + (int)this_rd_thresh, seg_mvs, +#if CONFIG_EXT_INTER + compound_seg_newmvs, +#endif // CONFIG_EXT_INTER + bsi, 0, mi_row, mi_col); + if (tmp_rd == INT64_MAX) continue; + } else { + total_sse = tmp_best_sse; + rate = tmp_best_rate; + rate_y = tmp_best_ratey; + distortion = tmp_best_distortion; + skippable = tmp_best_skippable; + *mbmi = tmp_best_mbmode; + for (i = 0; i < 4; i++) xd->mi[0]->bmi[i] = tmp_best_bmodes[i]; + } + // Add in the cost of the transform type + if (!xd->lossless[mbmi->segment_id]) { + int rate_tx_type = 0; +#if CONFIG_EXT_TX + if (get_ext_tx_types(mbmi->tx_size, bsize, 1, cm->reduced_tx_set_used) > + 1) { + const int eset = + get_ext_tx_set(mbmi->tx_size, bsize, 1, cm->reduced_tx_set_used); + rate_tx_type = + cpi->inter_tx_type_costs[eset][mbmi->tx_size][mbmi->tx_type]; + } +#else + if (mbmi->tx_size < TX_32X32) { + rate_tx_type = cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type]; + } +#endif // CONFIG_EXT_TX + rate += rate_tx_type; + rate_y += rate_tx_type; + } + + rate2 += rate; + distortion2 += distortion; + + if (cm->interp_filter == SWITCHABLE) + rate2 += av1_get_switchable_rate(cpi, xd); + + if (!mode_excluded) + mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE + : cm->reference_mode == COMPOUND_REFERENCE; + + compmode_cost = av1_cost_bit(comp_mode_p, comp_pred); + + tmp_best_rdu = + best_rd - AOMMIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2), + RDCOST(x->rdmult, x->rddiv, 0, total_sse)); + + if (tmp_best_rdu > 0) { + // If even the 'Y' rd value of split is higher than best so far + // then dont bother looking at UV + int is_cost_valid_uv; + RD_STATS rd_stats_uv; + av1_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, NULL, + BLOCK_8X8); +#if CONFIG_VAR_TX + is_cost_valid_uv = + inter_block_uvrd(cpi, x, &rd_stats_uv, BLOCK_8X8, tmp_best_rdu); +#else + is_cost_valid_uv = + super_block_uvrd(cpi, x, &rd_stats_uv, BLOCK_8X8, tmp_best_rdu); +#endif // CONFIG_VAR_TX + rate_uv = rd_stats_uv.rate; + distortion_uv = rd_stats_uv.dist; + uv_skippable = rd_stats_uv.skip; + uv_sse = rd_stats_uv.sse; + + if (!is_cost_valid_uv) continue; + rate2 += rate_uv; + distortion2 += distortion_uv; + skippable = skippable && uv_skippable; + total_sse += uv_sse; + } else { + continue; + } + } + + if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost; + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + if (second_ref_frame > INTRA_FRAME) { + rate2 += ref_costs_comp[ref_frame]; +#if CONFIG_EXT_REFS + rate2 += ref_costs_comp[second_ref_frame]; +#endif // CONFIG_EXT_REFS + } else { + rate2 += ref_costs_single[ref_frame]; + } + + if (!disable_skip) { + // Skip is never coded at the segment level for sub8x8 blocks and instead + // always coded in the bitstream at the mode info level. + + if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) { + if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) < + RDCOST(x->rdmult, x->rddiv, 0, total_sse)) { + // Add in the cost of the no skip flag. + rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + } else { + // FIXME(rbultje) make this work for splitmv also + rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + distortion2 = total_sse; + assert(total_sse >= 0); + rate2 -= (rate_y + rate_uv); + rate_y = 0; + rate_uv = 0; + this_skip2 = 1; + } + } else { + // Add in the cost of the no skip flag. + rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + } + + // Calculate the final RD estimate for this mode. + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + } + + if (!disable_skip && ref_frame == INTRA_FRAME) { + for (i = 0; i < REFERENCE_MODES; ++i) + best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd); + } + + // Did this mode help.. i.e. is it the new best mode + if (this_rd < best_rd || x->skip) { + if (!mode_excluded) { + // Note index of best mode so far + best_ref_index = ref_index; + + if (ref_frame == INTRA_FRAME) { + /* required for left and above block mv */ + mbmi->mv[0].as_int = 0; + } + + rd_cost->rate = rate2; +#if CONFIG_SUPERTX + *returnrate_nocoef = rate2 - rate_y - rate_uv; + if (!disable_skip) + *returnrate_nocoef -= + av1_cost_bit(av1_get_skip_prob(cm, xd), this_skip2); + *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd), + mbmi->ref_frame[0] != INTRA_FRAME); + assert(*returnrate_nocoef > 0); +#endif // CONFIG_SUPERTX + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + best_rd = this_rd; + best_yrd = + best_rd - RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv); + best_mbmode = *mbmi; + best_skip2 = this_skip2; + +#if CONFIG_VAR_TX + for (i = 0; i < MAX_MB_PLANE; ++i) + memset(ctx->blk_skip[i], 0, sizeof(uint8_t) * ctx->num_4x4_blk); +#endif // CONFIG_VAR_TX + + for (i = 0; i < 4; i++) best_bmodes[i] = xd->mi[0]->bmi[i]; + } + } + + /* keep record of best compound/single-only prediction */ + if (!disable_skip && ref_frame != INTRA_FRAME) { + int64_t single_rd, hybrid_rd, single_rate, hybrid_rate; + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + single_rate = rate2 - compmode_cost; + hybrid_rate = rate2; + } else { + single_rate = rate2; + hybrid_rate = rate2 + compmode_cost; + } + + single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2); + hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2); + + if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE]) + best_pred_rd[SINGLE_REFERENCE] = single_rd; + else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE]) + best_pred_rd[COMPOUND_REFERENCE] = single_rd; + + if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT]) + best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; + } + + if (x->skip && !comp_pred) break; + } + + if (best_rd >= best_rd_so_far) { + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; +#if CONFIG_SUPERTX + *returnrate_nocoef = INT_MAX; +#endif // CONFIG_SUPERTX + return; + } + + if (best_rd == INT64_MAX) { + rd_cost->rate = INT_MAX; + rd_cost->dist = INT64_MAX; + rd_cost->rdcost = INT64_MAX; +#if CONFIG_SUPERTX + *returnrate_nocoef = INT_MAX; +#endif // CONFIG_SUPERTX + return; + } + +#if CONFIG_DUAL_FILTER + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == best_mbmode.interp_filter[0]) || + !is_inter_block(&best_mbmode)); +#else + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == best_mbmode.interp_filter) || + !is_inter_block(&best_mbmode)); +#endif // CONFIG_DUAL_FILTER + + av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact, + sf->adaptive_rd_thresh, bsize, best_ref_index); + + // macroblock modes + *mbmi = best_mbmode; +#if CONFIG_VAR_TX + mbmi->inter_tx_size[0][0] = mbmi->tx_size; +#endif // CONFIG_VAR_TX + + x->skip |= best_skip2; + if (!is_inter_block(&best_mbmode)) { + for (i = 0; i < 4; i++) xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode; + } else { + for (i = 0; i < 4; ++i) + memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info)); + +#if CONFIG_REF_MV + mbmi->pred_mv[0].as_int = xd->mi[0]->bmi[3].pred_mv[0].as_int; + mbmi->pred_mv[1].as_int = xd->mi[0]->bmi[3].pred_mv[1].as_int; +#endif // CONFIG_REF_MV + mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int; + mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int; + } + +// Note: this section is needed since the mode may have been forced to ZEROMV +#if CONFIG_GLOBAL_MOTION + if (mbmi->mode == ZEROMV +#if CONFIG_EXT_INTER + || mbmi->mode == ZERO_ZEROMV +#endif // CONFIG_EXT_INTER + ) { + if (is_nontrans_global_motion(xd)) { +#if CONFIG_DUAL_FILTER + mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE + ? EIGHTTAP_REGULAR + : cm->interp_filter; + mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE + ? EIGHTTAP_REGULAR + : cm->interp_filter; +#else + mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR + : cm->interp_filter; +#endif // CONFIG_DUAL_FILTER + } + } +#endif // CONFIG_GLOBAL_MOTION + + for (i = 0; i < REFERENCE_MODES; ++i) { + if (best_pred_rd[i] == INT64_MAX) + best_pred_diff[i] = INT_MIN; + else + best_pred_diff[i] = best_rd - best_pred_rd[i]; + } + + store_coding_context(x, ctx, best_ref_index, best_pred_diff, 0); +} + +#if CONFIG_MOTION_VAR +// This function has a structure similar to av1_build_obmc_inter_prediction +// +// The OBMC predictor is computed as: +// +// PObmc(x,y) = +// AOM_BLEND_A64(Mh(x), +// AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)), +// PLeft(x, y)) +// +// Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate +// rounding, this can be written as: +// +// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) = +// Mh(x) * Mv(y) * P(x,y) + +// Mh(x) * Cv(y) * Pabove(x,y) + +// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y) +// +// Where : +// +// Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y) +// Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y) +// +// This function computes 'wsrc' and 'mask' as: +// +// wsrc(x, y) = +// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) - +// Mh(x) * Cv(y) * Pabove(x,y) + +// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y) +// +// mask(x, y) = Mh(x) * Mv(y) +// +// These can then be used to efficiently approximate the error for any +// predictor P in the context of the provided neighbouring predictors by +// computing: +// +// error(x, y) = +// wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2) +// +static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, + const MACROBLOCKD *xd, int mi_row, + int mi_col, const uint8_t *above, + int above_stride, const uint8_t *left, + int left_stride) { + const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; + int row, col, i; + const int bw = xd->n8_w << MI_SIZE_LOG2; + const int bh = xd->n8_h << MI_SIZE_LOG2; + int32_t *mask_buf = x->mask_buf; + int32_t *wsrc_buf = x->wsrc_buf; + const int wsrc_stride = bw; + const int mask_stride = bw; + const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA; +#if CONFIG_HIGHBITDEPTH + const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; +#else + const int is_hbd = 0; +#endif // CONFIG_HIGHBITDEPTH + + // plane 0 should not be subsampled + assert(xd->plane[0].subsampling_x == 0); + assert(xd->plane[0].subsampling_y == 0); + + av1_zero_array(wsrc_buf, bw * bh); + for (i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA; + + // handle above row + if (xd->up_available) { + const int overlap = num_4x4_blocks_high_lookup[bsize] * 2; + const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col); + const int mi_row_offset = -1; + const uint8_t *const mask1d = av1_get_obmc_mask(overlap); + const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]]; + int neighbor_count = 0; + + assert(miw > 0); + + i = 0; + do { // for each mi in the above row + const int mi_col_offset = i; + const MB_MODE_INFO *const above_mbmi = + &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi; + const BLOCK_SIZE a_bsize = above_mbmi->sb_type; + const int mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]); + const int neighbor_bw = mi_step * MI_SIZE; + + if (is_neighbor_overlappable(above_mbmi)) { + if (!CONFIG_CB4X4 && (a_bsize == BLOCK_4X4 || a_bsize == BLOCK_4X8)) + neighbor_count += 2; + else + neighbor_count++; + if (neighbor_count > neighbor_limit) break; + + const int tmp_stride = above_stride; + int32_t *wsrc = wsrc_buf + (i * MI_SIZE); + int32_t *mask = mask_buf + (i * MI_SIZE); + + if (!is_hbd) { + const uint8_t *tmp = above; + + for (row = 0; row < overlap; ++row) { + const uint8_t m0 = mask1d[row]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + for (col = 0; col < neighbor_bw; ++col) { + wsrc[col] = m1 * tmp[col]; + mask[col] = m0; + } + wsrc += wsrc_stride; + mask += mask_stride; + tmp += tmp_stride; + } +#if CONFIG_HIGHBITDEPTH + } else { + const uint16_t *tmp = CONVERT_TO_SHORTPTR(above); + + for (row = 0; row < overlap; ++row) { + const uint8_t m0 = mask1d[row]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + for (col = 0; col < neighbor_bw; ++col) { + wsrc[col] = m1 * tmp[col]; + mask[col] = m0; + } + wsrc += wsrc_stride; + mask += mask_stride; + tmp += tmp_stride; + } +#endif // CONFIG_HIGHBITDEPTH + } + } + + above += neighbor_bw; + i += mi_step; + } while (i < miw); + } + + for (i = 0; i < bw * bh; ++i) { + wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA; + mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA; + } + + // handle left column + if (xd->left_available) { + const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2; + const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row); + const int mi_col_offset = -1; + const uint8_t *const mask1d = av1_get_obmc_mask(overlap); + const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]]; + int neighbor_count = 0; + + assert(mih > 0); + + i = 0; + do { // for each mi in the left column + const int mi_row_offset = i; + const MB_MODE_INFO *const left_mbmi = + &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi; + const BLOCK_SIZE l_bsize = left_mbmi->sb_type; + const int mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]); + const int neighbor_bh = mi_step * MI_SIZE; + + if (is_neighbor_overlappable(left_mbmi)) { + if (!CONFIG_CB4X4 && (l_bsize == BLOCK_4X4 || l_bsize == BLOCK_8X4)) + neighbor_count += 2; + else + neighbor_count++; + if (neighbor_count > neighbor_limit) break; + + const int tmp_stride = left_stride; + int32_t *wsrc = wsrc_buf + (i * MI_SIZE * wsrc_stride); + int32_t *mask = mask_buf + (i * MI_SIZE * mask_stride); + + if (!is_hbd) { + const uint8_t *tmp = left; + + for (row = 0; row < neighbor_bh; ++row) { + for (col = 0; col < overlap; ++col) { + const uint8_t m0 = mask1d[col]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 + + (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1; + mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0; + } + wsrc += wsrc_stride; + mask += mask_stride; + tmp += tmp_stride; + } +#if CONFIG_HIGHBITDEPTH + } else { + const uint16_t *tmp = CONVERT_TO_SHORTPTR(left); + + for (row = 0; row < neighbor_bh; ++row) { + for (col = 0; col < overlap; ++col) { + const uint8_t m0 = mask1d[col]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 + + (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1; + mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0; + } + wsrc += wsrc_stride; + mask += mask_stride; + tmp += tmp_stride; + } +#endif // CONFIG_HIGHBITDEPTH + } + } + + left += neighbor_bh * left_stride; + i += mi_step; + } while (i < mih); + } + + if (!is_hbd) { + const uint8_t *src = x->plane[0].src.buf; + + for (row = 0; row < bh; ++row) { + for (col = 0; col < bw; ++col) { + wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col]; + } + wsrc_buf += wsrc_stride; + src += x->plane[0].src.stride; + } +#if CONFIG_HIGHBITDEPTH + } else { + const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf); + + for (row = 0; row < bh; ++row) { + for (col = 0; col < bw; ++col) { + wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col]; + } + wsrc_buf += wsrc_stride; + src += x->plane[0].src.stride; + } +#endif // CONFIG_HIGHBITDEPTH + } +} + +#if CONFIG_NCOBMC +void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x, + int mi_row, int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MB_MODE_INFO backup_mbmi; + BLOCK_SIZE bsize = mbmi->sb_type; + int ref, skip_blk, backup_skip = x->skip; + int64_t rd_causal; + RD_STATS rd_stats_y, rd_stats_uv; + int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0); + int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1); + + // Recompute the best causal predictor and rd + mbmi->motion_mode = SIMPLE_TRANSLATION; + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]); + assert(cfg != NULL); + av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, + &xd->block_refs[ref]->sf); + } + av1_setup_dst_planes(x->e_mbd.plane, bsize, + get_frame_new_buffer(&cpi->common), mi_row, mi_col); + + av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize); + + av1_subtract_plane(x, bsize, 0); + super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); + super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); + assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX); + if (rd_stats_y.skip && rd_stats_uv.skip) { + rd_stats_y.rate = rate_skip1; + rd_stats_uv.rate = 0; + rd_stats_y.dist = rd_stats_y.sse; + rd_stats_uv.dist = rd_stats_uv.sse; + skip_blk = 0; + } else if (RDCOST(x->rdmult, x->rddiv, + (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0), + (rd_stats_y.dist + rd_stats_uv.dist)) > + RDCOST(x->rdmult, x->rddiv, rate_skip1, + (rd_stats_y.sse + rd_stats_uv.sse))) { + rd_stats_y.rate = rate_skip1; + rd_stats_uv.rate = 0; + rd_stats_y.dist = rd_stats_y.sse; + rd_stats_uv.dist = rd_stats_uv.sse; + skip_blk = 1; + } else { + rd_stats_y.rate += rate_skip0; + skip_blk = 0; + } + backup_skip = skip_blk; + backup_mbmi = *mbmi; + rd_causal = RDCOST(x->rdmult, x->rddiv, (rd_stats_y.rate + rd_stats_uv.rate), + (rd_stats_y.dist + rd_stats_uv.dist)); + rd_causal += RDCOST(x->rdmult, x->rddiv, + av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 0), 0); + + // Check non-causal mode + mbmi->motion_mode = OBMC_CAUSAL; + av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col); + + av1_subtract_plane(x, bsize, 0); + super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); + super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); + assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX); + if (rd_stats_y.skip && rd_stats_uv.skip) { + rd_stats_y.rate = rate_skip1; + rd_stats_uv.rate = 0; + rd_stats_y.dist = rd_stats_y.sse; + rd_stats_uv.dist = rd_stats_uv.sse; + skip_blk = 0; + } else if (RDCOST(x->rdmult, x->rddiv, + (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0), + (rd_stats_y.dist + rd_stats_uv.dist)) > + RDCOST(x->rdmult, x->rddiv, rate_skip1, + (rd_stats_y.sse + rd_stats_uv.sse))) { + rd_stats_y.rate = rate_skip1; + rd_stats_uv.rate = 0; + rd_stats_y.dist = rd_stats_y.sse; + rd_stats_uv.dist = rd_stats_uv.sse; + skip_blk = 1; + } else { + rd_stats_y.rate += rate_skip0; + skip_blk = 0; + } + + if (rd_causal > + RDCOST(x->rdmult, x->rddiv, + rd_stats_y.rate + rd_stats_uv.rate + + av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1), + (rd_stats_y.dist + rd_stats_uv.dist))) { + x->skip = skip_blk; + } else { + *mbmi = backup_mbmi; + x->skip = backup_skip; + } +} +#endif // CONFIG_NCOBMC +#endif // CONFIG_MOTION_VAR diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h new file mode 100644 index 000000000..a7053b289 --- /dev/null +++ b/third_party/aom/av1/encoder/rdopt.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_RDOPT_H_ +#define AV1_ENCODER_RDOPT_H_ + +#include "av1/common/blockd.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/context_tree.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct TileInfo; +struct AV1_COMP; +struct macroblock; +struct RD_STATS; + +#if CONFIG_RD_DEBUG +static INLINE void av1_update_txb_coeff_cost(RD_STATS *rd_stats, int plane, + TX_SIZE tx_size, int blk_row, + int blk_col, int txb_coeff_cost) { + (void)blk_row; + (void)blk_col; + (void)tx_size; + rd_stats->txb_coeff_cost[plane] += txb_coeff_cost; + +#if CONFIG_VAR_TX + { + const int txb_h = tx_size_high_unit[tx_size]; + const int txb_w = tx_size_wide_unit[tx_size]; + int idx, idy; + for (idy = 0; idy < txb_h; ++idy) + for (idx = 0; idx < txb_w; ++idx) + rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0; + + rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost; + } + assert(blk_row < TXB_COEFF_COST_MAP_SIZE); + assert(blk_col < TXB_COEFF_COST_MAP_SIZE); +#endif +} +#endif + +typedef enum OUTPUT_STATUS { + OUTPUT_HAS_PREDICTED_PIXELS, + OUTPUT_HAS_DECODED_PIXELS +} OUTPUT_STATUS; + +void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col, + TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse, + OUTPUT_STATUS output_status); + +#if !CONFIG_PVQ || CONFIG_VAR_TX +int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order, + const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l, + int use_fast_coef_costing); +#endif +void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x, + struct RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd); + +unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs); +#if CONFIG_HIGHBITDEPTH +unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs, int bd); +#endif + +void av1_rd_pick_inter_mode_sb(const struct AV1_COMP *cpi, + struct TileDataEnc *tile_data, + struct macroblock *x, int mi_row, int mi_col, + struct RD_STATS *rd_cost, +#if CONFIG_SUPERTX + int *returnrate_nocoef, +#endif // CONFIG_SUPERTX + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); + +void av1_rd_pick_inter_mode_sb_seg_skip( + const struct AV1_COMP *cpi, struct TileDataEnc *tile_data, + struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); + +int av1_internal_image_edge(const struct AV1_COMP *cpi); +int av1_active_h_edge(const struct AV1_COMP *cpi, int mi_row, int mi_step); +int av1_active_v_edge(const struct AV1_COMP *cpi, int mi_col, int mi_step); +int av1_active_edge_sb(const struct AV1_COMP *cpi, int mi_row, int mi_col); + +void av1_rd_pick_inter_mode_sub8x8(const struct AV1_COMP *cpi, + struct TileDataEnc *tile_data, + struct macroblock *x, int mi_row, int mi_col, + struct RD_STATS *rd_cost, +#if CONFIG_SUPERTX + int *returnrate_nocoef, +#endif // CONFIG_SUPERTX + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); + +#if CONFIG_MOTION_VAR && CONFIG_NCOBMC +void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x, + int mi_row, int mi_col); +#endif // CONFIG_MOTION_VAR && CONFIG_NCOBMC + +#if CONFIG_SUPERTX +#if CONFIG_VAR_TX +void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, + int blk_row, int blk_col, int plane, int block, + int plane_bsize, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l, RD_STATS *rd_stats); +#endif + +void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate, + int64_t *distortion, int *skippable, + int64_t *sse, int64_t ref_best_rd, int plane, + BLOCK_SIZE bsize, TX_SIZE tx_size, + int use_fast_coef_casting); +#endif // CONFIG_SUPERTX + +#ifdef __cplusplus +} // extern "C" +#endif + +int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd, + BLOCK_SIZE bsize, int plane, TX_SIZE tx_size, + TX_TYPE tx_type); + +#endif // AV1_ENCODER_RDOPT_H_ diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c new file mode 100644 index 000000000..b581a61d0 --- /dev/null +++ b/third_party/aom/av1/encoder/segmentation.c @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_mem/aom_mem.h" + +#include "av1/common/pred_common.h" +#include "av1/common/tile_common.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/subexp.h" + +void av1_enable_segmentation(struct segmentation *seg) { + seg->enabled = 1; + seg->update_map = 1; + seg->update_data = 1; +} + +void av1_disable_segmentation(struct segmentation *seg) { + seg->enabled = 0; + seg->update_map = 0; + seg->update_data = 0; +} + +void av1_set_segment_data(struct segmentation *seg, signed char *feature_data, + unsigned char abs_delta) { + seg->abs_delta = abs_delta; + + memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data)); +} +void av1_disable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_mask[segment_id] &= ~(1 << feature_id); +} + +void av1_clear_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_data[segment_id][feature_id] = 0; +} + +// Based on set of segment counts calculate a probability tree +static void calc_segtree_probs(unsigned *segcounts, + aom_prob *segment_tree_probs, + const aom_prob *cur_tree_probs, + const int probwt) { + // Work out probabilities of each segment + const unsigned cc[4] = { segcounts[0] + segcounts[1], + segcounts[2] + segcounts[3], + segcounts[4] + segcounts[5], + segcounts[6] + segcounts[7] }; + const unsigned ccc[2] = { cc[0] + cc[1], cc[2] + cc[3] }; + int i; + + segment_tree_probs[0] = get_binary_prob(ccc[0], ccc[1]); + segment_tree_probs[1] = get_binary_prob(cc[0], cc[1]); + segment_tree_probs[2] = get_binary_prob(cc[2], cc[3]); + segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]); + segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]); + segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]); + segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]); + + for (i = 0; i < 7; i++) { + const unsigned *ct = + i == 0 ? ccc : i < 3 ? cc + (i & 2) : segcounts + (i - 3) * 2; + av1_prob_diff_update_savings_search(ct, cur_tree_probs[i], + &segment_tree_probs[i], + DIFF_UPDATE_PROB, probwt); + } +} + +// Based on set of segment counts and probabilities calculate a cost estimate +static int cost_segmap(unsigned *segcounts, aom_prob *probs) { + const int c01 = segcounts[0] + segcounts[1]; + const int c23 = segcounts[2] + segcounts[3]; + const int c45 = segcounts[4] + segcounts[5]; + const int c67 = segcounts[6] + segcounts[7]; + const int c0123 = c01 + c23; + const int c4567 = c45 + c67; + + // Cost the top node of the tree + int cost = c0123 * av1_cost_zero(probs[0]) + c4567 * av1_cost_one(probs[0]); + + // Cost subsequent levels + if (c0123 > 0) { + cost += c01 * av1_cost_zero(probs[1]) + c23 * av1_cost_one(probs[1]); + + if (c01 > 0) + cost += segcounts[0] * av1_cost_zero(probs[3]) + + segcounts[1] * av1_cost_one(probs[3]); + if (c23 > 0) + cost += segcounts[2] * av1_cost_zero(probs[4]) + + segcounts[3] * av1_cost_one(probs[4]); + } + + if (c4567 > 0) { + cost += c45 * av1_cost_zero(probs[2]) + c67 * av1_cost_one(probs[2]); + + if (c45 > 0) + cost += segcounts[4] * av1_cost_zero(probs[5]) + + segcounts[5] * av1_cost_one(probs[5]); + if (c67 > 0) + cost += segcounts[6] * av1_cost_zero(probs[6]) + + segcounts[7] * av1_cost_one(probs[6]); + } + + return cost; +} + +static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *tile, MODE_INFO **mi, + unsigned *no_pred_segcounts, + unsigned (*temporal_predictor_count)[2], + unsigned *t_unpred_seg_counts, int bw, int bh, + int mi_row, int mi_col) { + int segment_id; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + xd->mi = mi; + segment_id = xd->mi[0]->mbmi.segment_id; + + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, +#if CONFIG_DEPENDENT_HORZTILES + cm->dependent_horz_tiles, +#endif // CONFIG_DEPENDENT_HORZTILES + cm->mi_rows, cm->mi_cols); + + // Count the number of hits on each segment with no prediction + no_pred_segcounts[segment_id]++; + + // Temporal prediction not allowed on key frames + if (cm->frame_type != KEY_FRAME) { + const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; + // Test to see if the segment id matches the predicted value. + const int pred_segment_id = + get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col); + const int pred_flag = pred_segment_id == segment_id; + const int pred_context = av1_get_pred_context_seg_id(xd); + + // Store the prediction status for this mb and update counts + // as appropriate + xd->mi[0]->mbmi.seg_id_predicted = pred_flag; + temporal_predictor_count[pred_context][pred_flag]++; + + // Update the "unpredicted" segment count + if (!pred_flag) t_unpred_seg_counts[segment_id]++; + } +} + +static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *tile, MODE_INFO **mi, + unsigned *no_pred_segcounts, + unsigned (*temporal_predictor_count)[2], + unsigned *t_unpred_seg_counts, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const int mis = cm->mi_stride; + const int bs = mi_size_wide[bsize], hbs = bs / 2; +#if CONFIG_EXT_PARTITION_TYPES + PARTITION_TYPE partition; +#else + int bw, bh; +#endif // CONFIG_EXT_PARTITION_TYPES + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + +#if CONFIG_EXT_PARTITION_TYPES + if (bsize == BLOCK_8X8) + partition = PARTITION_NONE; + else + partition = get_partition(cm, mi_row, mi_col, bsize); + switch (partition) { + case PARTITION_NONE: + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, bs, bs, mi_row, mi_col); + break; + case PARTITION_HORZ: + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, bs, hbs, mi_row, mi_col); + count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, bs, hbs, + mi_row + hbs, mi_col); + break; + case PARTITION_VERT: + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, hbs, bs, mi_row, mi_col); + count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row, + mi_col + hbs); + break; + case PARTITION_HORZ_A: + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, hbs, hbs, mi_row, mi_col); + count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, hbs, hbs, + mi_row, mi_col + hbs); + count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, bs, hbs, + mi_row + hbs, mi_col); + break; + case PARTITION_HORZ_B: + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, bs, hbs, mi_row, mi_col); + count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, hbs, hbs, + mi_row + hbs, mi_col); + count_segs(cm, xd, tile, mi + hbs + hbs * mis, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, hbs, hbs, + mi_row + hbs, mi_col + hbs); + break; + case PARTITION_VERT_A: + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, hbs, hbs, mi_row, mi_col); + count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, hbs, hbs, + mi_row + hbs, mi_col); + count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row, + mi_col + hbs); + break; + case PARTITION_VERT_B: + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, hbs, bs, mi_row, mi_col); + count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, hbs, hbs, + mi_row, mi_col + hbs); + count_segs(cm, xd, tile, mi + hbs + hbs * mis, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, hbs, hbs, + mi_row + hbs, mi_col + hbs); + break; + case PARTITION_SPLIT: { + const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize]; + int n; + + assert(num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type] < bs && + num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type] < bs); + + for (n = 0; n < 4; n++) { + const int mi_dc = hbs * (n & 1); + const int mi_dr = hbs * (n >> 1); + + count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, + mi_row + mi_dr, mi_col + mi_dc, subsize); + } + } break; + default: assert(0); + } +#else + bw = mi_size_wide[mi[0]->mbmi.sb_type]; + bh = mi_size_high[mi[0]->mbmi.sb_type]; + + if (bw == bs && bh == bs) { + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, bs, bs, mi_row, mi_col); + } else if (bw == bs && bh < bs) { + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, bs, hbs, mi_row, mi_col); + count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, bs, hbs, + mi_row + hbs, mi_col); + } else if (bw < bs && bh == bs) { + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, hbs, bs, mi_row, mi_col); + count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row, + mi_col + hbs); + } else { + const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize]; + int n; + + assert(bw < bs && bh < bs); + + for (n = 0; n < 4; n++) { + const int mi_dc = hbs * (n & 1); + const int mi_dr = hbs * (n >> 1); + + count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, + mi_row + mi_dr, mi_col + mi_dc, subsize); + } + } +#endif // CONFIG_EXT_PARTITION_TYPES +} + +void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) { + struct segmentation *seg = &cm->seg; + struct segmentation_probs *segp = &cm->fc->seg; + + int no_pred_cost; + int t_pred_cost = INT_MAX; + + int i, tile_col, tile_row, mi_row, mi_col; +#if CONFIG_TILE_GROUPS + const int probwt = cm->num_tg; +#else + const int probwt = 1; +#endif + + unsigned(*temporal_predictor_count)[2] = cm->counts.seg.pred; + unsigned *no_pred_segcounts = cm->counts.seg.tree_total; + unsigned *t_unpred_seg_counts = cm->counts.seg.tree_mispred; + + aom_prob no_pred_tree[SEG_TREE_PROBS]; + aom_prob t_pred_tree[SEG_TREE_PROBS]; + aom_prob t_nopred_prob[PREDICTION_PROBS]; + + (void)xd; + + // We are about to recompute all the segment counts, so zero the accumulators. + av1_zero(cm->counts.seg); + + // First of all generate stats regarding how well the last segment map + // predicts this one + for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) { + TileInfo tile_info; + av1_tile_set_row(&tile_info, cm, tile_row); + for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) { + MODE_INFO **mi_ptr; + av1_tile_set_col(&tile_info, cm, tile_col); +#if CONFIG_TILE_GROUPS && CONFIG_DEPENDENT_HORZTILES + av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col); +#endif + mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride + + tile_info.mi_col_start; + for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end; + mi_row += cm->mib_size, mi_ptr += cm->mib_size * cm->mi_stride) { + MODE_INFO **mi = mi_ptr; + for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; + mi_col += cm->mib_size, mi += cm->mib_size) { + count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, mi_row, + mi_col, cm->sb_size); + } + } + } + } + + // Work out probability tree for coding segments without prediction + // and the cost. + calc_segtree_probs(no_pred_segcounts, no_pred_tree, segp->tree_probs, probwt); + no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree); + + // Key frames cannot use temporal prediction + if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) { + // Work out probability tree for coding those segments not + // predicted using the temporal method and the cost. + calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs, + probwt); + t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree); + + // Add in the cost of the signaling for each prediction context. + for (i = 0; i < PREDICTION_PROBS; i++) { + const int count0 = temporal_predictor_count[i][0]; + const int count1 = temporal_predictor_count[i][1]; + + t_nopred_prob[i] = get_binary_prob(count0, count1); + av1_prob_diff_update_savings_search( + temporal_predictor_count[i], segp->pred_probs[i], &t_nopred_prob[i], + DIFF_UPDATE_PROB, probwt); + + // Add in the predictor signaling cost + t_pred_cost += count0 * av1_cost_zero(t_nopred_prob[i]) + + count1 * av1_cost_one(t_nopred_prob[i]); + } + } + + // Now choose which coding method to use. + if (t_pred_cost < no_pred_cost) { + assert(!cm->error_resilient_mode); + seg->temporal_update = 1; + } else { + seg->temporal_update = 0; + } +} + +void av1_reset_segment_features(AV1_COMMON *cm) { + struct segmentation *seg = &cm->seg; + + // Set up default state for MB feature flags + seg->enabled = 0; + seg->update_map = 0; + seg->update_data = 0; + av1_clearall_segfeatures(seg); +} diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h new file mode 100644 index 000000000..c1491ca2a --- /dev/null +++ b/third_party/aom/av1/encoder/segmentation.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_SEGMENTATION_H_ +#define AV1_ENCODER_SEGMENTATION_H_ + +#include "av1/common/blockd.h" +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_enable_segmentation(struct segmentation *seg); +void av1_disable_segmentation(struct segmentation *seg); + +void av1_disable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); +void av1_clear_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); + +// The values given for each segment can be either deltas (from the default +// value chosen for the frame) or absolute values. +// +// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for +// SEGMENT_ALT_LF) +// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for +// SEGMENT_ALT_LF) +// +// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use +// the absolute values given). +void av1_set_segment_data(struct segmentation *seg, signed char *feature_data, + unsigned char abs_delta); + +void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd); + +void av1_reset_segment_features(AV1_COMMON *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_SEGMENTATION_H_ diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c new file mode 100644 index 000000000..20c96761b --- /dev/null +++ b/third_party/aom/av1/encoder/speed_features.c @@ -0,0 +1,506 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/encoder/encoder.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/rdopt.h" + +#include "aom_dsp/aom_dsp_common.h" + +#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method +static MESH_PATTERN + good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + }; +static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = { + 50, 25, 15, 5, 1, 1 +}; + +#if CONFIG_INTRABC +// TODO(aconverse@google.com): These settings are pretty relaxed, tune them for +// each speed setting +static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { + { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, +}; +static uint8_t intrabc_max_mesh_pct[MAX_MESH_SPEED + 1] = { 100, 100, 100, + 25, 25, 10 }; +#endif + +// Intra only frames, golden frames (except alt ref overlays) and +// alt ref frames tend to be coded at a higher than ambient quality +static int frame_is_boosted(const AV1_COMP *cpi) { + return frame_is_kf_gf_arf(cpi); +} + +// Sets a partition size down to which the auto partition code will always +// search (can go lower), based on the image dimensions. The logic here +// is that the extent to which ringing artefacts are offensive, depends +// partly on the screen area that over which they propogate. Propogation is +// limited by transform block size but the screen area take up by a given block +// size will be larger for a small image format stretched to full screen. +static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) { + unsigned int screen_area = (cm->width * cm->height); + + // Select block size based on image format size. + if (screen_area < 1280 * 720) { + // Formats smaller in area than 720P + return BLOCK_4X4; + } else if (screen_area < 1920 * 1080) { + // Format >= 720P and < 1080P + return BLOCK_8X8; + } else { + // Formats 1080P and up + return BLOCK_16X16; + } +} + +static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, + SPEED_FEATURES *sf, + int speed) { + AV1_COMMON *const cm = &cpi->common; + + if (speed >= 1) { + if (AOMMIN(cm->width, cm->height) >= 720) { + sf->disable_split_mask = + cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + sf->partition_search_breakout_dist_thr = (1 << 23); + } else { + sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; + sf->partition_search_breakout_dist_thr = (1 << 21); + } + } + + if (speed >= 2) { + if (AOMMIN(cm->width, cm->height) >= 720) { + sf->disable_split_mask = + cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + sf->adaptive_pred_interp_filter = 0; + sf->partition_search_breakout_dist_thr = (1 << 24); + sf->partition_search_breakout_rate_thr = 120; + } else { + sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; + sf->partition_search_breakout_dist_thr = (1 << 22); + sf->partition_search_breakout_rate_thr = 100; + } + sf->rd_auto_partition_min_limit = set_partition_min_limit(cm); + } + + if (speed >= 3) { + if (AOMMIN(cm->width, cm->height) >= 720) { + sf->disable_split_mask = DISABLE_ALL_SPLIT; + sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0; + sf->partition_search_breakout_dist_thr = (1 << 25); + sf->partition_search_breakout_rate_thr = 200; + } else { + sf->max_intra_bsize = BLOCK_32X32; + sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; + sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0; + sf->partition_search_breakout_dist_thr = (1 << 23); + sf->partition_search_breakout_rate_thr = 120; + } + } + + // If this is a two pass clip that fits the criteria for animated or + // graphics content then reset disable_split_mask for speeds 1-4. + // Also if the image edge is internal to the coded area. + if ((speed >= 1) && (cpi->oxcf.pass == 2) && + ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) || + (av1_internal_image_edge(cpi)))) { + sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; + } + + if (speed >= 4) { + if (AOMMIN(cm->width, cm->height) >= 720) { + sf->partition_search_breakout_dist_thr = (1 << 26); + } else { + sf->partition_search_breakout_dist_thr = (1 << 24); + } + sf->disable_split_mask = DISABLE_ALL_SPLIT; + } +} + +static void set_good_speed_feature(AV1_COMP *cpi, AV1_COMMON *cm, + SPEED_FEATURES *sf, int speed) { + const int boosted = frame_is_boosted(cpi); + + if (speed >= 1) { + sf->tx_type_search.fast_intra_tx_type_search = 1; + sf->tx_type_search.fast_inter_tx_type_search = 1; + } + + if (speed >= 2) { + if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) || + av1_internal_image_edge(cpi)) { + sf->use_square_partition_only = !frame_is_boosted(cpi); + } else { + sf->use_square_partition_only = !frame_is_intra_only(cm); + } + + sf->less_rectangular_check = 1; + + sf->use_rd_breakout = 1; + sf->adaptive_motion_search = 1; + sf->mv.auto_mv_step_size = 1; + sf->adaptive_rd_thresh = 1; + sf->mv.subpel_iters_per_step = 1; + sf->mode_skip_start = 10; + sf->adaptive_pred_interp_filter = 1; + + sf->recode_loop = ALLOW_RECODE_KFARFGF; +#if CONFIG_TX64X64 + sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC_H_V; +#endif // CONFIG_TX64X64 + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + + sf->tx_size_search_breakout = 1; + sf->partition_search_breakout_rate_thr = 80; + sf->tx_type_search.prune_mode = PRUNE_ONE; + // Use transform domain distortion. + // Note var-tx expt always uses pixel domain distortion. + sf->use_transform_domain_distortion = 1; +#if CONFIG_EXT_INTER + sf->disable_wedge_search_var_thresh = 100; + sf->fast_wedge_sign_estimate = 1; +#endif // CONFIG_EXT_INTER + } + + if (speed >= 3) { + sf->tx_size_search_method = + frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL; + sf->mode_search_skip_flags = + (cm->frame_type == KEY_FRAME) + ? 0 + : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; + sf->disable_filter_search_var_thresh = 100; + sf->comp_inter_joint_search_thresh = BLOCK_SIZES; + sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; + sf->allow_partition_search_skip = 1; + sf->use_upsampled_references = 0; + sf->adaptive_rd_thresh = 2; +#if CONFIG_EXT_TX + sf->tx_type_search.prune_mode = PRUNE_TWO; +#endif + } + + if (speed >= 4) { + sf->use_square_partition_only = !frame_is_intra_only(cm); + sf->tx_size_search_method = + frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL; + sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED; + sf->adaptive_pred_interp_filter = 0; + sf->adaptive_mode_search = 1; + sf->cb_partition_search = !boosted; + sf->cb_pred_filter_search = 1; + sf->alt_ref_search_fp = 1; + sf->recode_loop = ALLOW_RECODE_KFMAXBW; + sf->adaptive_rd_thresh = 3; + sf->mode_skip_start = 6; +#if CONFIG_TX64X64 + sf->intra_y_mode_mask[TX_64X64] = INTRA_DC; + sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC; +#endif // CONFIG_TX64X64 + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; + sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; + sf->adaptive_interp_filter_search = 1; + } + + if (speed >= 5) { + sf->use_square_partition_only = 1; + sf->tx_size_search_method = USE_LARGESTALL; + sf->mv.search_method = BIGDIA; + sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + sf->adaptive_rd_thresh = 4; + if (cm->frame_type != KEY_FRAME) + sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE; + sf->disable_filter_search_var_thresh = 200; + sf->use_fast_coef_updates = ONE_LOOP_REDUCED; + sf->use_fast_coef_costing = 1; + sf->partition_search_breakout_rate_thr = 300; + } + + if (speed >= 6) { + int i; + sf->optimize_coefficients = 0; + sf->mv.search_method = HEX; + sf->disable_filter_search_var_thresh = 500; + for (i = 0; i < TX_SIZES; ++i) { + sf->intra_y_mode_mask[i] = INTRA_DC; + sf->intra_uv_mode_mask[i] = INTRA_DC; + } + sf->partition_search_breakout_rate_thr = 500; + sf->mv.reduce_first_step_size = 1; + sf->simple_model_rd_from_var = 1; + } + if (speed >= 7) { + const int is_keyframe = cm->frame_type == KEY_FRAME; + const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key; + sf->default_max_partition_size = BLOCK_32X32; + sf->default_min_partition_size = BLOCK_8X8; +#if CONFIG_TX64X64 + sf->intra_y_mode_mask[TX_64X64] = INTRA_DC; +#endif // CONFIG_TX64X64 + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; + sf->frame_parameter_update = 0; + sf->mv.search_method = FAST_HEX; + sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW; + sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST; + sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST; + sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST; +#if CONFIG_EXT_PARTITION + sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST; + sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST; + sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST; +#endif // CONFIG_EXT_PARTITION + sf->partition_search_type = REFERENCE_PARTITION; + sf->default_min_partition_size = BLOCK_8X8; + sf->reuse_inter_pred_sby = 1; + sf->force_frame_boost = + is_keyframe || + (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1); + sf->max_delta_qindex = is_keyframe ? 20 : 15; + sf->coeff_prob_appx_step = 4; + sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; + } +} + +void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) { + SPEED_FEATURES *const sf = &cpi->sf; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + AV1_COMMON *const cm = &cpi->common; + RD_OPT *const rd = &cpi->rd; + int i; + +// Limit memory usage for high resolutions +#if CONFIG_EXT_REFS + // TODO(zoeliu): Temporary solution to resolve the insufficient RAM issue for + // ext-refs. Need to work with @yunqingwang to have a more + // effective solution. + if (AOMMIN(cm->width, cm->height) > 720) { + // Turn off the use of upsampled references for HD resolution + sf->use_upsampled_references = 0; + } else if ((AOMMIN(cm->width, cm->height) > 540) && + (oxcf->profile != PROFILE_0)) { + sf->use_upsampled_references = 0; + } +#else + if (AOMMIN(cm->width, cm->height) > 1080) { + sf->use_upsampled_references = 0; + } else if ((AOMMIN(cm->width, cm->height) > 720) && + (oxcf->profile != PROFILE_0)) { + sf->use_upsampled_references = 0; + } +#endif // CONFIG_EXT_REFS + + if (oxcf->mode == GOOD) { + set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed); + } + + if (sf->disable_split_mask == DISABLE_ALL_SPLIT) { + sf->adaptive_pred_interp_filter = 0; + } + + // Check for masked out split cases. + for (i = 0; i < MAX_REFS; ++i) { + if (sf->disable_split_mask & (1 << i)) { + rd->thresh_mult_sub8x8[i] = INT_MAX; + } + } + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test == 1) + cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv; + else if (cpi->oxcf.motion_vector_unit_test == 2) + cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv; +} + +void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { + SPEED_FEATURES *const sf = &cpi->sf; + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + int i; + + // best quality defaults + sf->frame_parameter_update = 1; + sf->mv.search_method = NSTEP; + sf->recode_loop = ALLOW_RECODE; + sf->mv.subpel_search_method = SUBPEL_TREE; + sf->mv.subpel_iters_per_step = 2; + sf->mv.subpel_force_stop = 0; + sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf); + sf->mv.reduce_first_step_size = 0; + sf->coeff_prob_appx_step = 1; + sf->mv.auto_mv_step_size = 0; + sf->mv.fullpel_search_step_param = 6; + sf->comp_inter_joint_search_thresh = BLOCK_4X4; + sf->adaptive_rd_thresh = 0; + sf->tx_size_search_method = USE_FULL_RD; + sf->adaptive_motion_search = 0; + sf->adaptive_pred_interp_filter = 0; + sf->adaptive_mode_search = 0; + sf->cb_pred_filter_search = 0; + sf->cb_partition_search = 0; + sf->alt_ref_search_fp = 0; + sf->partition_search_type = SEARCH_PARTITION; + sf->tx_type_search.prune_mode = NO_PRUNE; + sf->tx_type_search.fast_intra_tx_type_search = 0; + sf->tx_type_search.fast_inter_tx_type_search = 0; + sf->less_rectangular_check = 0; + sf->use_square_partition_only = 0; + sf->auto_min_max_partition_size = NOT_IN_USE; + sf->rd_auto_partition_min_limit = BLOCK_4X4; + sf->default_max_partition_size = BLOCK_LARGEST; + sf->default_min_partition_size = BLOCK_4X4; + sf->adjust_partitioning_from_last_frame = 0; + sf->last_partitioning_redo_frequency = 4; + sf->disable_split_mask = 0; + sf->mode_search_skip_flags = 0; + sf->force_frame_boost = 0; + sf->max_delta_qindex = 0; + sf->disable_filter_search_var_thresh = 0; + sf->adaptive_interp_filter_search = 0; + sf->allow_partition_search_skip = 0; + sf->use_upsampled_references = 1; +#if CONFIG_EXT_INTER + sf->disable_wedge_search_var_thresh = 0; + sf->fast_wedge_sign_estimate = 0; +#endif // CONFIG_EXT_INTER + + for (i = 0; i < TX_SIZES; i++) { + sf->intra_y_mode_mask[i] = INTRA_ALL; + sf->intra_uv_mode_mask[i] = INTRA_ALL; + } + sf->use_rd_breakout = 0; + sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE; + sf->use_fast_coef_updates = TWO_LOOP; + sf->use_fast_coef_costing = 0; + sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set + sf->schedule_mode_search = 0; + for (i = 0; i < BLOCK_SIZES; ++i) sf->inter_mode_mask[i] = INTER_ALL; + sf->max_intra_bsize = BLOCK_LARGEST; + sf->reuse_inter_pred_sby = 0; + // This setting only takes effect when partition_search_type is set + // to FIXED_PARTITION. + sf->always_this_block_size = BLOCK_16X16; + sf->search_type_check_frequency = 50; + // Recode loop tolerance %. + sf->recode_tolerance = 25; + sf->default_interp_filter = SWITCHABLE; + sf->tx_size_search_breakout = 0; + sf->partition_search_breakout_dist_thr = 0; + sf->partition_search_breakout_rate_thr = 0; + sf->simple_model_rd_from_var = 0; + + // Set this at the appropriate speed levels + sf->use_transform_domain_distortion = 0; + + if (oxcf->mode == GOOD +#if CONFIG_XIPHRC + || oxcf->pass == 1 +#endif + ) + set_good_speed_feature(cpi, cm, sf, oxcf->speed); + + // sf->partition_search_breakout_dist_thr is set assuming max 64x64 + // blocks. Normalise this if the blocks are bigger. + if (MAX_SB_SIZE_LOG2 > 6) { + sf->partition_search_breakout_dist_thr <<= 2 * (MAX_SB_SIZE_LOG2 - 6); + } + + cpi->full_search_sad = av1_full_search_sad; + cpi->diamond_search_sad = av1_diamond_search_sad; + + sf->allow_exhaustive_searches = 1; + int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed; + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) + sf->exhaustive_searches_thresh = (1 << 24); + else + sf->exhaustive_searches_thresh = (1 << 25); + sf->max_exaustive_pct = good_quality_max_mesh_pct[speed]; + if (speed > 0) + sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1; + + for (i = 0; i < MAX_MESH_STEP; ++i) { + sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[speed][i].interval; + } +#if CONFIG_INTRABC + if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) && + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION || + cpi->oxcf.content == AOM_CONTENT_SCREEN)) { + for (i = 0; i < MAX_MESH_STEP; ++i) { + sf->mesh_patterns[i].range = intrabc_mesh_patterns[speed][i].range; + sf->mesh_patterns[i].interval = intrabc_mesh_patterns[speed][i].interval; + } + sf->max_exaustive_pct = intrabc_max_mesh_pct[speed]; + } +#endif // CONFIG_INTRABC + +#if !CONFIG_XIPHRC + // Slow quant, dct and trellis not worthwhile for first pass + // so make sure they are always turned off. + if (oxcf->pass == 1) sf->optimize_coefficients = 0; +#endif + + // No recode for 1 pass. + if (oxcf->pass == 0) { + sf->recode_loop = DISALLOW_RECODE; + sf->optimize_coefficients = 0; + } + + if (sf->mv.subpel_search_method == SUBPEL_TREE) { + cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree; + } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) { + cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned; + } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) { + cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_more; + } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) { + cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_evenmore; + } + +#if !CONFIG_AOM_QM + x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1; +#else + // FIXME: trellis not very efficient for quantisation matrices + x->optimize = 0; +#endif + + x->min_partition_size = sf->default_min_partition_size; + x->max_partition_size = sf->default_max_partition_size; + + if (!cpi->oxcf.frame_periodic_boost) { + sf->max_delta_qindex = 0; + } + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test == 1) + cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv; + else if (cpi->oxcf.motion_vector_unit_test == 2) + cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv; +} diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h new file mode 100644 index 000000000..af54a1a9a --- /dev/null +++ b/third_party/aom/av1/encoder/speed_features.h @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_SPEED_FEATURES_H_ +#define AV1_ENCODER_SPEED_FEATURES_H_ + +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) | + (1 << D135_PRED) | (1 << D117_PRED) | (1 << D153_PRED) | + (1 << D207_PRED) | (1 << D63_PRED) | +#if CONFIG_ALT_INTRA + (1 << SMOOTH_PRED) | +#endif // CONFIG_ALT_INTRA + (1 << TM_PRED), + INTRA_DC = (1 << DC_PRED), + INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED), + INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED), + INTRA_DC_TM_H_V = + (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | (1 << H_PRED) +}; + +#if CONFIG_EXT_INTER +enum { + INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV) | + (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | + (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) | (1 << NEW_NEWMV) | + (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) | + (1 << NEW_NEARESTMV) | (1 << ZERO_ZEROMV), + INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) | + (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) | + (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV), + INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) | + (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) | + (1 << NEAR_NEARESTMV) | (1 << NEAREST_NEARMV) | + (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) | + (1 << NEW_NEARMV) | (1 << NEAR_NEWMV), + INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | + (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | + (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) | + (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV), + INTER_NEAREST_NEW_ZERO = + (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV) | + (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | (1 << NEW_NEWMV) | + (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) | (1 << NEW_NEARESTMV) | + (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) | (1 << NEAR_NEWMV), + INTER_NEAREST_NEAR_NEW = + (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) | + (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) | (1 << NEAREST_NEARMV) | + (1 << NEAR_NEARESTMV) | (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) | + (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV), + INTER_NEAREST_NEAR_ZERO = + (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | + (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | (1 << NEAREST_NEARMV) | + (1 << NEAR_NEARESTMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) | + (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV), +}; +#else +enum { + INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV), + INTER_NEAREST = (1 << NEARESTMV), + INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV), + INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV), + INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV), + INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV), + INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV), +}; +#endif // CONFIG_EXT_INTER + +enum { + DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) | + (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST), + + DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT, + + DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA), + + LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) | + (1 << THR_ALTR) | (1 << THR_GOLD) +}; + +typedef enum { + DIAMOND = 0, + NSTEP = 1, + HEX = 2, + BIGDIA = 3, + SQUARE = 4, + FAST_HEX = 5, + FAST_DIAMOND = 6 +} SEARCH_METHODS; + +typedef enum { + // No recode. + DISALLOW_RECODE = 0, + // Allow recode for KF and exceeding maximum frame bandwidth. + ALLOW_RECODE_KFMAXBW = 1, + // Allow recode only for KF/ARF/GF frames. + ALLOW_RECODE_KFARFGF = 2, + // Allow recode for all frames based on bitrate constraints. + ALLOW_RECODE = 3, +} RECODE_LOOP_TYPE; + +typedef enum { + SUBPEL_TREE = 0, + SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches + SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively + SUBPEL_TREE_PRUNED_EVENMORE = 3, // Prunes 1/2- and 1/4-pel searches + // Other methods to come +} SUBPEL_SEARCH_METHODS; + +typedef enum { + NO_MOTION_THRESHOLD = 0, + LOW_MOTION_THRESHOLD = 7 +} MOTION_THRESHOLD; + +typedef enum { + USE_FULL_RD = 0, + USE_LARGESTALL, + USE_TX_8X8 +} TX_SIZE_SEARCH_METHOD; + +typedef enum { + NOT_IN_USE = 0, + RELAXED_NEIGHBORING_MIN_MAX = 1 +} AUTO_MIN_MAX_MODE; + +typedef enum { + // Try the full image with different values. + LPF_PICK_FROM_FULL_IMAGE, + // Try a small portion of the image with different values. + LPF_PICK_FROM_SUBIMAGE, + // Estimate the level based on quantizer and frame type + LPF_PICK_FROM_Q, + // Pick 0 to disable LPF if LPF was enabled last frame + LPF_PICK_MINIMAL_LPF +} LPF_PICK_METHOD; + +typedef enum { + // Terminate search early based on distortion so far compared to + // qp step, distortion in the neighborhood of the frame, etc. + FLAG_EARLY_TERMINATE = 1 << 0, + + // Skips comp inter modes if the best so far is an intra mode. + FLAG_SKIP_COMP_BESTINTRA = 1 << 1, + + // Skips oblique intra modes if the best so far is an inter mode. + FLAG_SKIP_INTRA_BESTINTER = 1 << 3, + + // Skips oblique intra modes at angles 27, 63, 117, 153 if the best + // intra so far is not one of the neighboring directions. + FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4, + + // Skips intra modes other than DC_PRED if the source variance is small + FLAG_SKIP_INTRA_LOWVAR = 1 << 5, +} MODE_SEARCH_SKIP_LOGIC; + +typedef enum { + FLAG_SKIP_EIGHTTAP_REGULAR = 1 << EIGHTTAP_REGULAR, + FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH, + FLAG_SKIP_MULTITAP_SHARP = 1 << MULTITAP_SHARP, +} INTERP_FILTER_MASK; + +typedef enum { + NO_PRUNE = 0, + // eliminates one tx type in vertical and horizontal direction + PRUNE_ONE = 1, +#if CONFIG_EXT_TX + // eliminates two tx types in each direction + PRUNE_TWO = 2, +#endif +} TX_TYPE_PRUNE_MODE; + +typedef struct { + TX_TYPE_PRUNE_MODE prune_mode; + int fast_intra_tx_type_search; + int fast_inter_tx_type_search; +} TX_TYPE_SEARCH; + +typedef enum { + // Search partitions using RD criterion + SEARCH_PARTITION, + + // Always use a fixed size partition + FIXED_PARTITION, + + REFERENCE_PARTITION, + + // Use an arbitrary partitioning scheme based on source variance within + // a 64X64 SB + VAR_BASED_PARTITION, + + // Use non-fixed partitions based on source variance + SOURCE_VAR_BASED_PARTITION +} PARTITION_SEARCH_TYPE; + +typedef enum { + // Does a dry run to see if any of the contexts need to be updated or not, + // before the final run. + TWO_LOOP = 0, + + // No dry run, also only half the coef contexts and bands are updated. + // The rest are not updated at all. + ONE_LOOP_REDUCED = 1 +} FAST_COEFF_UPDATE; + +typedef struct MV_SPEED_FEATURES { + // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). + SEARCH_METHODS search_method; + + // This parameter controls which step in the n-step process we start at. + // It's changed adaptively based on circumstances. + int reduce_first_step_size; + + // If this is set to 1, we limit the motion search range to 2 times the + // largest motion vector found in the last frame. + int auto_mv_step_size; + + // Subpel_search_method can only be subpel_tree which does a subpixel + // logarithmic search that keeps stepping at 1/2 pixel units until + // you stop getting a gain, and then goes on to 1/4 and repeats + // the same process. Along the way it skips many diagonals. + SUBPEL_SEARCH_METHODS subpel_search_method; + + // Maximum number of steps in logarithmic subpel search before giving up. + int subpel_iters_per_step; + + // Control when to stop subpel search + int subpel_force_stop; + + // This variable sets the step_param used in full pel motion search. + int fullpel_search_step_param; +} MV_SPEED_FEATURES; + +#define MAX_MESH_STEP 4 + +typedef struct MESH_PATTERN { + int range; + int interval; +} MESH_PATTERN; + +typedef struct SPEED_FEATURES { + MV_SPEED_FEATURES mv; + + // Frame level coding parameter update + int frame_parameter_update; + + RECODE_LOOP_TYPE recode_loop; + + // Trellis (dynamic programming) optimization of quantized values (+1, 0). + int optimize_coefficients; + + // Always set to 0. If on it enables 0 cost background transmission + // (except for the initial transmission of the segmentation). The feature is + // disabled because the addition of very large block sizes make the + // backgrounds very to cheap to encode, and the segmentation we have + // adds overhead. + int static_segmentation; + + // If 1 we iterate finding a best reference for 2 ref frames together - via + // a log search that iterates 4 times (check around mv for last for best + // error of combined predictor then check around mv for alt). If 0 we + // we just use the best motion vector found for each frame by itself. + BLOCK_SIZE comp_inter_joint_search_thresh; + + // This variable is used to cap the maximum number of times we skip testing a + // mode to be evaluated. A high value means we will be faster. + int adaptive_rd_thresh; + + // Coefficient probability model approximation step size + int coeff_prob_appx_step; + + // The threshold is to determine how slow the motino is, it is used when + // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION + MOTION_THRESHOLD lf_motion_threshold; + + // Determine which method we use to determine transform size. We can choose + // between options like full rd, largest for prediction size, largest + // for intra and model coefs for the rest. + TX_SIZE_SEARCH_METHOD tx_size_search_method; + + // After looking at the first set of modes (set by index here), skip + // checking modes for reference frames that don't match the reference frame + // of the best so far. + int mode_skip_start; + + PARTITION_SEARCH_TYPE partition_search_type; + + TX_TYPE_SEARCH tx_type_search; + + // Used if partition_search_type = FIXED_SIZE_PARTITION + BLOCK_SIZE always_this_block_size; + + // Skip rectangular partition test when partition type none gives better + // rd than partition type split. + int less_rectangular_check; + + // Disable testing non square partitions. (eg 16x32) + int use_square_partition_only; + + // Sets min and max partition sizes for this superblock based on the + // same superblock in last encoded frame, and the left and above neighbor. + AUTO_MIN_MAX_MODE auto_min_max_partition_size; + // Ensures the rd based auto partition search will always + // go down at least to the specified level. + BLOCK_SIZE rd_auto_partition_min_limit; + + // Min and max partition size we enable (block_size) as per auto + // min max, but also used by adjust partitioning, and pick_partitioning. + BLOCK_SIZE default_min_partition_size; + BLOCK_SIZE default_max_partition_size; + + // Whether or not we allow partitions one smaller or one greater than the last + // frame's partitioning. Only used if use_lastframe_partitioning is set. + int adjust_partitioning_from_last_frame; + + // How frequently we re do the partitioning from scratch. Only used if + // use_lastframe_partitioning is set. + int last_partitioning_redo_frequency; + + // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable + // it always, to allow it for only Last frame and Intra, disable it for all + // inter modes or to enable it always. + int disable_split_mask; + + // TODO(jingning): combine the related motion search speed features + // This allows us to use motion search at other sizes as a starting + // point for this motion search and limits the search range around it. + int adaptive_motion_search; + + // Flag for allowing some use of exhaustive searches; + int allow_exhaustive_searches; + + // Threshold for allowing exhaistive motion search. + int exhaustive_searches_thresh; + + // Maximum number of exhaustive searches for a frame. + int max_exaustive_pct; + + // Pattern to be used for any exhaustive mesh searches. + MESH_PATTERN mesh_patterns[MAX_MESH_STEP]; + + int schedule_mode_search; + + // Allows sub 8x8 modes to use the prediction filter that was determined + // best for 8x8 mode. If set to 0 we always re check all the filters for + // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter + // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected. + int adaptive_pred_interp_filter; + + // Adaptive prediction mode search + int adaptive_mode_search; + + // Chessboard pattern prediction filter type search + int cb_pred_filter_search; + + int cb_partition_search; + + int alt_ref_search_fp; + + // Use finer quantizer in every other few frames that run variable block + // partition type search. + int force_frame_boost; + + // Maximally allowed base quantization index fluctuation. + int max_delta_qindex; + + // Implements various heuristics to skip searching modes + // The heuristics selected are based on flags + // defined in the MODE_SEARCH_SKIP_HEURISTICS enum + unsigned int mode_search_skip_flags; + + // A source variance threshold below which filter search is disabled + // Choose a very large value (UINT_MAX) to use 8-tap always + unsigned int disable_filter_search_var_thresh; + +#if CONFIG_EXT_INTER + // A source variance threshold below which wedge search is disabled + unsigned int disable_wedge_search_var_thresh; + + // Whether fast wedge sign estimate is used + int fast_wedge_sign_estimate; +#endif // CONFIG_EXT_INTER + + // These bit masks allow you to enable or disable intra modes for each + // transform size separately. + int intra_y_mode_mask[TX_SIZES]; + int intra_uv_mode_mask[TX_SIZES]; + + // These bit masks allow you to enable or disable intra modes for each + // prediction block size separately. + int intra_y_mode_bsize_mask[BLOCK_SIZES]; + + // This variable enables an early break out of mode testing if the model for + // rd built from the prediction signal indicates a value that's much + // higher than the best rd we've seen so far. + int use_rd_breakout; + + // This feature controls how the loop filter level is determined. + LPF_PICK_METHOD lpf_pick; + + // This feature limits the number of coefficients updates we actually do + // by only looking at counts from 1/2 the bands. + FAST_COEFF_UPDATE use_fast_coef_updates; + + // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV + // modes are used in order from LSB to MSB for each BLOCK_SIZE. + int inter_mode_mask[BLOCK_SIZES]; + + // This feature controls whether we do the expensive context update and + // calculation in the rd coefficient costing loop. + int use_fast_coef_costing; + + // This feature controls the tolerence vs target used in deciding whether to + // recode a frame. It has no meaning if recode is disabled. + int recode_tolerance; + + // This variable controls the maximum block size where intra blocks can be + // used in inter frames. + // TODO(aconverse): Fold this into one of the other many mode skips + BLOCK_SIZE max_intra_bsize; + + // The frequency that we check if SOURCE_VAR_BASED_PARTITION or + // FIXED_PARTITION search type should be used. + int search_type_check_frequency; + + // When partition is pre-set, the inter prediction result from pick_inter_mode + // can be reused in final block encoding process. It is enabled only for real- + // time mode speed 6. + int reuse_inter_pred_sby; + + // default interp filter choice + InterpFilter default_interp_filter; + + // Early termination in transform size search, which only applies while + // tx_size_search_method is USE_FULL_RD. + int tx_size_search_breakout; + + // adaptive interp_filter search to allow skip of certain filter types. + int adaptive_interp_filter_search; + + // mask for skip evaluation of certain interp_filter type. + INTERP_FILTER_MASK interp_filter_search_mask; + + // Partition search early breakout thresholds. + int64_t partition_search_breakout_dist_thr; + int partition_search_breakout_rate_thr; + + // Allow skipping partition search for still image frame + int allow_partition_search_skip; + + // Fast approximation of av1_model_rd_from_var_lapndz + int simple_model_rd_from_var; + + // Do sub-pixel search in up-sampled reference frames + int use_upsampled_references; + + // Whether to compute distortion in the image domain (slower but + // more accurate), or in the transform domain (faster but less acurate). + int use_transform_domain_distortion; +} SPEED_FEATURES; + +struct AV1_COMP; + +void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi); +void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_SPEED_FEATURES_H_ diff --git a/third_party/aom/av1/encoder/subexp.c b/third_party/aom/av1/encoder/subexp.c new file mode 100644 index 000000000..8960d3341 --- /dev/null +++ b/third_party/aom/av1/encoder/subexp.c @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "aom_dsp/bitwriter.h" + +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/subexp.h" + +static const uint8_t update_bits[255] = { + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 0, +}; +#define MIN_DELP_BITS 5 + +static int recenter_nonneg(int v, int m) { + if (v > (m << 1)) + return v; + else if (v >= m) + return ((v - m) << 1); + else + return ((m - v) << 1) - 1; +} + +static int remap_prob(int v, int m) { + int i; + static const uint8_t map_table[MAX_PROB - 1] = { + // generated by: + // map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM); + 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 2, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, + 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 4, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88, + 89, 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102, + 103, 104, 105, 106, 107, 108, 109, 7, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 8, 122, 123, 124, 125, 126, 127, 128, 129, 130, + 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, + 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, + 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, + 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 13, 182, 183, 184, 185, + 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, 213, + 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, + 228, 229, 17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, + 18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19, + }; + v--; + m--; + if ((m << 1) <= MAX_PROB) + i = recenter_nonneg(v, m) - 1; + else + i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1; + + i = map_table[i]; + return i; +} + +static int prob_diff_update_cost(aom_prob newp, aom_prob oldp) { + int delp = remap_prob(newp, oldp); + return update_bits[delp] << AV1_PROB_COST_SHIFT; +} + +static void encode_uniform(aom_writer *w, int v) { + const int l = 8; + const int m = (1 << l) - 190; + if (v < m) { + aom_write_literal(w, v, l - 1); + } else { + aom_write_literal(w, m + ((v - m) >> 1), l - 1); + aom_write_literal(w, (v - m) & 1, 1); + } +} + +static INLINE int write_bit_gte(aom_writer *w, int word, int test) { + aom_write_literal(w, word >= test, 1); + return word >= test; +} + +static void encode_term_subexp(aom_writer *w, int word) { + if (!write_bit_gte(w, word, 16)) { + aom_write_literal(w, word, 4); + } else if (!write_bit_gte(w, word, 32)) { + aom_write_literal(w, word - 16, 4); + } else if (!write_bit_gte(w, word, 64)) { + aom_write_literal(w, word - 32, 5); + } else { + encode_uniform(w, word - 64); + } +} + +void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldp) { + const int delp = remap_prob(newp, oldp); + encode_term_subexp(w, delp); +} + +int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp, + aom_prob *bestp, aom_prob upd, + int probwt) { + const uint32_t old_b = cost_branch256(ct, oldp); + int bestsavings = 0; + aom_prob newp, bestnewp = oldp; + const int step = *bestp > oldp ? -1 : 1; + const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd); + + if (old_b > (uint32_t)upd_cost + (MIN_DELP_BITS << AV1_PROB_COST_SHIFT)) { + for (newp = *bestp; newp != oldp; newp += step) { + const int new_b = cost_branch256(ct, newp); + const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost; + const int savings = (int)((int64_t)old_b - new_b - update_b * probwt); + if (savings > bestsavings) { + bestsavings = savings; + bestnewp = newp; + } + } + } + *bestp = bestnewp; + return bestsavings; +} + +int av1_prob_diff_update_savings_search_model(const unsigned int *ct, + const aom_prob oldp, + aom_prob *bestp, aom_prob upd, + int stepsize, int probwt) { + int i, old_b, new_b, update_b, savings, bestsavings; + int newp; + const int step_sign = *bestp > oldp ? -1 : 1; + const int step = stepsize * step_sign; + const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd); + const aom_prob *newplist, *oldplist; + aom_prob bestnewp; + oldplist = av1_pareto8_full[oldp - 1]; + old_b = cost_branch256(ct + 2 * PIVOT_NODE, oldp); + for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i) + old_b += cost_branch256(ct + 2 * i, oldplist[i - UNCONSTRAINED_NODES]); + + bestsavings = 0; + bestnewp = oldp; + + assert(stepsize > 0); + + if (old_b > upd_cost + (MIN_DELP_BITS << AV1_PROB_COST_SHIFT)) { + for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) { + if (newp < 1 || newp > 255) continue; + newplist = av1_pareto8_full[newp - 1]; + new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp); + for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i) + new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]); + update_b = prob_diff_update_cost(newp, oldp) + upd_cost; + savings = old_b - new_b - update_b * probwt; + if (savings > bestsavings) { + bestsavings = savings; + bestnewp = newp; + } + } + } + + *bestp = bestnewp; + return bestsavings; +} + +#if CONFIG_SUBFRAME_PROB_UPDATE +static int get_cost(unsigned int ct[][2], aom_prob p, int n) { + int i, p0 = p; + unsigned int total_ct[2] = { 0, 0 }; + int cost = 0; + + for (i = 0; i <= n; ++i) { + cost += cost_branch256(ct[i], p); + total_ct[0] += ct[i][0]; + total_ct[1] += ct[i][1]; + if (i < n) + p = av1_merge_probs(p0, total_ct, COEF_COUNT_SAT, COEF_MAX_UPDATE_FACTOR); + } + return cost; +} + +int av1_prob_update_search_subframe(unsigned int ct[][2], aom_prob oldp, + aom_prob *bestp, aom_prob upd, int n) { + const int old_b = get_cost(ct, oldp, n); + int bestsavings = 0; + const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd); + aom_prob newp, bestnewp = oldp; + const int step = *bestp > oldp ? -1 : 1; + + for (newp = *bestp; newp != oldp; newp += step) { + const int new_b = get_cost(ct, newp, n); + const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost; + const int savings = old_b - new_b - update_b; + if (savings > bestsavings) { + bestsavings = savings; + bestnewp = newp; + } + } + *bestp = bestnewp; + return bestsavings; +} + +int av1_prob_update_search_model_subframe( + unsigned int ct[ENTROPY_NODES][COEF_PROBS_BUFS][2], const aom_prob *oldp, + aom_prob *bestp, aom_prob upd, int stepsize, int n) { + int i, old_b, new_b, update_b, savings, bestsavings; + int newp; + const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1; + const int step = stepsize * step_sign; + const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd); + aom_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES]; + av1_model_to_full_probs(oldp, oldplist); + memcpy(newplist, oldp, sizeof(aom_prob) * UNCONSTRAINED_NODES); + for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i) + old_b += get_cost(ct[i], oldplist[i], n); + old_b += get_cost(ct[PIVOT_NODE], oldplist[PIVOT_NODE], n); + + bestsavings = 0; + bestnewp = oldp[PIVOT_NODE]; + + assert(stepsize > 0); + + for (newp = *bestp; (newp - oldp[PIVOT_NODE]) * step_sign < 0; newp += step) { + if (newp < 1 || newp > 255) continue; + newplist[PIVOT_NODE] = newp; + av1_model_to_full_probs(newplist, newplist); + for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i) + new_b += get_cost(ct[i], newplist[i], n); + new_b += get_cost(ct[PIVOT_NODE], newplist[PIVOT_NODE], n); + update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) + upd_cost; + savings = old_b - new_b - update_b; + if (savings > bestsavings) { + bestsavings = savings; + bestnewp = newp; + } + } + + *bestp = bestnewp; + return bestsavings; +} +#endif // CONFIG_SUBFRAME_PROB_UPDATE + +void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp, + const unsigned int ct[2], int probwt) { + const aom_prob upd = DIFF_UPDATE_PROB; + aom_prob newp = get_binary_prob(ct[0], ct[1]); + const int savings = + av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt); + assert(newp >= 1); + if (savings > 0) { + aom_write(w, 1, upd); + av1_write_prob_diff_update(w, newp, *oldp); + *oldp = newp; + } else { + aom_write(w, 0, upd); + } +} + +int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2], + int probwt) { + const aom_prob upd = DIFF_UPDATE_PROB; + aom_prob newp = get_binary_prob(ct[0], ct[1]); + const int savings = + av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt); + return savings; +} diff --git a/third_party/aom/av1/encoder/subexp.h b/third_party/aom/av1/encoder/subexp.h new file mode 100644 index 000000000..049265cb8 --- /dev/null +++ b/third_party/aom/av1/encoder/subexp.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_SUBEXP_H_ +#define AV1_ENCODER_SUBEXP_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom_dsp/bitwriter.h" +#include "aom_dsp/prob.h" + +void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldpm); + +void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp, + const unsigned int ct[2], int probwt); + +int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp, + aom_prob *bestp, aom_prob upd, + int probwt); + +int av1_prob_diff_update_savings_search_model(const unsigned int *ct, + const aom_prob oldp, + aom_prob *bestp, aom_prob upd, + int stepsize, int probwt); + +int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2], + int probwt); +#if CONFIG_SUBFRAME_PROB_UPDATE +int av1_prob_update_search_subframe(unsigned int ct[][2], aom_prob oldp, + aom_prob *bestp, aom_prob upd, int n); +int av1_prob_update_search_model_subframe( + unsigned int ct[ENTROPY_NODES][COEF_PROBS_BUFS][2], const aom_prob *oldp, + aom_prob *bestp, aom_prob upd, int stepsize, int n); +#endif // CONFIG_SUBFRAME_PROB_UPDATE +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_SUBEXP_H_ diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c new file mode 100644 index 000000000..de962fe84 --- /dev/null +++ b/third_party/aom/av1/encoder/temporal_filter.c @@ -0,0 +1,719 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "av1/common/alloccommon.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/odintrin.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/temporal_filter.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_scale/aom_scale.h" + +static void temporal_filter_predictors_mb_c( + MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, + int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col, + uint8_t *pred, struct scale_factors *scale, int x, int y) { + const int which_mv = 0; + const MV mv = { mv_row, mv_col }; + enum mv_precision mv_precision_uv; + int uv_stride; + // TODO(angiebird): change plane setting accordingly + ConvolveParams conv_params = get_conv_params(which_mv, 0); + +#if USE_TEMPORALFILTER_12TAP +#if CONFIG_DUAL_FILTER + const InterpFilter interp_filter[4] = { TEMPORALFILTER_12TAP, + TEMPORALFILTER_12TAP, + TEMPORALFILTER_12TAP, + TEMPORALFILTER_12TAP }; +#else + const InterpFilter interp_filter = TEMPORALFILTER_12TAP; +#endif + (void)xd; +#else + const InterpFilter interp_filter = xd->mi[0]->mbmi.interp_filter; +#endif // USE_TEMPORALFILTER_12TAP +#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + WarpTypesAllowed warp_types; + memset(&warp_types, 0, sizeof(WarpTypesAllowed)); +#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + + if (uv_block_width == 8) { + uv_stride = (stride + 1) >> 1; + mv_precision_uv = MV_PRECISION_Q4; + } else { + uv_stride = stride; + mv_precision_uv = MV_PRECISION_Q3; + } + +#if CONFIG_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + av1_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, + 16, 16, which_mv, interp_filter, +#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + &warp_types, x, y, +#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + 0, MV_PRECISION_Q3, x, y, xd); + + av1_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], + uv_block_width, &mv, scale, uv_block_width, + uv_block_height, which_mv, interp_filter, +#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + &warp_types, x, y, +#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + 1, mv_precision_uv, x, y, xd); + + av1_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], + uv_block_width, &mv, scale, uv_block_width, + uv_block_height, which_mv, interp_filter, +#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + &warp_types, x, y, +#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + 2, mv_precision_uv, x, y, xd); + return; + } +#endif // CONFIG_HIGHBITDEPTH + av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16, + &conv_params, interp_filter, +#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + &warp_types, x, y, 0, 0, +#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + MV_PRECISION_Q3, x, y, xd); + + av1_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width, + &mv, scale, uv_block_width, uv_block_height, + &conv_params, interp_filter, +#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + &warp_types, x, y, 1, 0, +#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + mv_precision_uv, x, y, xd); + + av1_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width, + &mv, scale, uv_block_width, uv_block_height, + &conv_params, interp_filter, +#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + &warp_types, x, y, 2, 0, +#endif // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION + mv_precision_uv, x, y, xd); +} + +void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, + uint8_t *frame2, unsigned int block_width, + unsigned int block_height, int strength, + int filter_weight, unsigned int *accumulator, + uint16_t *count) { + unsigned int i, j, k; + int modifier; + int byte = 0; + const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + + for (i = 0, k = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++, k++) { + int pixel_value = *frame2; + + // non-local mean approach + int diff_sse[9] = { 0 }; + int idx, idy, index = 0; + + for (idy = -1; idy <= 1; ++idy) { + for (idx = -1; idx <= 1; ++idx) { + int row = (int)i + idy; + int col = (int)j + idx; + + if (row >= 0 && row < (int)block_height && col >= 0 && + col < (int)block_width) { + int diff = frame1[byte + idy * (int)stride + idx] - + frame2[idy * (int)block_width + idx]; + diff_sse[index] = diff * diff; + ++index; + } + } + } + + assert(index > 0); + + modifier = 0; + for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx]; + + modifier *= 3; + modifier /= index; + + ++frame2; + + modifier += rounding; + modifier >>= strength; + + if (modifier > 16) modifier = 16; + + modifier = 16 - modifier; + modifier *= filter_weight; + + count[k] += modifier; + accumulator[k] += modifier * pixel_value; + + byte++; + } + + byte += stride - block_width; + } +} + +#if CONFIG_HIGHBITDEPTH +void av1_highbd_temporal_filter_apply_c( + uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8, + unsigned int block_width, unsigned int block_height, int strength, + int filter_weight, unsigned int *accumulator, uint16_t *count) { + uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8); + uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8); + unsigned int i, j, k; + int modifier; + int byte = 0; + const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + + for (i = 0, k = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++, k++) { + int pixel_value = *frame2; + + // non-local mean approach + int diff_sse[9] = { 0 }; + int idx, idy, index = 0; + + for (idy = -1; idy <= 1; ++idy) { + for (idx = -1; idx <= 1; ++idx) { + int row = (int)i + idy; + int col = (int)j + idx; + + if (row >= 0 && row < (int)block_height && col >= 0 && + col < (int)block_width) { + int diff = frame1[byte + idy * (int)stride + idx] - + frame2[idy * (int)block_width + idx]; + diff_sse[index] = diff * diff; + ++index; + } + } + } + + assert(index > 0); + + modifier = 0; + for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx]; + + modifier *= 3; + modifier /= index; + + ++frame2; + + modifier += rounding; + modifier >>= strength; + + if (modifier > 16) modifier = 16; + + modifier = 16 - modifier; + modifier *= filter_weight; + + count[k] += modifier; + accumulator[k] += modifier * pixel_value; + + byte++; + } + + byte += stride - block_width; + } +} +#endif // CONFIG_HIGHBITDEPTH + +static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, + uint8_t *arf_frame_buf, + uint8_t *frame_ptr_buf, + int stride) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + int step_param; + int sadpb = x->sadperbit16; + int bestsme = INT_MAX; + int distortion; + unsigned int sse; + int cost_list[5]; + MvLimits tmp_mv_limits = x->mv_limits; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + // Save input state + struct buf_2d src = x->plane[0].src; + struct buf_2d pre = xd->plane[0].pre[0]; + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = arf_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = frame_ptr_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + +#if CONFIG_REF_MV + x->mvcost = x->mv_cost_stack[0]; + x->nmvjointcost = x->nmv_vec_cost[0]; + x->mvsadcost = x->mvcost; + x->nmvjointsadcost = x->nmvjointcost; +#endif + + // Ignore mv costing by sending NULL pointer instead of cost arrays + av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1, + cond_cost_list(cpi, cost_list), &cpi->fn_ptr[BLOCK_16X16], 0, + &best_ref_mv1); + + x->mv_limits = tmp_mv_limits; + + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + 0); + + x->e_mbd.mi[0]->bmi[0].as_mv[0] = x->best_mv; + + // Restore input state + x->plane[0].src = src; + xd->plane[0].pre[0] = pre; + + return bestsme; +} + +static void temporal_filter_iterate_c(AV1_COMP *cpi, + YV12_BUFFER_CONFIG **frames, + int frame_count, int alt_ref_index, + int strength, + struct scale_factors *scale) { + int byte; + int frame; + int mb_col, mb_row; + unsigned int filter_weight; + int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4; + int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4; + int mb_y_offset = 0; + int mb_uv_offset = 0; + DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]); + MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; + YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; + uint8_t *dst1, *dst2; +#if CONFIG_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]); + uint8_t *predictor; +#else + DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]); +#endif + const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; + const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x; + + // Save input state + uint8_t *input_buffer[MAX_MB_PLANE]; + int i; +#if CONFIG_HIGHBITDEPTH + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + predictor = CONVERT_TO_BYTEPTR(predictor16); + } else { + predictor = predictor8; + } +#endif + + for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; + + for (mb_row = 0; mb_row < mb_rows; mb_row++) { + // Source frames are extended to 16 pixels. This is different than + // L/A/G reference frames that have a border of 32 (AV1ENCBORDERINPIXELS) + // A 6/8 tap filter is used for motion search. This requires 2 pixels + // before and 3 pixels after. So the largest Y mv on a border would + // then be 16 - AOM_INTERP_EXTEND. The UV blocks are half the size of the + // Y and therefore only extended by 8. The largest mv that a UV block + // can support is 8 - AOM_INTERP_EXTEND. A UV mv is half of a Y mv. + // (16 - AOM_INTERP_EXTEND) >> 1 which is greater than + // 8 - AOM_INTERP_EXTEND. + // To keep the mv in play for both Y and UV planes the max that it + // can be on a border is therefore 16 - (2*AOM_INTERP_EXTEND+1). + cpi->td.mb.mv_limits.row_min = + -((mb_row * 16) + (17 - 2 * AOM_INTERP_EXTEND)); + cpi->td.mb.mv_limits.row_max = + ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * AOM_INTERP_EXTEND); + + for (mb_col = 0; mb_col < mb_cols; mb_col++) { + int j, k; + int stride; + + memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0])); + memset(count, 0, 16 * 16 * 3 * sizeof(count[0])); + + cpi->td.mb.mv_limits.col_min = + -((mb_col * 16) + (17 - 2 * AOM_INTERP_EXTEND)); + cpi->td.mb.mv_limits.col_max = + ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * AOM_INTERP_EXTEND); + + for (frame = 0; frame < frame_count; frame++) { + const int thresh_low = 10000; + const int thresh_high = 20000; + + if (frames[frame] == NULL) continue; + + mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0; + mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0; + + if (frame == alt_ref_index) { + filter_weight = 2; + } else { + // Find best match in this frame by MC + int err = temporal_filter_find_matching_mb_c( + cpi, frames[alt_ref_index]->y_buffer + mb_y_offset, + frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride); + + // Assign higher weight to matching MB if it's error + // score is lower. If not applying MC default behavior + // is to weight all MBs equal. + filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0; + } + + if (filter_weight != 0) { + // Construct the predictors + temporal_filter_predictors_mb_c( + mbd, frames[frame]->y_buffer + mb_y_offset, + frames[frame]->u_buffer + mb_uv_offset, + frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, + mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row, + mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale, + mb_col * 16, mb_row * 16); + +#if CONFIG_HIGHBITDEPTH + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + int adj_strength = strength + 2 * (mbd->bd - 8); + // Apply the filter (YUV) + av1_highbd_temporal_filter_apply( + f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, + adj_strength, filter_weight, accumulator, count); + av1_highbd_temporal_filter_apply( + f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, + mb_uv_width, mb_uv_height, adj_strength, filter_weight, + accumulator + 256, count + 256); + av1_highbd_temporal_filter_apply( + f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, + mb_uv_width, mb_uv_height, adj_strength, filter_weight, + accumulator + 512, count + 512); + } else { + // Apply the filter (YUV) + av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, + predictor, 16, 16, strength, + filter_weight, accumulator, count); + av1_temporal_filter_apply_c( + f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, + mb_uv_width, mb_uv_height, strength, filter_weight, + accumulator + 256, count + 256); + av1_temporal_filter_apply_c( + f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, + mb_uv_width, mb_uv_height, strength, filter_weight, + accumulator + 512, count + 512); + } +#else + // Apply the filter (YUV) + av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, + predictor, 16, 16, strength, + filter_weight, accumulator, count); + av1_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride, + predictor + 256, mb_uv_width, + mb_uv_height, strength, filter_weight, + accumulator + 256, count + 256); + av1_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride, + predictor + 512, mb_uv_width, + mb_uv_height, strength, filter_weight, + accumulator + 512, count + 512); +#endif // CONFIG_HIGHBITDEPTH + } + } + +#if CONFIG_HIGHBITDEPTH + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *dst1_16; + uint16_t *dst2_16; + // Normalize filter output to produce AltRef frame + dst1 = cpi->alt_ref_buffer.y_buffer; + dst1_16 = CONVERT_TO_SHORTPTR(dst1); + stride = cpi->alt_ref_buffer.y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < 16; i++) { + for (j = 0; j < 16; j++, k++) { + dst1_16[byte] = + (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); + + // move to next pixel + byte++; + } + + byte += stride - 16; + } + + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + dst1_16 = CONVERT_TO_SHORTPTR(dst1); + dst2_16 = CONVERT_TO_SHORTPTR(dst2); + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = 256; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + 256; + + // U + dst1_16[byte] = + (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); + + // V + dst2_16[byte] = + (uint16_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]); + + // move to next pixel + byte++; + } + + byte += stride - mb_uv_width; + } + } else { + // Normalize filter output to produce AltRef frame + dst1 = cpi->alt_ref_buffer.y_buffer; + stride = cpi->alt_ref_buffer.y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < 16; i++) { + for (j = 0; j < 16; j++, k++) { + dst1[byte] = + (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); + + // move to next pixel + byte++; + } + byte += stride - 16; + } + + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = 256; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + 256; + + // U + dst1[byte] = + (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); + + // V + dst2[byte] = + (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]); + + // move to next pixel + byte++; + } + byte += stride - mb_uv_width; + } + } +#else + // Normalize filter output to produce AltRef frame + dst1 = cpi->alt_ref_buffer.y_buffer; + stride = cpi->alt_ref_buffer.y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < 16; i++) { + for (j = 0; j < 16; j++, k++) { + dst1[byte] = + (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); + + // move to next pixel + byte++; + } + byte += stride - 16; + } + + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = 256; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + 256; + + // U + dst1[byte] = + (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); + + // V + dst2[byte] = + (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]); + + // move to next pixel + byte++; + } + byte += stride - mb_uv_width; + } +#endif // CONFIG_HIGHBITDEPTH + mb_y_offset += 16; + mb_uv_offset += mb_uv_width; + } + mb_y_offset += 16 * (f->y_stride - mb_cols); + mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols; + } + + // Restore input state + for (i = 0; i < MAX_MB_PLANE; i++) mbd->plane[i].pre[0].buf = input_buffer[i]; +} + +// Apply buffer limits and context specific adjustments to arnr filter. +static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost, + int *arnr_frames, int *arnr_strength) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int frames_after_arf = + av1_lookahead_depth(cpi->lookahead) - distance - 1; + int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1; + int frames_bwd; + int q, frames, strength; + + // Define the forward and backwards filter limits for this arnr group. + if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf; + if (frames_fwd > distance) frames_fwd = distance; + + frames_bwd = frames_fwd; + + // For even length filter there is one more frame backward + // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff. + if (frames_bwd < distance) frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1; + + // Set the baseline active filter size. + frames = frames_bwd + 1 + frames_fwd; + + // Adjust the strength based on active max q. + if (cpi->common.current_video_frame > 1) + q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME], + cpi->common.bit_depth)); + else + q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME], + cpi->common.bit_depth)); + if (q > 16) { + strength = oxcf->arnr_strength; + } else { + strength = oxcf->arnr_strength - ((16 - q) / 2); + if (strength < 0) strength = 0; + } + + // Adjust number of frames in filter and strength based on gf boost level. + if (frames > group_boost / 150) { + frames = group_boost / 150; + frames += !(frames & 1); + } + + if (strength > group_boost / 300) { + strength = group_boost / 300; + } + + // Adjustments for second level arf in multi arf case. + if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) { + strength >>= 1; + } + } + + *arnr_frames = frames; + *arnr_strength = strength; +} + +void av1_temporal_filter(AV1_COMP *cpi, int distance) { + RATE_CONTROL *const rc = &cpi->rc; + int frame; + int frames_to_blur; + int start_frame; + int strength; + int frames_to_blur_backward; + int frames_to_blur_forward; + struct scale_factors sf; + YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL }; +#if CONFIG_EXT_REFS + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; +#endif + + // Apply context specific adjustments to the arnr filter parameters. + adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength); +// TODO(weitinglin): Currently, we enforce the filtering strength on +// extra ARFs' to be zeros. We should investigate in which +// case it is more beneficial to use non-zero strength +// filtering. +#if CONFIG_EXT_REFS + if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) { + strength = 0; + frames_to_blur = 1; + } +#endif + +#if CONFIG_EXT_REFS + if (strength == 0 && frames_to_blur == 1) { + cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 1; + } else { + cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 0; + } +#endif + + frames_to_blur_backward = (frames_to_blur / 2); + frames_to_blur_forward = ((frames_to_blur - 1) / 2); + start_frame = distance + frames_to_blur_forward; + + // Setup frame pointers, NULL indicates frame not included in filter. + for (frame = 0; frame < frames_to_blur; ++frame) { + const int which_buffer = start_frame - frame; + struct lookahead_entry *buf = + av1_lookahead_peek(cpi->lookahead, which_buffer); + frames[frames_to_blur - 1 - frame] = &buf->img; + } + + if (frames_to_blur > 0) { +// Setup scaling factors. Scaling on each of the arnr frames is not +// supported. +// ARF is produced at the native frame size and resized when coded. +#if CONFIG_HIGHBITDEPTH + av1_setup_scale_factors_for_frame( + &sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + frames[0]->y_crop_width, frames[0]->y_crop_height, + cpi->common.use_highbitdepth); +#else + av1_setup_scale_factors_for_frame( + &sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + frames[0]->y_crop_width, frames[0]->y_crop_height); +#endif // CONFIG_HIGHBITDEPTH + } + + temporal_filter_iterate_c(cpi, frames, frames_to_blur, + frames_to_blur_backward, strength, &sf); +} diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h new file mode 100644 index 000000000..bc0863a63 --- /dev/null +++ b/third_party/aom/av1/encoder/temporal_filter.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_TEMPORAL_FILTER_H_ +#define AV1_ENCODER_TEMPORAL_FILTER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_temporal_filter(AV1_COMP *cpi, int distance); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_TEMPORAL_FILTER_H_ diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c new file mode 100644 index 000000000..f48493bf8 --- /dev/null +++ b/third_party/aom/av1/encoder/tokenize.c @@ -0,0 +1,887 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_mem/aom_mem.h" + +#include "av1/common/entropy.h" +#include "av1/common/pred_common.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/tokenize.h" + +static const TOKENVALUE dct_cat_lt_10_value_tokens[] = { + { 9, 63 }, { 9, 61 }, { 9, 59 }, { 9, 57 }, { 9, 55 }, { 9, 53 }, { 9, 51 }, + { 9, 49 }, { 9, 47 }, { 9, 45 }, { 9, 43 }, { 9, 41 }, { 9, 39 }, { 9, 37 }, + { 9, 35 }, { 9, 33 }, { 9, 31 }, { 9, 29 }, { 9, 27 }, { 9, 25 }, { 9, 23 }, + { 9, 21 }, { 9, 19 }, { 9, 17 }, { 9, 15 }, { 9, 13 }, { 9, 11 }, { 9, 9 }, + { 9, 7 }, { 9, 5 }, { 9, 3 }, { 9, 1 }, { 8, 31 }, { 8, 29 }, { 8, 27 }, + { 8, 25 }, { 8, 23 }, { 8, 21 }, { 8, 19 }, { 8, 17 }, { 8, 15 }, { 8, 13 }, + { 8, 11 }, { 8, 9 }, { 8, 7 }, { 8, 5 }, { 8, 3 }, { 8, 1 }, { 7, 15 }, + { 7, 13 }, { 7, 11 }, { 7, 9 }, { 7, 7 }, { 7, 5 }, { 7, 3 }, { 7, 1 }, + { 6, 7 }, { 6, 5 }, { 6, 3 }, { 6, 1 }, { 5, 3 }, { 5, 1 }, { 4, 1 }, + { 3, 1 }, { 2, 1 }, { 1, 1 }, { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 }, + { 4, 0 }, { 5, 0 }, { 5, 2 }, { 6, 0 }, { 6, 2 }, { 6, 4 }, { 6, 6 }, + { 7, 0 }, { 7, 2 }, { 7, 4 }, { 7, 6 }, { 7, 8 }, { 7, 10 }, { 7, 12 }, + { 7, 14 }, { 8, 0 }, { 8, 2 }, { 8, 4 }, { 8, 6 }, { 8, 8 }, { 8, 10 }, + { 8, 12 }, { 8, 14 }, { 8, 16 }, { 8, 18 }, { 8, 20 }, { 8, 22 }, { 8, 24 }, + { 8, 26 }, { 8, 28 }, { 8, 30 }, { 9, 0 }, { 9, 2 }, { 9, 4 }, { 9, 6 }, + { 9, 8 }, { 9, 10 }, { 9, 12 }, { 9, 14 }, { 9, 16 }, { 9, 18 }, { 9, 20 }, + { 9, 22 }, { 9, 24 }, { 9, 26 }, { 9, 28 }, { 9, 30 }, { 9, 32 }, { 9, 34 }, + { 9, 36 }, { 9, 38 }, { 9, 40 }, { 9, 42 }, { 9, 44 }, { 9, 46 }, { 9, 48 }, + { 9, 50 }, { 9, 52 }, { 9, 54 }, { 9, 56 }, { 9, 58 }, { 9, 60 }, { 9, 62 } +}; +const TOKENVALUE *av1_dct_cat_lt_10_value_tokens = + dct_cat_lt_10_value_tokens + + (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens)) / + 2; +// The corresponding costs of the extrabits for the tokens in the above table +// are stored in the table below. The values are obtained from looking up the +// entry for the specified extrabits in the table corresponding to the token +// (as defined in cost element av1_extra_bits) +// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1] +static const int dct_cat_lt_10_value_cost[] = { + 3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531, 3432, 3409, 3363, 3340, 3282, + 3259, 3213, 3190, 3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894, 2795, 2772, + 2726, 2703, 2645, 2622, 2576, 2553, 3197, 3116, 3058, 2977, 2881, 2800, 2742, + 2661, 2615, 2534, 2476, 2395, 2299, 2218, 2160, 2079, 2566, 2427, 2334, 2195, + 2023, 1884, 1791, 1652, 1893, 1696, 1453, 1256, 1229, 864, 512, 512, 512, + 512, 0, 512, 512, 512, 512, 864, 1229, 1256, 1453, 1696, 1893, 1652, + 1791, 1884, 2023, 2195, 2334, 2427, 2566, 2079, 2160, 2218, 2299, 2395, 2476, + 2534, 2615, 2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197, 2553, 2576, 2622, + 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136, + 3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432, 3531, 3554, 3600, 3623, 3681, + 3704, 3750, 3773, +}; +const int *av1_dct_cat_lt_10_value_cost = + dct_cat_lt_10_value_cost + + (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost)) / 2; + +// Array indices are identical to previously-existing CONTEXT_NODE indices +/* clang-format off */ +const aom_tree_index av1_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = { + -EOB_TOKEN, 2, // 0 = EOB + -ZERO_TOKEN, 4, // 1 = ZERO + -ONE_TOKEN, 6, // 2 = ONE + 8, 12, // 3 = LOW_VAL + -TWO_TOKEN, 10, // 4 = TWO + -THREE_TOKEN, -FOUR_TOKEN, // 5 = THREE + 14, 16, // 6 = HIGH_LOW + -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 7 = CAT_ONE + 18, 20, // 8 = CAT_THREEFOUR + -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 9 = CAT_THREE + -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 10 = CAT_FIVE +}; +/* clang-format on */ + +static const int16_t zero_cost[] = { 0 }; +static const int16_t sign_cost[1] = { 512 }; +static const int16_t cat1_cost[1 << 1] = { 864, 1229 }; +static const int16_t cat2_cost[1 << 2] = { 1256, 1453, 1696, 1893 }; +static const int16_t cat3_cost[1 << 3] = { 1652, 1791, 1884, 2023, + 2195, 2334, 2427, 2566 }; +static const int16_t cat4_cost[1 << 4] = { 2079, 2160, 2218, 2299, 2395, 2476, + 2534, 2615, 2661, 2742, 2800, 2881, + 2977, 3058, 3116, 3197 }; +static const int16_t cat5_cost[1 << 5] = { + 2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963, + 2986, 3044, 3067, 3113, 3136, 3190, 3213, 3259, 3282, 3340, 3363, + 3409, 3432, 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773 +}; +const int16_t av1_cat6_low_cost[256] = { + 3378, 3390, 3401, 3413, 3435, 3447, 3458, 3470, 3517, 3529, 3540, 3552, 3574, + 3586, 3597, 3609, 3671, 3683, 3694, 3706, 3728, 3740, 3751, 3763, 3810, 3822, + 3833, 3845, 3867, 3879, 3890, 3902, 3973, 3985, 3996, 4008, 4030, 4042, 4053, + 4065, 4112, 4124, 4135, 4147, 4169, 4181, 4192, 4204, 4266, 4278, 4289, 4301, + 4323, 4335, 4346, 4358, 4405, 4417, 4428, 4440, 4462, 4474, 4485, 4497, 4253, + 4265, 4276, 4288, 4310, 4322, 4333, 4345, 4392, 4404, 4415, 4427, 4449, 4461, + 4472, 4484, 4546, 4558, 4569, 4581, 4603, 4615, 4626, 4638, 4685, 4697, 4708, + 4720, 4742, 4754, 4765, 4777, 4848, 4860, 4871, 4883, 4905, 4917, 4928, 4940, + 4987, 4999, 5010, 5022, 5044, 5056, 5067, 5079, 5141, 5153, 5164, 5176, 5198, + 5210, 5221, 5233, 5280, 5292, 5303, 5315, 5337, 5349, 5360, 5372, 4988, 5000, + 5011, 5023, 5045, 5057, 5068, 5080, 5127, 5139, 5150, 5162, 5184, 5196, 5207, + 5219, 5281, 5293, 5304, 5316, 5338, 5350, 5361, 5373, 5420, 5432, 5443, 5455, + 5477, 5489, 5500, 5512, 5583, 5595, 5606, 5618, 5640, 5652, 5663, 5675, 5722, + 5734, 5745, 5757, 5779, 5791, 5802, 5814, 5876, 5888, 5899, 5911, 5933, 5945, + 5956, 5968, 6015, 6027, 6038, 6050, 6072, 6084, 6095, 6107, 5863, 5875, 5886, + 5898, 5920, 5932, 5943, 5955, 6002, 6014, 6025, 6037, 6059, 6071, 6082, 6094, + 6156, 6168, 6179, 6191, 6213, 6225, 6236, 6248, 6295, 6307, 6318, 6330, 6352, + 6364, 6375, 6387, 6458, 6470, 6481, 6493, 6515, 6527, 6538, 6550, 6597, 6609, + 6620, 6632, 6654, 6666, 6677, 6689, 6751, 6763, 6774, 6786, 6808, 6820, 6831, + 6843, 6890, 6902, 6913, 6925, 6947, 6959, 6970, 6982 +}; +const int av1_cat6_high_cost[CAT6_HIGH_COST_ENTRIES] = { + 100, 2263, 2739, 4902, 3160, 5323, 5799, 7962, 3678, 5841, 6317, + 8480, 6738, 8901, 9377, 11540, 3678, 5841, 6317, 8480, 6738, 8901, + 9377, 11540, 7256, 9419, 9895, 12058, 10316, 12479, 12955, 15118, 3678, + 5841, 6317, 8480, 6738, 8901, 9377, 11540, 7256, 9419, 9895, 12058, + 10316, 12479, 12955, 15118, 7256, 9419, 9895, 12058, 10316, 12479, 12955, + 15118, 10834, 12997, 13473, 15636, 13894, 16057, 16533, 18696, +#if CONFIG_HIGHBITDEPTH + 4193, 6356, 6832, 8995, 7253, 9416, 9892, 12055, 7771, 9934, 10410, + 12573, 10831, 12994, 13470, 15633, 7771, 9934, 10410, 12573, 10831, 12994, + 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771, + 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, + 14409, 16572, 17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, + 19211, 14927, 17090, 17566, 19729, 17987, 20150, 20626, 22789, 4193, 6356, + 6832, 8995, 7253, 9416, 9892, 12055, 7771, 9934, 10410, 12573, 10831, + 12994, 13470, 15633, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, + 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771, 9934, 10410, + 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, + 17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, + 17090, 17566, 19729, 17987, 20150, 20626, 22789, 8286, 10449, 10925, 13088, + 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, + 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, + 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, + 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, + 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, + 23822, 22080, 24243, 24719, 26882, 4193, 6356, 6832, 8995, 7253, 9416, + 9892, 12055, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 7771, + 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, + 14409, 16572, 17048, 19211, 7771, 9934, 10410, 12573, 10831, 12994, 13470, + 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349, 13512, + 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729, 17987, + 20150, 20626, 22789, 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148, + 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, + 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, + 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, + 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, + 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, + 26882, 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, + 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, + 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, + 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, + 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, + 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, + 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, + 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, + 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, + 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, + 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, + 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, 4193, 6356, 6832, + 8995, 7253, 9416, 9892, 12055, 7771, 9934, 10410, 12573, 10831, 12994, + 13470, 15633, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, + 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771, 9934, 10410, 12573, + 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, + 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, + 17566, 19729, 17987, 20150, 20626, 22789, 8286, 10449, 10925, 13088, 11346, + 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, + 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, + 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, + 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, + 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, + 22080, 24243, 24719, 26882, 8286, 10449, 10925, 13088, 11346, 13509, 13985, + 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, + 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, + 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, + 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, + 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, + 24719, 26882, 12379, 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, + 18120, 18596, 20759, 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, + 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, + 27397, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, + 22174, 24337, 22595, 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, + 24758, 25234, 27397, 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, + 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503, + 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087, + 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864, + 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, + 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141, + 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542, + 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, + 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, + 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, + 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, + 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, + 25276, 25752, 27915, 26173, 28336, 28812, 30975, 12379, 14542, 15018, 17181, + 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656, + 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, + 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017, + 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, + 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752, + 27915, 26173, 28336, 28812, 30975, 16472, 18635, 19111, 21274, 19532, 21695, + 22171, 24334, 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 20050, + 22213, 22689, 24852, 23110, 25273, 25749, 27912, 23628, 25791, 26267, 28430, + 26688, 28851, 29327, 31490, 20050, 22213, 22689, 24852, 23110, 25273, 25749, + 27912, 23628, 25791, 26267, 28430, 26688, 28851, 29327, 31490, 23628, 25791, + 26267, 28430, 26688, 28851, 29327, 31490, 27206, 29369, 29845, 32008, 30266, + 32429, 32905, 35068 +#endif +}; + +const uint8_t av1_cat6_skipped_bits_discount[8] = { + 0, 3, 6, 9, 12, 18, 24, 30 +}; + +#if CONFIG_NEW_MULTISYMBOL +const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = { + { 0, 0, 0, zero_cost }, // ZERO_TOKEN + { 0, 0, 1, sign_cost }, // ONE_TOKEN + { 0, 0, 2, sign_cost }, // TWO_TOKEN + { 0, 0, 3, sign_cost }, // THREE_TOKEN + { 0, 0, 4, sign_cost }, // FOUR_TOKEN + { av1_cat1_cdf, 1, CAT1_MIN_VAL, cat1_cost }, // CATEGORY1_TOKEN + { av1_cat2_cdf, 2, CAT2_MIN_VAL, cat2_cost }, // CATEGORY2_TOKEN + { av1_cat3_cdf, 3, CAT3_MIN_VAL, cat3_cost }, // CATEGORY3_TOKEN + { av1_cat4_cdf, 4, CAT4_MIN_VAL, cat4_cost }, // CATEGORY4_TOKEN + { av1_cat5_cdf, 5, CAT5_MIN_VAL, cat5_cost }, // CATEGORY5_TOKEN + { av1_cat6_cdf, 18, CAT6_MIN_VAL, 0 }, // CATEGORY6_TOKEN + { 0, 0, 0, zero_cost } // EOB_TOKEN +}; +#else +const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = { + { 0, 0, 0, zero_cost }, // ZERO_TOKEN + { 0, 0, 1, sign_cost }, // ONE_TOKEN + { 0, 0, 2, sign_cost }, // TWO_TOKEN + { 0, 0, 3, sign_cost }, // THREE_TOKEN + { 0, 0, 4, sign_cost }, // FOUR_TOKEN + { av1_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost }, // CATEGORY1_TOKEN + { av1_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost }, // CATEGORY2_TOKEN + { av1_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost }, // CATEGORY3_TOKEN + { av1_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost }, // CATEGORY4_TOKEN + { av1_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost }, // CATEGORY5_TOKEN + { av1_cat6_prob, 18, CAT6_MIN_VAL, 0 }, // CATEGORY6_TOKEN + { 0, 0, 0, zero_cost } // EOB_TOKEN +}; +#endif + +#if !CONFIG_EC_MULTISYMBOL +const struct av1_token av1_coef_encodings[ENTROPY_TOKENS] = { + { 2, 2 }, { 6, 3 }, { 28, 5 }, { 58, 6 }, { 59, 6 }, { 60, 6 }, + { 61, 6 }, { 124, 7 }, { 125, 7 }, { 126, 7 }, { 127, 7 }, { 0, 1 } +}; +#endif // !CONFIG_EC_MULTISYMBOL + +struct tokenize_b_args { + const AV1_COMP *cpi; + ThreadData *td; + TOKENEXTRA **tp; + int this_rate; +}; + +#if !CONFIG_PVQ || CONFIG_VAR_TX +static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { + struct tokenize_b_args *const args = arg; + const AV1_COMP *const cpi = args->cpi; + const AV1_COMMON *cm = &cpi->common; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const PLANE_TYPE type = pd->plane_type; + const int ref = is_inter_block(mbmi); + const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size); + const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, ref); + const int rate = av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, + pd->above_context + blk_col, + pd->left_context + blk_row, 0); + args->this_rate += rate; + (void)plane_bsize; + av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col, + blk_row); +} + +static void set_entropy_context_b(int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct tokenize_b_args *const args = arg; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + (void)plane_bsize; + av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col, + blk_row); +} + +#if CONFIG_NEW_TOKENSET +static INLINE void add_token(TOKENEXTRA **t, + aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)], + aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)], + int eob_val, int first_val, int32_t extra, + uint8_t token) { + (*t)->token = token; + (*t)->extra = extra; + (*t)->tail_cdf = tail_cdf; + (*t)->head_cdf = head_cdf; + (*t)->eob_val = eob_val; + (*t)->first_val = first_val; + (*t)++; +} + +#else // CONFIG_NEW_TOKENSET +static INLINE void add_token( + TOKENEXTRA **t, const aom_prob *context_tree, +#if CONFIG_EC_MULTISYMBOL + aom_cdf_prob (*token_cdf)[CDF_SIZE(ENTROPY_TOKENS)], +#endif // CONFIG_EC_MULTISYMBOL + int32_t extra, uint8_t token, uint8_t skip_eob_node, unsigned int *counts) { + (*t)->token = token; + (*t)->extra = extra; + (*t)->context_tree = context_tree; +#if CONFIG_EC_MULTISYMBOL + (*t)->token_cdf = token_cdf; +#endif // CONFIG_EC_MULTISYMBOL + (*t)->skip_eob_node = skip_eob_node; + (*t)++; + ++counts[token]; +} +#endif // CONFIG_NEW_TOKENSET +#endif // !CONFIG_PVQ || CONFIG_VAR_TX + +#if CONFIG_PALETTE +void av1_tokenize_palette_sb(const AV1_COMP *cpi, + const struct ThreadData *const td, int plane, + TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize, + int *rate) { + const MACROBLOCK *const x = &td->mb; + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const uint8_t *const color_map = xd->plane[plane].color_index_map; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int n = pmi->palette_size[plane]; + int i, j; + int this_rate = 0; + uint8_t color_order[PALETTE_MAX_SIZE]; + const aom_prob( + *const probs)[PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS - 1] = + plane == 0 ? av1_default_palette_y_color_index_prob + : av1_default_palette_uv_color_index_prob; + int plane_block_width, rows, cols; + av1_get_block_dimensions(bsize, plane, xd, &plane_block_width, NULL, &rows, + &cols); + assert(plane == 0 || plane == 1); + +#if CONFIG_PALETTE_THROUGHPUT + int k; + for (k = 1; k < rows + cols - 1; ++k) { + for (j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) { + i = k - j; +#else + for (i = 0; i < rows; ++i) { + for (j = (i == 0 ? 1 : 0); j < cols; ++j) { +#endif // CONFIG_PALETTE_THROUGHPUT + int color_new_idx; + const int color_ctx = av1_get_palette_color_index_context( + color_map, plane_block_width, i, j, n, color_order, &color_new_idx); + assert(color_new_idx >= 0 && color_new_idx < n); + if (dry_run == DRY_RUN_COSTCOEFFS) + this_rate += cpi->palette_y_color_cost[n - PALETTE_MIN_SIZE][color_ctx] + [color_new_idx]; + (*t)->token = color_new_idx; + (*t)->context_tree = probs[n - PALETTE_MIN_SIZE][color_ctx]; + (*t)->skip_eob_node = 0; + ++(*t); + } + } + if (rate) *rate += this_rate; +} +#endif // CONFIG_PALETTE + +#if CONFIG_PVQ +static void add_pvq_block(AV1_COMMON *const cm, MACROBLOCK *const x, + PVQ_INFO *pvq) { + PVQ_QUEUE *q = x->pvq_q; + if (q->curr_pos >= q->buf_len) { + int new_buf_len = 2 * q->buf_len + 1; + PVQ_INFO *new_buf; + CHECK_MEM_ERROR(cm, new_buf, aom_malloc(new_buf_len * sizeof(PVQ_INFO))); + memcpy(new_buf, q->buf, q->buf_len * sizeof(PVQ_INFO)); + aom_free(q->buf); + q->buf = new_buf; + q->buf_len = new_buf_len; + } + OD_COPY(q->buf + q->curr_pos, pvq, 1); + ++q->curr_pos; +} + +// NOTE: This does not actually generate tokens, instead we store the encoding +// decisions made for PVQ in a queue that we will read from when +// actually writing the bitstream in write_modes_b +static void tokenize_pvq(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { + struct tokenize_b_args *const args = arg; + const AV1_COMP *cpi = args->cpi; + const AV1_COMMON *const cm = &cpi->common; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + PVQ_INFO *pvq_info; + + (void)block; + (void)blk_row; + (void)blk_col; + (void)plane_bsize; + (void)tx_size; + + assert(block < MAX_PVQ_BLOCKS_IN_SB); + pvq_info = &x->pvq[block][plane]; + add_pvq_block((AV1_COMMON * const)cm, x, pvq_info); +} +#endif // CONFIG_PVQ + +static void tokenize_b(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { +#if !CONFIG_PVQ + struct tokenize_b_args *const args = arg; + const AV1_COMP *cpi = args->cpi; + const AV1_COMMON *const cm = &cpi->common; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + TOKENEXTRA **tp = args->tp; + uint8_t token_cache[MAX_TX_SQUARE]; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + int pt; /* near block/prev token context index */ + int c; + TOKENEXTRA *t = *tp; /* store tokens starting here */ + const int eob = p->eobs[block]; + const PLANE_TYPE type = pd->plane_type; + const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); +#if CONFIG_SUPERTX + const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx); +#else + const int segment_id = mbmi->segment_id; +#endif // CONFIG_SUEPRTX + const int16_t *scan, *nb; + const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size); + const SCAN_ORDER *const scan_order = + get_scan(cm, tx_size, tx_type, is_inter_block(mbmi)); + const int ref = is_inter_block(mbmi); + unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] = + td->rd_counts.coef_counts[txsize_sqr_map[tx_size]][type][ref]; +#if !CONFIG_NEW_TOKENSET +#if CONFIG_SUBFRAME_PROB_UPDATE + const aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = + cpi->subframe_stats.coef_probs_buf[cpi->common.coef_probs_update_idx] + [txsize_sqr_map[tx_size]][type][ref]; +#else + aom_prob(*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = + cpi->common.fc->coef_probs[txsize_sqr_map[tx_size]][type][ref]; +#endif // CONFIG_SUBFRAME_PROB_UPDATE +#endif // !CONFIG_NEW_TOKENSET +#if CONFIG_EC_ADAPT + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; +#elif CONFIG_EC_MULTISYMBOL + FRAME_CONTEXT *ec_ctx = cpi->common.fc; +#endif +#if CONFIG_NEW_TOKENSET + aom_cdf_prob( + *const coef_head_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] = + ec_ctx->coef_head_cdfs[txsize_sqr_map[tx_size]][type][ref]; + aom_cdf_prob( + *const coef_tail_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] = + ec_ctx->coef_tail_cdfs[txsize_sqr_map[tx_size]][type][ref]; + unsigned int(*const blockz_count)[2] = + td->counts->blockz_count[txsize_sqr_map[tx_size]][type][ref]; + int eob_val; + int first_val = 1; +#else +#if CONFIG_EC_MULTISYMBOL + aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] = + ec_ctx->coef_cdfs[txsize_sqr_map[tx_size]][type][ref]; +#endif + int skip_eob = 0; +#endif + const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size); + unsigned int(*const eob_branch)[COEFF_CONTEXTS] = + td->counts->eob_branch[txsize_sqr_map[tx_size]][type][ref]; + const uint8_t *const band = get_band_translate(tx_size); + int16_t token; + EXTRABIT extra; + (void)plane_bsize; + pt = get_entropy_context(tx_size, pd->above_context + blk_col, + pd->left_context + blk_row); + scan = scan_order->scan; + nb = scan_order->neighbors; + c = 0; + +#if CONFIG_NEW_TOKENSET + if (eob == 0) + add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt], 1, + 1, 0, BLOCK_Z_TOKEN); + + ++blockz_count[pt][eob != 0]; + + while (c < eob) { + int v = qcoeff[scan[c]]; + first_val = (c == 0); + + if (!v) { + add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt], + 0, first_val, 0, ZERO_TOKEN); + ++counts[band[c]][pt][ZERO_TOKEN]; + token_cache[scan[c]] = 0; + } else { + eob_val = + (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB; + + av1_get_token_extra(v, &token, &extra); + + add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt], + eob_val, first_val, extra, (uint8_t)token); + + if (eob_val != LAST_EOB) { + ++counts[band[c]][pt][token]; + ++eob_branch[band[c]][pt]; + counts[band[c]][pt][EOB_TOKEN] += eob_val != NO_EOB; + } + + token_cache[scan[c]] = av1_pt_energy_class[token]; + } + ++c; + pt = get_coef_context(nb, token_cache, AOMMIN(c, eob - 1)); + } +#else + while (c < eob) { + const int v = qcoeff[scan[c]]; + eob_branch[band[c]][pt] += !skip_eob; + + av1_get_token_extra(v, &token, &extra); + + add_token(&t, coef_probs[band[c]][pt], +#if CONFIG_EC_MULTISYMBOL + &coef_cdfs[band[c]][pt], +#endif + extra, (uint8_t)token, (uint8_t)skip_eob, counts[band[c]][pt]); + + token_cache[scan[c]] = av1_pt_energy_class[token]; + ++c; + pt = get_coef_context(nb, token_cache, c); + skip_eob = (token == ZERO_TOKEN); + } + if (c < seg_eob) { + add_token(&t, coef_probs[band[c]][pt], +#if CONFIG_EC_MULTISYMBOL + NULL, +#endif + 0, EOB_TOKEN, 0, counts[band[c]][pt]); + ++eob_branch[band[c]][pt]; + } +#endif // CONFIG_NEW_TOKENSET + +#if CONFIG_COEF_INTERLEAVE + t->token = EOSB_TOKEN; + t++; +#endif + + *tp = t; + +#if CONFIG_ADAPT_SCAN + // Since dqcoeff is not available here, we pass qcoeff into + // av1_update_scan_count_facade(). The update behavior should be the same + // because av1_update_scan_count_facade() only cares if coefficients are zero + // or not. + av1_update_scan_count_facade((AV1_COMMON *)cm, td->counts, tx_size, tx_type, + qcoeff, c); +#endif + + av1_set_contexts(xd, pd, plane, tx_size, c > 0, blk_col, blk_row); +#else // !CONFIG_PVQ + tokenize_pvq(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); +#endif // !CONFIG_PVQ +} + +struct is_skippable_args { + uint16_t *eobs; + int *skippable; +}; +static void is_skippable(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *argv) { + struct is_skippable_args *args = argv; + (void)plane; + (void)plane_bsize; + (void)tx_size; + (void)blk_row; + (void)blk_col; + args->skippable[0] &= (!args->eobs[block]); +} + +// TODO(yaowu): rewrite and optimize this function to remove the usage of +// av1_foreach_transform_block() and simplify is_skippable(). +int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { + int result = 1; + struct is_skippable_args args = { x->plane[plane].eobs, &result }; + av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable, + &args); + return result; +} + +#if CONFIG_VAR_TX +void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run, + TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row, + int blk_col, int block, int plane, void *arg) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; + const int tx_row = blk_row >> (1 - pd->subsampling_y); + const int tx_col = blk_col >> (1 - pd->subsampling_x); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + TX_SIZE plane_tx_size; + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + plane_tx_size = + plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0] + : mbmi->inter_tx_size[tx_row][tx_col]; + + if (tx_size == plane_tx_size) { + plane_bsize = get_plane_block_size(mbmi->sb_type, pd); + if (!dry_run) + tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); + else if (dry_run == DRY_RUN_NORMAL) + set_entropy_context_b(plane, block, blk_row, blk_col, plane_bsize, + tx_size, arg); + else if (dry_run == DRY_RUN_COSTCOEFFS) + cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); + } else { + // Half the block size in transform block unit. + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsl = tx_size_wide_unit[sub_txs]; + int i; + + assert(bsl > 0); + + for (i = 0; i < 4; ++i) { + const int offsetr = blk_row + ((i >> 1) * bsl); + const int offsetc = blk_col + ((i & 0x01) * bsl); + + int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs]; + + if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + + tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc, + block, plane, arg); + block += step; + } + } +} + +void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t, + RUN_TYPE dry_run, int mi_row, int mi_col, + BLOCK_SIZE bsize, int *rate) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + TOKENEXTRA *t_backup = *t; + const int ctx = av1_get_skip_context(xd); + const int skip_inc = + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); + struct tokenize_b_args arg = { cpi, td, t, 0 }; + int plane; + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + if (mbmi->skip) { + if (!dry_run) td->counts->skip[ctx][1] += skip_inc; + reset_skip_context(xd, bsize); + if (dry_run) *t = t_backup; + return; + } + + if (!dry_run) + td->counts->skip[ctx][0] += skip_inc; + else + *t = t_backup; + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { +#if CONFIG_CB4X4 + if (!is_chroma_reference(mi_row, mi_col, bsize, + xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y)) { +#if !CONFIG_PVQ + if (!dry_run) { + (*t)->token = EOSB_TOKEN; + (*t)++; + } +#endif + continue; + } +#endif + const struct macroblockd_plane *const pd = &xd->plane[plane]; +#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2 + const BLOCK_SIZE plane_bsize = + AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd)); +#else + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); +#endif + const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize); + const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; + int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0]; + int bh = block_size_high[txb_size] >> tx_size_wide_log2[0]; + int idx, idy; + int block = 0; + int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + for (idy = 0; idy < mi_height; idy += bh) { + for (idx = 0; idx < mi_width; idx += bw) { + tokenize_vartx(td, t, dry_run, max_tx_size, plane_bsize, idy, idx, + block, plane, &arg); + block += step; + } + } + + if (!dry_run) { + (*t)->token = EOSB_TOKEN; + (*t)++; + } + } + if (rate) *rate += arg.this_rate; +} +#endif // CONFIG_VAR_TX + +void av1_tokenize_sb(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t, + RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, + const int mi_row, const int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const int ctx = av1_get_skip_context(xd); + const int skip_inc = + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); + struct tokenize_b_args arg = { cpi, td, t, 0 }; + if (mbmi->skip) { + if (!dry_run) td->counts->skip[ctx][1] += skip_inc; + reset_skip_context(xd, bsize); + return; + } + + if (!dry_run) { +#if CONFIG_COEF_INTERLEAVE + td->counts->skip[ctx][0] += skip_inc; + av1_foreach_transformed_block_interleave(xd, bsize, tokenize_b, &arg); +#else + int plane; + + td->counts->skip[ctx][0] += skip_inc; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { +#if CONFIG_CB4X4 + if (!is_chroma_reference(mi_row, mi_col, bsize, + xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y)) { +#if !CONFIG_PVQ + (*t)->token = EOSB_TOKEN; + (*t)++; +#endif + continue; + } +#else + (void)mi_row; + (void)mi_col; +#endif + av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b, + &arg); +#if !CONFIG_PVQ + (*t)->token = EOSB_TOKEN; + (*t)++; +#endif // !CONFIG_PVQ + } +#endif + } +#if !CONFIG_PVQ + else if (dry_run == DRY_RUN_NORMAL) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { +#if CONFIG_CB4X4 + if (!is_chroma_reference(mi_row, mi_col, bsize, + xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y)) + continue; +#else + (void)mi_row; + (void)mi_col; +#endif + av1_foreach_transformed_block_in_plane(xd, bsize, plane, + set_entropy_context_b, &arg); + } + } else if (dry_run == DRY_RUN_COSTCOEFFS) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { +#if CONFIG_CB4X4 + if (!is_chroma_reference(mi_row, mi_col, bsize, + xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y)) + continue; +#else + (void)mi_row; + (void)mi_col; +#endif + av1_foreach_transformed_block_in_plane(xd, bsize, plane, cost_coeffs_b, + &arg); + } + } +#endif // !CONFIG_PVQ + + if (rate) *rate += arg.this_rate; +} + +#if CONFIG_SUPERTX +void av1_tokenize_sb_supertx(const AV1_COMP *cpi, ThreadData *td, + TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize, + int *rate) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &td->mb.e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + TOKENEXTRA *t_backup = *t; + const int ctx = av1_get_skip_context(xd); + const int skip_inc = + !segfeature_active(&cm->seg, mbmi->segment_id_supertx, SEG_LVL_SKIP); + struct tokenize_b_args arg = { cpi, td, t, 0 }; + if (mbmi->skip) { + if (!dry_run) td->counts->skip[ctx][1] += skip_inc; + reset_skip_context(xd, bsize); + if (dry_run) *t = t_backup; + return; + } + + if (!dry_run) { + int plane; + td->counts->skip[ctx][0] += skip_inc; + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b, + &arg); + (*t)->token = EOSB_TOKEN; + (*t)++; + } + } else if (dry_run == DRY_RUN_NORMAL) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) + av1_foreach_transformed_block_in_plane(xd, bsize, plane, + set_entropy_context_b, &arg); + *t = t_backup; + } else if (dry_run == DRY_RUN_COSTCOEFFS) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) + av1_foreach_transformed_block_in_plane(xd, bsize, plane, cost_coeffs_b, + &arg); + } + if (rate) *rate += arg.this_rate; +} +#endif // CONFIG_SUPERTX diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h new file mode 100644 index 000000000..3928111d6 --- /dev/null +++ b/third_party/aom/av1/encoder/tokenize.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_TOKENIZE_H_ +#define AV1_ENCODER_TOKENIZE_H_ + +#include "av1/common/entropy.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/treewriter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define EOSB_TOKEN 127 // Not signalled, encoder only + +#if CONFIG_HIGHBITDEPTH +typedef int32_t EXTRABIT; +#else +typedef int16_t EXTRABIT; +#endif + +typedef struct { + int16_t token; + EXTRABIT extra; +} TOKENVALUE; + +typedef struct { +#if CONFIG_NEW_TOKENSET + aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)]; + aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)]; + int eob_val; + int first_val; +#elif CONFIG_EC_MULTISYMBOL + aom_cdf_prob (*token_cdf)[CDF_SIZE(ENTROPY_TOKENS)]; +#endif + const aom_prob *context_tree; + EXTRABIT extra; + uint8_t token; + uint8_t skip_eob_node; +} TOKENEXTRA; + +extern const aom_tree_index av1_coef_tree[]; +extern const aom_tree_index av1_coef_con_tree[]; +#if !CONFIG_EC_MULTISYMBOL +extern const struct av1_token av1_coef_encodings[]; +#endif // !CONFIG_EC_MULTISYMBOL + +int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); + +struct AV1_COMP; +struct ThreadData; + +typedef enum { + OUTPUT_ENABLED = 0, + DRY_RUN_NORMAL, + DRY_RUN_COSTCOEFFS, +} RUN_TYPE; + +// Note in all the tokenize functions rate if non NULL is incremented +// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS, +// otherwise rate is not incremented. +#if CONFIG_VAR_TX +void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td, + TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *rate); +#endif +#if CONFIG_PALETTE +void av1_tokenize_palette_sb(const struct AV1_COMP *cpi, + const struct ThreadData *const td, int plane, + TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize, + int *rate); +#endif // CONFIG_PALETTE +void av1_tokenize_sb(const struct AV1_COMP *cpi, struct ThreadData *td, + TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize, + int *rate, const int mi_row, const int mi_col); +#if CONFIG_SUPERTX +void av1_tokenize_sb_supertx(const struct AV1_COMP *cpi, struct ThreadData *td, + TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize, + int *rate); +#endif + +extern const int16_t *av1_dct_value_cost_ptr; +/* TODO: The Token field should be broken out into a separate char array to + * improve cache locality, since it's needed for costing when the rest of the + * fields are not. + */ +extern const TOKENVALUE *av1_dct_value_tokens_ptr; +extern const TOKENVALUE *av1_dct_cat_lt_10_value_tokens; +extern const int *av1_dct_cat_lt_10_value_cost; +extern const int16_t av1_cat6_low_cost[256]; +#if CONFIG_HIGHBITDEPTH +#define CAT6_HIGH_COST_ENTRIES 1024 +#else +#define CAT6_HIGH_COST_ENTRIES 64 +#endif +extern const int av1_cat6_high_cost[CAT6_HIGH_COST_ENTRIES]; +extern const uint8_t av1_cat6_skipped_bits_discount[8]; + +static INLINE void av1_get_token_extra(int v, int16_t *token, EXTRABIT *extra) { + if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) { + *token = CATEGORY6_TOKEN; + if (v >= CAT6_MIN_VAL) + *extra = 2 * v - 2 * CAT6_MIN_VAL; + else + *extra = -2 * v - 2 * CAT6_MIN_VAL + 1; + return; + } + *token = av1_dct_cat_lt_10_value_tokens[v].token; + *extra = av1_dct_cat_lt_10_value_tokens[v].extra; +} +static INLINE int16_t av1_get_token(int v) { + if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) return 10; + return av1_dct_cat_lt_10_value_tokens[v].token; +} + +static INLINE int av1_get_token_cost(int v, int16_t *token, int cat6_bits) { + if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) { + EXTRABIT extrabits; + *token = CATEGORY6_TOKEN; + extrabits = abs(v) - CAT6_MIN_VAL; + return av1_cat6_low_cost[extrabits & 0xff] + + av1_cat6_high_cost[extrabits >> 8] - + av1_cat6_skipped_bits_discount[18 - cat6_bits]; + } + *token = av1_dct_cat_lt_10_value_tokens[v].token; + return av1_dct_cat_lt_10_value_cost[v]; +} + +#if !CONFIG_PVQ || CONFIG_VAR_TX +static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id, + TX_SIZE tx_size) { + const int eob_max = tx_size_2d[tx_size]; + return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_TOKENIZE_H_ diff --git a/third_party/aom/av1/encoder/treewriter.c b/third_party/aom/av1/encoder/treewriter.c new file mode 100644 index 000000000..50be72413 --- /dev/null +++ b/third_party/aom/av1/encoder/treewriter.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/treewriter.h" + +static void tree2tok(struct av1_token *tokens, const aom_tree_index *tree, + int i, int v, int l) { + v += v; + ++l; + + do { + const aom_tree_index j = tree[i++]; + if (j <= 0) { + tokens[-j].value = v; + tokens[-j].len = l; + } else { + tree2tok(tokens, tree, j, v, l); + } + } while (++v & 1); +} + +void av1_tokens_from_tree(struct av1_token *tokens, + const aom_tree_index *tree) { + tree2tok(tokens, tree, 0, 0, 0); +} + +static unsigned int convert_distribution(unsigned int i, aom_tree tree, + unsigned int branch_ct[][2], + const unsigned int num_events[]) { + unsigned int left, right; + + if (tree[i] <= 0) + left = num_events[-tree[i]]; + else + left = convert_distribution(tree[i], tree, branch_ct, num_events); + + if (tree[i + 1] <= 0) + right = num_events[-tree[i + 1]]; + else + right = convert_distribution(tree[i + 1], tree, branch_ct, num_events); + + branch_ct[i >> 1][0] = left; + branch_ct[i >> 1][1] = right; + return left + right; +} + +void av1_tree_probs_from_distribution(aom_tree tree, + unsigned int branch_ct[/* n-1 */][2], + const unsigned int num_events[/* n */]) { + convert_distribution(0, tree, branch_ct, num_events); +} diff --git a/third_party/aom/av1/encoder/treewriter.h b/third_party/aom/av1/encoder/treewriter.h new file mode 100644 index 000000000..9a4cb86cb --- /dev/null +++ b/third_party/aom/av1/encoder/treewriter.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_TREEWRITER_H_ +#define AV1_ENCODER_TREEWRITER_H_ + +#include "aom_dsp/bitwriter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_tree_probs_from_distribution(aom_tree tree, + unsigned int branch_ct[/* n - 1 */][2], + const unsigned int num_events[/* n */]); + +struct av1_token { + int value; + int len; +}; + +void av1_tokens_from_tree(struct av1_token *, const aom_tree_index *); + +static INLINE void av1_write_token(aom_writer *w, const aom_tree_index *tree, + const aom_prob *probs, + const struct av1_token *token) { + aom_write_tree(w, tree, probs, token->value, token->len, 0); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AV1_ENCODER_TREEWRITER_H_ diff --git a/third_party/aom/av1/encoder/variance_tree.c b/third_party/aom/av1/encoder/variance_tree.c new file mode 100644 index 000000000..9384cd78e --- /dev/null +++ b/third_party/aom/av1/encoder/variance_tree.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/variance_tree.h" +#include "av1/encoder/encoder.h" + +void av1_setup_var_tree(struct AV1Common *cm, ThreadData *td) { + int i, j; +#if CONFIG_EXT_PARTITION + const int leaf_nodes = 1024; + const int tree_nodes = 1024 + 256 + 64 + 16 + 4 + 1; +#else + const int leaf_nodes = 256; + const int tree_nodes = 256 + 64 + 16 + 4 + 1; +#endif // CONFIG_EXT_PARTITION + int index = 0; + VAR_TREE *this_var; + int nodes; + + aom_free(td->var_tree); + CHECK_MEM_ERROR(cm, td->var_tree, + aom_calloc(tree_nodes, sizeof(*td->var_tree))); + + this_var = &td->var_tree[0]; + + // Sets up all the leaf nodes in the tree. + for (index = 0; index < leaf_nodes; ++index) { + VAR_TREE *const leaf = &td->var_tree[index]; + leaf->split[0] = NULL; + } + + // Each node has 4 leaf nodes, fill in the child pointers + // from leafs to the root. + for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) { + for (i = 0; i < nodes; ++i, ++index) { + VAR_TREE *const node = &td->var_tree[index]; + for (j = 0; j < 4; j++) node->split[j] = this_var++; + } + } + + // Set up the root node for the largest superblock size + i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2; + td->var_root[i] = &td->var_tree[tree_nodes - 1]; + // Set up the root nodes for the rest of the possible superblock sizes + while (--i >= 0) { + td->var_root[i] = td->var_root[i + 1]->split[0]; + } +} + +void av1_free_var_tree(ThreadData *td) { + aom_free(td->var_tree); + td->var_tree = NULL; +} diff --git a/third_party/aom/av1/encoder/variance_tree.h b/third_party/aom/av1/encoder/variance_tree.h new file mode 100644 index 000000000..a9f27302e --- /dev/null +++ b/third_party/aom/av1/encoder/variance_tree.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_VARIANCE_TREE_H_ +#define AV1_ENCODER_VARIANCE_TREE_H_ + +#include + +#include "./aom_config.h" + +#include "aom/aom_integer.h" + +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Common; +struct ThreadData; + +typedef struct { + int64_t sum_square_error; + int64_t sum_error; + int log2_count; + int variance; +} VAR; + +typedef struct { + VAR none; + VAR horz[2]; + VAR vert[2]; +} partition_variance; + +typedef struct VAR_TREE { + int force_split; + partition_variance variances; + struct VAR_TREE *split[4]; + BLOCK_SIZE bsize; + const uint8_t *src; + const uint8_t *ref; + int src_stride; + int ref_stride; + int width; + int height; +#if CONFIG_HIGHBITDEPTH + int highbd; +#endif // CONFIG_HIGHBITDEPTH +} VAR_TREE; + +void av1_setup_var_tree(struct AV1Common *cm, struct ThreadData *td); +void av1_free_var_tree(struct ThreadData *td); + +// Set variance values given sum square error, sum error, count. +static INLINE void fill_variance(int64_t s2, int64_t s, int c, VAR *v) { + v->sum_square_error = s2; + v->sum_error = s; + v->log2_count = c; + v->variance = + (int)(256 * (v->sum_square_error - + ((v->sum_error * v->sum_error) >> v->log2_count)) >> + v->log2_count); +} + +static INLINE void sum_2_variances(const VAR *a, const VAR *b, VAR *r) { + assert(a->log2_count == b->log2_count); + fill_variance(a->sum_square_error + b->sum_square_error, + a->sum_error + b->sum_error, a->log2_count + 1, r); +} + +static INLINE void fill_variance_node(VAR_TREE *vt) { + sum_2_variances(&vt->split[0]->variances.none, &vt->split[1]->variances.none, + &vt->variances.horz[0]); + sum_2_variances(&vt->split[2]->variances.none, &vt->split[3]->variances.none, + &vt->variances.horz[1]); + sum_2_variances(&vt->split[0]->variances.none, &vt->split[2]->variances.none, + &vt->variances.vert[0]); + sum_2_variances(&vt->split[1]->variances.none, &vt->split[3]->variances.none, + &vt->variances.vert[1]); + sum_2_variances(&vt->variances.vert[0], &vt->variances.vert[1], + &vt->variances.none); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* AV1_ENCODER_VARIANCE_TREE_H_ */ diff --git a/third_party/aom/av1/encoder/wedge_utils.c b/third_party/aom/av1/encoder/wedge_utils.c new file mode 100644 index 000000000..e6edbb6af --- /dev/null +++ b/third_party/aom/av1/encoder/wedge_utils.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" + +#include "aom_ports/mem.h" + +#include "aom_dsp/aom_dsp_common.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * Computes SSE of a compound predictor constructed from 2 fundamental + * predictors p0 and p1 using blending with mask. + * + * r1: Residuals of p1. + * (source - p1) + * d: Difference of p1 and p0. + * (p1 - p0) + * m: The blending mask + * N: Number of pixels + * + * 'r1', 'd', and 'm' are contiguous. + * + * Computes: + * Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to: + * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2), + * where r0 is (source - p0), and r1 is (source - p1), which is in turn + * is equivalent to: + * Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2), + * which is the SSE of the residuals of the compound predictor scaled up by + * MAX_MASK_VALUE**2. + * + * Note that we clamp the partial term in the loop to 16 bits signed. This is + * to facilitate equivalent SIMD implementation. It should have no effect if + * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always + * holds for 8 bit input, and on real input, it should hold practically always, + * as residuals are expected to be small. + */ +uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + uint64_t csse = 0; + int i; + + for (i = 0; i < N; i++) { + int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i]; + t = clamp(t, INT16_MIN, INT16_MAX); + csse += t * t; + } + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * Choose the mask sign for a compound predictor. + * + * ds: Difference of the squares of the residuals. + * r0**2 - r1**2 + * m: The blending mask + * N: Number of pixels + * limit: Pre-computed threshold value. + * MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) + * + * 'ds' and 'm' are contiguous. + * + * Returns true if the negated mask has lower SSE compared to the positive + * mask. Computation is based on: + * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2) + * > + * Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2) + * + * which can be simplified to: + * + * Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) + * + * The right hand side does not depend on the mask, and needs to be passed as + * the 'limit' parameter. + * + * After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left + * hand side is simply a scalar product between an int16_t and uint8_t vector. + * + * Note that for efficiency, ds is stored on 16 bits. Real input residuals + * being small, this should not cause a noticeable issue. + */ +int av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int N, + int64_t limit) { + int64_t acc = 0; + + do { + acc += *ds++ * *m++; + } while (--N); + + return acc > limit; +} + +/** + * Compute the element-wise difference of the squares of 2 arrays. + * + * d: Difference of the squares of the inputs: a**2 - b**2 + * a: First input array + * b: Second input array + * N: Number of elements + * + * 'd', 'a', and 'b' are contiguous. + * + * The result is saturated to signed 16 bits. + */ +void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + int i; + + for (i = 0; i < N; i++) + d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX); +} diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c new file mode 100644 index 000000000..fa5626002 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./av1_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" + +// Coefficient quantization phase 1 +// param[0-2] : rounding/quan/dequan constants +static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param, + const int shift, const int scale, + __m128i *qcoeff, __m128i *dquan, + __m128i *sign) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + + *sign = _mm_cmplt_epi32(*coeff, zero); + *sign = _mm_or_si128(*sign, one); + *coeff = _mm_abs_epi32(*coeff); + + qcoeff[0] = _mm_add_epi32(*coeff, param[0]); + qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero); + qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero); + + qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]); + qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift); + dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]); + dquan[0] = _mm_srli_epi64(dquan[0], scale); +} + +// Coefficient quantization phase 2 +static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan, + const __m128i *sign, + const __m128i *param, const int shift, + const int scale, tran_low_t *qAddr, + tran_low_t *dqAddr) { + __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0); + __m128i mask0H = _mm_set_epi32(0, 0, -1, -1); + + qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]); + qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift); + dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]); + dquan[1] = _mm_srli_epi64(dquan[1], scale); + + // combine L&H + qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8); + qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d); + + qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H); + qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L); + + dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8); + dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d); + + dquan[0] = _mm_and_si128(dquan[0], mask0H); + dquan[1] = _mm_and_si128(dquan[1], mask0L); + + qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]); + dquan[0] = _mm_or_si128(dquan[0], dquan[1]); + + qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign); + dquan[0] = _mm_sign_epi32(dquan[0], *sign); + + _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]); + _mm_storeu_si128((__m128i *)dqAddr, dquan[0]); +} + +static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan, + __m128i *eob) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, iscanIdx; + const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr); + const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4)); + __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero); + __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero); + + nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero); + nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero); + + mask = _mm_packs_epi32(nz_flag0, nz_flag1); + iscanIdx = _mm_loadu_si128((__m128i const *)iscan); + iscanIdx = _mm_sub_epi16(iscanIdx, mask); + iscanIdx = _mm_and_si128(iscanIdx, mask); + *eob = _mm_max_epi16(*eob, iscanIdx); +} + +static INLINE uint16_t get_accumulated_eob(__m128i *eob) { + __m128i eob_shuffled; + uint16_t eobValue; + eob_shuffled = _mm_shuffle_epi32(*eob, 0xe); + *eob = _mm_max_epi16(*eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe); + *eob = _mm_max_epi16(*eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1); + *eob = _mm_max_epi16(*eob, eob_shuffled); + eobValue = _mm_extract_epi16(*eob, 0); + return eobValue; +} + +void av1_highbd_quantize_fp_sse4_1( + const tran_low_t *coeff_ptr, intptr_t count, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, int log_scale) { + __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign; + __m128i eob = _mm_setzero_si128(); + const tran_low_t *src = coeff_ptr; + tran_low_t *quanAddr = qcoeff_ptr; + tran_low_t *dquanAddr = dqcoeff_ptr; + const int shift = 16 - log_scale; + const int coeff_stride = 4; + const int quan_stride = coeff_stride; + (void)skip_block; + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)scan; + + memset(quanAddr, 0, count * sizeof(quanAddr[0])); + memset(dquanAddr, 0, count * sizeof(dquanAddr[0])); + + if (!skip_block) { + coeff[0] = _mm_loadu_si128((__m128i const *)src); + + qparam[0] = + _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1], round_ptr[0]); + qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]); + qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]); + + // DC and first 3 AC + quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + + // update round/quan/dquan for AC + qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); + qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]); + qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]); + + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, + log_scale, quanAddr, dquanAddr); + + // next 4 AC + coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); + quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, + log_scale, quanAddr + quan_stride, + dquanAddr + quan_stride); + + find_eob(quanAddr, iscan, &eob); + + count -= 8; + + // loop for the rest of AC + while (count > 0) { + src += coeff_stride << 1; + quanAddr += quan_stride << 1; + dquanAddr += quan_stride << 1; + iscan += quan_stride << 1; + + coeff[0] = _mm_loadu_si128((__m128i const *)src); + coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); + + quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, + dequant, &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, + log_scale, quanAddr, dquanAddr); + + quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, + dequant, &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, + log_scale, quanAddr + quan_stride, + dquanAddr + quan_stride); + + find_eob(quanAddr, iscan, &eob); + + count -= 8; + } + *eob_ptr = get_accumulated_eob(&eob); + } else { + *eob_ptr = 0; + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c new file mode 100644 index 000000000..f9c95b6cb --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./av1_rtcd.h" +#include "aom/aom_integer.h" + +void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + __m128i zero; + __m128i thr; + int16_t nzflag; + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + + coeff_ptr += n_coeffs; + iscan_ptr += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + zero = _mm_setzero_si128(); + + if (!skip_block) { + __m128i eob; + __m128i round, quant, dequant; + { + __m128i coeff0, coeff1; + + // Setup global values + { + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + } + + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + // Do DC and first 15 AC + coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs)); + coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + round = _mm_unpackhi_epi64(round, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + quant = _mm_unpackhi_epi64(quant, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + } + + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob = _mm_max_epi16(eob, eob1); + } + n_coeffs += 8 * 2; + } + + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop + while (n_coeffs < 0) { + __m128i coeff0, coeff1; + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + + coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs)); + coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + } else { + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + } + } + + if (nzflag) { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob0, eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob0 = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob0 = _mm_max_epi16(eob0, eob1); + eob = _mm_max_epi16(eob, eob0); + } + n_coeffs += 8 * 2; + } + + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); + } + } else { + do { + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + n_coeffs += 8 * 2; + } while (n_coeffs < 0); + *eob_ptr = 0; + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm new file mode 100644 index 000000000..ad4ae274e --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm @@ -0,0 +1,204 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%define private_prefix av1 + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 + +SECTION .text + +%macro QUANTIZE_FP 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + cmp dword skipm, 0 + jne .blank + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, dequantmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, fp_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m1, m5 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [r2q] ; m3 = dequant + mov r3, qcoeffmp + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, fp_32x32 + psllw m2, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + + lea coeffq, [ coeffq+ncoeffq*2] + lea r5q, [ r5q+ncoeffq*2] + lea r3q, [ r3q+ncoeffq*2] + lea r4q, [r4q+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpeqw m7, m7 + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [r3q+ncoeffq*2+ 0], m8 + mova [r3q+ncoeffq*2+16], m13 +%ifidn %1, fp_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; r4[i] = r3[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; r4[i] = r3[i] * q +%ifidn %1, fp_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 + psrlw m0, m3, 2 +%else + psrlw m0, m3, 1 +%endif + mova [r4q+ncoeffq*2+ 0], m8 + mova [r4q+ncoeffq*2+16], m13 + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m7 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + + pcmpgtw m7, m6, m0 + pcmpgtw m12, m11, m0 + pmovmskb r6d, m7 + pmovmskb r2d, m12 + + or r6, r2 + jz .skip_iter + + pcmpeqw m7, m7 + + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [r3q+ncoeffq*2+ 0], m14 + mova [r3q+ncoeffq*2+16], m13 +%ifidn %1, fp_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; r4[i] = r3[i] * q + pmullw m13, m3 ; r4[i] = r3[i] * q +%ifidn %1, fp_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + mova [r4q+ncoeffq*2+ 0], m14 + mova [r4q+ncoeffq*2+16], m13 + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m7 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + + jmp .accumulate_eob +.skip_iter: + mova [r3q+ncoeffq*2+ 0], m5 + mova [r3q+ncoeffq*2+16], m5 + mova [r4q+ncoeffq*2+ 0], m5 + mova [r4q+ncoeffq*2+16], m5 + add ncoeffq, mmsize + jl .ac_only_loop + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET + + ; skip-block, i.e. just write all zeroes +.blank: + mov r0, dqcoeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, qcoeffmp + mov r3, eobmp + + lea r0q, [r0q+ncoeffq*2] + lea r2q, [r2q+ncoeffq*2] + neg ncoeffq + pxor m7, m7 +.blank_loop: + mova [r0q+ncoeffq*2+ 0], m7 + mova [r0q+ncoeffq*2+16], m7 + mova [r2q+ncoeffq*2+ 0], m7 + mova [r2q+ncoeffq*2+16], m7 + add ncoeffq, mmsize + jl .blank_loop + mov word [r3q], 0 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FP fp, 7 +QUANTIZE_FP fp_32x32, 7 diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm new file mode 100644 index 000000000..dcc697ba3 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm @@ -0,0 +1,219 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(av1_ssim_parms_16x16_sse2) PRIVATE +sym(av1_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(av1_ssim_parms_8x8_sse2) PRIVATE +sym(av1_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c new file mode 100644 index 000000000..37c4b0d88 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c @@ -0,0 +1,3884 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "./aom_dsp_rtcd.h" +#include "./av1_rtcd.h" +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/fwd_txfm_sse2.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_ports/mem.h" + +static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr) { + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); + __m128i mask; + + if (!flipud) { + in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + } else { + in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = _mm_shufflelo_epi16(in[0], 0x1b); + in[1] = _mm_shufflelo_epi16(in[1], 0x1b); + in[2] = _mm_shufflelo_epi16(in[2], 0x1b); + in[3] = _mm_shufflelo_epi16(in[3], 0x1b); + } + + in[0] = _mm_slli_epi16(in[0], 4); + in[1] = _mm_slli_epi16(in[1], 4); + in[2] = _mm_slli_epi16(in[2], 4); + in[3] = _mm_slli_epi16(in[3], 4); + + mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); + in[0] = _mm_add_epi16(in[0], mask); + in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); +} + +static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) { + const __m128i kOne = _mm_set1_epi16(1); + __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); + __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); + __m128i out01 = _mm_add_epi16(in01, kOne); + __m128i out23 = _mm_add_epi16(in23, kOne); + out01 = _mm_srai_epi16(out01, 2); + out23 = _mm_srai_epi16(out23, 2); + store_output(&out01, (output + 0 * 8)); + store_output(&out23, (output + 1 * 8)); +} + +static INLINE void transpose_4x4(__m128i *res) { + // Combine and transpose + // 00 01 02 03 20 21 22 23 + // 10 11 12 13 30 31 32 33 + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); + res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); + + // 00 10 20 30 01 11 21 31 + // 02 12 22 32 03 13 23 33 + // only use the first 4 16-bit integers + res[1] = _mm_unpackhi_epi64(res[0], res[0]); + res[3] = _mm_unpackhi_epi64(res[2], res[2]); +} + +static void fdct4_sse2(__m128i *in) { + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u[4], v[4]; + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpacklo_epi16(in[3], in[2]); + + v[0] = _mm_add_epi16(u[0], u[1]); + v[1] = _mm_sub_epi16(u[0], u[1]); + + u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 + u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 + u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 + u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); + transpose_4x4(in); +} + +static void fadst4_sse2(__m128i *in) { + const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); + const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); + const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); + const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8]; + __m128i in7 = _mm_add_epi16(in[0], in[1]); + + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpacklo_epi16(in[2], in[3]); + u[2] = _mm_unpacklo_epi16(in7, kZero); + u[3] = _mm_unpacklo_epi16(in[2], kZero); + u[4] = _mm_unpacklo_epi16(in[3], kZero); + + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 + v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 + v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_sub_epi32(v[2], v[6]); + u[2] = _mm_add_epi32(v[3], v[4]); + u[3] = _mm_sub_epi32(u[2], u[0]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_sub_epi32(u[4], v[5]); + u[6] = _mm_add_epi32(u[3], u[5]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[2]); + in[1] = _mm_packs_epi32(u[1], u[3]); + transpose_4x4(in); +} + +#if CONFIG_EXT_TX +static void fidtx4_sse2(__m128i *in) { + const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0); + const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i v0, v1, v2, v3; + __m128i u0, u1, u2, u3; + + v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16); + v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16); + v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16); + v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16); + + u0 = _mm_madd_epi16(v0, k__sqrt2_epi16); + u1 = _mm_madd_epi16(v1, k__sqrt2_epi16); + u2 = _mm_madd_epi16(v2, k__sqrt2_epi16); + u3 = _mm_madd_epi16(v3, k__sqrt2_epi16); + + v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u0, u2); + in[1] = _mm_packs_epi32(u1, u3); + transpose_4x4(in); +} +#endif // CONFIG_EXT_TX + +void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in[4]; + + switch (tx_type) { + case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break; + case ADST_DCT: + load_buffer_4x4(input, in, stride, 0, 0); + fadst4_sse2(in); + fdct4_sse2(in); + write_buffer_4x4(output, in); + break; + case DCT_ADST: + load_buffer_4x4(input, in, stride, 0, 0); + fdct4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; + case ADST_ADST: + load_buffer_4x4(input, in, stride, 0, 0); + fadst4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_4x4(input, in, stride, 1, 0); + fadst4_sse2(in); + fdct4_sse2(in); + write_buffer_4x4(output, in); + break; + case DCT_FLIPADST: + load_buffer_4x4(input, in, stride, 0, 1); + fdct4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; + case FLIPADST_FLIPADST: + load_buffer_4x4(input, in, stride, 1, 1); + fadst4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; + case ADST_FLIPADST: + load_buffer_4x4(input, in, stride, 0, 1); + fadst4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; + case FLIPADST_ADST: + load_buffer_4x4(input, in, stride, 1, 0); + fadst4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; + case IDTX: + load_buffer_4x4(input, in, stride, 0, 0); + fidtx4_sse2(in); + fidtx4_sse2(in); + write_buffer_4x4(output, in); + break; + case V_DCT: + load_buffer_4x4(input, in, stride, 0, 0); + fdct4_sse2(in); + fidtx4_sse2(in); + write_buffer_4x4(output, in); + break; + case H_DCT: + load_buffer_4x4(input, in, stride, 0, 0); + fidtx4_sse2(in); + fdct4_sse2(in); + write_buffer_4x4(output, in); + break; + case V_ADST: + load_buffer_4x4(input, in, stride, 0, 0); + fadst4_sse2(in); + fidtx4_sse2(in); + write_buffer_4x4(output, in); + break; + case H_ADST: + load_buffer_4x4(input, in, stride, 0, 0); + fidtx4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; + case V_FLIPADST: + load_buffer_4x4(input, in, stride, 1, 0); + fadst4_sse2(in); + fidtx4_sse2(in); + write_buffer_4x4(output, in); + break; + case H_FLIPADST: + load_buffer_4x4(input, in, stride, 0, 1); + fidtx4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; +#endif // CONFIG_EXT_TX + default: assert(0); + } +} + +void av1_fdct8x8_quant_sse2(const int16_t *input, int stride, + int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + __m128i zero; + int pass; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + // Load input + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + __m128i *in[8]; + int index = 0; + + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)coeff_ptr; + + // Pre-condition input (shift by two) + in0 = _mm_slli_epi16(in0, 2); + in1 = _mm_slli_epi16(in1, 2); + in2 = _mm_slli_epi16(in2, 2); + in3 = _mm_slli_epi16(in3, 2); + in4 = _mm_slli_epi16(in4, 2); + in5 = _mm_slli_epi16(in5, 2); + in6 = _mm_slli_epi16(in6, 2); + in7 = _mm_slli_epi16(in7, 2); + + in[0] = &in0; + in[1] = &in1; + in[2] = &in2; + in[3] = &in3; + in[4] = &in4; + in[5] = &in5; + in[6] = &in6; + in[7] = &in7; + + // We do two passes, first the columns, then the rows. The results of the + // first pass are transposed so that the same column code can be reused. The + // results of the second pass are also transposed so that the rows (processed + // as columns) are put back in row positions. + for (pass = 0; pass < 2; pass++) { + // To store results of each pass before the transpose. + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + // Add/subtract + const __m128i q0 = _mm_add_epi16(in0, in7); + const __m128i q1 = _mm_add_epi16(in1, in6); + const __m128i q2 = _mm_add_epi16(in2, in5); + const __m128i q3 = _mm_add_epi16(in3, in4); + const __m128i q4 = _mm_sub_epi16(in3, in4); + const __m128i q5 = _mm_sub_epi16(in2, in5); + const __m128i q6 = _mm_sub_epi16(in1, in6); + const __m128i q7 = _mm_sub_epi16(in0, in7); + // Work on first four results + { + // Add/subtract + const __m128i r0 = _mm_add_epi16(q0, q3); + const __m128i r1 = _mm_add_epi16(q1, q2); + const __m128i r2 = _mm_sub_epi16(q1, q2); + const __m128i r3 = _mm_sub_epi16(q0, q3); + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res0 = _mm_packs_epi32(w0, w1); + res4 = _mm_packs_epi32(w2, w3); + res2 = _mm_packs_epi32(w4, w5); + res6 = _mm_packs_epi32(w6, w7); + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); + // Combine + const __m128i r0 = _mm_packs_epi32(s0, s1); + const __m128i r1 = _mm_packs_epi32(s2, s3); + // Add/subtract + const __m128i x0 = _mm_add_epi16(q4, r0); + const __m128i x1 = _mm_sub_epi16(q4, r0); + const __m128i x2 = _mm_sub_epi16(q7, r1); + const __m128i x3 = _mm_add_epi16(q7, r1); + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res1 = _mm_packs_epi32(w0, w1); + res7 = _mm_packs_epi32(w2, w3); + res5 = _mm_packs_epi32(w4, w5); + res3 = _mm_packs_epi32(w6, w7); + } + // Transpose the 8x8. + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); + const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); + const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); + const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); + const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); + const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); + const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); + const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } + // Post-condition output and store it + { + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const __m128i sign_in0 = _mm_srai_epi16(in0, 15); + const __m128i sign_in1 = _mm_srai_epi16(in1, 15); + const __m128i sign_in2 = _mm_srai_epi16(in2, 15); + const __m128i sign_in3 = _mm_srai_epi16(in3, 15); + const __m128i sign_in4 = _mm_srai_epi16(in4, 15); + const __m128i sign_in5 = _mm_srai_epi16(in5, 15); + const __m128i sign_in6 = _mm_srai_epi16(in6, 15); + const __m128i sign_in7 = _mm_srai_epi16(in7, 15); + in0 = _mm_sub_epi16(in0, sign_in0); + in1 = _mm_sub_epi16(in1, sign_in1); + in2 = _mm_sub_epi16(in2, sign_in2); + in3 = _mm_sub_epi16(in3, sign_in3); + in4 = _mm_sub_epi16(in4, sign_in4); + in5 = _mm_sub_epi16(in5, sign_in5); + in6 = _mm_sub_epi16(in6, sign_in6); + in7 = _mm_sub_epi16(in7, sign_in7); + in0 = _mm_srai_epi16(in0, 1); + in1 = _mm_srai_epi16(in1, 1); + in2 = _mm_srai_epi16(in2, 1); + in3 = _mm_srai_epi16(in3, 1); + in4 = _mm_srai_epi16(in4, 1); + in5 = _mm_srai_epi16(in5, 1); + in6 = _mm_srai_epi16(in6, 1); + in7 = _mm_srai_epi16(in7, 1); + } + + iscan_ptr += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + zero = _mm_setzero_si128(); + + if (!skip_block) { + __m128i eob; + __m128i round, quant, dequant; + { + __m128i coeff0, coeff1; + + // Setup global values + { + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + } + + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + // Do DC and first 15 AC + coeff0 = *in[0]; + coeff1 = *in[1]; + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + round = _mm_unpackhi_epi64(round, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + quant = _mm_unpackhi_epi64(quant, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + } + + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob = _mm_max_epi16(eob, eob1); + } + n_coeffs += 8 * 2; + } + + // AC only loop + index = 2; + while (n_coeffs < 0) { + __m128i coeff0, coeff1; + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + + assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); + coeff0 = *in[index]; + coeff1 = *in[index + 1]; + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + } + + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob0, eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob0 = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob0 = _mm_max_epi16(eob0, eob1); + eob = _mm_max_epi16(eob, eob0); + } + n_coeffs += 8 * 2; + index += 2; + } + + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); + } + } else { + do { + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + n_coeffs += 8 * 2; + } while (n_coeffs < 0); + *eob_ptr = 0; + } +} + +// load 8x8 array +static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr) { + if (!flipud) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + } else { + in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = mm_reverse_epi16(in[0]); + in[1] = mm_reverse_epi16(in[1]); + in[2] = mm_reverse_epi16(in[2]); + in[3] = mm_reverse_epi16(in[3]); + in[4] = mm_reverse_epi16(in[4]); + in[5] = mm_reverse_epi16(in[5]); + in[6] = mm_reverse_epi16(in[6]); + in[7] = mm_reverse_epi16(in[7]); + } + + in[0] = _mm_slli_epi16(in[0], 2); + in[1] = _mm_slli_epi16(in[1], 2); + in[2] = _mm_slli_epi16(in[2], 2); + in[3] = _mm_slli_epi16(in[3], 2); + in[4] = _mm_slli_epi16(in[4], 2); + in[5] = _mm_slli_epi16(in[5], 2); + in[6] = _mm_slli_epi16(in[6], 2); + in[7] = _mm_slli_epi16(in[7], 2); +} + +// right shift and rounding +static INLINE void right_shift_8x8(__m128i *res, const int bit) { + __m128i sign0 = _mm_srai_epi16(res[0], 15); + __m128i sign1 = _mm_srai_epi16(res[1], 15); + __m128i sign2 = _mm_srai_epi16(res[2], 15); + __m128i sign3 = _mm_srai_epi16(res[3], 15); + __m128i sign4 = _mm_srai_epi16(res[4], 15); + __m128i sign5 = _mm_srai_epi16(res[5], 15); + __m128i sign6 = _mm_srai_epi16(res[6], 15); + __m128i sign7 = _mm_srai_epi16(res[7], 15); + + if (bit == 2) { + const __m128i const_rounding = _mm_set1_epi16(1); + res[0] = _mm_adds_epi16(res[0], const_rounding); + res[1] = _mm_adds_epi16(res[1], const_rounding); + res[2] = _mm_adds_epi16(res[2], const_rounding); + res[3] = _mm_adds_epi16(res[3], const_rounding); + res[4] = _mm_adds_epi16(res[4], const_rounding); + res[5] = _mm_adds_epi16(res[5], const_rounding); + res[6] = _mm_adds_epi16(res[6], const_rounding); + res[7] = _mm_adds_epi16(res[7], const_rounding); + } + + res[0] = _mm_sub_epi16(res[0], sign0); + res[1] = _mm_sub_epi16(res[1], sign1); + res[2] = _mm_sub_epi16(res[2], sign2); + res[3] = _mm_sub_epi16(res[3], sign3); + res[4] = _mm_sub_epi16(res[4], sign4); + res[5] = _mm_sub_epi16(res[5], sign5); + res[6] = _mm_sub_epi16(res[6], sign6); + res[7] = _mm_sub_epi16(res[7], sign7); + + if (bit == 1) { + res[0] = _mm_srai_epi16(res[0], 1); + res[1] = _mm_srai_epi16(res[1], 1); + res[2] = _mm_srai_epi16(res[2], 1); + res[3] = _mm_srai_epi16(res[3], 1); + res[4] = _mm_srai_epi16(res[4], 1); + res[5] = _mm_srai_epi16(res[5], 1); + res[6] = _mm_srai_epi16(res[6], 1); + res[7] = _mm_srai_epi16(res[7], 1); + } else { + res[0] = _mm_srai_epi16(res[0], 2); + res[1] = _mm_srai_epi16(res[1], 2); + res[2] = _mm_srai_epi16(res[2], 2); + res[3] = _mm_srai_epi16(res[3], 2); + res[4] = _mm_srai_epi16(res[4], 2); + res[5] = _mm_srai_epi16(res[5], 2); + res[6] = _mm_srai_epi16(res[6], 2); + res[7] = _mm_srai_epi16(res[7], 2); + } +} + +// write 8x8 array +static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, + int stride) { + store_output(&res[0], (output + 0 * stride)); + store_output(&res[1], (output + 1 * stride)); + store_output(&res[2], (output + 2 * stride)); + store_output(&res[3], (output + 3 * stride)); + store_output(&res[4], (output + 4 * stride)); + store_output(&res[5], (output + 5 * stride)); + store_output(&res[6], (output + 6 * stride)); + store_output(&res[7], (output + 7 * stride)); +} + +// perform in-place transpose +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 44 54 45 55 46 56 47 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 25 35 + // 44 54 64 74 45 55 65 75 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 +} + +static void fdct8_sse2(__m128i *in) { + // constants + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + + // stage 1 + s0 = _mm_add_epi16(in[0], in[7]); + s1 = _mm_add_epi16(in[1], in[6]); + s2 = _mm_add_epi16(in[2], in[5]); + s3 = _mm_add_epi16(in[3], in[4]); + s4 = _mm_sub_epi16(in[3], in[4]); + s5 = _mm_sub_epi16(in[2], in[5]); + s6 = _mm_sub_epi16(in[1], in[6]); + s7 = _mm_sub_epi16(in[0], in[7]); + + u0 = _mm_add_epi16(s0, s3); + u1 = _mm_add_epi16(s1, s2); + u2 = _mm_sub_epi16(s1, s2); + u3 = _mm_sub_epi16(s0, s3); + // interleave and perform butterfly multiplication/addition + v0 = _mm_unpacklo_epi16(u0, u1); + v1 = _mm_unpackhi_epi16(u0, u1); + v2 = _mm_unpacklo_epi16(u2, u3); + v3 = _mm_unpackhi_epi16(u2, u3); + + u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); + u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); + u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); + u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); + u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); + u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); + u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); + u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); + + // shift and rounding + v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u0, u1); + in[2] = _mm_packs_epi32(u4, u5); + in[4] = _mm_packs_epi32(u2, u3); + in[6] = _mm_packs_epi32(u6, u7); + + // stage 2 + // interleave and perform butterfly multiplication/addition + u0 = _mm_unpacklo_epi16(s6, s5); + u1 = _mm_unpackhi_epi16(s6, s5); + v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); + + // shift and rounding + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + + u0 = _mm_packs_epi32(v0, v1); + u1 = _mm_packs_epi32(v2, v3); + + // stage 3 + s0 = _mm_add_epi16(s4, u0); + s1 = _mm_sub_epi16(s4, u0); + s2 = _mm_sub_epi16(s7, u1); + s3 = _mm_add_epi16(s7, u1); + + // stage 4 + u0 = _mm_unpacklo_epi16(s0, s3); + u1 = _mm_unpackhi_epi16(s0, s3); + u2 = _mm_unpacklo_epi16(s1, s2); + u3 = _mm_unpackhi_epi16(s1, s2); + + v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); + v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); + v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); + v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); + v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); + v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); + v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); + v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); + + // shift and rounding + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + in[1] = _mm_packs_epi32(v0, v1); + in[3] = _mm_packs_epi32(v4, v5); + in[5] = _mm_packs_epi32(v2, v3); + in[7] = _mm_packs_epi32(v6, v7); + + // transpose + array_transpose_8x8(in, in); +} + +static void fadst8_sse2(__m128i *in) { + // Constants + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__const_0 = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + // properly aligned for butterfly input + in0 = in[7]; + in1 = in[0]; + in2 = in[5]; + in3 = in[2]; + in4 = in[3]; + in5 = in[4]; + in6 = in[1]; + in7 = in[6]; + + // column transformation + // stage 1 + // interleave and multiply/add into 32-bit integer + s0 = _mm_unpacklo_epi16(in0, in1); + s1 = _mm_unpackhi_epi16(in0, in1); + s2 = _mm_unpacklo_epi16(in2, in3); + s3 = _mm_unpackhi_epi16(in2, in3); + s4 = _mm_unpacklo_epi16(in4, in5); + s5 = _mm_unpackhi_epi16(in4, in5); + s6 = _mm_unpacklo_epi16(in6, in7); + s7 = _mm_unpackhi_epi16(in6, in7); + + u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); + u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); + u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); + u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); + u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); + u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); + u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); + u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); + u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); + u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); + u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); + u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); + u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); + u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); + u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); + u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); + + // addition + w0 = _mm_add_epi32(u0, u8); + w1 = _mm_add_epi32(u1, u9); + w2 = _mm_add_epi32(u2, u10); + w3 = _mm_add_epi32(u3, u11); + w4 = _mm_add_epi32(u4, u12); + w5 = _mm_add_epi32(u5, u13); + w6 = _mm_add_epi32(u6, u14); + w7 = _mm_add_epi32(u7, u15); + w8 = _mm_sub_epi32(u0, u8); + w9 = _mm_sub_epi32(u1, u9); + w10 = _mm_sub_epi32(u2, u10); + w11 = _mm_sub_epi32(u3, u11); + w12 = _mm_sub_epi32(u4, u12); + w13 = _mm_sub_epi32(u5, u13); + w14 = _mm_sub_epi32(u6, u14); + w15 = _mm_sub_epi32(u7, u15); + + // shift and rounding + v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); + v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); + v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); + v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); + v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); + v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); + v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); + v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); + + u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); + u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); + u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); + u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); + u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); + u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); + u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); + u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); + + // back to 16-bit and pack 8 integers into __m128i + v0 = _mm_add_epi32(w0, w4); + v1 = _mm_add_epi32(w1, w5); + v2 = _mm_add_epi32(w2, w6); + v3 = _mm_add_epi32(w3, w7); + v4 = _mm_sub_epi32(w0, w4); + v5 = _mm_sub_epi32(w1, w5); + v6 = _mm_sub_epi32(w2, w6); + v7 = _mm_sub_epi32(w3, w7); + + w0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + w1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + w2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + w3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + w4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + w5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + w6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + w7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(w0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(w1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(w2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(w3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(w4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(w5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(w6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(w7, DCT_CONST_BITS); + + in[4] = _mm_packs_epi32(u8, u9); + in[5] = _mm_packs_epi32(u10, u11); + in[6] = _mm_packs_epi32(u12, u13); + in[7] = _mm_packs_epi32(u14, u15); + + // stage 2 + s0 = _mm_packs_epi32(v0, v1); + s1 = _mm_packs_epi32(v2, v3); + s2 = _mm_packs_epi32(v4, v5); + s3 = _mm_packs_epi32(v6, v7); + + u0 = _mm_unpacklo_epi16(in[4], in[5]); + u1 = _mm_unpackhi_epi16(in[4], in[5]); + u2 = _mm_unpacklo_epi16(in[6], in[7]); + u3 = _mm_unpackhi_epi16(in[6], in[7]); + + v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); + v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); + v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); + v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); + v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); + v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); + v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); + v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); + + w0 = _mm_add_epi32(v0, v4); + w1 = _mm_add_epi32(v1, v5); + w2 = _mm_add_epi32(v2, v6); + w3 = _mm_add_epi32(v3, v7); + w4 = _mm_sub_epi32(v0, v4); + w5 = _mm_sub_epi32(v1, v5); + w6 = _mm_sub_epi32(v2, v6); + w7 = _mm_sub_epi32(v3, v7); + + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + // back to 16-bit intergers + s4 = _mm_packs_epi32(u0, u1); + s5 = _mm_packs_epi32(u2, u3); + s6 = _mm_packs_epi32(u4, u5); + s7 = _mm_packs_epi32(u6, u7); + + // stage 3 + u0 = _mm_unpacklo_epi16(s2, s3); + u1 = _mm_unpackhi_epi16(s2, s3); + u2 = _mm_unpacklo_epi16(s6, s7); + u3 = _mm_unpackhi_epi16(s6, s7); + + v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); + v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); + v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); + v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); + + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + s2 = _mm_packs_epi32(v0, v1); + s3 = _mm_packs_epi32(v2, v3); + s6 = _mm_packs_epi32(v4, v5); + s7 = _mm_packs_epi32(v6, v7); + + // FIXME(jingning): do subtract using bit inversion? + in[0] = s0; + in[1] = _mm_sub_epi16(k__const_0, s4); + in[2] = s6; + in[3] = _mm_sub_epi16(k__const_0, s2); + in[4] = s3; + in[5] = _mm_sub_epi16(k__const_0, s7); + in[6] = s5; + in[7] = _mm_sub_epi16(k__const_0, s1); + + // transpose + array_transpose_8x8(in, in); +} + +#if CONFIG_EXT_TX +static void fidtx8_sse2(__m128i *in) { + in[0] = _mm_slli_epi16(in[0], 1); + in[1] = _mm_slli_epi16(in[1], 1); + in[2] = _mm_slli_epi16(in[2], 1); + in[3] = _mm_slli_epi16(in[3], 1); + in[4] = _mm_slli_epi16(in[4], 1); + in[5] = _mm_slli_epi16(in[5], 1); + in[6] = _mm_slli_epi16(in[6], 1); + in[7] = _mm_slli_epi16(in[7], 1); + + array_transpose_8x8(in, in); +} +#endif // CONFIG_EXT_TX + +void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in[8]; + + switch (tx_type) { + case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break; + case ADST_DCT: + load_buffer_8x8(input, in, stride, 0, 0); + fadst8_sse2(in); + fdct8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case DCT_ADST: + load_buffer_8x8(input, in, stride, 0, 0); + fdct8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case ADST_ADST: + load_buffer_8x8(input, in, stride, 0, 0); + fadst8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_8x8(input, in, stride, 1, 0); + fadst8_sse2(in); + fdct8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case DCT_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1); + fdct8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8(input, in, stride, 1, 1); + fadst8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case ADST_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1); + fadst8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case FLIPADST_ADST: + load_buffer_8x8(input, in, stride, 1, 0); + fadst8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case IDTX: + load_buffer_8x8(input, in, stride, 0, 0); + fidtx8_sse2(in); + fidtx8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case V_DCT: + load_buffer_8x8(input, in, stride, 0, 0); + fdct8_sse2(in); + fidtx8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case H_DCT: + load_buffer_8x8(input, in, stride, 0, 0); + fidtx8_sse2(in); + fdct8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case V_ADST: + load_buffer_8x8(input, in, stride, 0, 0); + fadst8_sse2(in); + fidtx8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case H_ADST: + load_buffer_8x8(input, in, stride, 0, 0); + fidtx8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case V_FLIPADST: + load_buffer_8x8(input, in, stride, 1, 0); + fadst8_sse2(in); + fidtx8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case H_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1); + fidtx8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; +#endif // CONFIG_EXT_TX + default: assert(0); + } +} + +static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0, + __m128i *in1, int stride, int flipud, + int fliplr) { + // Load 4 8x8 blocks + const int16_t *topL = input; + const int16_t *topR = input + 8; + const int16_t *botL = input + 8 * stride; + const int16_t *botR = input + 8 * stride + 8; + + const int16_t *tmp; + + if (flipud) { + // Swap left columns + tmp = topL; + topL = botL; + botL = tmp; + // Swap right columns + tmp = topR; + topR = botR; + botR = tmp; + } + + if (fliplr) { + // Swap top rows + tmp = topL; + topL = topR; + topR = tmp; + // Swap bottom rows + tmp = botL; + botL = botR; + botR = tmp; + } + + // load first 8 columns + load_buffer_8x8(topL, in0, stride, flipud, fliplr); + load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr); + + // load second 8 columns + load_buffer_8x8(topR, in1, stride, flipud, fliplr); + load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr); +} + +static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, + __m128i *in1, int stride) { + // write first 8 columns + write_buffer_8x8(output, in0, stride); + write_buffer_8x8(output + 8 * stride, in0 + 8, stride); + // write second 8 columns + output += 8; + write_buffer_8x8(output, in1, stride); + write_buffer_8x8(output + 8 * stride, in1 + 8, stride); +} + +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { + __m128i tbuf[8]; + array_transpose_8x8(res0, res0); + array_transpose_8x8(res1, tbuf); + array_transpose_8x8(res0 + 8, res1); + array_transpose_8x8(res1 + 8, res1 + 8); + + res0[8] = tbuf[0]; + res0[9] = tbuf[1]; + res0[10] = tbuf[2]; + res0[11] = tbuf[3]; + res0[12] = tbuf[4]; + res0[13] = tbuf[5]; + res0[14] = tbuf[6]; + res0[15] = tbuf[7]; +} + +static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { + // perform rounding operations + right_shift_8x8(res0, 2); + right_shift_8x8(res0 + 8, 2); + right_shift_8x8(res1, 2); + right_shift_8x8(res1 + 8, 2); +} + +static void fdct16_8col(__m128i *in) { + // perform 16x16 1-D DCT for 8 columns + __m128i i[8], s[8], p[8], t[8], u[16], v[16]; + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + // stage 1 + i[0] = _mm_add_epi16(in[0], in[15]); + i[1] = _mm_add_epi16(in[1], in[14]); + i[2] = _mm_add_epi16(in[2], in[13]); + i[3] = _mm_add_epi16(in[3], in[12]); + i[4] = _mm_add_epi16(in[4], in[11]); + i[5] = _mm_add_epi16(in[5], in[10]); + i[6] = _mm_add_epi16(in[6], in[9]); + i[7] = _mm_add_epi16(in[7], in[8]); + + s[0] = _mm_sub_epi16(in[7], in[8]); + s[1] = _mm_sub_epi16(in[6], in[9]); + s[2] = _mm_sub_epi16(in[5], in[10]); + s[3] = _mm_sub_epi16(in[4], in[11]); + s[4] = _mm_sub_epi16(in[3], in[12]); + s[5] = _mm_sub_epi16(in[2], in[13]); + s[6] = _mm_sub_epi16(in[1], in[14]); + s[7] = _mm_sub_epi16(in[0], in[15]); + + p[0] = _mm_add_epi16(i[0], i[7]); + p[1] = _mm_add_epi16(i[1], i[6]); + p[2] = _mm_add_epi16(i[2], i[5]); + p[3] = _mm_add_epi16(i[3], i[4]); + p[4] = _mm_sub_epi16(i[3], i[4]); + p[5] = _mm_sub_epi16(i[2], i[5]); + p[6] = _mm_sub_epi16(i[1], i[6]); + p[7] = _mm_sub_epi16(i[0], i[7]); + + u[0] = _mm_add_epi16(p[0], p[3]); + u[1] = _mm_add_epi16(p[1], p[2]); + u[2] = _mm_sub_epi16(p[1], p[2]); + u[3] = _mm_sub_epi16(p[0], p[3]); + + v[0] = _mm_unpacklo_epi16(u[0], u[1]); + v[1] = _mm_unpackhi_epi16(u[0], u[1]); + v[2] = _mm_unpacklo_epi16(u[2], u[3]); + v[3] = _mm_unpackhi_epi16(u[2], u[3]); + + u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); + u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); + u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); + u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); + u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); + u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); + u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); + u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[4] = _mm_packs_epi32(u[4], u[5]); + in[8] = _mm_packs_epi32(u[2], u[3]); + in[12] = _mm_packs_epi32(u[6], u[7]); + + u[0] = _mm_unpacklo_epi16(p[5], p[6]); + u[1] = _mm_unpackhi_epi16(p[5], p[6]); + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + + u[0] = _mm_packs_epi32(v[0], v[1]); + u[1] = _mm_packs_epi32(v[2], v[3]); + + t[0] = _mm_add_epi16(p[4], u[0]); + t[1] = _mm_sub_epi16(p[4], u[0]); + t[2] = _mm_sub_epi16(p[7], u[1]); + t[3] = _mm_add_epi16(p[7], u[1]); + + u[0] = _mm_unpacklo_epi16(t[0], t[3]); + u[1] = _mm_unpackhi_epi16(t[0], t[3]); + u[2] = _mm_unpacklo_epi16(t[1], t[2]); + u[3] = _mm_unpackhi_epi16(t[1], t[2]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); + v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); + v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); + v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); + v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); + v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + in[2] = _mm_packs_epi32(v[0], v[1]); + in[6] = _mm_packs_epi32(v[4], v[5]); + in[10] = _mm_packs_epi32(v[2], v[3]); + in[14] = _mm_packs_epi32(v[6], v[7]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[2], s[5]); + u[1] = _mm_unpackhi_epi16(s[2], s[5]); + u[2] = _mm_unpacklo_epi16(s[3], s[4]); + u[3] = _mm_unpackhi_epi16(s[3], s[4]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[2] = _mm_packs_epi32(v[0], v[1]); + t[3] = _mm_packs_epi32(v[2], v[3]); + t[4] = _mm_packs_epi32(v[4], v[5]); + t[5] = _mm_packs_epi32(v[6], v[7]); + + // stage 3 + p[0] = _mm_add_epi16(s[0], t[3]); + p[1] = _mm_add_epi16(s[1], t[2]); + p[2] = _mm_sub_epi16(s[1], t[2]); + p[3] = _mm_sub_epi16(s[0], t[3]); + p[4] = _mm_sub_epi16(s[7], t[4]); + p[5] = _mm_sub_epi16(s[6], t[5]); + p[6] = _mm_add_epi16(s[6], t[5]); + p[7] = _mm_add_epi16(s[7], t[4]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(p[1], p[6]); + u[1] = _mm_unpackhi_epi16(p[1], p[6]); + u[2] = _mm_unpacklo_epi16(p[2], p[5]); + u[3] = _mm_unpackhi_epi16(p[2], p[5]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); + v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); + v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); + v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); + v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); + v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[1] = _mm_packs_epi32(v[0], v[1]); + t[2] = _mm_packs_epi32(v[2], v[3]); + t[5] = _mm_packs_epi32(v[4], v[5]); + t[6] = _mm_packs_epi32(v[6], v[7]); + + // stage 5 + s[0] = _mm_add_epi16(p[0], t[1]); + s[1] = _mm_sub_epi16(p[0], t[1]); + s[2] = _mm_sub_epi16(p[3], t[2]); + s[3] = _mm_add_epi16(p[3], t[2]); + s[4] = _mm_add_epi16(p[4], t[5]); + s[5] = _mm_sub_epi16(p[4], t[5]); + s[6] = _mm_sub_epi16(p[7], t[6]); + s[7] = _mm_add_epi16(p[7], t[6]); + + // stage 6 + u[0] = _mm_unpacklo_epi16(s[0], s[7]); + u[1] = _mm_unpackhi_epi16(s[0], s[7]); + u[2] = _mm_unpacklo_epi16(s[1], s[6]); + u[3] = _mm_unpackhi_epi16(s[1], s[6]); + u[4] = _mm_unpacklo_epi16(s[2], s[5]); + u[5] = _mm_unpackhi_epi16(s[2], s[5]); + u[6] = _mm_unpacklo_epi16(s[3], s[4]); + u[7] = _mm_unpackhi_epi16(s[3], s[4]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); + v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); + v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); + v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); + v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); + v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); + v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); + v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); + v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); + v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); + v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); + v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); + v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); + v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[1] = _mm_packs_epi32(v[0], v[1]); + in[9] = _mm_packs_epi32(v[2], v[3]); + in[5] = _mm_packs_epi32(v[4], v[5]); + in[13] = _mm_packs_epi32(v[6], v[7]); + in[3] = _mm_packs_epi32(v[8], v[9]); + in[11] = _mm_packs_epi32(v[10], v[11]); + in[7] = _mm_packs_epi32(v[12], v[13]); + in[15] = _mm_packs_epi32(v[14], v[15]); +} + +static void fadst16_8col(__m128i *in) { + // perform 16x16 1-D ADST for 8 columns + __m128i s[16], x[16], u[32], v[32]; + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_set1_epi16(0); + + u[0] = _mm_unpacklo_epi16(in[15], in[0]); + u[1] = _mm_unpackhi_epi16(in[15], in[0]); + u[2] = _mm_unpacklo_epi16(in[13], in[2]); + u[3] = _mm_unpackhi_epi16(in[13], in[2]); + u[4] = _mm_unpacklo_epi16(in[11], in[4]); + u[5] = _mm_unpackhi_epi16(in[11], in[4]); + u[6] = _mm_unpacklo_epi16(in[9], in[6]); + u[7] = _mm_unpackhi_epi16(in[9], in[6]); + u[8] = _mm_unpacklo_epi16(in[7], in[8]); + u[9] = _mm_unpackhi_epi16(in[7], in[8]); + u[10] = _mm_unpacklo_epi16(in[5], in[10]); + u[11] = _mm_unpackhi_epi16(in[5], in[10]); + u[12] = _mm_unpacklo_epi16(in[3], in[12]); + u[13] = _mm_unpackhi_epi16(in[3], in[12]); + u[14] = _mm_unpacklo_epi16(in[1], in[14]); + u[15] = _mm_unpackhi_epi16(in[1], in[14]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); + + u[0] = _mm_add_epi32(v[0], v[16]); + u[1] = _mm_add_epi32(v[1], v[17]); + u[2] = _mm_add_epi32(v[2], v[18]); + u[3] = _mm_add_epi32(v[3], v[19]); + u[4] = _mm_add_epi32(v[4], v[20]); + u[5] = _mm_add_epi32(v[5], v[21]); + u[6] = _mm_add_epi32(v[6], v[22]); + u[7] = _mm_add_epi32(v[7], v[23]); + u[8] = _mm_add_epi32(v[8], v[24]); + u[9] = _mm_add_epi32(v[9], v[25]); + u[10] = _mm_add_epi32(v[10], v[26]); + u[11] = _mm_add_epi32(v[11], v[27]); + u[12] = _mm_add_epi32(v[12], v[28]); + u[13] = _mm_add_epi32(v[13], v[29]); + u[14] = _mm_add_epi32(v[14], v[30]); + u[15] = _mm_add_epi32(v[15], v[31]); + u[16] = _mm_sub_epi32(v[0], v[16]); + u[17] = _mm_sub_epi32(v[1], v[17]); + u[18] = _mm_sub_epi32(v[2], v[18]); + u[19] = _mm_sub_epi32(v[3], v[19]); + u[20] = _mm_sub_epi32(v[4], v[20]); + u[21] = _mm_sub_epi32(v[5], v[21]); + u[22] = _mm_sub_epi32(v[6], v[22]); + u[23] = _mm_sub_epi32(v[7], v[23]); + u[24] = _mm_sub_epi32(v[8], v[24]); + u[25] = _mm_sub_epi32(v[9], v[25]); + u[26] = _mm_sub_epi32(v[10], v[26]); + u[27] = _mm_sub_epi32(v[11], v[27]); + u[28] = _mm_sub_epi32(v[12], v[28]); + u[29] = _mm_sub_epi32(v[13], v[29]); + u[30] = _mm_sub_epi32(v[14], v[30]); + u[31] = _mm_sub_epi32(v[15], v[31]); + + v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); + v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); + v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); + v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); + v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); + v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); + v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); + v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); + v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); + v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); + v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); + v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); + v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); + v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); + v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); + v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); + + u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); + u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); + u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); + u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); + u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); + u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); + u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); + u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); + u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); + u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); + u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); + u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); + u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); + u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); + u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); + u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); + + v[0] = _mm_add_epi32(u[0], u[8]); + v[1] = _mm_add_epi32(u[1], u[9]); + v[2] = _mm_add_epi32(u[2], u[10]); + v[3] = _mm_add_epi32(u[3], u[11]); + v[4] = _mm_add_epi32(u[4], u[12]); + v[5] = _mm_add_epi32(u[5], u[13]); + v[6] = _mm_add_epi32(u[6], u[14]); + v[7] = _mm_add_epi32(u[7], u[15]); + + v[16] = _mm_add_epi32(v[0], v[4]); + v[17] = _mm_add_epi32(v[1], v[5]); + v[18] = _mm_add_epi32(v[2], v[6]); + v[19] = _mm_add_epi32(v[3], v[7]); + v[20] = _mm_sub_epi32(v[0], v[4]); + v[21] = _mm_sub_epi32(v[1], v[5]); + v[22] = _mm_sub_epi32(v[2], v[6]); + v[23] = _mm_sub_epi32(v[3], v[7]); + v[16] = _mm_add_epi32(v[16], k__DCT_CONST_ROUNDING); + v[17] = _mm_add_epi32(v[17], k__DCT_CONST_ROUNDING); + v[18] = _mm_add_epi32(v[18], k__DCT_CONST_ROUNDING); + v[19] = _mm_add_epi32(v[19], k__DCT_CONST_ROUNDING); + v[20] = _mm_add_epi32(v[20], k__DCT_CONST_ROUNDING); + v[21] = _mm_add_epi32(v[21], k__DCT_CONST_ROUNDING); + v[22] = _mm_add_epi32(v[22], k__DCT_CONST_ROUNDING); + v[23] = _mm_add_epi32(v[23], k__DCT_CONST_ROUNDING); + v[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); + v[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); + v[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); + v[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); + v[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); + v[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); + v[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); + v[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); + s[0] = _mm_packs_epi32(v[16], v[17]); + s[1] = _mm_packs_epi32(v[18], v[19]); + s[2] = _mm_packs_epi32(v[20], v[21]); + s[3] = _mm_packs_epi32(v[22], v[23]); + + v[8] = _mm_sub_epi32(u[0], u[8]); + v[9] = _mm_sub_epi32(u[1], u[9]); + v[10] = _mm_sub_epi32(u[2], u[10]); + v[11] = _mm_sub_epi32(u[3], u[11]); + v[12] = _mm_sub_epi32(u[4], u[12]); + v[13] = _mm_sub_epi32(u[5], u[13]); + v[14] = _mm_sub_epi32(u[6], u[14]); + v[15] = _mm_sub_epi32(u[7], u[15]); + + v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + s[4] = _mm_packs_epi32(v[8], v[9]); + s[5] = _mm_packs_epi32(v[10], v[11]); + s[6] = _mm_packs_epi32(v[12], v[13]); + s[7] = _mm_packs_epi32(v[14], v[15]); + // + + s[8] = _mm_packs_epi32(u[16], u[17]); + s[9] = _mm_packs_epi32(u[18], u[19]); + s[10] = _mm_packs_epi32(u[20], u[21]); + s[11] = _mm_packs_epi32(u[22], u[23]); + s[12] = _mm_packs_epi32(u[24], u[25]); + s[13] = _mm_packs_epi32(u[26], u[27]); + s[14] = _mm_packs_epi32(u[28], u[29]); + s[15] = _mm_packs_epi32(u[30], u[31]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); + u[1] = _mm_unpackhi_epi16(s[8], s[9]); + u[2] = _mm_unpacklo_epi16(s[10], s[11]); + u[3] = _mm_unpackhi_epi16(s[10], s[11]); + u[4] = _mm_unpacklo_epi16(s[12], s[13]); + u[5] = _mm_unpackhi_epi16(s[12], s[13]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[8] = _mm_add_epi32(u[0], u[4]); + v[9] = _mm_add_epi32(u[1], u[5]); + v[10] = _mm_add_epi32(u[2], u[6]); + v[11] = _mm_add_epi32(u[3], u[7]); + v[12] = _mm_sub_epi32(u[0], u[4]); + v[13] = _mm_sub_epi32(u[1], u[5]); + v[14] = _mm_sub_epi32(u[2], u[6]); + v[15] = _mm_sub_epi32(u[3], u[7]); + + v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + s[8] = _mm_packs_epi32(v[8], v[9]); + s[9] = _mm_packs_epi32(v[10], v[11]); + s[10] = _mm_packs_epi32(v[12], v[13]); + s[11] = _mm_packs_epi32(v[14], v[15]); + + x[12] = _mm_packs_epi32(u[8], u[9]); + x[13] = _mm_packs_epi32(u[10], u[11]); + x[14] = _mm_packs_epi32(u[12], u[13]); + x[15] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + u[0] = _mm_unpacklo_epi16(s[4], s[5]); + u[1] = _mm_unpackhi_epi16(s[4], s[5]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + u[4] = _mm_unpacklo_epi16(x[12], x[13]); + u[5] = _mm_unpackhi_epi16(x[12], x[13]); + u[6] = _mm_unpacklo_epi16(x[14], x[15]); + u[7] = _mm_unpackhi_epi16(x[14], x[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[4] = _mm_packs_epi32(v[0], v[1]); + s[5] = _mm_packs_epi32(v[2], v[3]); + s[6] = _mm_packs_epi32(v[4], v[5]); + s[7] = _mm_packs_epi32(v[6], v[7]); + + s[12] = _mm_packs_epi32(v[8], v[9]); + s[13] = _mm_packs_epi32(v[10], v[11]); + s[14] = _mm_packs_epi32(v[12], v[13]); + s[15] = _mm_packs_epi32(v[14], v[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + u[4] = _mm_unpacklo_epi16(s[10], s[11]); + u[5] = _mm_unpackhi_epi16(s[10], s[11]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); + v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); + v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); + v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); + v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); + v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); + v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); + v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[8]); + in[2] = s[12]; + in[3] = _mm_sub_epi16(kZero, s[4]); + in[4] = _mm_packs_epi32(v[4], v[5]); + in[5] = _mm_packs_epi32(v[12], v[13]); + in[6] = _mm_packs_epi32(v[8], v[9]); + in[7] = _mm_packs_epi32(v[0], v[1]); + in[8] = _mm_packs_epi32(v[2], v[3]); + in[9] = _mm_packs_epi32(v[10], v[11]); + in[10] = _mm_packs_epi32(v[14], v[15]); + in[11] = _mm_packs_epi32(v[6], v[7]); + in[12] = s[5]; + in[13] = _mm_sub_epi16(kZero, s[13]); + in[14] = s[9]; + in[15] = _mm_sub_epi16(kZero, s[1]); +} + +static void fdct16_sse2(__m128i *in0, __m128i *in1) { + fdct16_8col(in0); + fdct16_8col(in1); + array_transpose_16x16(in0, in1); +} + +static void fadst16_sse2(__m128i *in0, __m128i *in1) { + fadst16_8col(in0); + fadst16_8col(in1); + array_transpose_16x16(in0, in1); +} + +#if CONFIG_EXT_TX +static void fidtx16_sse2(__m128i *in0, __m128i *in1) { + idtx16_8col(in0); + idtx16_8col(in1); + array_transpose_16x16(in0, in1); +} +#endif // CONFIG_EXT_TX + +void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in0[16], in1[16]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fdct16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdct16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case ADST_DCT: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdct16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case DCT_ADST: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fdct16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case ADST_ADST: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_16x16(input, in0, in1, stride, 1, 0); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdct16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case DCT_FLIPADST: + load_buffer_16x16(input, in0, in1, stride, 0, 1); + fdct16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case FLIPADST_FLIPADST: + load_buffer_16x16(input, in0, in1, stride, 1, 1); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case ADST_FLIPADST: + load_buffer_16x16(input, in0, in1, stride, 0, 1); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case FLIPADST_ADST: + load_buffer_16x16(input, in0, in1, stride, 1, 0); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case IDTX: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fidtx16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fidtx16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case V_DCT: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fdct16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fidtx16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case H_DCT: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fidtx16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdct16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case V_ADST: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fidtx16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case H_ADST: + load_buffer_16x16(input, in0, in1, stride, 0, 0); + fidtx16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case V_FLIPADST: + load_buffer_16x16(input, in0, in1, stride, 1, 0); + fadst16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fidtx16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case H_FLIPADST: + load_buffer_16x16(input, in0, in1, stride, 0, 1); + fidtx16_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } +} + +static INLINE void prepare_4x8_row_first(__m128i *in) { + in[0] = _mm_unpacklo_epi64(in[0], in[2]); + in[1] = _mm_unpacklo_epi64(in[1], in[3]); + transpose_4x4(in); + in[4] = _mm_unpacklo_epi64(in[4], in[6]); + in[5] = _mm_unpacklo_epi64(in[5], in[7]); + transpose_4x4(in + 4); +} + +// Load input into the left-hand half of in (ie, into lanes 0..3 of +// each element of in). The right hand half (lanes 4..7) should be +// treated as being filled with "don't care" values. +static INLINE void load_buffer_4x8(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr) { + const int shift = 2; + if (!flipud) { + in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + in[4] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride)); + in[5] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride)); + in[6] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride)); + in[7] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride)); + } else { + in[0] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride)); + in[4] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + in[5] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[6] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[7] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = _mm_shufflelo_epi16(in[0], 0x1b); + in[1] = _mm_shufflelo_epi16(in[1], 0x1b); + in[2] = _mm_shufflelo_epi16(in[2], 0x1b); + in[3] = _mm_shufflelo_epi16(in[3], 0x1b); + in[4] = _mm_shufflelo_epi16(in[4], 0x1b); + in[5] = _mm_shufflelo_epi16(in[5], 0x1b); + in[6] = _mm_shufflelo_epi16(in[6], 0x1b); + in[7] = _mm_shufflelo_epi16(in[7], 0x1b); + } + + in[0] = _mm_slli_epi16(in[0], shift); + in[1] = _mm_slli_epi16(in[1], shift); + in[2] = _mm_slli_epi16(in[2], shift); + in[3] = _mm_slli_epi16(in[3], shift); + in[4] = _mm_slli_epi16(in[4], shift); + in[5] = _mm_slli_epi16(in[5], shift); + in[6] = _mm_slli_epi16(in[6], shift); + in[7] = _mm_slli_epi16(in[7], shift); + + scale_sqrt2_8x4(in); + scale_sqrt2_8x4(in + 4); + prepare_4x8_row_first(in); +} + +static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) { + __m128i in01, in23, in45, in67, sign01, sign23, sign45, sign67; + const int shift = 1; + + // revert the 8x8 txfm's transpose + array_transpose_8x8(res, res); + + in01 = _mm_unpacklo_epi64(res[0], res[1]); + in23 = _mm_unpacklo_epi64(res[2], res[3]); + in45 = _mm_unpacklo_epi64(res[4], res[5]); + in67 = _mm_unpacklo_epi64(res[6], res[7]); + + sign01 = _mm_srai_epi16(in01, 15); + sign23 = _mm_srai_epi16(in23, 15); + sign45 = _mm_srai_epi16(in45, 15); + sign67 = _mm_srai_epi16(in67, 15); + + in01 = _mm_sub_epi16(in01, sign01); + in23 = _mm_sub_epi16(in23, sign23); + in45 = _mm_sub_epi16(in45, sign45); + in67 = _mm_sub_epi16(in67, sign67); + + in01 = _mm_srai_epi16(in01, shift); + in23 = _mm_srai_epi16(in23, shift); + in45 = _mm_srai_epi16(in45, shift); + in67 = _mm_srai_epi16(in67, shift); + + store_output(&in01, (output + 0 * 8)); + store_output(&in23, (output + 1 * 8)); + store_output(&in45, (output + 2 * 8)); + store_output(&in67, (output + 3 * 8)); +} + +void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in[8]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_4x8(input, in, stride, 0, 0); + fdct4_sse2(in); + fdct4_sse2(in + 4); + fdct8_sse2(in); + break; + case ADST_DCT: + load_buffer_4x8(input, in, stride, 0, 0); + fdct4_sse2(in); + fdct4_sse2(in + 4); + fadst8_sse2(in); + break; + case DCT_ADST: + load_buffer_4x8(input, in, stride, 0, 0); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fdct8_sse2(in); + break; + case ADST_ADST: + load_buffer_4x8(input, in, stride, 0, 0); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fadst8_sse2(in); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_4x8(input, in, stride, 1, 0); + fdct4_sse2(in); + fdct4_sse2(in + 4); + fadst8_sse2(in); + break; + case DCT_FLIPADST: + load_buffer_4x8(input, in, stride, 0, 1); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fdct8_sse2(in); + break; + case FLIPADST_FLIPADST: + load_buffer_4x8(input, in, stride, 1, 1); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fadst8_sse2(in); + break; + case ADST_FLIPADST: + load_buffer_4x8(input, in, stride, 0, 1); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fadst8_sse2(in); + break; + case FLIPADST_ADST: + load_buffer_4x8(input, in, stride, 1, 0); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fadst8_sse2(in); + break; + case IDTX: + load_buffer_4x8(input, in, stride, 0, 0); + fidtx4_sse2(in); + fidtx4_sse2(in + 4); + fidtx8_sse2(in); + break; + case V_DCT: + load_buffer_4x8(input, in, stride, 0, 0); + fidtx4_sse2(in); + fidtx4_sse2(in + 4); + fdct8_sse2(in); + break; + case H_DCT: + load_buffer_4x8(input, in, stride, 0, 0); + fdct4_sse2(in); + fdct4_sse2(in + 4); + fidtx8_sse2(in); + break; + case V_ADST: + load_buffer_4x8(input, in, stride, 0, 0); + fidtx4_sse2(in); + fidtx4_sse2(in + 4); + fadst8_sse2(in); + break; + case H_ADST: + load_buffer_4x8(input, in, stride, 0, 0); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fidtx8_sse2(in); + break; + case V_FLIPADST: + load_buffer_4x8(input, in, stride, 1, 0); + fidtx4_sse2(in); + fidtx4_sse2(in + 4); + fadst8_sse2(in); + break; + case H_FLIPADST: + load_buffer_4x8(input, in, stride, 0, 1); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fidtx8_sse2(in); + break; +#endif + default: assert(0); break; + } + write_buffer_4x8(output, in); +} + +// Load input into the left-hand half of in (ie, into lanes 0..3 of +// each element of in). The right hand half (lanes 4..7) should be +// treated as being filled with "don't care" values. +// The input is split horizontally into two 4x4 +// chunks 'l' and 'r'. Then 'l' is stored in the top-left 4x4 +// block of 'in' and 'r' is stored in the bottom-left block. +// This is to allow us to reuse 4x4 transforms. +static INLINE void load_buffer_8x4(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr) { + const int shift = 2; + if (!flipud) { + in[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); + } else { + in[0] = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); + in[1] = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); + in[2] = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); + in[3] = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = mm_reverse_epi16(in[0]); + in[1] = mm_reverse_epi16(in[1]); + in[2] = mm_reverse_epi16(in[2]); + in[3] = mm_reverse_epi16(in[3]); + } + + in[0] = _mm_slli_epi16(in[0], shift); + in[1] = _mm_slli_epi16(in[1], shift); + in[2] = _mm_slli_epi16(in[2], shift); + in[3] = _mm_slli_epi16(in[3], shift); + + scale_sqrt2_8x4(in); + + in[4] = _mm_shuffle_epi32(in[0], 0xe); + in[5] = _mm_shuffle_epi32(in[1], 0xe); + in[6] = _mm_shuffle_epi32(in[2], 0xe); + in[7] = _mm_shuffle_epi32(in[3], 0xe); +} + +static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) { + __m128i out0, out1, out2, out3, sign0, sign1, sign2, sign3; + const int shift = 1; + sign0 = _mm_srai_epi16(res[0], 15); + sign1 = _mm_srai_epi16(res[1], 15); + sign2 = _mm_srai_epi16(res[2], 15); + sign3 = _mm_srai_epi16(res[3], 15); + + out0 = _mm_sub_epi16(res[0], sign0); + out1 = _mm_sub_epi16(res[1], sign1); + out2 = _mm_sub_epi16(res[2], sign2); + out3 = _mm_sub_epi16(res[3], sign3); + + out0 = _mm_srai_epi16(out0, shift); + out1 = _mm_srai_epi16(out1, shift); + out2 = _mm_srai_epi16(out2, shift); + out3 = _mm_srai_epi16(out3, shift); + + store_output(&out0, (output + 0 * 8)); + store_output(&out1, (output + 1 * 8)); + store_output(&out2, (output + 2 * 8)); + store_output(&out3, (output + 3 * 8)); +} + +void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in[8]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x4(input, in, stride, 0, 0); + fdct4_sse2(in); + fdct4_sse2(in + 4); + fdct8_sse2(in); + break; + case ADST_DCT: + load_buffer_8x4(input, in, stride, 0, 0); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fdct8_sse2(in); + break; + case DCT_ADST: + load_buffer_8x4(input, in, stride, 0, 0); + fdct4_sse2(in); + fdct4_sse2(in + 4); + fadst8_sse2(in); + break; + case ADST_ADST: + load_buffer_8x4(input, in, stride, 0, 0); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fadst8_sse2(in); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_8x4(input, in, stride, 1, 0); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fdct8_sse2(in); + break; + case DCT_FLIPADST: + load_buffer_8x4(input, in, stride, 0, 1); + fdct4_sse2(in); + fdct4_sse2(in + 4); + fadst8_sse2(in); + break; + case FLIPADST_FLIPADST: + load_buffer_8x4(input, in, stride, 1, 1); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fadst8_sse2(in); + break; + case ADST_FLIPADST: + load_buffer_8x4(input, in, stride, 0, 1); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fadst8_sse2(in); + break; + case FLIPADST_ADST: + load_buffer_8x4(input, in, stride, 1, 0); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fadst8_sse2(in); + break; + case IDTX: + load_buffer_8x4(input, in, stride, 0, 0); + fidtx4_sse2(in); + fidtx4_sse2(in + 4); + fidtx8_sse2(in); + break; + case V_DCT: + load_buffer_8x4(input, in, stride, 0, 0); + fdct4_sse2(in); + fdct4_sse2(in + 4); + fidtx8_sse2(in); + break; + case H_DCT: + load_buffer_8x4(input, in, stride, 0, 0); + fidtx4_sse2(in); + fidtx4_sse2(in + 4); + fdct8_sse2(in); + break; + case V_ADST: + load_buffer_8x4(input, in, stride, 0, 0); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fidtx8_sse2(in); + break; + case H_ADST: + load_buffer_8x4(input, in, stride, 0, 0); + fidtx4_sse2(in); + fidtx4_sse2(in + 4); + fadst8_sse2(in); + break; + case V_FLIPADST: + load_buffer_8x4(input, in, stride, 1, 0); + fadst4_sse2(in); + fadst4_sse2(in + 4); + fidtx8_sse2(in); + break; + case H_FLIPADST: + load_buffer_8x4(input, in, stride, 0, 1); + fidtx4_sse2(in); + fidtx4_sse2(in + 4); + fadst8_sse2(in); + break; +#endif + default: assert(0); break; + } + write_buffer_8x4(output, in); +} + +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr) { + // Load 2 8x8 blocks + const int16_t *t = input; + const int16_t *b = input + 8 * stride; + + if (flipud) { + const int16_t *const tmp = t; + t = b; + b = tmp; + } + + load_buffer_8x8(t, in, stride, flipud, fliplr); + scale_sqrt2_8x8(in); + load_buffer_8x8(b, in + 8, stride, flipud, fliplr); + scale_sqrt2_8x8(in + 8); +} + +static INLINE void round_power_of_two_signed(__m128i *x, int n) { + const __m128i rounding = _mm_set1_epi16((1 << n) >> 1); + const __m128i sign = _mm_srai_epi16(*x, 15); + const __m128i res = _mm_add_epi16(_mm_add_epi16(*x, rounding), sign); + *x = _mm_srai_epi16(res, n); +} + +static void row_8x16_rounding(__m128i *in, int bits) { + int i; + for (i = 0; i < 16; i++) { + round_power_of_two_signed(&in[i], bits); + } +} + +void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in[16]; + + __m128i *const t = in; // Alias to top 8x8 sub block + __m128i *const b = in + 8; // Alias to bottom 8x8 sub block + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x16(input, in, stride, 0, 0); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fdct8_sse2(t); + fdct8_sse2(b); + row_8x16_rounding(in, 2); + fdct16_8col(in); + break; + case ADST_DCT: + load_buffer_8x16(input, in, stride, 0, 0); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fdct8_sse2(t); + fdct8_sse2(b); + row_8x16_rounding(in, 2); + fadst16_8col(in); + break; + case DCT_ADST: + load_buffer_8x16(input, in, stride, 0, 0); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + row_8x16_rounding(in, 2); + fdct16_8col(in); + break; + case ADST_ADST: + load_buffer_8x16(input, in, stride, 0, 0); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + row_8x16_rounding(in, 2); + fadst16_8col(in); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_8x16(input, in, stride, 1, 0); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fdct8_sse2(t); + fdct8_sse2(b); + row_8x16_rounding(in, 2); + fadst16_8col(in); + break; + case DCT_FLIPADST: + load_buffer_8x16(input, in, stride, 0, 1); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + row_8x16_rounding(in, 2); + fdct16_8col(in); + break; + case FLIPADST_FLIPADST: + load_buffer_8x16(input, in, stride, 1, 1); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + row_8x16_rounding(in, 2); + fadst16_8col(in); + break; + case ADST_FLIPADST: + load_buffer_8x16(input, in, stride, 0, 1); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + row_8x16_rounding(in, 2); + fadst16_8col(in); + break; + case FLIPADST_ADST: + load_buffer_8x16(input, in, stride, 1, 0); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + row_8x16_rounding(in, 2); + fadst16_8col(in); + break; + case IDTX: + load_buffer_8x16(input, in, stride, 0, 0); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fidtx8_sse2(t); + fidtx8_sse2(b); + row_8x16_rounding(in, 2); + idtx16_8col(in); + break; + case V_DCT: + load_buffer_8x16(input, in, stride, 0, 0); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fidtx8_sse2(t); + fidtx8_sse2(b); + row_8x16_rounding(in, 2); + fdct16_8col(in); + break; + case H_DCT: + load_buffer_8x16(input, in, stride, 0, 0); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fdct8_sse2(t); + fdct8_sse2(b); + row_8x16_rounding(in, 2); + idtx16_8col(in); + break; + case V_ADST: + load_buffer_8x16(input, in, stride, 0, 0); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fidtx8_sse2(t); + fidtx8_sse2(b); + row_8x16_rounding(in, 2); + fadst16_8col(in); + break; + case H_ADST: + load_buffer_8x16(input, in, stride, 0, 0); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + row_8x16_rounding(in, 2); + idtx16_8col(in); + break; + case V_FLIPADST: + load_buffer_8x16(input, in, stride, 1, 0); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fidtx8_sse2(t); + fidtx8_sse2(b); + row_8x16_rounding(in, 2); + fadst16_8col(in); + break; + case H_FLIPADST: + load_buffer_8x16(input, in, stride, 0, 1); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + row_8x16_rounding(in, 2); + idtx16_8col(in); + break; +#endif + default: assert(0); break; + } + write_buffer_8x8(output, t, 8); + write_buffer_8x8(output + 64, b, 8); +} + +static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr) { + // Load 2 8x8 blocks + const int16_t *l = input; + const int16_t *r = input + 8; + + if (fliplr) { + const int16_t *const tmp = l; + l = r; + r = tmp; + } + + // load first 8 columns + load_buffer_8x8(l, in, stride, flipud, fliplr); + scale_sqrt2_8x8(in); + load_buffer_8x8(r, in + 8, stride, flipud, fliplr); + scale_sqrt2_8x8(in + 8); +} + +#define col_16x8_rounding row_8x16_rounding + +void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in[16]; + + __m128i *const l = in; // Alias to left 8x8 sub block + __m128i *const r = in + 8; // Alias to right 8x8 sub block, which we store + // in the second half of the array + + switch (tx_type) { + case DCT_DCT: + load_buffer_16x8(input, in, stride, 0, 0); + fdct8_sse2(l); + fdct8_sse2(r); + col_16x8_rounding(in, 2); + fdct16_8col(in); + break; + case ADST_DCT: + load_buffer_16x8(input, in, stride, 0, 0); + fadst8_sse2(l); + fadst8_sse2(r); + col_16x8_rounding(in, 2); + fdct16_8col(in); + break; + case DCT_ADST: + load_buffer_16x8(input, in, stride, 0, 0); + fdct8_sse2(l); + fdct8_sse2(r); + col_16x8_rounding(in, 2); + fadst16_8col(in); + break; + case ADST_ADST: + load_buffer_16x8(input, in, stride, 0, 0); + fadst8_sse2(l); + fadst8_sse2(r); + col_16x8_rounding(in, 2); + fadst16_8col(in); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_16x8(input, in, stride, 1, 0); + fadst8_sse2(l); + fadst8_sse2(r); + col_16x8_rounding(in, 2); + fdct16_8col(in); + break; + case DCT_FLIPADST: + load_buffer_16x8(input, in, stride, 0, 1); + fdct8_sse2(l); + fdct8_sse2(r); + col_16x8_rounding(in, 2); + fadst16_8col(in); + break; + case FLIPADST_FLIPADST: + load_buffer_16x8(input, in, stride, 1, 1); + fadst8_sse2(l); + fadst8_sse2(r); + col_16x8_rounding(in, 2); + fadst16_8col(in); + break; + case ADST_FLIPADST: + load_buffer_16x8(input, in, stride, 0, 1); + fadst8_sse2(l); + fadst8_sse2(r); + col_16x8_rounding(in, 2); + fadst16_8col(in); + break; + case FLIPADST_ADST: + load_buffer_16x8(input, in, stride, 1, 0); + fadst8_sse2(l); + fadst8_sse2(r); + col_16x8_rounding(in, 2); + fadst16_8col(in); + break; + case IDTX: + load_buffer_16x8(input, in, stride, 0, 0); + fidtx8_sse2(l); + fidtx8_sse2(r); + col_16x8_rounding(in, 2); + idtx16_8col(in); + break; + case V_DCT: + load_buffer_16x8(input, in, stride, 0, 0); + fdct8_sse2(l); + fdct8_sse2(r); + col_16x8_rounding(in, 2); + idtx16_8col(in); + break; + case H_DCT: + load_buffer_16x8(input, in, stride, 0, 0); + fidtx8_sse2(l); + fidtx8_sse2(r); + col_16x8_rounding(in, 2); + fdct16_8col(in); + break; + case V_ADST: + load_buffer_16x8(input, in, stride, 0, 0); + fadst8_sse2(l); + fadst8_sse2(r); + col_16x8_rounding(in, 2); + idtx16_8col(in); + break; + case H_ADST: + load_buffer_16x8(input, in, stride, 0, 0); + fidtx8_sse2(l); + fidtx8_sse2(r); + col_16x8_rounding(in, 2); + fadst16_8col(in); + break; + case V_FLIPADST: + load_buffer_16x8(input, in, stride, 1, 0); + fadst8_sse2(l); + fadst8_sse2(r); + col_16x8_rounding(in, 2); + idtx16_8col(in); + break; + case H_FLIPADST: + load_buffer_16x8(input, in, stride, 0, 1); + fidtx8_sse2(l); + fidtx8_sse2(r); + col_16x8_rounding(in, 2); + fadst16_8col(in); + break; +#endif + default: assert(0); break; + } + array_transpose_8x8(l, l); + array_transpose_8x8(r, r); + write_buffer_8x8(output, l, 16); + write_buffer_8x8(output + 8, r, 16); +} + +// Note: The 16-column 32-element transforms expect their input to be +// split up into a 2x2 grid of 8x16 blocks +static INLINE void fdct32_16col(__m128i *tl, __m128i *tr, __m128i *bl, + __m128i *br) { + fdct32_8col(tl, bl); + fdct32_8col(tr, br); + array_transpose_16x16(tl, tr); + array_transpose_16x16(bl, br); +} + +#if CONFIG_EXT_TX +static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl, + __m128i *br) { + int i; + for (i = 0; i < 16; ++i) { + tl[i] = _mm_slli_epi16(tl[i], 2); + tr[i] = _mm_slli_epi16(tr[i], 2); + bl[i] = _mm_slli_epi16(bl[i], 2); + br[i] = _mm_slli_epi16(br[i], 2); + } + array_transpose_16x16(tl, tr); + array_transpose_16x16(bl, br); +} +#endif + +static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl, + __m128i *intr, __m128i *inbl, + __m128i *inbr, int stride, int flipud, + int fliplr) { + int i; + if (flipud) { + input = input + 31 * stride; + stride = -stride; + } + + for (i = 0; i < 16; ++i) { + intl[i] = _mm_slli_epi16( + _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2); + intr[i] = _mm_slli_epi16( + _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2); + inbl[i] = _mm_slli_epi16( + _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0)), 2); + inbr[i] = _mm_slli_epi16( + _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8)), 2); + } + + if (fliplr) { + __m128i tmp; + for (i = 0; i < 16; ++i) { + tmp = intl[i]; + intl[i] = mm_reverse_epi16(intr[i]); + intr[i] = mm_reverse_epi16(tmp); + tmp = inbl[i]; + inbl[i] = mm_reverse_epi16(inbr[i]); + inbr[i] = mm_reverse_epi16(tmp); + } + } + + scale_sqrt2_8x16(intl); + scale_sqrt2_8x16(intr); + scale_sqrt2_8x16(inbl); + scale_sqrt2_8x16(inbr); +} + +static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl, + __m128i *restr, __m128i *resbl, + __m128i *resbr) { + int i; + for (i = 0; i < 16; ++i) { + store_output(&restl[i], output + i * 16 + 0); + store_output(&restr[i], output + i * 16 + 8); + store_output(&resbl[i], output + (i + 16) * 16 + 0); + store_output(&resbr[i], output + (i + 16) * 16 + 8); + } +} + +static INLINE void round_signed_8x8(__m128i *in, const int bit) { + const __m128i rounding = _mm_set1_epi16((1 << bit) >> 1); + __m128i sign0 = _mm_srai_epi16(in[0], 15); + __m128i sign1 = _mm_srai_epi16(in[1], 15); + __m128i sign2 = _mm_srai_epi16(in[2], 15); + __m128i sign3 = _mm_srai_epi16(in[3], 15); + __m128i sign4 = _mm_srai_epi16(in[4], 15); + __m128i sign5 = _mm_srai_epi16(in[5], 15); + __m128i sign6 = _mm_srai_epi16(in[6], 15); + __m128i sign7 = _mm_srai_epi16(in[7], 15); + + in[0] = _mm_add_epi16(_mm_add_epi16(in[0], rounding), sign0); + in[1] = _mm_add_epi16(_mm_add_epi16(in[1], rounding), sign1); + in[2] = _mm_add_epi16(_mm_add_epi16(in[2], rounding), sign2); + in[3] = _mm_add_epi16(_mm_add_epi16(in[3], rounding), sign3); + in[4] = _mm_add_epi16(_mm_add_epi16(in[4], rounding), sign4); + in[5] = _mm_add_epi16(_mm_add_epi16(in[5], rounding), sign5); + in[6] = _mm_add_epi16(_mm_add_epi16(in[6], rounding), sign6); + in[7] = _mm_add_epi16(_mm_add_epi16(in[7], rounding), sign7); + + in[0] = _mm_srai_epi16(in[0], bit); + in[1] = _mm_srai_epi16(in[1], bit); + in[2] = _mm_srai_epi16(in[2], bit); + in[3] = _mm_srai_epi16(in[3], bit); + in[4] = _mm_srai_epi16(in[4], bit); + in[5] = _mm_srai_epi16(in[5], bit); + in[6] = _mm_srai_epi16(in[6], bit); + in[7] = _mm_srai_epi16(in[7], bit); +} + +static INLINE void round_signed_16x16(__m128i *in0, __m128i *in1) { + const int bit = 4; + round_signed_8x8(in0, bit); + round_signed_8x8(in0 + 8, bit); + round_signed_8x8(in1, bit); + round_signed_8x8(in1 + 8, bit); +} + +// Note: +// suffix "t" indicates the transpose operation comes first +static void fdct16t_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + fdct16_8col(in0); + fdct16_8col(in1); +} + +static void fadst16t_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + fadst16_8col(in0); + fadst16_8col(in1); +} + +static INLINE void fdct32t_16col(__m128i *tl, __m128i *tr, __m128i *bl, + __m128i *br) { + array_transpose_16x16(tl, tr); + array_transpose_16x16(bl, br); + fdct32_8col(tl, bl); + fdct32_8col(tr, br); +} + +typedef enum transpose_indicator_ { + transpose, + no_transpose, +} transpose_indicator; + +static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl, + __m128i *br, transpose_indicator t) { + __m128i tmpl[16], tmpr[16]; + int i; + + // Copy the bottom half of the input to temporary storage + for (i = 0; i < 16; ++i) { + tmpl[i] = bl[i]; + tmpr[i] = br[i]; + } + + // Generate the bottom half of the output + for (i = 0; i < 16; ++i) { + bl[i] = _mm_slli_epi16(tl[i], 2); + br[i] = _mm_slli_epi16(tr[i], 2); + } + array_transpose_16x16(bl, br); + + // Copy the temporary storage back to the top half of the input + for (i = 0; i < 16; ++i) { + tl[i] = tmpl[i]; + tr[i] = tmpr[i]; + } + + // Generate the top half of the output + scale_sqrt2_8x16(tl); + scale_sqrt2_8x16(tr); + if (t == transpose) + fdct16t_sse2(tl, tr); + else + fdct16_sse2(tl, tr); +} + +// Note on data layout, for both this and the 32x16 transforms: +// So that we can reuse the 16-element transforms easily, +// we want to split the input into 8x16 blocks. +// For 16x32, this means the input is a 2x2 grid of such blocks. +// For 32x16, it means the input is a 4x1 grid. +void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i intl[16], intr[16], inbl[16], inbr[16]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); + fdct16t_sse2(intl, intr); + fdct16t_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fdct32t_16col(intl, intr, inbl, inbr); + break; + case ADST_DCT: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); + fdct16t_sse2(intl, intr); + fdct16t_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fhalfright32_16col(intl, intr, inbl, inbr, transpose); + break; + case DCT_ADST: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); + fadst16t_sse2(intl, intr); + fadst16t_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fdct32t_16col(intl, intr, inbl, inbr); + break; + case ADST_ADST: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); + fadst16t_sse2(intl, intr); + fadst16t_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fhalfright32_16col(intl, intr, inbl, inbr, transpose); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0); + fdct16t_sse2(intl, intr); + fdct16t_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fhalfright32_16col(intl, intr, inbl, inbr, transpose); + break; + case DCT_FLIPADST: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1); + fadst16t_sse2(intl, intr); + fadst16t_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fdct32t_16col(intl, intr, inbl, inbr); + break; + case FLIPADST_FLIPADST: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1); + fadst16t_sse2(intl, intr); + fadst16t_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fhalfright32_16col(intl, intr, inbl, inbr, transpose); + break; + case ADST_FLIPADST: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1); + fadst16t_sse2(intl, intr); + fadst16t_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fhalfright32_16col(intl, intr, inbl, inbr, transpose); + break; + case FLIPADST_ADST: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0); + fadst16t_sse2(intl, intr); + fadst16t_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fhalfright32_16col(intl, intr, inbl, inbr, transpose); + break; + case IDTX: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); + fidtx16_sse2(intl, intr); + fidtx16_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fidtx32_16col(intl, intr, inbl, inbr); + break; + case V_DCT: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); + fidtx16_sse2(intl, intr); + fidtx16_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fdct32t_16col(intl, intr, inbl, inbr); + break; + case H_DCT: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); + fdct16t_sse2(intl, intr); + fdct16t_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fidtx32_16col(intl, intr, inbl, inbr); + break; + case V_ADST: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); + fidtx16_sse2(intl, intr); + fidtx16_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fhalfright32_16col(intl, intr, inbl, inbr, transpose); + break; + case H_ADST: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0); + fadst16t_sse2(intl, intr); + fadst16t_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fidtx32_16col(intl, intr, inbl, inbr); + break; + case V_FLIPADST: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0); + fidtx16_sse2(intl, intr); + fidtx16_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fhalfright32_16col(intl, intr, inbl, inbr, transpose); + break; + case H_FLIPADST: + load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1); + fadst16t_sse2(intl, intr); + fadst16t_sse2(inbl, inbr); + round_signed_16x16(intl, intr); + round_signed_16x16(inbl, inbr); + fidtx32_16col(intl, intr, inbl, inbr); + break; +#endif + default: assert(0); break; + } + write_buffer_16x32(output, intl, intr, inbl, inbr); +} + +static INLINE void load_buffer_32x16(const int16_t *input, __m128i *in0, + __m128i *in1, __m128i *in2, __m128i *in3, + int stride, int flipud, int fliplr) { + int i; + if (flipud) { + input += 15 * stride; + stride = -stride; + } + + for (i = 0; i < 16; ++i) { + in0[i] = _mm_slli_epi16( + _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2); + in1[i] = _mm_slli_epi16( + _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2); + in2[i] = _mm_slli_epi16( + _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2); + in3[i] = _mm_slli_epi16( + _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2); + } + + if (fliplr) { + for (i = 0; i < 16; ++i) { + __m128i tmp1 = in0[i]; + __m128i tmp2 = in1[i]; + in0[i] = mm_reverse_epi16(in3[i]); + in1[i] = mm_reverse_epi16(in2[i]); + in2[i] = mm_reverse_epi16(tmp2); + in3[i] = mm_reverse_epi16(tmp1); + } + } + + scale_sqrt2_8x16(in0); + scale_sqrt2_8x16(in1); + scale_sqrt2_8x16(in2); + scale_sqrt2_8x16(in3); +} + +static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0, + __m128i *res1, __m128i *res2, + __m128i *res3) { + int i; + for (i = 0; i < 16; ++i) { + store_output(&res0[i], output + i * 32 + 0); + store_output(&res1[i], output + i * 32 + 8); + store_output(&res2[i], output + i * 32 + 16); + store_output(&res3[i], output + i * 32 + 24); + } +} + +void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in0[16], in1[16], in2[16], in3[16]; + + load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); + switch (tx_type) { + case DCT_DCT: + fdct16_sse2(in0, in1); + fdct16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fdct32_16col(in0, in1, in2, in3); + break; + case ADST_DCT: + fadst16_sse2(in0, in1); + fadst16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fdct32_16col(in0, in1, in2, in3); + break; + case DCT_ADST: + fdct16_sse2(in0, in1); + fdct16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fhalfright32_16col(in0, in1, in2, in3, no_transpose); + break; + case ADST_ADST: + fadst16_sse2(in0, in1); + fadst16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fhalfright32_16col(in0, in1, in2, in3, no_transpose); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0); + fadst16_sse2(in0, in1); + fadst16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fdct32_16col(in0, in1, in2, in3); + break; + case DCT_FLIPADST: + load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1); + fdct16_sse2(in0, in1); + fdct16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fhalfright32_16col(in0, in1, in2, in3, no_transpose); + break; + case FLIPADST_FLIPADST: + load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1); + fadst16_sse2(in0, in1); + fadst16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fhalfright32_16col(in0, in1, in2, in3, no_transpose); + break; + case ADST_FLIPADST: + load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1); + fadst16_sse2(in0, in1); + fadst16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fhalfright32_16col(in0, in1, in2, in3, no_transpose); + break; + case FLIPADST_ADST: + load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0); + fadst16_sse2(in0, in1); + fadst16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fhalfright32_16col(in0, in1, in2, in3, no_transpose); + break; + case IDTX: + load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); + fidtx16_sse2(in0, in1); + fidtx16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fidtx32_16col(in0, in1, in2, in3); + break; + case V_DCT: + load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); + fdct16_sse2(in0, in1); + fdct16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fidtx32_16col(in0, in1, in2, in3); + break; + case H_DCT: + load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); + fidtx16_sse2(in0, in1); + fidtx16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fdct32_16col(in0, in1, in2, in3); + break; + case V_ADST: + load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); + fadst16_sse2(in0, in1); + fadst16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fidtx32_16col(in0, in1, in2, in3); + break; + case H_ADST: + load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); + fidtx16_sse2(in0, in1); + fidtx16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fhalfright32_16col(in0, in1, in2, in3, no_transpose); + break; + case V_FLIPADST: + load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0); + fadst16_sse2(in0, in1); + fadst16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fidtx32_16col(in0, in1, in2, in3); + break; + case H_FLIPADST: + load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1); + fidtx16_sse2(in0, in1); + fidtx16_sse2(in2, in3); + round_signed_16x16(in0, in1); + round_signed_16x16(in2, in3); + fhalfright32_16col(in0, in1, in2, in3, no_transpose); + break; +#endif + default: assert(0); break; + } + write_buffer_32x16(output, in0, in1, in2, in3); +} + +// Note: +// 32x32 hybrid fwd txfm +// 4x2 grids of 8x16 block. Each block is represented by __m128i in[16] +static INLINE void load_buffer_32x32(const int16_t *input, + __m128i *in0 /*in0[32]*/, + __m128i *in1 /*in1[32]*/, + __m128i *in2 /*in2[32]*/, + __m128i *in3 /*in3[32]*/, int stride, + int flipud, int fliplr) { + if (flipud) { + input += 31 * stride; + stride = -stride; + } + + int i; + for (i = 0; i < 32; ++i) { + in0[i] = _mm_slli_epi16( + _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2); + in1[i] = _mm_slli_epi16( + _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2); + in2[i] = _mm_slli_epi16( + _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2); + in3[i] = _mm_slli_epi16( + _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2); + } + + if (fliplr) { + for (i = 0; i < 32; ++i) { + __m128i tmp1 = in0[i]; + __m128i tmp2 = in1[i]; + in0[i] = mm_reverse_epi16(in3[i]); + in1[i] = mm_reverse_epi16(in2[i]); + in2[i] = mm_reverse_epi16(tmp2); + in3[i] = mm_reverse_epi16(tmp1); + } + } +} + +static INLINE void swap_16x16(__m128i *b0l /*b0l[16]*/, + __m128i *b0r /*b0r[16]*/, + __m128i *b1l /*b1l[16]*/, + __m128i *b1r /*b1r[16]*/) { + int i; + for (i = 0; i < 16; ++i) { + __m128i tmp0 = b1l[i]; + __m128i tmp1 = b1r[i]; + b1l[i] = b0l[i]; + b1r[i] = b0r[i]; + b0l[i] = tmp0; + b0r[i] = tmp1; + } +} + +static INLINE void fdct32(__m128i *in0, __m128i *in1, __m128i *in2, + __m128i *in3) { + fdct32_8col(in0, &in0[16]); + fdct32_8col(in1, &in1[16]); + fdct32_8col(in2, &in2[16]); + fdct32_8col(in3, &in3[16]); + + array_transpose_16x16(in0, in1); + array_transpose_16x16(&in0[16], &in1[16]); + array_transpose_16x16(in2, in3); + array_transpose_16x16(&in2[16], &in3[16]); + + swap_16x16(&in0[16], &in1[16], in2, in3); +} + +static INLINE void fhalfright32(__m128i *in0, __m128i *in1, __m128i *in2, + __m128i *in3) { + fhalfright32_16col(in0, in1, &in0[16], &in1[16], no_transpose); + fhalfright32_16col(in2, in3, &in2[16], &in3[16], no_transpose); + swap_16x16(&in0[16], &in1[16], in2, in3); +} + +#if CONFIG_EXT_TX +static INLINE void fidtx32(__m128i *in0, __m128i *in1, __m128i *in2, + __m128i *in3) { + fidtx32_16col(in0, in1, &in0[16], &in1[16]); + fidtx32_16col(in2, in3, &in2[16], &in3[16]); + swap_16x16(&in0[16], &in1[16], in2, in3); +} +#endif + +static INLINE void round_signed_32x32(__m128i *in0, __m128i *in1, __m128i *in2, + __m128i *in3) { + round_signed_16x16(in0, in1); + round_signed_16x16(&in0[16], &in1[16]); + round_signed_16x16(in2, in3); + round_signed_16x16(&in2[16], &in3[16]); +} + +static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2, + __m128i *in3, tran_low_t *output) { + int i; + for (i = 0; i < 32; ++i) { + store_output(&in0[i], output + i * 32 + 0); + store_output(&in1[i], output + i * 32 + 8); + store_output(&in2[i], output + i * 32 + 16); + store_output(&in3[i], output + i * 32 + 24); + } +} + +void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in0[32], in1[32], in2[32], in3[32]; + + load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0); + switch (tx_type) { + case DCT_DCT: + fdct32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fdct32(in0, in1, in2, in3); + break; + case ADST_DCT: + fhalfright32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fdct32(in0, in1, in2, in3); + break; + case DCT_ADST: + fdct32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fhalfright32(in0, in1, in2, in3); + break; + case ADST_ADST: + fhalfright32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fhalfright32(in0, in1, in2, in3); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0); + fhalfright32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fdct32(in0, in1, in2, in3); + break; + case DCT_FLIPADST: + load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1); + fdct32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fhalfright32(in0, in1, in2, in3); + break; + case FLIPADST_FLIPADST: + load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 1); + fhalfright32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fhalfright32(in0, in1, in2, in3); + break; + case ADST_FLIPADST: + load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1); + fhalfright32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fhalfright32(in0, in1, in2, in3); + break; + case FLIPADST_ADST: + load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0); + fhalfright32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fhalfright32(in0, in1, in2, in3); + break; + case IDTX: + fidtx32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fidtx32(in0, in1, in2, in3); + break; + case V_DCT: + fdct32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fidtx32(in0, in1, in2, in3); + break; + case H_DCT: + fidtx32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fdct32(in0, in1, in2, in3); + break; + case V_ADST: + fhalfright32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fidtx32(in0, in1, in2, in3); + break; + case H_ADST: + fidtx32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fhalfright32(in0, in1, in2, in3); + break; + case V_FLIPADST: + load_buffer_32x32(input, in0, in1, in2, in3, stride, 1, 0); + fhalfright32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fidtx32(in0, in1, in2, in3); + break; + case H_FLIPADST: + load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 1); + fidtx32(in0, in1, in2, in3); + round_signed_32x32(in0, in1, in2, in3); + fhalfright32(in0, in1, in2, in3); + break; +#endif + default: assert(0); + } + write_buffer_32x32(in0, in1, in2, in3, output); +} diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm new file mode 100644 index 000000000..a99db3d6e --- /dev/null +++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm @@ -0,0 +1,87 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +%define private_prefix av1 + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro TRANSFORM_COLS 0 + paddw m0, m1 + movq m4, m0 + psubw m3, m2 + psubw m4, m3 + psraw m4, 1 + movq m5, m4 + psubw m5, m1 ;b1 + psubw m4, m2 ;c1 + psubw m0, m4 + paddw m3, m5 + ; m0 a0 + SWAP 1, 4 ; m1 c1 + SWAP 2, 3 ; m2 d1 + SWAP 3, 5 ; m3 b1 +%endmacro + +%macro TRANSPOSE_4X4 0 + ; 00 01 02 03 + ; 10 11 12 13 + ; 20 21 22 23 + ; 30 31 32 33 + punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13 + punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33 + mova m1, m0 + punpckldq m0, m2 ; 00 10 20 30 01 11 21 31 + punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33 +%endmacro + +INIT_XMM sse2 +cglobal fwht4x4, 3, 4, 8, input, output, stride + lea r3q, [inputq + strideq*4] + movq m0, [inputq] ;a1 + movq m1, [inputq + strideq*2] ;b1 + movq m2, [r3q] ;c1 + movq m3, [r3q + strideq*2] ;d1 + + TRANSFORM_COLS + TRANSPOSE_4X4 + SWAP 1, 2 + psrldq m1, m0, 8 + psrldq m3, m2, 8 + TRANSFORM_COLS + TRANSPOSE_4X4 + + psllw m0, 2 + psllw m1, 2 + +%if CONFIG_HIGHBITDEPTH + ; sign extension + mova m2, m0 + mova m3, m1 + punpcklwd m0, m0 + punpcklwd m1, m1 + punpckhwd m2, m2 + punpckhwd m3, m3 + psrad m0, 16 + psrad m1, 16 + psrad m2, 16 + psrad m3, 16 + mova [outputq], m0 + mova [outputq + 16], m2 + mova [outputq + 32], m1 + mova [outputq + 48], m3 +%else + mova [outputq], m0 + mova [outputq + 16], m1 +%endif + + RET diff --git a/third_party/aom/av1/encoder/x86/dct_ssse3.c b/third_party/aom/av1/encoder/x86/dct_ssse3.c new file mode 100644 index 000000000..717a99af8 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/dct_ssse3.c @@ -0,0 +1,469 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#if defined(_MSC_VER) && _MSC_VER <= 1500 +// Need to include math.h before calling tmmintrin.h/intrin.h +// in certain versions of MSVS. +#include +#endif +#include // SSSE3 + +#include "./av1_rtcd.h" +#include "aom_dsp/x86/inv_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +void av1_fdct8x8_quant_ssse3( + const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { + __m128i zero; + int pass; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + // Load input + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + __m128i *in[8]; + int index = 0; + + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)coeff_ptr; + + // Pre-condition input (shift by two) + in0 = _mm_slli_epi16(in0, 2); + in1 = _mm_slli_epi16(in1, 2); + in2 = _mm_slli_epi16(in2, 2); + in3 = _mm_slli_epi16(in3, 2); + in4 = _mm_slli_epi16(in4, 2); + in5 = _mm_slli_epi16(in5, 2); + in6 = _mm_slli_epi16(in6, 2); + in7 = _mm_slli_epi16(in7, 2); + + in[0] = &in0; + in[1] = &in1; + in[2] = &in2; + in[3] = &in3; + in[4] = &in4; + in[5] = &in5; + in[6] = &in6; + in[7] = &in7; + + // We do two passes, first the columns, then the rows. The results of the + // first pass are transposed so that the same column code can be reused. The + // results of the second pass are also transposed so that the rows (processed + // as columns) are put back in row positions. + for (pass = 0; pass < 2; pass++) { + // To store results of each pass before the transpose. + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + // Add/subtract + const __m128i q0 = _mm_add_epi16(in0, in7); + const __m128i q1 = _mm_add_epi16(in1, in6); + const __m128i q2 = _mm_add_epi16(in2, in5); + const __m128i q3 = _mm_add_epi16(in3, in4); + const __m128i q4 = _mm_sub_epi16(in3, in4); + const __m128i q5 = _mm_sub_epi16(in2, in5); + const __m128i q6 = _mm_sub_epi16(in1, in6); + const __m128i q7 = _mm_sub_epi16(in0, in7); + // Work on first four results + { + // Add/subtract + const __m128i r0 = _mm_add_epi16(q0, q3); + const __m128i r1 = _mm_add_epi16(q1, q2); + const __m128i r2 = _mm_sub_epi16(q1, q2); + const __m128i r3 = _mm_sub_epi16(q0, q3); + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + + res0 = _mm_packs_epi32(w0, w1); + res4 = _mm_packs_epi32(w2, w3); + res2 = _mm_packs_epi32(w4, w5); + res6 = _mm_packs_epi32(w6, w7); + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i d0 = _mm_sub_epi16(q6, q5); + const __m128i d1 = _mm_add_epi16(q6, q5); + const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16); + const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16); + + // Add/subtract + const __m128i x0 = _mm_add_epi16(q4, r0); + const __m128i x1 = _mm_sub_epi16(q4, r0); + const __m128i x2 = _mm_sub_epi16(q7, r1); + const __m128i x3 = _mm_add_epi16(q7, r1); + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res1 = _mm_packs_epi32(w0, w1); + res7 = _mm_packs_epi32(w2, w3); + res5 = _mm_packs_epi32(w4, w5); + res3 = _mm_packs_epi32(w6, w7); + } + // Transpose the 8x8. + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); + const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); + const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); + const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); + const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); + const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); + const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); + const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } + // Post-condition output and store it + { + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const __m128i sign_in0 = _mm_srai_epi16(in0, 15); + const __m128i sign_in1 = _mm_srai_epi16(in1, 15); + const __m128i sign_in2 = _mm_srai_epi16(in2, 15); + const __m128i sign_in3 = _mm_srai_epi16(in3, 15); + const __m128i sign_in4 = _mm_srai_epi16(in4, 15); + const __m128i sign_in5 = _mm_srai_epi16(in5, 15); + const __m128i sign_in6 = _mm_srai_epi16(in6, 15); + const __m128i sign_in7 = _mm_srai_epi16(in7, 15); + in0 = _mm_sub_epi16(in0, sign_in0); + in1 = _mm_sub_epi16(in1, sign_in1); + in2 = _mm_sub_epi16(in2, sign_in2); + in3 = _mm_sub_epi16(in3, sign_in3); + in4 = _mm_sub_epi16(in4, sign_in4); + in5 = _mm_sub_epi16(in5, sign_in5); + in6 = _mm_sub_epi16(in6, sign_in6); + in7 = _mm_sub_epi16(in7, sign_in7); + in0 = _mm_srai_epi16(in0, 1); + in1 = _mm_srai_epi16(in1, 1); + in2 = _mm_srai_epi16(in2, 1); + in3 = _mm_srai_epi16(in3, 1); + in4 = _mm_srai_epi16(in4, 1); + in5 = _mm_srai_epi16(in5, 1); + in6 = _mm_srai_epi16(in6, 1); + in7 = _mm_srai_epi16(in7, 1); + } + + iscan_ptr += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + zero = _mm_setzero_si128(); + + if (!skip_block) { + __m128i eob; + __m128i round, quant, dequant, thr; + int16_t nzflag; + { + __m128i coeff0, coeff1; + + // Setup global values + { + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + } + + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + // Do DC and first 15 AC + coeff0 = *in[0]; + coeff1 = *in[1]; + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + round = _mm_unpackhi_epi64(round, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + quant = _mm_unpackhi_epi64(quant, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + } + + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob = _mm_max_epi16(eob, eob1); + } + n_coeffs += 8 * 2; + } + + // AC only loop + index = 2; + thr = _mm_srai_epi16(dequant, 1); + while (n_coeffs < 0) { + __m128i coeff0, coeff1; + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + + assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); + coeff0 = *in[index]; + coeff1 = *in[index + 1]; + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + } else { + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + } + } + + if (nzflag) { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob0, eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob0 = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob0 = _mm_max_epi16(eob0, eob1); + eob = _mm_max_epi16(eob, eob0); + } + n_coeffs += 8 * 2; + index += 2; + } + + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); + } + } else { + do { + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + n_coeffs += 8 * 2; + } while (n_coeffs < 0); + *eob_ptr = 0; + } +} diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c new file mode 100644 index 000000000..ae733a1ce --- /dev/null +++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 + +#include "./av1_rtcd.h" +#include "aom/aom_integer.h" + +int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; + __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; + __m256i sse_reg_64hi, ssz_reg_64hi; + __m128i sse_reg128, ssz_reg128; + int64_t sse; + int i; + const __m256i zero_reg = _mm256_set1_epi16(0); + + // init sse and ssz registerd to zero + sse_reg = _mm256_set1_epi16(0); + ssz_reg = _mm256_set1_epi16(0); + + for (i = 0; i < block_size; i += 16) { + // load 32 bytes from coeff and dqcoeff + coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i)); + dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i)); + // dqcoeff - coeff + dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); + // madd (dqcoeff - coeff) + dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); + // madd coeff + coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); + // expand each double word of madd (dqcoeff - coeff) to quad word + exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); + exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); + // expand each double word of madd (coeff) to quad word + exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); + exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); + // add each quad word of madd (dqcoeff - coeff) and madd (coeff) + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); + } + // save the higher 64 bit of each 128 bit lane + sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); + ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); + // add the higher 64 bit to the low 64 bit + sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); + ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); + + // add each 64 bit from each of the 128 bit lane of the 256 bit + sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), + _mm256_extractf128_si256(sse_reg, 1)); + + ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), + _mm256_extractf128_si256(ssz_reg, 1)); + + // store the results + _mm_storel_epi64((__m128i *)(&sse), sse_reg128); + + _mm_storel_epi64((__m128i *)(ssz), ssz_reg128); + _mm256_zeroupper(); + return sse; +} diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm new file mode 100644 index 000000000..4680f1fab --- /dev/null +++ b/third_party/aom/av1/encoder/x86/error_sse2.asm @@ -0,0 +1,125 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%define private_prefix av1 + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, +; int64_t *ssz) + +INIT_XMM sse2 +cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz + pxor m4, m4 ; sse accumulator + pxor m6, m6 ; ssz accumulator + pxor m5, m5 ; dedicated zero register + lea uqcq, [uqcq+sizeq*2] + lea dqcq, [dqcq+sizeq*2] + neg sizeq +.loop: + mova m2, [uqcq+sizeq*2] + mova m0, [dqcq+sizeq*2] + mova m3, [uqcq+sizeq*2+mmsize] + mova m1, [dqcq+sizeq*2+mmsize] + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + ; accumulate in 64bit + punpckldq m7, m0, m5 + punpckhdq m0, m5 + paddq m4, m7 + punpckldq m7, m1, m5 + paddq m4, m0 + punpckhdq m1, m5 + paddq m4, m7 + punpckldq m7, m2, m5 + paddq m4, m1 + punpckhdq m2, m5 + paddq m6, m7 + punpckldq m7, m3, m5 + paddq m6, m2 + punpckhdq m3, m5 + paddq m6, m7 + paddq m6, m3 + add sizeq, mmsize + jl .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + movhlps m7, m6 + paddq m4, m5 + paddq m6, m7 +%if ARCH_X86_64 + movq rax, m4 + movq [sszq], m6 +%else + mov eax, sszm + pshufd m5, m4, 0x1 + movq [eax], m6 + movd eax, m4 + movd edx, m5 +%endif + RET + +; Compute the sum of squared difference between two int16_t vectors. +; int64_t av1_block_error_fp(int16_t *coeff, int16_t *dqcoeff, +; intptr_t block_size) + +INIT_XMM sse2 +cglobal block_error_fp, 3, 3, 6, uqc, dqc, size + pxor m4, m4 ; sse accumulator + pxor m5, m5 ; dedicated zero register + lea uqcq, [uqcq+sizeq*2] + lea dqcq, [dqcq+sizeq*2] + neg sizeq +.loop: + mova m2, [uqcq+sizeq*2] + mova m0, [dqcq+sizeq*2] + mova m3, [uqcq+sizeq*2+mmsize] + mova m1, [dqcq+sizeq*2+mmsize] + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + ; accumulate in 64bit + punpckldq m3, m0, m5 + punpckhdq m0, m5 + paddq m4, m3 + punpckldq m3, m1, m5 + paddq m4, m0 + punpckhdq m1, m5 + paddq m4, m3 + paddq m4, m1 + add sizeq, mmsize + jl .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + paddq m4, m5 +%if ARCH_X86_64 + movq rax, m4 +%else + pshufd m5, m4, 0x1 + movd eax, m4 + movd edx, m5 +%endif + RET diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c new file mode 100644 index 000000000..777304ace --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/common.h" + +int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps) { + int i, j, test; + uint32_t temp[4]; + __m128i max, min, cmp0, cmp1, cmp2, cmp3; + int64_t error = 0, sqcoeff = 0; + const int shift = 2 * (bps - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i += 8) { + // Load the data into xmm registers + __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); + __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); + __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); + __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); + // Check if any values require more than 15 bit + max = _mm_set1_epi32(0x3fff); + min = _mm_set1_epi32(0xffffc000); + cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), + _mm_cmplt_epi32(mm_coeff, min)); + cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), + _mm_cmplt_epi32(mm_coeff2, min)); + cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), + _mm_cmplt_epi32(mm_dqcoeff, min)); + cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), + _mm_cmplt_epi32(mm_dqcoeff2, min)); + test = _mm_movemask_epi8( + _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); + + if (!test) { + __m128i mm_diff, error_sse2, sqcoeff_sse2; + mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); + mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); + mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); + error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); + sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); + _mm_storeu_si128((__m128i *)temp, error_sse2); + error = error + temp[0] + temp[1] + temp[2] + temp[3]; + _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); + sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; + } else { + for (j = 0; j < 8; j++) { + const int64_t diff = coeff[i + j] - dqcoeff[i + j]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; + } + } + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c new file mode 100644 index 000000000..f201a29aa --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c @@ -0,0 +1,1895 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include /* SSE4.1 */ + +#include "./av1_rtcd.h" +#include "./aom_config.h" +#include "av1/common/av1_fwd_txfm2d_cfg.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_ports/mem.h" + +static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr, + int shift) { + if (!flipud) { + in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + } else { + in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = _mm_shufflelo_epi16(in[0], 0x1b); + in[1] = _mm_shufflelo_epi16(in[1], 0x1b); + in[2] = _mm_shufflelo_epi16(in[2], 0x1b); + in[3] = _mm_shufflelo_epi16(in[3], 0x1b); + } + + in[0] = _mm_cvtepi16_epi32(in[0]); + in[1] = _mm_cvtepi16_epi32(in[1]); + in[2] = _mm_cvtepi16_epi32(in[2]); + in[3] = _mm_cvtepi16_epi32(in[3]); + + in[0] = _mm_slli_epi32(in[0], shift); + in[1] = _mm_slli_epi32(in[1], shift); + in[2] = _mm_slli_epi32(in[2], shift); + in[3] = _mm_slli_epi32(in[3], shift); +} + +// We only use stage-2 bit; +// shift[0] is used in load_buffer_4x4() +// shift[1] is used in txfm_func_col() +// shift[2] is used in txfm_func_row() +static void fdct4x4_sse4_1(__m128i *in, int bit) { + const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i s0, s1, s2, s3; + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + + s0 = _mm_add_epi32(in[0], in[3]); + s1 = _mm_add_epi32(in[1], in[2]); + s2 = _mm_sub_epi32(in[1], in[2]); + s3 = _mm_sub_epi32(in[0], in[3]); + + // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit); + u0 = _mm_mullo_epi32(s0, cospi32); + u1 = _mm_mullo_epi32(s1, cospi32); + u2 = _mm_add_epi32(u0, u1); + v0 = _mm_sub_epi32(u0, u1); + + u3 = _mm_add_epi32(u2, rnding); + v1 = _mm_add_epi32(v0, rnding); + + u0 = _mm_srai_epi32(u3, bit); + u2 = _mm_srai_epi32(v1, bit); + + // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit); + v0 = _mm_mullo_epi32(s2, cospi48); + v1 = _mm_mullo_epi32(s3, cospi16); + v2 = _mm_add_epi32(v0, v1); + + v3 = _mm_add_epi32(v2, rnding); + u1 = _mm_srai_epi32(v3, bit); + + v0 = _mm_mullo_epi32(s2, cospi16); + v1 = _mm_mullo_epi32(s3, cospi48); + v2 = _mm_sub_epi32(v1, v0); + + v3 = _mm_add_epi32(v2, rnding); + u3 = _mm_srai_epi32(v3, bit); + + // Note: shift[1] and shift[2] are zeros + + // Transpose 4x4 32-bit + v0 = _mm_unpacklo_epi32(u0, u1); + v1 = _mm_unpackhi_epi32(u0, u1); + v2 = _mm_unpacklo_epi32(u2, u3); + v3 = _mm_unpackhi_epi32(u2, u3); + + in[0] = _mm_unpacklo_epi64(v0, v2); + in[1] = _mm_unpackhi_epi64(v0, v2); + in[2] = _mm_unpacklo_epi64(v1, v3); + in[3] = _mm_unpackhi_epi64(v1, v3); +} + +static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) { + _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); + _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); + _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); + _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); +} + +// Note: +// We implement av1_fwd_txfm2d_4x4(). This function is kept here since +// av1_highbd_fht4x4_c() is not removed yet +void av1_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + (void)input; + (void)output; + (void)stride; + (void)tx_type; + assert(0); +} + +static void fadst4x4_sse4_1(__m128i *in, int bit) { + const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + __m128i s0, s1, s2, s3; + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + + // stage 0 + // stage 1 + // stage 2 + u0 = _mm_mullo_epi32(in[3], cospi8); + u1 = _mm_mullo_epi32(in[0], cospi56); + u2 = _mm_add_epi32(u0, u1); + s0 = _mm_add_epi32(u2, rnding); + s0 = _mm_srai_epi32(s0, bit); + + v0 = _mm_mullo_epi32(in[3], cospi56); + v1 = _mm_mullo_epi32(in[0], cospi8); + v2 = _mm_sub_epi32(v0, v1); + s1 = _mm_add_epi32(v2, rnding); + s1 = _mm_srai_epi32(s1, bit); + + u0 = _mm_mullo_epi32(in[1], cospi40); + u1 = _mm_mullo_epi32(in[2], cospi24); + u2 = _mm_add_epi32(u0, u1); + s2 = _mm_add_epi32(u2, rnding); + s2 = _mm_srai_epi32(s2, bit); + + v0 = _mm_mullo_epi32(in[1], cospi24); + v1 = _mm_mullo_epi32(in[2], cospi40); + v2 = _mm_sub_epi32(v0, v1); + s3 = _mm_add_epi32(v2, rnding); + s3 = _mm_srai_epi32(s3, bit); + + // stage 3 + u0 = _mm_add_epi32(s0, s2); + u2 = _mm_sub_epi32(s0, s2); + u1 = _mm_add_epi32(s1, s3); + u3 = _mm_sub_epi32(s1, s3); + + // stage 4 + v0 = _mm_mullo_epi32(u2, cospi32); + v1 = _mm_mullo_epi32(u3, cospi32); + v2 = _mm_add_epi32(v0, v1); + s2 = _mm_add_epi32(v2, rnding); + u2 = _mm_srai_epi32(s2, bit); + + v2 = _mm_sub_epi32(v0, v1); + s3 = _mm_add_epi32(v2, rnding); + u3 = _mm_srai_epi32(s3, bit); + + // u0, u1, u2, u3 + u2 = _mm_sub_epi32(kZero, u2); + u1 = _mm_sub_epi32(kZero, u1); + + // u0, u2, u3, u1 + // Transpose 4x4 32-bit + v0 = _mm_unpacklo_epi32(u0, u2); + v1 = _mm_unpackhi_epi32(u0, u2); + v2 = _mm_unpacklo_epi32(u3, u1); + v3 = _mm_unpackhi_epi32(u3, u1); + + in[0] = _mm_unpacklo_epi64(v0, v2); + in[1] = _mm_unpackhi_epi64(v0, v2); + in[2] = _mm_unpacklo_epi64(v1, v3); + in[3] = _mm_unpackhi_epi64(v1, v3); +} + +void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff, + int input_stride, int tx_type, int bd) { + __m128i in[4]; + const TXFM_2D_CFG *cfg = NULL; + + switch (tx_type) { + case DCT_DCT: + cfg = &fwd_txfm_2d_cfg_dct_dct_4; + load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); + fdct4x4_sse4_1(in, cfg->cos_bit_col[2]); + fdct4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); + break; + case ADST_DCT: + cfg = &fwd_txfm_2d_cfg_adst_dct_4; + load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); + fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); + fdct4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); + break; + case DCT_ADST: + cfg = &fwd_txfm_2d_cfg_dct_adst_4; + load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); + fdct4x4_sse4_1(in, cfg->cos_bit_col[2]); + fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); + break; + case ADST_ADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_4; + load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); + fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); + fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + cfg = &fwd_txfm_2d_cfg_adst_dct_4; + load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]); + fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); + fdct4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); + break; + case DCT_FLIPADST: + cfg = &fwd_txfm_2d_cfg_dct_adst_4; + load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]); + fdct4x4_sse4_1(in, cfg->cos_bit_col[2]); + fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); + break; + case FLIPADST_FLIPADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_4; + load_buffer_4x4(input, in, input_stride, 1, 1, cfg->shift[0]); + fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); + fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); + break; + case ADST_FLIPADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_4; + load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]); + fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); + fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); + break; + case FLIPADST_ADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_4; + load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]); + fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); + fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); + break; +#endif + default: assert(0); + } + (void)bd; +} + +static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr, + int shift) { + __m128i u; + if (!flipud) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + } else { + in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = mm_reverse_epi16(in[0]); + in[1] = mm_reverse_epi16(in[1]); + in[2] = mm_reverse_epi16(in[2]); + in[3] = mm_reverse_epi16(in[3]); + in[4] = mm_reverse_epi16(in[4]); + in[5] = mm_reverse_epi16(in[5]); + in[6] = mm_reverse_epi16(in[6]); + in[7] = mm_reverse_epi16(in[7]); + } + + u = _mm_unpackhi_epi64(in[4], in[4]); + in[8] = _mm_cvtepi16_epi32(in[4]); + in[9] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[5], in[5]); + in[10] = _mm_cvtepi16_epi32(in[5]); + in[11] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[6], in[6]); + in[12] = _mm_cvtepi16_epi32(in[6]); + in[13] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[7], in[7]); + in[14] = _mm_cvtepi16_epi32(in[7]); + in[15] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[3], in[3]); + in[6] = _mm_cvtepi16_epi32(in[3]); + in[7] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[2], in[2]); + in[4] = _mm_cvtepi16_epi32(in[2]); + in[5] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[1], in[1]); + in[2] = _mm_cvtepi16_epi32(in[1]); + in[3] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[0], in[0]); + in[0] = _mm_cvtepi16_epi32(in[0]); + in[1] = _mm_cvtepi16_epi32(u); + + in[0] = _mm_slli_epi32(in[0], shift); + in[1] = _mm_slli_epi32(in[1], shift); + in[2] = _mm_slli_epi32(in[2], shift); + in[3] = _mm_slli_epi32(in[3], shift); + in[4] = _mm_slli_epi32(in[4], shift); + in[5] = _mm_slli_epi32(in[5], shift); + in[6] = _mm_slli_epi32(in[6], shift); + in[7] = _mm_slli_epi32(in[7], shift); + + in[8] = _mm_slli_epi32(in[8], shift); + in[9] = _mm_slli_epi32(in[9], shift); + in[10] = _mm_slli_epi32(in[10], shift); + in[11] = _mm_slli_epi32(in[11], shift); + in[12] = _mm_slli_epi32(in[12], shift); + in[13] = _mm_slli_epi32(in[13], shift); + in[14] = _mm_slli_epi32(in[14], shift); + in[15] = _mm_slli_epi32(in[15], shift); +} + +static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) { + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + + in[0] = _mm_add_epi32(in[0], rounding); + in[1] = _mm_add_epi32(in[1], rounding); + in[2] = _mm_add_epi32(in[2], rounding); + in[3] = _mm_add_epi32(in[3], rounding); + in[4] = _mm_add_epi32(in[4], rounding); + in[5] = _mm_add_epi32(in[5], rounding); + in[6] = _mm_add_epi32(in[6], rounding); + in[7] = _mm_add_epi32(in[7], rounding); + in[8] = _mm_add_epi32(in[8], rounding); + in[9] = _mm_add_epi32(in[9], rounding); + in[10] = _mm_add_epi32(in[10], rounding); + in[11] = _mm_add_epi32(in[11], rounding); + in[12] = _mm_add_epi32(in[12], rounding); + in[13] = _mm_add_epi32(in[13], rounding); + in[14] = _mm_add_epi32(in[14], rounding); + in[15] = _mm_add_epi32(in[15], rounding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); + in[4] = _mm_srai_epi32(in[4], shift); + in[5] = _mm_srai_epi32(in[5], shift); + in[6] = _mm_srai_epi32(in[6], shift); + in[7] = _mm_srai_epi32(in[7], shift); + in[8] = _mm_srai_epi32(in[8], shift); + in[9] = _mm_srai_epi32(in[9], shift); + in[10] = _mm_srai_epi32(in[10], shift); + in[11] = _mm_srai_epi32(in[11], shift); + in[12] = _mm_srai_epi32(in[12], shift); + in[13] = _mm_srai_epi32(in[13], shift); + in[14] = _mm_srai_epi32(in[14], shift); + in[15] = _mm_srai_epi32(in[15], shift); +} + +static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) { + _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); + _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); + _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); + _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); + + _mm_store_si128((__m128i *)(output + 4 * 4), res[4]); + _mm_store_si128((__m128i *)(output + 5 * 4), res[5]); + _mm_store_si128((__m128i *)(output + 6 * 4), res[6]); + _mm_store_si128((__m128i *)(output + 7 * 4), res[7]); + + _mm_store_si128((__m128i *)(output + 8 * 4), res[8]); + _mm_store_si128((__m128i *)(output + 9 * 4), res[9]); + _mm_store_si128((__m128i *)(output + 10 * 4), res[10]); + _mm_store_si128((__m128i *)(output + 11 * 4), res[11]); + + _mm_store_si128((__m128i *)(output + 12 * 4), res[12]); + _mm_store_si128((__m128i *)(output + 13 * 4), res[13]); + _mm_store_si128((__m128i *)(output + 14 * 4), res[14]); + _mm_store_si128((__m128i *)(output + 15 * 4), res[15]); +} + +static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { + const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i u[8], v[8]; + + // Even 8 points 0, 2, ..., 14 + // stage 0 + // stage 1 + u[0] = _mm_add_epi32(in[0], in[14]); + v[7] = _mm_sub_epi32(in[0], in[14]); // v[7] + u[1] = _mm_add_epi32(in[2], in[12]); + u[6] = _mm_sub_epi32(in[2], in[12]); + u[2] = _mm_add_epi32(in[4], in[10]); + u[5] = _mm_sub_epi32(in[4], in[10]); + u[3] = _mm_add_epi32(in[6], in[8]); + v[4] = _mm_sub_epi32(in[6], in[8]); // v[4] + + // stage 2 + v[0] = _mm_add_epi32(u[0], u[3]); + v[3] = _mm_sub_epi32(u[0], u[3]); + v[1] = _mm_add_epi32(u[1], u[2]); + v[2] = _mm_sub_epi32(u[1], u[2]); + + v[5] = _mm_mullo_epi32(u[5], cospim32); + v[6] = _mm_mullo_epi32(u[6], cospi32); + v[5] = _mm_add_epi32(v[5], v[6]); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + u[0] = _mm_mullo_epi32(u[5], cospi32); + v[6] = _mm_mullo_epi32(u[6], cospim32); + v[6] = _mm_sub_epi32(u[0], v[6]); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + // stage 3 + // type 0 + v[0] = _mm_mullo_epi32(v[0], cospi32); + v[1] = _mm_mullo_epi32(v[1], cospi32); + u[0] = _mm_add_epi32(v[0], v[1]); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_sub_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // type 1 + v[0] = _mm_mullo_epi32(v[2], cospi48); + v[1] = _mm_mullo_epi32(v[3], cospi16); + u[2] = _mm_add_epi32(v[0], v[1]); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + v[0] = _mm_mullo_epi32(v[2], cospi16); + v[1] = _mm_mullo_epi32(v[3], cospi48); + u[3] = _mm_sub_epi32(v[1], v[0]); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + u[4] = _mm_add_epi32(v[4], v[5]); + u[5] = _mm_sub_epi32(v[4], v[5]); + u[6] = _mm_sub_epi32(v[7], v[6]); + u[7] = _mm_add_epi32(v[7], v[6]); + + // stage 4 + // stage 5 + v[0] = _mm_mullo_epi32(u[4], cospi56); + v[1] = _mm_mullo_epi32(u[7], cospi8); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[2] = _mm_srai_epi32(v[0], bit); // buf0[4] + + v[0] = _mm_mullo_epi32(u[4], cospi8); + v[1] = _mm_mullo_epi32(u[7], cospi56); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[14] = _mm_srai_epi32(v[0], bit); // buf0[7] + + v[0] = _mm_mullo_epi32(u[5], cospi24); + v[1] = _mm_mullo_epi32(u[6], cospi40); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[10] = _mm_srai_epi32(v[0], bit); // buf0[5] + + v[0] = _mm_mullo_epi32(u[5], cospi40); + v[1] = _mm_mullo_epi32(u[6], cospi24); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[6] = _mm_srai_epi32(v[0], bit); // buf0[6] + + out[0] = u[0]; // buf0[0] + out[8] = u[1]; // buf0[1] + out[4] = u[2]; // buf0[2] + out[12] = u[3]; // buf0[3] + + // Odd 8 points: 1, 3, ..., 15 + // stage 0 + // stage 1 + u[0] = _mm_add_epi32(in[1], in[15]); + v[7] = _mm_sub_epi32(in[1], in[15]); // v[7] + u[1] = _mm_add_epi32(in[3], in[13]); + u[6] = _mm_sub_epi32(in[3], in[13]); + u[2] = _mm_add_epi32(in[5], in[11]); + u[5] = _mm_sub_epi32(in[5], in[11]); + u[3] = _mm_add_epi32(in[7], in[9]); + v[4] = _mm_sub_epi32(in[7], in[9]); // v[4] + + // stage 2 + v[0] = _mm_add_epi32(u[0], u[3]); + v[3] = _mm_sub_epi32(u[0], u[3]); + v[1] = _mm_add_epi32(u[1], u[2]); + v[2] = _mm_sub_epi32(u[1], u[2]); + + v[5] = _mm_mullo_epi32(u[5], cospim32); + v[6] = _mm_mullo_epi32(u[6], cospi32); + v[5] = _mm_add_epi32(v[5], v[6]); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + u[0] = _mm_mullo_epi32(u[5], cospi32); + v[6] = _mm_mullo_epi32(u[6], cospim32); + v[6] = _mm_sub_epi32(u[0], v[6]); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + // stage 3 + // type 0 + v[0] = _mm_mullo_epi32(v[0], cospi32); + v[1] = _mm_mullo_epi32(v[1], cospi32); + u[0] = _mm_add_epi32(v[0], v[1]); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_sub_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // type 1 + v[0] = _mm_mullo_epi32(v[2], cospi48); + v[1] = _mm_mullo_epi32(v[3], cospi16); + u[2] = _mm_add_epi32(v[0], v[1]); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + v[0] = _mm_mullo_epi32(v[2], cospi16); + v[1] = _mm_mullo_epi32(v[3], cospi48); + u[3] = _mm_sub_epi32(v[1], v[0]); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + u[4] = _mm_add_epi32(v[4], v[5]); + u[5] = _mm_sub_epi32(v[4], v[5]); + u[6] = _mm_sub_epi32(v[7], v[6]); + u[7] = _mm_add_epi32(v[7], v[6]); + + // stage 4 + // stage 5 + v[0] = _mm_mullo_epi32(u[4], cospi56); + v[1] = _mm_mullo_epi32(u[7], cospi8); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[3] = _mm_srai_epi32(v[0], bit); // buf0[4] + + v[0] = _mm_mullo_epi32(u[4], cospi8); + v[1] = _mm_mullo_epi32(u[7], cospi56); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[15] = _mm_srai_epi32(v[0], bit); // buf0[7] + + v[0] = _mm_mullo_epi32(u[5], cospi24); + v[1] = _mm_mullo_epi32(u[6], cospi40); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[11] = _mm_srai_epi32(v[0], bit); // buf0[5] + + v[0] = _mm_mullo_epi32(u[5], cospi40); + v[1] = _mm_mullo_epi32(u[6], cospi24); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[7] = _mm_srai_epi32(v[0], bit); // buf0[6] + + out[1] = u[0]; // buf0[0] + out[9] = u[1]; // buf0[1] + out[5] = u[2]; // buf0[2] + out[13] = u[3]; // buf0[3] +} + +static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) { + const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + __m128i u[8], v[8], x; + + // Even 8 points: 0, 2, ..., 14 + // stage 0 + // stage 1 + // stage 2 + // (1) + u[0] = _mm_mullo_epi32(in[14], cospi4); + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[14], cospi60); + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[10], cospi20); + x = _mm_mullo_epi32(in[4], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[10], cospi44); + x = _mm_mullo_epi32(in[4], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[6], cospi36); + x = _mm_mullo_epi32(in[8], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[6], cospi28); + x = _mm_mullo_epi32(in[8], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[2], cospi52); + x = _mm_mullo_epi32(in[12], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[2], cospi12); + x = _mm_mullo_epi32(in[12], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + v[0] = _mm_add_epi32(u[0], u[4]); + v[4] = _mm_sub_epi32(u[0], u[4]); + v[1] = _mm_add_epi32(u[1], u[5]); + v[5] = _mm_sub_epi32(u[1], u[5]); + v[2] = _mm_add_epi32(u[2], u[6]); + v[6] = _mm_sub_epi32(u[2], u[6]); + v[3] = _mm_add_epi32(u[3], u[7]); + v[7] = _mm_sub_epi32(u[3], u[7]); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + v[0] = _mm_add_epi32(u[0], u[2]); + v[2] = _mm_sub_epi32(u[0], u[2]); + v[1] = _mm_add_epi32(u[1], u[3]); + v[3] = _mm_sub_epi32(u[1], u[3]); + v[4] = _mm_add_epi32(u[4], u[6]); + v[6] = _mm_sub_epi32(u[4], u[6]); + v[5] = _mm_add_epi32(u[5], u[7]); + v[7] = _mm_sub_epi32(u[5], u[7]); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + out[0] = u[0]; + out[2] = _mm_sub_epi32(kZero, u[4]); + out[4] = u[6]; + out[6] = _mm_sub_epi32(kZero, u[2]); + out[8] = u[3]; + out[10] = _mm_sub_epi32(kZero, u[7]); + out[12] = u[5]; + out[14] = _mm_sub_epi32(kZero, u[1]); + + // Odd 8 points: 1, 3, ..., 15 + // stage 0 + // stage 1 + // stage 2 + // (1) + u[0] = _mm_mullo_epi32(in[15], cospi4); + x = _mm_mullo_epi32(in[1], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[15], cospi60); + x = _mm_mullo_epi32(in[1], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[11], cospi20); + x = _mm_mullo_epi32(in[5], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[11], cospi44); + x = _mm_mullo_epi32(in[5], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[7], cospi36); + x = _mm_mullo_epi32(in[9], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[7], cospi28); + x = _mm_mullo_epi32(in[9], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[3], cospi52); + x = _mm_mullo_epi32(in[13], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[3], cospi12); + x = _mm_mullo_epi32(in[13], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + v[0] = _mm_add_epi32(u[0], u[4]); + v[4] = _mm_sub_epi32(u[0], u[4]); + v[1] = _mm_add_epi32(u[1], u[5]); + v[5] = _mm_sub_epi32(u[1], u[5]); + v[2] = _mm_add_epi32(u[2], u[6]); + v[6] = _mm_sub_epi32(u[2], u[6]); + v[3] = _mm_add_epi32(u[3], u[7]); + v[7] = _mm_sub_epi32(u[3], u[7]); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + v[0] = _mm_add_epi32(u[0], u[2]); + v[2] = _mm_sub_epi32(u[0], u[2]); + v[1] = _mm_add_epi32(u[1], u[3]); + v[3] = _mm_sub_epi32(u[1], u[3]); + v[4] = _mm_add_epi32(u[4], u[6]); + v[6] = _mm_sub_epi32(u[4], u[6]); + v[5] = _mm_add_epi32(u[5], u[7]); + v[7] = _mm_sub_epi32(u[5], u[7]); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + out[1] = u[0]; + out[3] = _mm_sub_epi32(kZero, u[4]); + out[5] = u[6]; + out[7] = _mm_sub_epi32(kZero, u[2]); + out[9] = u[3]; + out[11] = _mm_sub_epi32(kZero, u[7]); + out[13] = u[5]; + out[15] = _mm_sub_epi32(kZero, u[1]); +} + +void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, + int tx_type, int bd) { + __m128i in[16], out[16]; + const TXFM_2D_CFG *cfg = NULL; + + switch (tx_type) { + case DCT_DCT: + cfg = &fwd_txfm_2d_cfg_dct_dct_8; + load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]); + fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case ADST_DCT: + cfg = &fwd_txfm_2d_cfg_adst_dct_8; + load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]); + fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case DCT_ADST: + cfg = &fwd_txfm_2d_cfg_dct_adst_8; + load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]); + fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case ADST_ADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_8; + load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]); + fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + cfg = &fwd_txfm_2d_cfg_adst_dct_8; + load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]); + fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case DCT_FLIPADST: + cfg = &fwd_txfm_2d_cfg_dct_adst_8; + load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]); + fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case FLIPADST_FLIPADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_8; + load_buffer_8x8(input, in, stride, 1, 1, cfg->shift[0]); + fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case ADST_FLIPADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_8; + load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]); + fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; + case FLIPADST_ADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_8; + load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]); + fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); + col_txfm_8x8_rounding(out, -cfg->shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + transpose_8x8(out, in); + write_buffer_8x8(in, coeff); + break; +#endif // CONFIG_EXT_TX + default: assert(0); + } + (void)bd; +} + +// Hybrid Transform 16x16 + +static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) { + int row_index = 0; + int dst_index = 0; + int src_index = 0; + + // row 0, 1, .., 7 + do { + out[dst_index] = in[src_index]; + out[dst_index + 1] = in[src_index + 1]; + out[dst_index + 2] = in[src_index + 16]; + out[dst_index + 3] = in[src_index + 17]; + dst_index += 4; + src_index += 2; + row_index += 1; + } while (row_index < 8); + + // row 8, 9, ..., 15 + src_index += 16; + do { + out[dst_index] = in[src_index]; + out[dst_index + 1] = in[src_index + 1]; + out[dst_index + 2] = in[src_index + 16]; + out[dst_index + 3] = in[src_index + 17]; + dst_index += 4; + src_index += 2; + row_index += 1; + } while (row_index < 16); +} + +static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + __m128i in[64]; + // Load 4 8x8 blocks + const int16_t *topL = input; + const int16_t *topR = input + 8; + const int16_t *botL = input + 8 * stride; + const int16_t *botR = input + 8 * stride + 8; + + const int16_t *tmp; + + if (flipud) { + // Swap left columns + tmp = topL; + topL = botL; + botL = tmp; + // Swap right columns + tmp = topR; + topR = botR; + botR = tmp; + } + + if (fliplr) { + // Swap top rows + tmp = topL; + topL = topR; + topR = tmp; + // Swap bottom rows + tmp = botL; + botL = botR; + botR = tmp; + } + + // load first 8 columns + load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift); + load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift); + + // load second 8 columns + load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift); + load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift); + + convert_8x8_to_16x16(in, out); +} + +static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { + const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i u[16], v[16], x; + const int col_num = 4; + int col; + + // Calculate the column 0, 1, 2, 3 + for (col = 0; col < col_num; ++col) { + // stage 0 + // stage 1 + u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]); + u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]); + + // stage 2 + v[0] = _mm_add_epi32(u[0], u[7]); + v[7] = _mm_sub_epi32(u[0], u[7]); + v[1] = _mm_add_epi32(u[1], u[6]); + v[6] = _mm_sub_epi32(u[1], u[6]); + v[2] = _mm_add_epi32(u[2], u[5]); + v[5] = _mm_sub_epi32(u[2], u[5]); + v[3] = _mm_add_epi32(u[3], u[4]); + v[4] = _mm_sub_epi32(u[3], u[4]); + v[8] = u[8]; + v[9] = u[9]; + + v[10] = _mm_mullo_epi32(u[10], cospim32); + x = _mm_mullo_epi32(u[13], cospi32); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[13], cospim32); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[11] = _mm_mullo_epi32(u[11], cospim32); + x = _mm_mullo_epi32(u[12], cospi32); + v[11] = _mm_add_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[11], cospi32); + x = _mm_mullo_epi32(u[12], cospim32); + v[12] = _mm_sub_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + v[14] = u[14]; + v[15] = u[15]; + + // stage 3 + u[0] = _mm_add_epi32(v[0], v[3]); + u[3] = _mm_sub_epi32(v[0], v[3]); + u[1] = _mm_add_epi32(v[1], v[2]); + u[2] = _mm_sub_epi32(v[1], v[2]); + u[4] = v[4]; + + u[5] = _mm_mullo_epi32(v[5], cospim32); + x = _mm_mullo_epi32(v[6], cospi32); + u[5] = _mm_add_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[5], cospi32); + x = _mm_mullo_epi32(v[6], cospim32); + u[6] = _mm_sub_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = v[7]; + u[8] = _mm_add_epi32(v[8], v[11]); + u[11] = _mm_sub_epi32(v[8], v[11]); + u[9] = _mm_add_epi32(v[9], v[10]); + u[10] = _mm_sub_epi32(v[9], v[10]); + u[12] = _mm_sub_epi32(v[15], v[12]); + u[15] = _mm_add_epi32(v[15], v[12]); + u[13] = _mm_sub_epi32(v[14], v[13]); + u[14] = _mm_add_epi32(v[14], v[13]); + + // stage 4 + u[0] = _mm_mullo_epi32(u[0], cospi32); + u[1] = _mm_mullo_epi32(u[1], cospi32); + v[0] = _mm_add_epi32(u[0], u[1]); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_sub_epi32(u[0], u[1]); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = _mm_mullo_epi32(u[2], cospi48); + x = _mm_mullo_epi32(u[3], cospi16); + v[2] = _mm_add_epi32(v[2], x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_mullo_epi32(u[2], cospi16); + x = _mm_mullo_epi32(u[3], cospi48); + v[3] = _mm_sub_epi32(x, v[3]); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = _mm_add_epi32(u[4], u[5]); + v[5] = _mm_sub_epi32(u[4], u[5]); + v[6] = _mm_sub_epi32(u[7], u[6]); + v[7] = _mm_add_epi32(u[7], u[6]); + v[8] = u[8]; + + v[9] = _mm_mullo_epi32(u[9], cospim16); + x = _mm_mullo_epi32(u[14], cospi48); + v[9] = _mm_add_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[14] = _mm_mullo_epi32(u[9], cospi48); + x = _mm_mullo_epi32(u[14], cospim16); + v[14] = _mm_sub_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[10] = _mm_mullo_epi32(u[10], cospim48); + x = _mm_mullo_epi32(u[13], cospim16); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_mullo_epi32(u[10], cospim16); + x = _mm_mullo_epi32(u[13], cospim48); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[11] = u[11]; + v[12] = u[12]; + v[15] = u[15]; + + // stage 5 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi56); + x = _mm_mullo_epi32(v[7], cospi8); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[7] = _mm_mullo_epi32(v[4], cospi8); + x = _mm_mullo_epi32(v[7], cospi56); + u[7] = _mm_sub_epi32(x, u[7]); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + u[5] = _mm_mullo_epi32(v[5], cospi24); + x = _mm_mullo_epi32(v[6], cospi40); + u[5] = _mm_add_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[5], cospi40); + x = _mm_mullo_epi32(v[6], cospi24); + u[6] = _mm_sub_epi32(x, u[6]); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[8] = _mm_add_epi32(v[8], v[9]); + u[9] = _mm_sub_epi32(v[8], v[9]); + u[10] = _mm_sub_epi32(v[11], v[10]); + u[11] = _mm_add_epi32(v[11], v[10]); + u[12] = _mm_add_epi32(v[12], v[13]); + u[13] = _mm_sub_epi32(v[12], v[13]); + u[14] = _mm_sub_epi32(v[15], v[14]); + u[15] = _mm_add_epi32(v[15], v[14]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm_mullo_epi32(u[8], cospi60); + x = _mm_mullo_epi32(u[15], cospi4); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[15] = _mm_mullo_epi32(u[8], cospi4); + x = _mm_mullo_epi32(u[15], cospi60); + v[15] = _mm_sub_epi32(x, v[15]); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + v[9] = _mm_mullo_epi32(u[9], cospi28); + x = _mm_mullo_epi32(u[14], cospi36); + v[9] = _mm_add_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[14] = _mm_mullo_epi32(u[9], cospi36); + x = _mm_mullo_epi32(u[14], cospi28); + v[14] = _mm_sub_epi32(x, v[14]); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[10] = _mm_mullo_epi32(u[10], cospi44); + x = _mm_mullo_epi32(u[13], cospi20); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_mullo_epi32(u[10], cospi20); + x = _mm_mullo_epi32(u[13], cospi44); + v[13] = _mm_sub_epi32(x, v[13]); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[11] = _mm_mullo_epi32(u[11], cospi12); + x = _mm_mullo_epi32(u[12], cospi52); + v[11] = _mm_add_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[11], cospi52); + x = _mm_mullo_epi32(u[12], cospi12); + v[12] = _mm_sub_epi32(x, v[12]); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + out[0 * col_num + col] = v[0]; + out[1 * col_num + col] = v[8]; + out[2 * col_num + col] = v[4]; + out[3 * col_num + col] = v[12]; + out[4 * col_num + col] = v[2]; + out[5 * col_num + col] = v[10]; + out[6 * col_num + col] = v[6]; + out[7 * col_num + col] = v[14]; + out[8 * col_num + col] = v[1]; + out[9 * col_num + col] = v[9]; + out[10 * col_num + col] = v[5]; + out[11 * col_num + col] = v[13]; + out[12 * col_num + col] = v[3]; + out[13 * col_num + col] = v[11]; + out[14 * col_num + col] = v[7]; + out[15 * col_num + col] = v[15]; + } +} + +static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { + const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i u[16], v[16], x, y; + const int col_num = 4; + int col; + + // Calculate the column 0, 1, 2, 3 + for (col = 0; col < col_num; ++col) { + // stage 0 + // stage 1 + // stage 2 + v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2); + x = _mm_mullo_epi32(in[0 * col_num + col], cospi62); + v[0] = _mm_add_epi32(v[0], x); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62); + x = _mm_mullo_epi32(in[0 * col_num + col], cospi2); + v[1] = _mm_sub_epi32(v[1], x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10); + x = _mm_mullo_epi32(in[2 * col_num + col], cospi54); + v[2] = _mm_add_epi32(v[2], x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54); + x = _mm_mullo_epi32(in[2 * col_num + col], cospi10); + v[3] = _mm_sub_epi32(v[3], x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18); + x = _mm_mullo_epi32(in[4 * col_num + col], cospi46); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46); + x = _mm_mullo_epi32(in[4 * col_num + col], cospi18); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26); + x = _mm_mullo_epi32(in[6 * col_num + col], cospi38); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38); + x = _mm_mullo_epi32(in[6 * col_num + col], cospi26); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34); + x = _mm_mullo_epi32(in[8 * col_num + col], cospi30); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30); + x = _mm_mullo_epi32(in[8 * col_num + col], cospi34); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42); + x = _mm_mullo_epi32(in[10 * col_num + col], cospi22); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22); + x = _mm_mullo_epi32(in[10 * col_num + col], cospi42); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50); + x = _mm_mullo_epi32(in[12 * col_num + col], cospi14); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14); + x = _mm_mullo_epi32(in[12 * col_num + col], cospi50); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58); + x = _mm_mullo_epi32(in[14 * col_num + col], cospi6); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6); + x = _mm_mullo_epi32(in[14 * col_num + col], cospi58); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 3 + u[0] = _mm_add_epi32(v[0], v[8]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm_mullo_epi32(u[8], cospi8); + x = _mm_mullo_epi32(u[9], cospi56); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi8); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(u[10], cospi40); + x = _mm_mullo_epi32(u[11], cospi24); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(u[10], cospi24); + x = _mm_mullo_epi32(u[11], cospi40); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[12], cospim56); + x = _mm_mullo_epi32(u[13], cospi8); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi8); + x = _mm_mullo_epi32(u[13], cospim56); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim24); + x = _mm_mullo_epi32(u[15], cospi40); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi40); + x = _mm_mullo_epi32(u[15], cospim24); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 5 + u[0] = _mm_add_epi32(v[0], v[4]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = _mm_mullo_epi32(u[4], cospi16); + x = _mm_mullo_epi32(u[5], cospi48); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(u[4], cospi48); + x = _mm_mullo_epi32(u[5], cospi16); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(u[6], cospim48); + x = _mm_mullo_epi32(u[7], cospi16); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(u[6], cospi16); + x = _mm_mullo_epi32(u[7], cospim48); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = _mm_mullo_epi32(u[12], cospi16); + x = _mm_mullo_epi32(u[13], cospi48); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi48); + x = _mm_mullo_epi32(u[13], cospi16); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim48); + x = _mm_mullo_epi32(u[15], cospi16); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi16); + x = _mm_mullo_epi32(u[15], cospim48); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 7 + u[0] = _mm_add_epi32(v[0], v[2]); + u[2] = _mm_sub_epi32(v[0], v[2]); + u[1] = _mm_add_epi32(v[1], v[3]); + u[3] = _mm_sub_epi32(v[1], v[3]); + u[4] = _mm_add_epi32(v[4], v[6]); + u[6] = _mm_sub_epi32(v[4], v[6]); + u[5] = _mm_add_epi32(v[5], v[7]); + u[7] = _mm_sub_epi32(v[5], v[7]); + u[8] = _mm_add_epi32(v[8], v[10]); + u[10] = _mm_sub_epi32(v[8], v[10]); + u[9] = _mm_add_epi32(v[9], v[11]); + u[11] = _mm_sub_epi32(v[9], v[11]); + u[12] = _mm_add_epi32(v[12], v[14]); + u[14] = _mm_sub_epi32(v[12], v[14]); + u[13] = _mm_add_epi32(v[13], v[15]); + u[15] = _mm_sub_epi32(v[13], v[15]); + + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + v[2] = _mm_add_epi32(y, x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(y, x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + v[6] = _mm_add_epi32(y, x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(y, x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + v[10] = _mm_add_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + v[14] = _mm_add_epi32(y, x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(y, x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 9 + out[0 * col_num + col] = v[0]; + out[1 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[8]); + out[2 * col_num + col] = v[12]; + out[3 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[4]); + out[4 * col_num + col] = v[6]; + out[5 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[14]); + out[6 * col_num + col] = v[10]; + out[7 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[2]); + out[8 * col_num + col] = v[3]; + out[9 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[11]); + out[10 * col_num + col] = v[15]; + out[11 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[7]); + out[12 * col_num + col] = v[5]; + out[13 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[13]); + out[14 * col_num + col] = v[9]; + out[15 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[1]); + } +} + +static void col_txfm_16x16_rounding(__m128i *in, int shift) { + // Note: + // We split 16x16 rounding into 4 sections of 8x8 rounding, + // instead of 4 columns + col_txfm_8x8_rounding(&in[0], shift); + col_txfm_8x8_rounding(&in[16], shift); + col_txfm_8x8_rounding(&in[32], shift); + col_txfm_8x8_rounding(&in[48], shift); +} + +static void write_buffer_16x16(const __m128i *in, tran_low_t *output) { + const int size_8x8 = 16 * 4; + write_buffer_8x8(&in[0], output); + output += size_8x8; + write_buffer_8x8(&in[16], output); + output += size_8x8; + write_buffer_8x8(&in[32], output); + output += size_8x8; + write_buffer_8x8(&in[48], output); +} + +void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, int tx_type, int bd) { + __m128i in[64], out[64]; + const TXFM_2D_CFG *cfg = NULL; + + switch (tx_type) { + case DCT_DCT: + cfg = &fwd_txfm_2d_cfg_dct_dct_16; + load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]); + fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]); + col_txfm_16x16_rounding(out, -cfg->shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case ADST_DCT: + cfg = &fwd_txfm_2d_cfg_adst_dct_16; + load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]); + fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]); + col_txfm_16x16_rounding(out, -cfg->shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case DCT_ADST: + cfg = &fwd_txfm_2d_cfg_dct_adst_16; + load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]); + fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]); + col_txfm_16x16_rounding(out, -cfg->shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case ADST_ADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_16; + load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]); + fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]); + col_txfm_16x16_rounding(out, -cfg->shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + cfg = &fwd_txfm_2d_cfg_adst_dct_16; + load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]); + fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]); + col_txfm_16x16_rounding(out, -cfg->shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case DCT_FLIPADST: + cfg = &fwd_txfm_2d_cfg_dct_adst_16; + load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]); + fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]); + col_txfm_16x16_rounding(out, -cfg->shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case FLIPADST_FLIPADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_16; + load_buffer_16x16(input, in, stride, 1, 1, cfg->shift[0]); + fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]); + col_txfm_16x16_rounding(out, -cfg->shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case ADST_FLIPADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_16; + load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]); + fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]); + col_txfm_16x16_rounding(out, -cfg->shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; + case FLIPADST_ADST: + cfg = &fwd_txfm_2d_cfg_adst_adst_16; + load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]); + fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]); + col_txfm_16x16_rounding(out, -cfg->shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + transpose_16x16(out, in); + write_buffer_16x16(in, coeff); + break; +#endif // CONFIG_EXT_TX + default: assert(0); + } + (void)bd; +} diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c new file mode 100644 index 000000000..198e4e4c4 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c @@ -0,0 +1,1678 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // avx2 + +#include "./av1_rtcd.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_dsp/x86/fwd_txfm_avx2.h" +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +static int32_t get_16x16_sum(const int16_t *input, int stride) { + __m256i r0, r1, r2, r3, u0, u1; + __m256i zero = _mm256_setzero_si256(); + __m256i sum = _mm256_setzero_si256(); + const int16_t *blockBound = input + (stride << 4); + __m128i v0, v1; + + while (input < blockBound) { + r0 = _mm256_loadu_si256((__m256i const *)input); + r1 = _mm256_loadu_si256((__m256i const *)(input + stride)); + r2 = _mm256_loadu_si256((__m256i const *)(input + 2 * stride)); + r3 = _mm256_loadu_si256((__m256i const *)(input + 3 * stride)); + + u0 = _mm256_add_epi16(r0, r1); + u1 = _mm256_add_epi16(r2, r3); + sum = _mm256_add_epi16(sum, u0); + sum = _mm256_add_epi16(sum, u1); + + input += stride << 2; + } + + // unpack 16 int16_t into 2x8 int32_t + u0 = _mm256_unpacklo_epi16(zero, sum); + u1 = _mm256_unpackhi_epi16(zero, sum); + u0 = _mm256_srai_epi32(u0, 16); + u1 = _mm256_srai_epi32(u1, 16); + sum = _mm256_add_epi32(u0, u1); + + u0 = _mm256_srli_si256(sum, 8); + u1 = _mm256_add_epi32(sum, u0); + + v0 = _mm_add_epi32(_mm256_extracti128_si256(u1, 1), + _mm256_castsi256_si128(u1)); + v1 = _mm_srli_si128(v0, 4); + v0 = _mm_add_epi32(v0, v1); + return (int32_t)_mm_extract_epi32(v0, 0); +} + +void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output, + int stride) { + int32_t dc = get_16x16_sum(input, stride); + output[0] = (tran_low_t)(dc >> 1); + _mm256_zeroupper(); +} + +static INLINE void load_buffer_16x16(const int16_t *input, int stride, + int flipud, int fliplr, __m256i *in) { + if (!flipud) { + in[0] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride)); + in[1] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride)); + in[2] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride)); + in[3] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride)); + in[4] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride)); + in[5] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride)); + in[6] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride)); + in[7] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride)); + in[8] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride)); + in[9] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride)); + in[10] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride)); + in[11] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride)); + in[12] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride)); + in[13] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride)); + in[14] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride)); + in[15] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride)); + } else { + in[0] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride)); + in[1] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride)); + in[2] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride)); + in[3] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride)); + in[4] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride)); + in[5] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride)); + in[6] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride)); + in[7] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride)); + in[8] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride)); + in[9] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride)); + in[10] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride)); + in[11] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride)); + in[12] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride)); + in[13] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride)); + in[14] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride)); + in[15] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride)); + } + + if (fliplr) { + mm256_reverse_epi16(&in[0]); + mm256_reverse_epi16(&in[1]); + mm256_reverse_epi16(&in[2]); + mm256_reverse_epi16(&in[3]); + mm256_reverse_epi16(&in[4]); + mm256_reverse_epi16(&in[5]); + mm256_reverse_epi16(&in[6]); + mm256_reverse_epi16(&in[7]); + mm256_reverse_epi16(&in[8]); + mm256_reverse_epi16(&in[9]); + mm256_reverse_epi16(&in[10]); + mm256_reverse_epi16(&in[11]); + mm256_reverse_epi16(&in[12]); + mm256_reverse_epi16(&in[13]); + mm256_reverse_epi16(&in[14]); + mm256_reverse_epi16(&in[15]); + } + + in[0] = _mm256_slli_epi16(in[0], 2); + in[1] = _mm256_slli_epi16(in[1], 2); + in[2] = _mm256_slli_epi16(in[2], 2); + in[3] = _mm256_slli_epi16(in[3], 2); + in[4] = _mm256_slli_epi16(in[4], 2); + in[5] = _mm256_slli_epi16(in[5], 2); + in[6] = _mm256_slli_epi16(in[6], 2); + in[7] = _mm256_slli_epi16(in[7], 2); + in[8] = _mm256_slli_epi16(in[8], 2); + in[9] = _mm256_slli_epi16(in[9], 2); + in[10] = _mm256_slli_epi16(in[10], 2); + in[11] = _mm256_slli_epi16(in[11], 2); + in[12] = _mm256_slli_epi16(in[12], 2); + in[13] = _mm256_slli_epi16(in[13], 2); + in[14] = _mm256_slli_epi16(in[14], 2); + in[15] = _mm256_slli_epi16(in[15], 2); +} + +static INLINE void write_buffer_16x16(const __m256i *in, tran_low_t *output) { + int i; + for (i = 0; i < 16; ++i) { + storeu_output_avx2(&in[i], output + (i << 4)); + } +} + +static void right_shift_16x16(__m256i *in) { + const __m256i one = _mm256_set1_epi16(1); + __m256i s0 = _mm256_srai_epi16(in[0], 15); + __m256i s1 = _mm256_srai_epi16(in[1], 15); + __m256i s2 = _mm256_srai_epi16(in[2], 15); + __m256i s3 = _mm256_srai_epi16(in[3], 15); + __m256i s4 = _mm256_srai_epi16(in[4], 15); + __m256i s5 = _mm256_srai_epi16(in[5], 15); + __m256i s6 = _mm256_srai_epi16(in[6], 15); + __m256i s7 = _mm256_srai_epi16(in[7], 15); + __m256i s8 = _mm256_srai_epi16(in[8], 15); + __m256i s9 = _mm256_srai_epi16(in[9], 15); + __m256i s10 = _mm256_srai_epi16(in[10], 15); + __m256i s11 = _mm256_srai_epi16(in[11], 15); + __m256i s12 = _mm256_srai_epi16(in[12], 15); + __m256i s13 = _mm256_srai_epi16(in[13], 15); + __m256i s14 = _mm256_srai_epi16(in[14], 15); + __m256i s15 = _mm256_srai_epi16(in[15], 15); + + in[0] = _mm256_add_epi16(in[0], one); + in[1] = _mm256_add_epi16(in[1], one); + in[2] = _mm256_add_epi16(in[2], one); + in[3] = _mm256_add_epi16(in[3], one); + in[4] = _mm256_add_epi16(in[4], one); + in[5] = _mm256_add_epi16(in[5], one); + in[6] = _mm256_add_epi16(in[6], one); + in[7] = _mm256_add_epi16(in[7], one); + in[8] = _mm256_add_epi16(in[8], one); + in[9] = _mm256_add_epi16(in[9], one); + in[10] = _mm256_add_epi16(in[10], one); + in[11] = _mm256_add_epi16(in[11], one); + in[12] = _mm256_add_epi16(in[12], one); + in[13] = _mm256_add_epi16(in[13], one); + in[14] = _mm256_add_epi16(in[14], one); + in[15] = _mm256_add_epi16(in[15], one); + + in[0] = _mm256_sub_epi16(in[0], s0); + in[1] = _mm256_sub_epi16(in[1], s1); + in[2] = _mm256_sub_epi16(in[2], s2); + in[3] = _mm256_sub_epi16(in[3], s3); + in[4] = _mm256_sub_epi16(in[4], s4); + in[5] = _mm256_sub_epi16(in[5], s5); + in[6] = _mm256_sub_epi16(in[6], s6); + in[7] = _mm256_sub_epi16(in[7], s7); + in[8] = _mm256_sub_epi16(in[8], s8); + in[9] = _mm256_sub_epi16(in[9], s9); + in[10] = _mm256_sub_epi16(in[10], s10); + in[11] = _mm256_sub_epi16(in[11], s11); + in[12] = _mm256_sub_epi16(in[12], s12); + in[13] = _mm256_sub_epi16(in[13], s13); + in[14] = _mm256_sub_epi16(in[14], s14); + in[15] = _mm256_sub_epi16(in[15], s15); + + in[0] = _mm256_srai_epi16(in[0], 2); + in[1] = _mm256_srai_epi16(in[1], 2); + in[2] = _mm256_srai_epi16(in[2], 2); + in[3] = _mm256_srai_epi16(in[3], 2); + in[4] = _mm256_srai_epi16(in[4], 2); + in[5] = _mm256_srai_epi16(in[5], 2); + in[6] = _mm256_srai_epi16(in[6], 2); + in[7] = _mm256_srai_epi16(in[7], 2); + in[8] = _mm256_srai_epi16(in[8], 2); + in[9] = _mm256_srai_epi16(in[9], 2); + in[10] = _mm256_srai_epi16(in[10], 2); + in[11] = _mm256_srai_epi16(in[11], 2); + in[12] = _mm256_srai_epi16(in[12], 2); + in[13] = _mm256_srai_epi16(in[13], 2); + in[14] = _mm256_srai_epi16(in[14], 2); + in[15] = _mm256_srai_epi16(in[15], 2); +} + +static void fdct16_avx2(__m256i *in) { + // sequence: cospi_L_H = pairs(L, H) and L first + const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); + const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64); + const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); + const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64); + const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); + const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64); + const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); + + const __m256i cospi_p30_p02 = pair256_set_epi16(cospi_30_64, cospi_2_64); + const __m256i cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64); + + const __m256i cospi_p14_p18 = pair256_set_epi16(cospi_14_64, cospi_18_64); + const __m256i cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64); + + const __m256i cospi_p22_p10 = pair256_set_epi16(cospi_22_64, cospi_10_64); + const __m256i cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64); + + const __m256i cospi_p06_p26 = pair256_set_epi16(cospi_6_64, cospi_26_64); + const __m256i cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64); + + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i s0, s1, s2, s3, s4, s5, s6, s7; + __m256i t0, t1, t2, t3, t4, t5, t6, t7; + __m256i v0, v1, v2, v3; + __m256i x0, x1; + + // 0, 4, 8, 12 + u0 = _mm256_add_epi16(in[0], in[15]); + u1 = _mm256_add_epi16(in[1], in[14]); + u2 = _mm256_add_epi16(in[2], in[13]); + u3 = _mm256_add_epi16(in[3], in[12]); + u4 = _mm256_add_epi16(in[4], in[11]); + u5 = _mm256_add_epi16(in[5], in[10]); + u6 = _mm256_add_epi16(in[6], in[9]); + u7 = _mm256_add_epi16(in[7], in[8]); + + s0 = _mm256_add_epi16(u0, u7); + s1 = _mm256_add_epi16(u1, u6); + s2 = _mm256_add_epi16(u2, u5); + s3 = _mm256_add_epi16(u3, u4); + + // 0, 8 + v0 = _mm256_add_epi16(s0, s3); + v1 = _mm256_add_epi16(s1, s2); + + x0 = _mm256_unpacklo_epi16(v0, v1); + x1 = _mm256_unpackhi_epi16(v0, v1); + + t0 = butter_fly(x0, x1, cospi_p16_p16); + t1 = butter_fly(x0, x1, cospi_p16_m16); + + // 4, 12 + v0 = _mm256_sub_epi16(s1, s2); + v1 = _mm256_sub_epi16(s0, s3); + + x0 = _mm256_unpacklo_epi16(v0, v1); + x1 = _mm256_unpackhi_epi16(v0, v1); + + t2 = butter_fly(x0, x1, cospi_p24_p08); + t3 = butter_fly(x0, x1, cospi_m08_p24); + + // 2, 6, 10, 14 + s0 = _mm256_sub_epi16(u3, u4); + s1 = _mm256_sub_epi16(u2, u5); + s2 = _mm256_sub_epi16(u1, u6); + s3 = _mm256_sub_epi16(u0, u7); + + v0 = s0; // output[4] + v3 = s3; // output[7] + + x0 = _mm256_unpacklo_epi16(s2, s1); + x1 = _mm256_unpackhi_epi16(s2, s1); + + v2 = butter_fly(x0, x1, cospi_p16_p16); // output[5] + v1 = butter_fly(x0, x1, cospi_p16_m16); // output[6] + + s0 = _mm256_add_epi16(v0, v1); // step[4] + s1 = _mm256_sub_epi16(v0, v1); // step[5] + s2 = _mm256_sub_epi16(v3, v2); // step[6] + s3 = _mm256_add_epi16(v3, v2); // step[7] + + // 2, 14 + x0 = _mm256_unpacklo_epi16(s0, s3); + x1 = _mm256_unpackhi_epi16(s0, s3); + + t4 = butter_fly(x0, x1, cospi_p28_p04); + t5 = butter_fly(x0, x1, cospi_m04_p28); + + // 10, 6 + x0 = _mm256_unpacklo_epi16(s1, s2); + x1 = _mm256_unpackhi_epi16(s1, s2); + t6 = butter_fly(x0, x1, cospi_p12_p20); + t7 = butter_fly(x0, x1, cospi_m20_p12); + + // 1, 3, 5, 7, 9, 11, 13, 15 + s0 = _mm256_sub_epi16(in[7], in[8]); // step[8] + s1 = _mm256_sub_epi16(in[6], in[9]); // step[9] + u2 = _mm256_sub_epi16(in[5], in[10]); + u3 = _mm256_sub_epi16(in[4], in[11]); + u4 = _mm256_sub_epi16(in[3], in[12]); + u5 = _mm256_sub_epi16(in[2], in[13]); + s6 = _mm256_sub_epi16(in[1], in[14]); // step[14] + s7 = _mm256_sub_epi16(in[0], in[15]); // step[15] + + in[0] = t0; + in[8] = t1; + in[4] = t2; + in[12] = t3; + in[2] = t4; + in[14] = t5; + in[10] = t6; + in[6] = t7; + + x0 = _mm256_unpacklo_epi16(u5, u2); + x1 = _mm256_unpackhi_epi16(u5, u2); + + s2 = butter_fly(x0, x1, cospi_p16_p16); // step[13] + s5 = butter_fly(x0, x1, cospi_p16_m16); // step[10] + + x0 = _mm256_unpacklo_epi16(u4, u3); + x1 = _mm256_unpackhi_epi16(u4, u3); + + s3 = butter_fly(x0, x1, cospi_p16_p16); // step[12] + s4 = butter_fly(x0, x1, cospi_p16_m16); // step[11] + + u0 = _mm256_add_epi16(s0, s4); // output[8] + u1 = _mm256_add_epi16(s1, s5); + u2 = _mm256_sub_epi16(s1, s5); + u3 = _mm256_sub_epi16(s0, s4); + u4 = _mm256_sub_epi16(s7, s3); + u5 = _mm256_sub_epi16(s6, s2); + u6 = _mm256_add_epi16(s6, s2); + u7 = _mm256_add_epi16(s7, s3); + + // stage 4 + s0 = u0; + s3 = u3; + s4 = u4; + s7 = u7; + + x0 = _mm256_unpacklo_epi16(u1, u6); + x1 = _mm256_unpackhi_epi16(u1, u6); + + s1 = butter_fly(x0, x1, cospi_m08_p24); + s6 = butter_fly(x0, x1, cospi_p24_p08); + + x0 = _mm256_unpacklo_epi16(u2, u5); + x1 = _mm256_unpackhi_epi16(u2, u5); + + s2 = butter_fly(x0, x1, cospi_m24_m08); + s5 = butter_fly(x0, x1, cospi_m08_p24); + + // stage 5 + u0 = _mm256_add_epi16(s0, s1); + u1 = _mm256_sub_epi16(s0, s1); + u2 = _mm256_sub_epi16(s3, s2); + u3 = _mm256_add_epi16(s3, s2); + u4 = _mm256_add_epi16(s4, s5); + u5 = _mm256_sub_epi16(s4, s5); + u6 = _mm256_sub_epi16(s7, s6); + u7 = _mm256_add_epi16(s7, s6); + + // stage 6 + x0 = _mm256_unpacklo_epi16(u0, u7); + x1 = _mm256_unpackhi_epi16(u0, u7); + in[1] = butter_fly(x0, x1, cospi_p30_p02); + in[15] = butter_fly(x0, x1, cospi_m02_p30); + + x0 = _mm256_unpacklo_epi16(u1, u6); + x1 = _mm256_unpackhi_epi16(u1, u6); + in[9] = butter_fly(x0, x1, cospi_p14_p18); + in[7] = butter_fly(x0, x1, cospi_m18_p14); + + x0 = _mm256_unpacklo_epi16(u2, u5); + x1 = _mm256_unpackhi_epi16(u2, u5); + in[5] = butter_fly(x0, x1, cospi_p22_p10); + in[11] = butter_fly(x0, x1, cospi_m10_p22); + + x0 = _mm256_unpacklo_epi16(u3, u4); + x1 = _mm256_unpackhi_epi16(u3, u4); + in[13] = butter_fly(x0, x1, cospi_p06_p26); + in[3] = butter_fly(x0, x1, cospi_m26_p06); +} + +void fadst16_avx2(__m256i *in) { + const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64); + const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64); + const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64); + const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64); + const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64); + const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64); + const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64); + const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64); + const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64); + const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64); + const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64); + const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64); + const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64); + const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64); + const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64); + const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64); + const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64); + const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64); + const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64); + const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64); + const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64); + const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64); + const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64); + const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64); + const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64); + const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64); + const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); + const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); + const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64); + const __m256i zero = _mm256_setzero_si256(); + const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); + __m256i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; + __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + __m256i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; + __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + __m256i y0, y1; + + // stage 1, s takes low 256 bits; x takes high 256 bits + y0 = _mm256_unpacklo_epi16(in[15], in[0]); + y1 = _mm256_unpackhi_epi16(in[15], in[0]); + s0 = _mm256_madd_epi16(y0, cospi_p01_p31); + x0 = _mm256_madd_epi16(y1, cospi_p01_p31); + s1 = _mm256_madd_epi16(y0, cospi_p31_m01); + x1 = _mm256_madd_epi16(y1, cospi_p31_m01); + + y0 = _mm256_unpacklo_epi16(in[13], in[2]); + y1 = _mm256_unpackhi_epi16(in[13], in[2]); + s2 = _mm256_madd_epi16(y0, cospi_p05_p27); + x2 = _mm256_madd_epi16(y1, cospi_p05_p27); + s3 = _mm256_madd_epi16(y0, cospi_p27_m05); + x3 = _mm256_madd_epi16(y1, cospi_p27_m05); + + y0 = _mm256_unpacklo_epi16(in[11], in[4]); + y1 = _mm256_unpackhi_epi16(in[11], in[4]); + s4 = _mm256_madd_epi16(y0, cospi_p09_p23); + x4 = _mm256_madd_epi16(y1, cospi_p09_p23); + s5 = _mm256_madd_epi16(y0, cospi_p23_m09); + x5 = _mm256_madd_epi16(y1, cospi_p23_m09); + + y0 = _mm256_unpacklo_epi16(in[9], in[6]); + y1 = _mm256_unpackhi_epi16(in[9], in[6]); + s6 = _mm256_madd_epi16(y0, cospi_p13_p19); + x6 = _mm256_madd_epi16(y1, cospi_p13_p19); + s7 = _mm256_madd_epi16(y0, cospi_p19_m13); + x7 = _mm256_madd_epi16(y1, cospi_p19_m13); + + y0 = _mm256_unpacklo_epi16(in[7], in[8]); + y1 = _mm256_unpackhi_epi16(in[7], in[8]); + s8 = _mm256_madd_epi16(y0, cospi_p17_p15); + x8 = _mm256_madd_epi16(y1, cospi_p17_p15); + s9 = _mm256_madd_epi16(y0, cospi_p15_m17); + x9 = _mm256_madd_epi16(y1, cospi_p15_m17); + + y0 = _mm256_unpacklo_epi16(in[5], in[10]); + y1 = _mm256_unpackhi_epi16(in[5], in[10]); + s10 = _mm256_madd_epi16(y0, cospi_p21_p11); + x10 = _mm256_madd_epi16(y1, cospi_p21_p11); + s11 = _mm256_madd_epi16(y0, cospi_p11_m21); + x11 = _mm256_madd_epi16(y1, cospi_p11_m21); + + y0 = _mm256_unpacklo_epi16(in[3], in[12]); + y1 = _mm256_unpackhi_epi16(in[3], in[12]); + s12 = _mm256_madd_epi16(y0, cospi_p25_p07); + x12 = _mm256_madd_epi16(y1, cospi_p25_p07); + s13 = _mm256_madd_epi16(y0, cospi_p07_m25); + x13 = _mm256_madd_epi16(y1, cospi_p07_m25); + + y0 = _mm256_unpacklo_epi16(in[1], in[14]); + y1 = _mm256_unpackhi_epi16(in[1], in[14]); + s14 = _mm256_madd_epi16(y0, cospi_p29_p03); + x14 = _mm256_madd_epi16(y1, cospi_p29_p03); + s15 = _mm256_madd_epi16(y0, cospi_p03_m29); + x15 = _mm256_madd_epi16(y1, cospi_p03_m29); + + // u takes low 256 bits; v takes high 256 bits + u0 = _mm256_add_epi32(s0, s8); + u1 = _mm256_add_epi32(s1, s9); + u2 = _mm256_add_epi32(s2, s10); + u3 = _mm256_add_epi32(s3, s11); + u4 = _mm256_add_epi32(s4, s12); + u5 = _mm256_add_epi32(s5, s13); + u6 = _mm256_add_epi32(s6, s14); + u7 = _mm256_add_epi32(s7, s15); + + u8 = _mm256_sub_epi32(s0, s8); + u9 = _mm256_sub_epi32(s1, s9); + u10 = _mm256_sub_epi32(s2, s10); + u11 = _mm256_sub_epi32(s3, s11); + u12 = _mm256_sub_epi32(s4, s12); + u13 = _mm256_sub_epi32(s5, s13); + u14 = _mm256_sub_epi32(s6, s14); + u15 = _mm256_sub_epi32(s7, s15); + + v0 = _mm256_add_epi32(x0, x8); + v1 = _mm256_add_epi32(x1, x9); + v2 = _mm256_add_epi32(x2, x10); + v3 = _mm256_add_epi32(x3, x11); + v4 = _mm256_add_epi32(x4, x12); + v5 = _mm256_add_epi32(x5, x13); + v6 = _mm256_add_epi32(x6, x14); + v7 = _mm256_add_epi32(x7, x15); + + v8 = _mm256_sub_epi32(x0, x8); + v9 = _mm256_sub_epi32(x1, x9); + v10 = _mm256_sub_epi32(x2, x10); + v11 = _mm256_sub_epi32(x3, x11); + v12 = _mm256_sub_epi32(x4, x12); + v13 = _mm256_sub_epi32(x5, x13); + v14 = _mm256_sub_epi32(x6, x14); + v15 = _mm256_sub_epi32(x7, x15); + + // low 256 bits rounding + u8 = _mm256_add_epi32(u8, dct_rounding); + u9 = _mm256_add_epi32(u9, dct_rounding); + u10 = _mm256_add_epi32(u10, dct_rounding); + u11 = _mm256_add_epi32(u11, dct_rounding); + u12 = _mm256_add_epi32(u12, dct_rounding); + u13 = _mm256_add_epi32(u13, dct_rounding); + u14 = _mm256_add_epi32(u14, dct_rounding); + u15 = _mm256_add_epi32(u15, dct_rounding); + + u8 = _mm256_srai_epi32(u8, DCT_CONST_BITS); + u9 = _mm256_srai_epi32(u9, DCT_CONST_BITS); + u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS); + u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS); + u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS); + u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS); + u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS); + u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS); + + // high 256 bits rounding + v8 = _mm256_add_epi32(v8, dct_rounding); + v9 = _mm256_add_epi32(v9, dct_rounding); + v10 = _mm256_add_epi32(v10, dct_rounding); + v11 = _mm256_add_epi32(v11, dct_rounding); + v12 = _mm256_add_epi32(v12, dct_rounding); + v13 = _mm256_add_epi32(v13, dct_rounding); + v14 = _mm256_add_epi32(v14, dct_rounding); + v15 = _mm256_add_epi32(v15, dct_rounding); + + v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS); + v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS); + v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS); + v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS); + v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS); + v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS); + v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); + v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); + + // Saturation pack 32-bit to 16-bit + x8 = _mm256_packs_epi32(u8, v8); + x9 = _mm256_packs_epi32(u9, v9); + x10 = _mm256_packs_epi32(u10, v10); + x11 = _mm256_packs_epi32(u11, v11); + x12 = _mm256_packs_epi32(u12, v12); + x13 = _mm256_packs_epi32(u13, v13); + x14 = _mm256_packs_epi32(u14, v14); + x15 = _mm256_packs_epi32(u15, v15); + + // stage 2 + y0 = _mm256_unpacklo_epi16(x8, x9); + y1 = _mm256_unpackhi_epi16(x8, x9); + s8 = _mm256_madd_epi16(y0, cospi_p04_p28); + x8 = _mm256_madd_epi16(y1, cospi_p04_p28); + s9 = _mm256_madd_epi16(y0, cospi_p28_m04); + x9 = _mm256_madd_epi16(y1, cospi_p28_m04); + + y0 = _mm256_unpacklo_epi16(x10, x11); + y1 = _mm256_unpackhi_epi16(x10, x11); + s10 = _mm256_madd_epi16(y0, cospi_p20_p12); + x10 = _mm256_madd_epi16(y1, cospi_p20_p12); + s11 = _mm256_madd_epi16(y0, cospi_p12_m20); + x11 = _mm256_madd_epi16(y1, cospi_p12_m20); + + y0 = _mm256_unpacklo_epi16(x12, x13); + y1 = _mm256_unpackhi_epi16(x12, x13); + s12 = _mm256_madd_epi16(y0, cospi_m28_p04); + x12 = _mm256_madd_epi16(y1, cospi_m28_p04); + s13 = _mm256_madd_epi16(y0, cospi_p04_p28); + x13 = _mm256_madd_epi16(y1, cospi_p04_p28); + + y0 = _mm256_unpacklo_epi16(x14, x15); + y1 = _mm256_unpackhi_epi16(x14, x15); + s14 = _mm256_madd_epi16(y0, cospi_m12_p20); + x14 = _mm256_madd_epi16(y1, cospi_m12_p20); + s15 = _mm256_madd_epi16(y0, cospi_p20_p12); + x15 = _mm256_madd_epi16(y1, cospi_p20_p12); + + x0 = _mm256_add_epi32(u0, u4); + s0 = _mm256_add_epi32(v0, v4); + x1 = _mm256_add_epi32(u1, u5); + s1 = _mm256_add_epi32(v1, v5); + x2 = _mm256_add_epi32(u2, u6); + s2 = _mm256_add_epi32(v2, v6); + x3 = _mm256_add_epi32(u3, u7); + s3 = _mm256_add_epi32(v3, v7); + + v8 = _mm256_sub_epi32(u0, u4); + v9 = _mm256_sub_epi32(v0, v4); + v10 = _mm256_sub_epi32(u1, u5); + v11 = _mm256_sub_epi32(v1, v5); + v12 = _mm256_sub_epi32(u2, u6); + v13 = _mm256_sub_epi32(v2, v6); + v14 = _mm256_sub_epi32(u3, u7); + v15 = _mm256_sub_epi32(v3, v7); + + v8 = _mm256_add_epi32(v8, dct_rounding); + v9 = _mm256_add_epi32(v9, dct_rounding); + v10 = _mm256_add_epi32(v10, dct_rounding); + v11 = _mm256_add_epi32(v11, dct_rounding); + v12 = _mm256_add_epi32(v12, dct_rounding); + v13 = _mm256_add_epi32(v13, dct_rounding); + v14 = _mm256_add_epi32(v14, dct_rounding); + v15 = _mm256_add_epi32(v15, dct_rounding); + + v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS); + v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS); + v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS); + v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS); + v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS); + v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS); + v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); + v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); + + x4 = _mm256_packs_epi32(v8, v9); + x5 = _mm256_packs_epi32(v10, v11); + x6 = _mm256_packs_epi32(v12, v13); + x7 = _mm256_packs_epi32(v14, v15); + + u8 = _mm256_add_epi32(s8, s12); + u9 = _mm256_add_epi32(s9, s13); + u10 = _mm256_add_epi32(s10, s14); + u11 = _mm256_add_epi32(s11, s15); + u12 = _mm256_sub_epi32(s8, s12); + u13 = _mm256_sub_epi32(s9, s13); + u14 = _mm256_sub_epi32(s10, s14); + u15 = _mm256_sub_epi32(s11, s15); + + v8 = _mm256_add_epi32(x8, x12); + v9 = _mm256_add_epi32(x9, x13); + v10 = _mm256_add_epi32(x10, x14); + v11 = _mm256_add_epi32(x11, x15); + v12 = _mm256_sub_epi32(x8, x12); + v13 = _mm256_sub_epi32(x9, x13); + v14 = _mm256_sub_epi32(x10, x14); + v15 = _mm256_sub_epi32(x11, x15); + + u12 = _mm256_add_epi32(u12, dct_rounding); + u13 = _mm256_add_epi32(u13, dct_rounding); + u14 = _mm256_add_epi32(u14, dct_rounding); + u15 = _mm256_add_epi32(u15, dct_rounding); + + u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS); + u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS); + u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS); + u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS); + + v12 = _mm256_add_epi32(v12, dct_rounding); + v13 = _mm256_add_epi32(v13, dct_rounding); + v14 = _mm256_add_epi32(v14, dct_rounding); + v15 = _mm256_add_epi32(v15, dct_rounding); + + v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS); + v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS); + v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); + v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); + + x12 = _mm256_packs_epi32(u12, v12); + x13 = _mm256_packs_epi32(u13, v13); + x14 = _mm256_packs_epi32(u14, v14); + x15 = _mm256_packs_epi32(u15, v15); + + // stage 3 + y0 = _mm256_unpacklo_epi16(x4, x5); + y1 = _mm256_unpackhi_epi16(x4, x5); + s4 = _mm256_madd_epi16(y0, cospi_p08_p24); + x4 = _mm256_madd_epi16(y1, cospi_p08_p24); + s5 = _mm256_madd_epi16(y0, cospi_p24_m08); + x5 = _mm256_madd_epi16(y1, cospi_p24_m08); + + y0 = _mm256_unpacklo_epi16(x6, x7); + y1 = _mm256_unpackhi_epi16(x6, x7); + s6 = _mm256_madd_epi16(y0, cospi_m24_p08); + x6 = _mm256_madd_epi16(y1, cospi_m24_p08); + s7 = _mm256_madd_epi16(y0, cospi_p08_p24); + x7 = _mm256_madd_epi16(y1, cospi_p08_p24); + + y0 = _mm256_unpacklo_epi16(x12, x13); + y1 = _mm256_unpackhi_epi16(x12, x13); + s12 = _mm256_madd_epi16(y0, cospi_p08_p24); + x12 = _mm256_madd_epi16(y1, cospi_p08_p24); + s13 = _mm256_madd_epi16(y0, cospi_p24_m08); + x13 = _mm256_madd_epi16(y1, cospi_p24_m08); + + y0 = _mm256_unpacklo_epi16(x14, x15); + y1 = _mm256_unpackhi_epi16(x14, x15); + s14 = _mm256_madd_epi16(y0, cospi_m24_p08); + x14 = _mm256_madd_epi16(y1, cospi_m24_p08); + s15 = _mm256_madd_epi16(y0, cospi_p08_p24); + x15 = _mm256_madd_epi16(y1, cospi_p08_p24); + + u0 = _mm256_add_epi32(x0, x2); + v0 = _mm256_add_epi32(s0, s2); + u1 = _mm256_add_epi32(x1, x3); + v1 = _mm256_add_epi32(s1, s3); + u2 = _mm256_sub_epi32(x0, x2); + v2 = _mm256_sub_epi32(s0, s2); + u3 = _mm256_sub_epi32(x1, x3); + v3 = _mm256_sub_epi32(s1, s3); + + u0 = _mm256_add_epi32(u0, dct_rounding); + v0 = _mm256_add_epi32(v0, dct_rounding); + u1 = _mm256_add_epi32(u1, dct_rounding); + v1 = _mm256_add_epi32(v1, dct_rounding); + u2 = _mm256_add_epi32(u2, dct_rounding); + v2 = _mm256_add_epi32(v2, dct_rounding); + u3 = _mm256_add_epi32(u3, dct_rounding); + v3 = _mm256_add_epi32(v3, dct_rounding); + + u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); + v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); + v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS); + v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS); + v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS); + + in[0] = _mm256_packs_epi32(u0, v0); + x1 = _mm256_packs_epi32(u1, v1); + x2 = _mm256_packs_epi32(u2, v2); + x3 = _mm256_packs_epi32(u3, v3); + + // Rounding on s4 + s6, s5 + s7, s4 - s6, s5 - s7 + u4 = _mm256_add_epi32(s4, s6); + u5 = _mm256_add_epi32(s5, s7); + u6 = _mm256_sub_epi32(s4, s6); + u7 = _mm256_sub_epi32(s5, s7); + + v4 = _mm256_add_epi32(x4, x6); + v5 = _mm256_add_epi32(x5, x7); + v6 = _mm256_sub_epi32(x4, x6); + v7 = _mm256_sub_epi32(x5, x7); + + u4 = _mm256_add_epi32(u4, dct_rounding); + u5 = _mm256_add_epi32(u5, dct_rounding); + u6 = _mm256_add_epi32(u6, dct_rounding); + u7 = _mm256_add_epi32(u7, dct_rounding); + + u4 = _mm256_srai_epi32(u4, DCT_CONST_BITS); + u5 = _mm256_srai_epi32(u5, DCT_CONST_BITS); + u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS); + u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS); + + v4 = _mm256_add_epi32(v4, dct_rounding); + v5 = _mm256_add_epi32(v5, dct_rounding); + v6 = _mm256_add_epi32(v6, dct_rounding); + v7 = _mm256_add_epi32(v7, dct_rounding); + + v4 = _mm256_srai_epi32(v4, DCT_CONST_BITS); + v5 = _mm256_srai_epi32(v5, DCT_CONST_BITS); + v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS); + v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS); + + x4 = _mm256_packs_epi32(u4, v4); + in[12] = _mm256_packs_epi32(u5, v5); + x6 = _mm256_packs_epi32(u6, v6); + x7 = _mm256_packs_epi32(u7, v7); + + u0 = _mm256_add_epi32(u8, u10); + v0 = _mm256_add_epi32(v8, v10); + u1 = _mm256_add_epi32(u9, u11); + v1 = _mm256_add_epi32(v9, v11); + u2 = _mm256_sub_epi32(u8, u10); + v2 = _mm256_sub_epi32(v8, v10); + u3 = _mm256_sub_epi32(u9, u11); + v3 = _mm256_sub_epi32(v9, v11); + + u0 = _mm256_add_epi32(u0, dct_rounding); + v0 = _mm256_add_epi32(v0, dct_rounding); + u1 = _mm256_add_epi32(u1, dct_rounding); + v1 = _mm256_add_epi32(v1, dct_rounding); + u2 = _mm256_add_epi32(u2, dct_rounding); + v2 = _mm256_add_epi32(v2, dct_rounding); + u3 = _mm256_add_epi32(u3, dct_rounding); + v3 = _mm256_add_epi32(v3, dct_rounding); + + u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); + v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); + v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS); + v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS); + v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS); + + x8 = _mm256_packs_epi32(u0, v0); + in[14] = _mm256_packs_epi32(u1, v1); + x10 = _mm256_packs_epi32(u2, v2); + x11 = _mm256_packs_epi32(u3, v3); + + // Rounding on s12 + s14, s13 + s15, s12 - s14, s13 - s15 + u12 = _mm256_add_epi32(s12, s14); + u13 = _mm256_add_epi32(s13, s15); + u14 = _mm256_sub_epi32(s12, s14); + u15 = _mm256_sub_epi32(s13, s15); + + v12 = _mm256_add_epi32(x12, x14); + v13 = _mm256_add_epi32(x13, x15); + v14 = _mm256_sub_epi32(x12, x14); + v15 = _mm256_sub_epi32(x13, x15); + + u12 = _mm256_add_epi32(u12, dct_rounding); + u13 = _mm256_add_epi32(u13, dct_rounding); + u14 = _mm256_add_epi32(u14, dct_rounding); + u15 = _mm256_add_epi32(u15, dct_rounding); + + u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS); + u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS); + u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS); + u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS); + + v12 = _mm256_add_epi32(v12, dct_rounding); + v13 = _mm256_add_epi32(v13, dct_rounding); + v14 = _mm256_add_epi32(v14, dct_rounding); + v15 = _mm256_add_epi32(v15, dct_rounding); + + v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS); + v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS); + v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); + v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); + + x12 = _mm256_packs_epi32(u12, v12); + x13 = _mm256_packs_epi32(u13, v13); + x14 = _mm256_packs_epi32(u14, v14); + x15 = _mm256_packs_epi32(u15, v15); + in[2] = x12; + + // stage 4 + y0 = _mm256_unpacklo_epi16(x2, x3); + y1 = _mm256_unpackhi_epi16(x2, x3); + s2 = _mm256_madd_epi16(y0, cospi_m16_m16); + x2 = _mm256_madd_epi16(y1, cospi_m16_m16); + s3 = _mm256_madd_epi16(y0, cospi_p16_m16); + x3 = _mm256_madd_epi16(y1, cospi_p16_m16); + + y0 = _mm256_unpacklo_epi16(x6, x7); + y1 = _mm256_unpackhi_epi16(x6, x7); + s6 = _mm256_madd_epi16(y0, cospi_p16_p16); + x6 = _mm256_madd_epi16(y1, cospi_p16_p16); + s7 = _mm256_madd_epi16(y0, cospi_m16_p16); + x7 = _mm256_madd_epi16(y1, cospi_m16_p16); + + y0 = _mm256_unpacklo_epi16(x10, x11); + y1 = _mm256_unpackhi_epi16(x10, x11); + s10 = _mm256_madd_epi16(y0, cospi_p16_p16); + x10 = _mm256_madd_epi16(y1, cospi_p16_p16); + s11 = _mm256_madd_epi16(y0, cospi_m16_p16); + x11 = _mm256_madd_epi16(y1, cospi_m16_p16); + + y0 = _mm256_unpacklo_epi16(x14, x15); + y1 = _mm256_unpackhi_epi16(x14, x15); + s14 = _mm256_madd_epi16(y0, cospi_m16_m16); + x14 = _mm256_madd_epi16(y1, cospi_m16_m16); + s15 = _mm256_madd_epi16(y0, cospi_p16_m16); + x15 = _mm256_madd_epi16(y1, cospi_p16_m16); + + // Rounding + u2 = _mm256_add_epi32(s2, dct_rounding); + u3 = _mm256_add_epi32(s3, dct_rounding); + u6 = _mm256_add_epi32(s6, dct_rounding); + u7 = _mm256_add_epi32(s7, dct_rounding); + + u10 = _mm256_add_epi32(s10, dct_rounding); + u11 = _mm256_add_epi32(s11, dct_rounding); + u14 = _mm256_add_epi32(s14, dct_rounding); + u15 = _mm256_add_epi32(s15, dct_rounding); + + u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS); + u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS); + u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS); + u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS); + + u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS); + u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS); + u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS); + u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS); + + v2 = _mm256_add_epi32(x2, dct_rounding); + v3 = _mm256_add_epi32(x3, dct_rounding); + v6 = _mm256_add_epi32(x6, dct_rounding); + v7 = _mm256_add_epi32(x7, dct_rounding); + + v10 = _mm256_add_epi32(x10, dct_rounding); + v11 = _mm256_add_epi32(x11, dct_rounding); + v14 = _mm256_add_epi32(x14, dct_rounding); + v15 = _mm256_add_epi32(x15, dct_rounding); + + v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS); + v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS); + v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS); + v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS); + + v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS); + v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS); + v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS); + v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS); + + in[7] = _mm256_packs_epi32(u2, v2); + in[8] = _mm256_packs_epi32(u3, v3); + + in[4] = _mm256_packs_epi32(u6, v6); + in[11] = _mm256_packs_epi32(u7, v7); + + in[6] = _mm256_packs_epi32(u10, v10); + in[9] = _mm256_packs_epi32(u11, v11); + + in[5] = _mm256_packs_epi32(u14, v14); + in[10] = _mm256_packs_epi32(u15, v15); + + in[1] = _mm256_sub_epi16(zero, x8); + in[3] = _mm256_sub_epi16(zero, x4); + in[13] = _mm256_sub_epi16(zero, x13); + in[15] = _mm256_sub_epi16(zero, x1); +} + +#if CONFIG_EXT_TX +static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); } +#endif + +void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m256i in[16]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_16x16(input, stride, 0, 0, in); + fdct16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fdct16_avx2(in); + break; + case ADST_DCT: + load_buffer_16x16(input, stride, 0, 0, in); + fadst16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fdct16_avx2(in); + break; + case DCT_ADST: + load_buffer_16x16(input, stride, 0, 0, in); + fdct16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fadst16_avx2(in); + break; + case ADST_ADST: + load_buffer_16x16(input, stride, 0, 0, in); + fadst16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fadst16_avx2(in); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_16x16(input, stride, 1, 0, in); + fadst16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fdct16_avx2(in); + break; + case DCT_FLIPADST: + load_buffer_16x16(input, stride, 0, 1, in); + fdct16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fadst16_avx2(in); + break; + case FLIPADST_FLIPADST: + load_buffer_16x16(input, stride, 1, 1, in); + fadst16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fadst16_avx2(in); + break; + case ADST_FLIPADST: + load_buffer_16x16(input, stride, 0, 1, in); + fadst16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fadst16_avx2(in); + break; + case FLIPADST_ADST: + load_buffer_16x16(input, stride, 1, 0, in); + fadst16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fadst16_avx2(in); + break; + case IDTX: + load_buffer_16x16(input, stride, 0, 0, in); + fidtx16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fidtx16_avx2(in); + break; + case V_DCT: + load_buffer_16x16(input, stride, 0, 0, in); + fdct16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fidtx16_avx2(in); + break; + case H_DCT: + load_buffer_16x16(input, stride, 0, 0, in); + fidtx16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fdct16_avx2(in); + break; + case V_ADST: + load_buffer_16x16(input, stride, 0, 0, in); + fadst16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fidtx16_avx2(in); + break; + case H_ADST: + load_buffer_16x16(input, stride, 0, 0, in); + fidtx16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fadst16_avx2(in); + break; + case V_FLIPADST: + load_buffer_16x16(input, stride, 1, 0, in); + fadst16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fidtx16_avx2(in); + break; + case H_FLIPADST: + load_buffer_16x16(input, stride, 0, 1, in); + fidtx16_avx2(in); + mm256_transpose_16x16(in); + right_shift_16x16(in); + fadst16_avx2(in); + break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } + mm256_transpose_16x16(in); + write_buffer_16x16(in, output); + _mm256_zeroupper(); +} + +void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output, + int stride) { + // left and upper corner + int32_t sum = get_16x16_sum(input, stride); + // right and upper corner + sum += get_16x16_sum(input + 16, stride); + // left and lower corner + sum += get_16x16_sum(input + (stride << 4), stride); + // right and lower corner + sum += get_16x16_sum(input + (stride << 4) + 16, stride); + + sum >>= 3; + output[0] = (tran_low_t)sum; + _mm256_zeroupper(); +} + +static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) { + int i = 0; + __m256i temp; + while (i < size) { + temp = a0[i]; + a0[i] = a1[i]; + a1[i] = temp; + i++; + } +} + +static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) { + mm256_transpose_16x16(in0); + mm256_transpose_16x16(&in0[16]); + mm256_transpose_16x16(in1); + mm256_transpose_16x16(&in1[16]); + mm256_vectors_swap(&in0[16], in1, 16); +} + +static void prepare_16x16_even(const __m256i *in, __m256i *even) { + even[0] = _mm256_add_epi16(in[0], in[31]); + even[1] = _mm256_add_epi16(in[1], in[30]); + even[2] = _mm256_add_epi16(in[2], in[29]); + even[3] = _mm256_add_epi16(in[3], in[28]); + even[4] = _mm256_add_epi16(in[4], in[27]); + even[5] = _mm256_add_epi16(in[5], in[26]); + even[6] = _mm256_add_epi16(in[6], in[25]); + even[7] = _mm256_add_epi16(in[7], in[24]); + even[8] = _mm256_add_epi16(in[8], in[23]); + even[9] = _mm256_add_epi16(in[9], in[22]); + even[10] = _mm256_add_epi16(in[10], in[21]); + even[11] = _mm256_add_epi16(in[11], in[20]); + even[12] = _mm256_add_epi16(in[12], in[19]); + even[13] = _mm256_add_epi16(in[13], in[18]); + even[14] = _mm256_add_epi16(in[14], in[17]); + even[15] = _mm256_add_epi16(in[15], in[16]); +} + +static void prepare_16x16_odd(const __m256i *in, __m256i *odd) { + odd[0] = _mm256_sub_epi16(in[15], in[16]); + odd[1] = _mm256_sub_epi16(in[14], in[17]); + odd[2] = _mm256_sub_epi16(in[13], in[18]); + odd[3] = _mm256_sub_epi16(in[12], in[19]); + odd[4] = _mm256_sub_epi16(in[11], in[20]); + odd[5] = _mm256_sub_epi16(in[10], in[21]); + odd[6] = _mm256_sub_epi16(in[9], in[22]); + odd[7] = _mm256_sub_epi16(in[8], in[23]); + odd[8] = _mm256_sub_epi16(in[7], in[24]); + odd[9] = _mm256_sub_epi16(in[6], in[25]); + odd[10] = _mm256_sub_epi16(in[5], in[26]); + odd[11] = _mm256_sub_epi16(in[4], in[27]); + odd[12] = _mm256_sub_epi16(in[3], in[28]); + odd[13] = _mm256_sub_epi16(in[2], in[29]); + odd[14] = _mm256_sub_epi16(in[1], in[30]); + odd[15] = _mm256_sub_epi16(in[0], in[31]); +} + +static void collect_16col(const __m256i *even, const __m256i *odd, + __m256i *out) { + // fdct16_avx2() already maps the output + out[0] = even[0]; + out[2] = even[1]; + out[4] = even[2]; + out[6] = even[3]; + out[8] = even[4]; + out[10] = even[5]; + out[12] = even[6]; + out[14] = even[7]; + out[16] = even[8]; + out[18] = even[9]; + out[20] = even[10]; + out[22] = even[11]; + out[24] = even[12]; + out[26] = even[13]; + out[28] = even[14]; + out[30] = even[15]; + + out[1] = odd[0]; + out[17] = odd[1]; + out[9] = odd[2]; + out[25] = odd[3]; + out[5] = odd[4]; + out[21] = odd[5]; + out[13] = odd[6]; + out[29] = odd[7]; + out[3] = odd[8]; + out[19] = odd[9]; + out[11] = odd[10]; + out[27] = odd[11]; + out[7] = odd[12]; + out[23] = odd[13]; + out[15] = odd[14]; + out[31] = odd[15]; +} + +static void collect_coeffs(const __m256i *first_16col_even, + const __m256i *first_16col_odd, + const __m256i *second_16col_even, + const __m256i *second_16col_odd, __m256i *in0, + __m256i *in1) { + collect_16col(first_16col_even, first_16col_odd, in0); + collect_16col(second_16col_even, second_16col_odd, in1); +} + +static void fdct16_odd_avx2(__m256i *in) { + // sequence: cospi_L_H = pairs(L, H) and L first + const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64); + const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64); + const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); + const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); + const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64); + const __m256i cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); + const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); + const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64); + const __m256i cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m256i cospi_p31_p01 = pair256_set_epi16(cospi_31_64, cospi_1_64); + const __m256i cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64); + const __m256i cospi_p15_p17 = pair256_set_epi16(cospi_15_64, cospi_17_64); + const __m256i cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64); + const __m256i cospi_p23_p09 = pair256_set_epi16(cospi_23_64, cospi_9_64); + const __m256i cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64); + const __m256i cospi_p07_p25 = pair256_set_epi16(cospi_7_64, cospi_25_64); + const __m256i cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64); + const __m256i cospi_p27_p05 = pair256_set_epi16(cospi_27_64, cospi_5_64); + const __m256i cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); + const __m256i cospi_p11_p21 = pair256_set_epi16(cospi_11_64, cospi_21_64); + const __m256i cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); + const __m256i cospi_p19_p13 = pair256_set_epi16(cospi_19_64, cospi_13_64); + const __m256i cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64); + const __m256i cospi_p03_p29 = pair256_set_epi16(cospi_3_64, cospi_29_64); + const __m256i cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64); + + __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15; + __m256i u0, u1; + + // stage 1 is in prepare_16x16_odd() + + // stage 2 + y0 = in[0]; + y1 = in[1]; + y2 = in[2]; + y3 = in[3]; + + u0 = _mm256_unpacklo_epi16(in[4], in[11]); + u1 = _mm256_unpackhi_epi16(in[4], in[11]); + y4 = butter_fly(u0, u1, cospi_m16_p16); + y11 = butter_fly(u0, u1, cospi_p16_p16); + + u0 = _mm256_unpacklo_epi16(in[5], in[10]); + u1 = _mm256_unpackhi_epi16(in[5], in[10]); + y5 = butter_fly(u0, u1, cospi_m16_p16); + y10 = butter_fly(u0, u1, cospi_p16_p16); + + u0 = _mm256_unpacklo_epi16(in[6], in[9]); + u1 = _mm256_unpackhi_epi16(in[6], in[9]); + y6 = butter_fly(u0, u1, cospi_m16_p16); + y9 = butter_fly(u0, u1, cospi_p16_p16); + + u0 = _mm256_unpacklo_epi16(in[7], in[8]); + u1 = _mm256_unpackhi_epi16(in[7], in[8]); + y7 = butter_fly(u0, u1, cospi_m16_p16); + y8 = butter_fly(u0, u1, cospi_p16_p16); + + y12 = in[12]; + y13 = in[13]; + y14 = in[14]; + y15 = in[15]; + + // stage 3 + x0 = _mm256_add_epi16(y0, y7); + x1 = _mm256_add_epi16(y1, y6); + x2 = _mm256_add_epi16(y2, y5); + x3 = _mm256_add_epi16(y3, y4); + x4 = _mm256_sub_epi16(y3, y4); + x5 = _mm256_sub_epi16(y2, y5); + x6 = _mm256_sub_epi16(y1, y6); + x7 = _mm256_sub_epi16(y0, y7); + x8 = _mm256_sub_epi16(y15, y8); + x9 = _mm256_sub_epi16(y14, y9); + x10 = _mm256_sub_epi16(y13, y10); + x11 = _mm256_sub_epi16(y12, y11); + x12 = _mm256_add_epi16(y12, y11); + x13 = _mm256_add_epi16(y13, y10); + x14 = _mm256_add_epi16(y14, y9); + x15 = _mm256_add_epi16(y15, y8); + + // stage 4 + y0 = x0; + y1 = x1; + y6 = x6; + y7 = x7; + y8 = x8; + y9 = x9; + y14 = x14; + y15 = x15; + + u0 = _mm256_unpacklo_epi16(x2, x13); + u1 = _mm256_unpackhi_epi16(x2, x13); + y2 = butter_fly(u0, u1, cospi_m08_p24); + y13 = butter_fly(u0, u1, cospi_p24_p08); + + u0 = _mm256_unpacklo_epi16(x3, x12); + u1 = _mm256_unpackhi_epi16(x3, x12); + y3 = butter_fly(u0, u1, cospi_m08_p24); + y12 = butter_fly(u0, u1, cospi_p24_p08); + + u0 = _mm256_unpacklo_epi16(x4, x11); + u1 = _mm256_unpackhi_epi16(x4, x11); + y4 = butter_fly(u0, u1, cospi_m24_m08); + y11 = butter_fly(u0, u1, cospi_m08_p24); + + u0 = _mm256_unpacklo_epi16(x5, x10); + u1 = _mm256_unpackhi_epi16(x5, x10); + y5 = butter_fly(u0, u1, cospi_m24_m08); + y10 = butter_fly(u0, u1, cospi_m08_p24); + + // stage 5 + x0 = _mm256_add_epi16(y0, y3); + x1 = _mm256_add_epi16(y1, y2); + x2 = _mm256_sub_epi16(y1, y2); + x3 = _mm256_sub_epi16(y0, y3); + x4 = _mm256_sub_epi16(y7, y4); + x5 = _mm256_sub_epi16(y6, y5); + x6 = _mm256_add_epi16(y6, y5); + x7 = _mm256_add_epi16(y7, y4); + + x8 = _mm256_add_epi16(y8, y11); + x9 = _mm256_add_epi16(y9, y10); + x10 = _mm256_sub_epi16(y9, y10); + x11 = _mm256_sub_epi16(y8, y11); + x12 = _mm256_sub_epi16(y15, y12); + x13 = _mm256_sub_epi16(y14, y13); + x14 = _mm256_add_epi16(y14, y13); + x15 = _mm256_add_epi16(y15, y12); + + // stage 6 + y0 = x0; + y3 = x3; + y4 = x4; + y7 = x7; + y8 = x8; + y11 = x11; + y12 = x12; + y15 = x15; + + u0 = _mm256_unpacklo_epi16(x1, x14); + u1 = _mm256_unpackhi_epi16(x1, x14); + y1 = butter_fly(u0, u1, cospi_m04_p28); + y14 = butter_fly(u0, u1, cospi_p28_p04); + + u0 = _mm256_unpacklo_epi16(x2, x13); + u1 = _mm256_unpackhi_epi16(x2, x13); + y2 = butter_fly(u0, u1, cospi_m28_m04); + y13 = butter_fly(u0, u1, cospi_m04_p28); + + u0 = _mm256_unpacklo_epi16(x5, x10); + u1 = _mm256_unpackhi_epi16(x5, x10); + y5 = butter_fly(u0, u1, cospi_m20_p12); + y10 = butter_fly(u0, u1, cospi_p12_p20); + + u0 = _mm256_unpacklo_epi16(x6, x9); + u1 = _mm256_unpackhi_epi16(x6, x9); + y6 = butter_fly(u0, u1, cospi_m12_m20); + y9 = butter_fly(u0, u1, cospi_m20_p12); + + // stage 7 + x0 = _mm256_add_epi16(y0, y1); + x1 = _mm256_sub_epi16(y0, y1); + x2 = _mm256_sub_epi16(y3, y2); + x3 = _mm256_add_epi16(y3, y2); + x4 = _mm256_add_epi16(y4, y5); + x5 = _mm256_sub_epi16(y4, y5); + x6 = _mm256_sub_epi16(y7, y6); + x7 = _mm256_add_epi16(y7, y6); + + x8 = _mm256_add_epi16(y8, y9); + x9 = _mm256_sub_epi16(y8, y9); + x10 = _mm256_sub_epi16(y11, y10); + x11 = _mm256_add_epi16(y11, y10); + x12 = _mm256_add_epi16(y12, y13); + x13 = _mm256_sub_epi16(y12, y13); + x14 = _mm256_sub_epi16(y15, y14); + x15 = _mm256_add_epi16(y15, y14); + + // stage 8 + u0 = _mm256_unpacklo_epi16(x0, x15); + u1 = _mm256_unpackhi_epi16(x0, x15); + in[0] = butter_fly(u0, u1, cospi_p31_p01); + in[15] = butter_fly(u0, u1, cospi_m01_p31); + + u0 = _mm256_unpacklo_epi16(x1, x14); + u1 = _mm256_unpackhi_epi16(x1, x14); + in[1] = butter_fly(u0, u1, cospi_p15_p17); + in[14] = butter_fly(u0, u1, cospi_m17_p15); + + u0 = _mm256_unpacklo_epi16(x2, x13); + u1 = _mm256_unpackhi_epi16(x2, x13); + in[2] = butter_fly(u0, u1, cospi_p23_p09); + in[13] = butter_fly(u0, u1, cospi_m09_p23); + + u0 = _mm256_unpacklo_epi16(x3, x12); + u1 = _mm256_unpackhi_epi16(x3, x12); + in[3] = butter_fly(u0, u1, cospi_p07_p25); + in[12] = butter_fly(u0, u1, cospi_m25_p07); + + u0 = _mm256_unpacklo_epi16(x4, x11); + u1 = _mm256_unpackhi_epi16(x4, x11); + in[4] = butter_fly(u0, u1, cospi_p27_p05); + in[11] = butter_fly(u0, u1, cospi_m05_p27); + + u0 = _mm256_unpacklo_epi16(x5, x10); + u1 = _mm256_unpackhi_epi16(x5, x10); + in[5] = butter_fly(u0, u1, cospi_p11_p21); + in[10] = butter_fly(u0, u1, cospi_m21_p11); + + u0 = _mm256_unpacklo_epi16(x6, x9); + u1 = _mm256_unpackhi_epi16(x6, x9); + in[6] = butter_fly(u0, u1, cospi_p19_p13); + in[9] = butter_fly(u0, u1, cospi_m13_p19); + + u0 = _mm256_unpacklo_epi16(x7, x8); + u1 = _mm256_unpackhi_epi16(x7, x8); + in[7] = butter_fly(u0, u1, cospi_p03_p29); + in[8] = butter_fly(u0, u1, cospi_m29_p03); +} + +static void fdct32_avx2(__m256i *in0, __m256i *in1) { + __m256i even0[16], even1[16], odd0[16], odd1[16]; + prepare_16x16_even(in0, even0); + fdct16_avx2(even0); + + prepare_16x16_odd(in0, odd0); + fdct16_odd_avx2(odd0); + + prepare_16x16_even(in1, even1); + fdct16_avx2(even1); + + prepare_16x16_odd(in1, odd1); + fdct16_odd_avx2(odd1); + + collect_coeffs(even0, odd0, even1, odd1, in0, in1); + + mm256_transpose_32x32(in0, in1); +} + +static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1, + tran_low_t *output) { + int i = 0; + const int stride = 32; + tran_low_t *coeff = output; + while (i < 32) { + storeu_output_avx2(&in0[i], coeff); + storeu_output_avx2(&in1[i], coeff + 16); + coeff += stride; + i += 1; + } +} + +#if CONFIG_EXT_TX +static void fhalfright32_16col_avx2(__m256i *in) { + int i = 0; + const __m256i zero = _mm256_setzero_si256(); + const __m256i sqrt2 = _mm256_set1_epi16(Sqrt2); + const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); + __m256i x0, x1; + + while (i < 16) { + in[i] = _mm256_slli_epi16(in[i], 2); + x0 = _mm256_unpacklo_epi16(in[i + 16], zero); + x1 = _mm256_unpackhi_epi16(in[i + 16], zero); + x0 = _mm256_madd_epi16(x0, sqrt2); + x1 = _mm256_madd_epi16(x1, sqrt2); + x0 = _mm256_add_epi32(x0, dct_rounding); + x1 = _mm256_add_epi32(x1, dct_rounding); + x0 = _mm256_srai_epi32(x0, DCT_CONST_BITS); + x1 = _mm256_srai_epi32(x1, DCT_CONST_BITS); + in[i + 16] = _mm256_packs_epi32(x0, x1); + i += 1; + } + fdct16_avx2(&in[16]); +} + +static void fhalfright32_avx2(__m256i *in0, __m256i *in1) { + fhalfright32_16col_avx2(in0); + fhalfright32_16col_avx2(in1); + mm256_vectors_swap(in0, &in0[16], 16); + mm256_vectors_swap(in1, &in1[16], 16); + mm256_transpose_32x32(in0, in1); +} +#endif // CONFIG_EXT_TX + +static INLINE void load_buffer_32x32(const int16_t *input, int stride, + int flipud, int fliplr, __m256i *in0, + __m256i *in1) { + // Load 4 16x16 blocks + const int16_t *topL = input; + const int16_t *topR = input + 16; + const int16_t *botL = input + 16 * stride; + const int16_t *botR = input + 16 * stride + 16; + + const int16_t *tmp; + + if (flipud) { + // Swap left columns + tmp = topL; + topL = botL; + botL = tmp; + // Swap right columns + tmp = topR; + topR = botR; + botR = tmp; + } + + if (fliplr) { + // Swap top rows + tmp = topL; + topL = topR; + topR = tmp; + // Swap bottom rows + tmp = botL; + botL = botR; + botR = tmp; + } + + // load first 16 columns + load_buffer_16x16(topL, stride, flipud, fliplr, in0); + load_buffer_16x16(botL, stride, flipud, fliplr, in0 + 16); + + // load second 16 columns + load_buffer_16x16(topR, stride, flipud, fliplr, in1); + load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16); +} + +static INLINE void right_shift_32x32_16col(int bit, __m256i *in) { + int i = 0; + const __m256i rounding = _mm256_set1_epi16((1 << bit) >> 1); + __m256i sign; + while (i < 32) { + sign = _mm256_srai_epi16(in[i], 15); + in[i] = _mm256_add_epi16(in[i], rounding); + in[i] = _mm256_add_epi16(in[i], sign); + in[i] = _mm256_srai_epi16(in[i], bit); + i += 1; + } +} + +// Positive rounding +static INLINE void right_shift_32x32(__m256i *in0, __m256i *in1) { + const int bit = 4; + right_shift_32x32_16col(bit, in0); + right_shift_32x32_16col(bit, in1); +} + +#if CONFIG_EXT_TX +static void fidtx32_avx2(__m256i *in0, __m256i *in1) { + int i = 0; + while (i < 32) { + in0[i] = _mm256_slli_epi16(in0[i], 2); + in1[i] = _mm256_slli_epi16(in1[i], 2); + i += 1; + } + mm256_transpose_32x32(in0, in1); +} +#endif + +void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m256i in0[32]; // left 32 columns + __m256i in1[32]; // right 32 columns + + switch (tx_type) { + case DCT_DCT: + load_buffer_32x32(input, stride, 0, 0, in0, in1); + fdct32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fdct32_avx2(in0, in1); + break; +#if CONFIG_EXT_TX + case ADST_DCT: + load_buffer_32x32(input, stride, 0, 0, in0, in1); + fhalfright32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fdct32_avx2(in0, in1); + break; + case DCT_ADST: + load_buffer_32x32(input, stride, 0, 0, in0, in1); + fdct32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fhalfright32_avx2(in0, in1); + break; + case ADST_ADST: + load_buffer_32x32(input, stride, 0, 0, in0, in1); + fhalfright32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fhalfright32_avx2(in0, in1); + break; + case FLIPADST_DCT: + load_buffer_32x32(input, stride, 1, 0, in0, in1); + fhalfright32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fdct32_avx2(in0, in1); + break; + case DCT_FLIPADST: + load_buffer_32x32(input, stride, 0, 1, in0, in1); + fdct32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fhalfright32_avx2(in0, in1); + break; + case FLIPADST_FLIPADST: + load_buffer_32x32(input, stride, 1, 1, in0, in1); + fhalfright32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fhalfright32_avx2(in0, in1); + break; + case ADST_FLIPADST: + load_buffer_32x32(input, stride, 0, 1, in0, in1); + fhalfright32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fhalfright32_avx2(in0, in1); + break; + case FLIPADST_ADST: + load_buffer_32x32(input, stride, 1, 0, in0, in1); + fhalfright32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fhalfright32_avx2(in0, in1); + break; + case IDTX: + load_buffer_32x32(input, stride, 0, 0, in0, in1); + fidtx32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fidtx32_avx2(in0, in1); + break; + case V_DCT: + load_buffer_32x32(input, stride, 0, 0, in0, in1); + fdct32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fidtx32_avx2(in0, in1); + break; + case H_DCT: + load_buffer_32x32(input, stride, 0, 0, in0, in1); + fidtx32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fdct32_avx2(in0, in1); + break; + case V_ADST: + load_buffer_32x32(input, stride, 0, 0, in0, in1); + fhalfright32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fidtx32_avx2(in0, in1); + break; + case H_ADST: + load_buffer_32x32(input, stride, 0, 0, in0, in1); + fidtx32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fhalfright32_avx2(in0, in1); + break; + case V_FLIPADST: + load_buffer_32x32(input, stride, 1, 0, in0, in1); + fhalfright32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fidtx32_avx2(in0, in1); + break; + case H_FLIPADST: + load_buffer_32x32(input, stride, 0, 1, in0, in1); + fidtx32_avx2(in0, in1); + right_shift_32x32(in0, in1); + fhalfright32_avx2(in0, in1); + break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } + write_buffer_32x32(in0, in1, output); + _mm256_zeroupper(); +} diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm new file mode 100644 index 000000000..7186b6b92 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm @@ -0,0 +1,215 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +; void av1_temporal_filter_apply_sse2 | arg +; (unsigned char *frame1, | 0 +; unsigned int stride, | 1 +; unsigned char *frame2, | 2 +; unsigned int block_width, | 3 +; unsigned int block_height, | 4 +; int strength, | 5 +; int filter_weight, | 6 +; unsigned int *accumulator, | 7 +; unsigned short *count) | 8 +global sym(av1_temporal_filter_apply_sse2) PRIVATE +sym(av1_temporal_filter_apply_sse2): + + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ALIGN_STACK 16, rax + %define block_width 0 + %define block_height 16 + %define strength 32 + %define filter_weight 48 + %define rounding_bit 64 + %define rbp_backup 80 + %define stack_size 96 + sub rsp, stack_size + mov [rsp + rbp_backup], rbp + ; end prolog + + mov edx, arg(3) + mov [rsp + block_width], rdx + mov edx, arg(4) + mov [rsp + block_height], rdx + movd xmm6, arg(5) + movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read + + ; calculate the rounding bit outside the loop + ; 0x8000 >> (16 - strength) + mov rdx, 16 + sub rdx, arg(5) ; 16 - strength + movq xmm4, rdx ; can't use rdx w/ shift + movdqa xmm5, [GLOBAL(_const_top_bit)] + psrlw xmm5, xmm4 + movdqa [rsp + rounding_bit], xmm5 + + mov rsi, arg(0) ; src/frame1 + mov rdx, arg(2) ; predictor frame + mov rdi, arg(7) ; accumulator + mov rax, arg(8) ; count + + ; dup the filter weight and store for later + movd xmm0, arg(6) ; filter_weight + pshuflw xmm0, xmm0, 0 + punpcklwd xmm0, xmm0 + movdqa [rsp + filter_weight], xmm0 + + mov rbp, arg(1) ; stride + pxor xmm7, xmm7 ; zero for extraction + + mov rcx, [rsp + block_width] + imul rcx, [rsp + block_height] + add rcx, rdx + cmp dword ptr [rsp + block_width], 8 + jne .temporal_filter_apply_load_16 + +.temporal_filter_apply_load_8: + movq xmm0, [rsi] ; first row + lea rsi, [rsi + rbp] ; += stride + punpcklbw xmm0, xmm7 ; src[ 0- 7] + movq xmm1, [rsi] ; second row + lea rsi, [rsi + rbp] ; += stride + punpcklbw xmm1, xmm7 ; src[ 8-15] + jmp .temporal_filter_apply_load_finished + +.temporal_filter_apply_load_16: + movdqa xmm0, [rsi] ; src (frame1) + lea rsi, [rsi + rbp] ; += stride + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 ; src[ 0- 7] + punpckhbw xmm1, xmm7 ; src[ 8-15] + +.temporal_filter_apply_load_finished: + movdqa xmm2, [rdx] ; predictor (frame2) + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm7 ; pred[ 0- 7] + punpckhbw xmm3, xmm7 ; pred[ 8-15] + + ; modifier = src_byte - pixel_value + psubw xmm0, xmm2 ; src - pred[ 0- 7] + psubw xmm1, xmm3 ; src - pred[ 8-15] + + ; modifier *= modifier + pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 + pmullw xmm1, xmm1 ; modifer[ 8-15]^2 + + ; modifier *= 3 + pmullw xmm0, [GLOBAL(_const_3w)] + pmullw xmm1, [GLOBAL(_const_3w)] + + ; modifer += 0x8000 >> (16 - strength) + paddw xmm0, [rsp + rounding_bit] + paddw xmm1, [rsp + rounding_bit] + + ; modifier >>= strength + psrlw xmm0, [rsp + strength] + psrlw xmm1, [rsp + strength] + + ; modifier = 16 - modifier + ; saturation takes care of modifier > 16 + movdqa xmm3, [GLOBAL(_const_16w)] + movdqa xmm2, [GLOBAL(_const_16w)] + psubusw xmm3, xmm1 + psubusw xmm2, xmm0 + + ; modifier *= filter_weight + pmullw xmm2, [rsp + filter_weight] + pmullw xmm3, [rsp + filter_weight] + + ; count + movdqa xmm4, [rax] + movdqa xmm5, [rax+16] + ; += modifier + paddw xmm4, xmm2 + paddw xmm5, xmm3 + ; write back + movdqa [rax], xmm4 + movdqa [rax+16], xmm5 + lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) + + ; load and extract the predictor up to shorts + pxor xmm7, xmm7 + movdqa xmm0, [rdx] + lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 ; pred[ 0- 7] + punpckhbw xmm1, xmm7 ; pred[ 8-15] + + ; modifier *= pixel_value + pmullw xmm0, xmm2 + pmullw xmm1, xmm3 + + ; expand to double words + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm7 ; [ 0- 3] + punpckhwd xmm2, xmm7 ; [ 4- 7] + movdqa xmm3, xmm1 + punpcklwd xmm1, xmm7 ; [ 8-11] + punpckhwd xmm3, xmm7 ; [12-15] + + ; accumulator + movdqa xmm4, [rdi] + movdqa xmm5, [rdi+16] + movdqa xmm6, [rdi+32] + movdqa xmm7, [rdi+48] + ; += modifier + paddd xmm4, xmm0 + paddd xmm5, xmm2 + paddd xmm6, xmm1 + paddd xmm7, xmm3 + ; write back + movdqa [rdi], xmm4 + movdqa [rdi+16], xmm5 + movdqa [rdi+32], xmm6 + movdqa [rdi+48], xmm7 + lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) + + cmp rdx, rcx + je .temporal_filter_apply_epilog + pxor xmm7, xmm7 ; zero for extraction + cmp dword ptr [rsp + block_width], 16 + je .temporal_filter_apply_load_16 + jmp .temporal_filter_apply_load_8 + +.temporal_filter_apply_epilog: + ; begin epilog + mov rbp, [rsp + rbp_backup] + add rsp, stack_size + pop rsp + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +_const_3w: + times 8 dw 3 +align 16 +_const_top_bit: + times 8 dw 1<<15 +align 16 +_const_16w: + times 8 dw 16 diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c new file mode 100644 index 000000000..bf233ca4d --- /dev/null +++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/x86/synonyms.h" + +#include "aom/aom_integer.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * See av1_wedge_sse_from_residuals_c + */ +uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + int n = -N; + int n8 = n + 8; + + uint64_t csse; + + const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); + const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + + __m128i v_acc0_q = _mm_setzero_si128(); + + assert(N % 64 == 0); + + r1 += N; + d += N; + m += N; + + do { + const __m128i v_r0_w = xx_load_128(r1 + n); + const __m128i v_r1_w = xx_load_128(r1 + n8); + const __m128i v_d0_w = xx_load_128(d + n); + const __m128i v_d1_w = xx_load_128(d + n8); + const __m128i v_m01_b = xx_load_128(m + n); + + const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w); + const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w); + const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w); + const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w); + const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); + const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); + + const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w); + const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w); + const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w); + const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w); + + const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w); + const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w); + const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w); + const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w); + + const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d); + const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d); + + const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w); + const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w); + + const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q), + _mm_srli_epi64(v_sq0_d, 32)); + const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q), + _mm_srli_epi64(v_sq1_d, 32)); + + v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q); + v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q); + + n8 += 16; + n += 16; + } while (n); + + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); + +#if ARCH_X86_64 + csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q); +#else + xx_storel_64(&csse, v_acc0_q); +#endif + + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * See av1_wedge_sign_from_residuals_c + */ +int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int64_t acc; + + __m128i v_sign_d; + __m128i v_acc0_d = _mm_setzero_si128(); + __m128i v_acc1_d = _mm_setzero_si128(); + __m128i v_acc_q; + + // Input size limited to 8192 by the use of 32 bit accumulators and m + // being between [0, 64]. Overflow might happen at larger sizes, + // though it is practically impossible on real video input. + assert(N < 8192); + assert(N % 64 == 0); + + do { + const __m128i v_m01_b = xx_load_128(m); + const __m128i v_m23_b = xx_load_128(m + 16); + const __m128i v_m45_b = xx_load_128(m + 32); + const __m128i v_m67_b = xx_load_128(m + 48); + + const __m128i v_d0_w = xx_load_128(ds); + const __m128i v_d1_w = xx_load_128(ds + 8); + const __m128i v_d2_w = xx_load_128(ds + 16); + const __m128i v_d3_w = xx_load_128(ds + 24); + const __m128i v_d4_w = xx_load_128(ds + 32); + const __m128i v_d5_w = xx_load_128(ds + 40); + const __m128i v_d6_w = xx_load_128(ds + 48); + const __m128i v_d7_w = xx_load_128(ds + 56); + + const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); + const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); + const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128()); + const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128()); + const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128()); + const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128()); + const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128()); + const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128()); + + const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w); + const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w); + const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w); + const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w); + const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w); + const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w); + const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w); + const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w); + + const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d); + const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d); + const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d); + const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d); + + const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d); + const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d); + + v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d); + v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d); + + ds += 64; + m += 64; + + N -= 64; + } while (N); + + v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128()); + v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d), + _mm_unpackhi_epi32(v_acc0_d, v_sign_d)); + + v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128()); + v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d), + _mm_unpackhi_epi32(v_acc1_d, v_sign_d)); + + v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + +#if ARCH_X86_64 + acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q); +#else + xx_storel_64(&acc, v_acc_q); +#endif + + return acc > limit; +} + +// Negate under mask +static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) { + return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w); +} + +/** + * av1_wedge_compute_delta_squares_c + */ +void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + const __m128i v_neg_w = + _mm_set_epi16(0xffff, 0, 0xffff, 0, 0xffff, 0, 0xffff, 0); + + assert(N % 64 == 0); + + do { + const __m128i v_a0_w = xx_load_128(a); + const __m128i v_b0_w = xx_load_128(b); + const __m128i v_a1_w = xx_load_128(a + 8); + const __m128i v_b1_w = xx_load_128(b + 8); + const __m128i v_a2_w = xx_load_128(a + 16); + const __m128i v_b2_w = xx_load_128(b + 16); + const __m128i v_a3_w = xx_load_128(a + 24); + const __m128i v_b3_w = xx_load_128(b + 24); + + const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w); + const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w); + const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w); + const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w); + const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w); + const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w); + const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w); + const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w); + + // Negate top word of pairs + const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w); + const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w); + const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w); + const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w); + const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w); + const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w); + const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w); + const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w); + + const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w); + const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w); + const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w); + const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w); + const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w); + const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w); + const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w); + const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w); + + const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w); + const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w); + const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w); + const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w); + + xx_store_128(d, v_r0_w); + xx_store_128(d + 8, v_r1_w); + xx_store_128(d + 16, v_r2_w); + xx_store_128(d + 24, v_r3_w); + + a += 32; + b += 32; + d += 32; + N -= 32; + } while (N); +} -- cgit v1.2.3