diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 23:05:00 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 23:05:03 -0500 |
commit | d2499ead93dc4298c0882fe98902acb1b5209f99 (patch) | |
tree | cb0b942aed59e5108f9a3e9d64e7b77854383421 /third_party/aom/av1/encoder/rdopt.c | |
parent | 41fbdea457bf50c0a43e1c27c5cbf7f0a3a9eb33 (diff) | |
download | UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.gz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.lz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.xz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.zip |
Update libaom to commit ID 1e227d41f0616de9548a673a83a21ef990b62591
Diffstat (limited to 'third_party/aom/av1/encoder/rdopt.c')
-rw-r--r-- | third_party/aom/av1/encoder/rdopt.c | 4551 |
1 files changed, 2851 insertions, 1700 deletions
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c index fef6d2875..c2d15534f 100644 --- a/third_party/aom/av1/encoder/rdopt.c +++ b/third_party/aom/av1/encoder/rdopt.c @@ -55,17 +55,97 @@ #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" #include "av1/encoder/tokenize.h" #include "av1/encoder/tx_prune_model_weights.h" -#define DNN_BASED_RD_INTERP_FILTER 0 +typedef void (*model_rd_for_sb_type)( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist); -// Set this macro as 1 to collect data about tx size selection. -#define COLLECT_TX_SIZE_DATA 0 +static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, + int plane_to, int mi_row, int mi_col, + int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, + int64_t *plane_dist); +static void model_rd_for_sb_with_curvfit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +static void model_rd_for_sb_with_surffit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +static void model_rd_for_sb_with_dnn( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +static void model_rd_for_sb_with_fullrdy( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); +static void model_rd_from_sse(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, + int plane, int64_t sse, int num_samples, + int *rate, int64_t *dist); +static void model_rd_with_dnn(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, + int plane, int64_t sse, int num_samples, + int *rate, int64_t *dist); +static void model_rd_with_curvfit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist); +static void model_rd_with_surffit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist); -#if COLLECT_TX_SIZE_DATA -static const char av1_tx_size_data_output_file[] = "tx_size_data.txt"; -#endif +typedef enum { + MODELRD_LEGACY, + MODELRD_CURVFIT, + MODELRD_SUFFIT, + MODELRD_DNN, + MODELRD_FULLRDY, + MODELRD_TYPES +} ModelRdType; + +static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = { + model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit, + model_rd_for_sb_with_dnn, model_rd_for_sb_with_fullrdy +}; + +static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = { + model_rd_from_sse, model_rd_with_curvfit, model_rd_with_surffit, + model_rd_with_dnn, NULL +}; + +// 0: Legacy model +// 1: Curve fit model +// 2: Surface fit model +// 3: DNN regression model +// 4: Full rd model +#define MODELRD_TYPE_INTERP_FILTER 1 +#define MODELRD_TYPE_TX_SEARCH_PRUNE 2 +#define MODELRD_TYPE_MASKED_COMPOUND 1 +#define MODELRD_TYPE_INTERINTRA 1 +#define MODELRD_TYPE_INTRA 1 +#define MODELRD_TYPE_JNT_COMPOUND 1 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS) static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = { @@ -103,6 +183,16 @@ typedef enum { FTXS_USE_TRANSFORM_DOMAIN = 1 << 2 } FAST_TX_SEARCH_MODE; +static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row, + int mi_col, int64_t ref_best_rd); + +static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t non_skip_ref_best_rd, + int64_t skip_ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode); + struct rdcost_block_args { const AV1_COMP *cpi; MACROBLOCK *x; @@ -112,6 +202,7 @@ struct rdcost_block_args { int64_t this_rd; int64_t best_rd; int exit_early; + int incomplete_exit; int use_fast_coef_costing; FAST_TX_SEARCH_MODE ftxs_mode; }; @@ -126,8 +217,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } }, { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } }, - { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, - { NEWMV, { LAST_FRAME, NONE_FRAME } }, { NEWMV, { LAST2_FRAME, NONE_FRAME } }, { NEWMV, { LAST3_FRAME, NONE_FRAME } }, @@ -172,12 +261,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, - { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, - - { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, - { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, - { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, - { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, @@ -274,15 +357,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, - { H_PRED, { INTRA_FRAME, NONE_FRAME } }, - { V_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, - { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, - { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } }, @@ -314,6 +388,21 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = { { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } }, + + // intra modes + { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, + { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, }; static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = { @@ -451,7 +540,6 @@ static int get_prediction_mode_idx(PREDICTION_MODE this_mode, if (this_mode >= SINGLE_INTER_MODE_START && this_mode < SINGLE_INTER_MODE_END) { assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); - assert(second_ref_frame == NONE_FRAME); return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START] [ref_frame]; } @@ -479,6 +567,12 @@ static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = { UV_D113_PRED, UV_D45_PRED, }; +typedef struct SingleInterModeState { + int64_t rd; + MV_REFERENCE_FRAME ref_frame; + int valid; +} SingleInterModeState; + typedef struct InterModeSearchState { int64_t best_rd; MB_MODE_INFO best_mbmode; @@ -510,32 +604,21 @@ typedef struct InterModeSearchState { int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES]; int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES]; int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES]; - int64_t modelled_rd[MB_MODE_COUNT][REF_FRAMES]; + int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES]; + // The rd of simple translation in single inter modes + int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES]; + + // Single search results by [directions][modes][reference frames] + SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; + int single_state_cnt[2][SINGLE_INTER_MODE_NUM]; + SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM] + [FWD_REFS]; + int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM]; + + MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; } InterModeSearchState; #if CONFIG_COLLECT_INTER_MODE_RD_STATS - -typedef struct InterModeRdModel { - int ready; - double a; - double b; - double dist_mean; - int skip_count; - int non_skip_count; - int fp_skip_count; - int bracket_idx; -} InterModeRdModel; - -InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; - -#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400 -static int inter_mode_data_idx[4]; -static int64_t inter_mode_data_sse[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; -static int64_t inter_mode_data_dist[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; -static int inter_mode_data_residue_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; -static int inter_mode_data_all_cost[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; -static int64_t inter_mode_data_ref_best_rd[4][INTER_MODE_RD_DATA_OVERALL_SIZE]; - int inter_mode_data_block_idx(BLOCK_SIZE bsize) { if (bsize == BLOCK_8X8) return 1; if (bsize == BLOCK_16X16) return 2; @@ -543,137 +626,152 @@ int inter_mode_data_block_idx(BLOCK_SIZE bsize) { return -1; } -void av1_inter_mode_data_init() { +void av1_inter_mode_data_init(TileDataEnc *tile_data) { for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { - const int block_idx = inter_mode_data_block_idx(i); - if (block_idx != -1) inter_mode_data_idx[block_idx] = 0; - InterModeRdModel *md = &inter_mode_rd_models[i]; + InterModeRdModel *md = &tile_data->inter_mode_rd_models[i]; md->ready = 0; - md->skip_count = 0; - md->non_skip_count = 0; - md->fp_skip_count = 0; - md->bracket_idx = 0; + md->num = 0; + md->dist_sum = 0; + md->ld_sum = 0; + md->sse_sum = 0; + md->sse_sse_sum = 0; + md->sse_ld_sum = 0; } } -void av1_inter_mode_data_show(const AV1_COMMON *cm) { - printf("frame_offset %d\n", cm->frame_offset); - for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { - const int block_idx = inter_mode_data_block_idx(i); - if (block_idx != -1) inter_mode_data_idx[block_idx] = 0; - InterModeRdModel *md = &inter_mode_rd_models[i]; - if (md->ready) { - printf("bsize %d non_skip_count %d skip_count %d fp_skip_count %d\n", i, - md->non_skip_count, md->skip_count, md->fp_skip_count); +static int get_est_rate_dist(TileDataEnc *tile_data, BLOCK_SIZE bsize, + int64_t sse, int *est_residue_cost, + int64_t *est_dist) { + aom_clear_system_state(); + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + if (md->ready) { + const double est_ld = md->a * sse + md->b; + if (sse < md->dist_mean) { + *est_residue_cost = 0; + *est_dist = sse; + } else { + *est_residue_cost = (int)round((sse - md->dist_mean) / est_ld); + *est_dist = (int64_t)round(md->dist_mean); } + return 1; } + return 0; } -static int64_t get_est_rd(BLOCK_SIZE bsize, int rdmult, int64_t sse, - int curr_cost) { - aom_clear_system_state(); - InterModeRdModel *md = &inter_mode_rd_models[bsize]; - if (md->ready) { - const double est_ld = md->a * sse + md->b; - const double est_residue_cost = (sse - md->dist_mean) / est_ld; - const int64_t est_cost = (int64_t)round(est_residue_cost) + curr_cost; - const int64_t int64_dist_mean = (int64_t)round(md->dist_mean); - const int64_t est_rd = RDCOST(rdmult, est_cost, int64_dist_mean); +static int64_t get_est_rd(TileDataEnc *tile_data, BLOCK_SIZE bsize, int rdmult, + int64_t sse, int curr_cost) { + int est_residue_cost; + int64_t est_dist; + if (get_est_rate_dist(tile_data, bsize, sse, &est_residue_cost, &est_dist)) { + int rate = est_residue_cost + curr_cost; + int64_t est_rd = RDCOST(rdmult, rate, est_dist); return est_rd; } return 0; } -#define DATA_BRACKETS 7 -static const int data_num_threshold[DATA_BRACKETS] = { - 200, 400, 800, 1600, 3200, 6400, INT32_MAX -}; - -void av1_inter_mode_data_fit(int rdmult) { +void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) { aom_clear_system_state(); for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { const int block_idx = inter_mode_data_block_idx(bsize); - InterModeRdModel *md = &inter_mode_rd_models[bsize]; + InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; if (block_idx == -1) continue; - int data_num = inter_mode_data_idx[block_idx]; - if (data_num < data_num_threshold[md->bracket_idx]) { + if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) { continue; + } else { + if (md->ready == 0) { + md->dist_mean = md->dist_sum / md->num; + md->ld_mean = md->ld_sum / md->num; + md->sse_mean = md->sse_sum / md->num; + md->sse_sse_mean = md->sse_sse_sum / md->num; + md->sse_ld_mean = md->sse_ld_sum / md->num; + } else { + const double factor = 3; + md->dist_mean = + (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1); + md->ld_mean = + (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1); + md->sse_mean = + (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1); + md->sse_sse_mean = + (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) / + (factor + 1); + md->sse_ld_mean = + (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) / + (factor + 1); + } + + const double my = md->ld_mean; + const double mx = md->sse_mean; + const double dx = sqrt(md->sse_sse_mean); + const double dxy = md->sse_ld_mean; + + md->a = (dxy - mx * my) / (dx * dx - mx * mx); + md->b = my - md->a * mx; + md->ready = 1; + + md->num = 0; + md->dist_sum = 0; + md->ld_sum = 0; + md->sse_sum = 0; + md->sse_sse_sum = 0; + md->sse_ld_sum = 0; } - double my = 0; - double mx = 0; - double dx = 0; - double dxy = 0; - double dist_mean = 0; - const int train_num = data_num; - for (int i = 0; i < train_num; ++i) { - const double sse = (double)inter_mode_data_sse[block_idx][i]; - const double dist = (double)inter_mode_data_dist[block_idx][i]; - const double residue_cost = inter_mode_data_residue_cost[block_idx][i]; - const double ld = (sse - dist) / residue_cost; - dist_mean += dist; - my += ld; - mx += sse; - dx += sse * sse; - dxy += sse * ld; - } - dist_mean = dist_mean / data_num; - my = my / train_num; - mx = mx / train_num; - dx = sqrt(dx / train_num); - dxy = dxy / train_num; - - md->dist_mean = dist_mean; - md->a = (dxy - mx * my) / (dx * dx - mx * mx); - md->b = my - md->a * mx; - ++md->bracket_idx; - md->ready = 1; - assert(md->bracket_idx < DATA_BRACKETS); - (void)rdmult; -#if 0 - int skip_count = 0; - int fp_skip_count = 0; - double avg_error = 0; - const int test_num = data_num; - for (int i = 0; i < data_num; ++i) { - const int64_t sse = inter_mode_data_sse[block_idx][i]; - const int64_t dist = inter_mode_data_dist[block_idx][i]; - const int64_t residue_cost = inter_mode_data_residue_cost[block_idx][i]; - const int64_t all_cost = inter_mode_data_all_cost[block_idx][i]; - const int64_t est_rd = - get_est_rd(bsize, rdmult, sse, all_cost - residue_cost); - const int64_t real_rd = RDCOST(rdmult, all_cost, dist); - const int64_t ref_best_rd = inter_mode_data_ref_best_rd[block_idx][i]; - if (est_rd > ref_best_rd) { - ++skip_count; - if (real_rd < ref_best_rd) { - ++fp_skip_count; - } - } - avg_error += abs(est_rd - real_rd) * 100. / real_rd; - } - avg_error /= test_num; - printf("test_num %d bsize %d avg_error %f skip_count %d fp_skip_count %d\n", - test_num, bsize, avg_error, skip_count, fp_skip_count); -#endif } } -static void inter_mode_data_push(BLOCK_SIZE bsize, int64_t sse, int64_t dist, - int residue_cost, int all_cost, - int64_t ref_best_rd) { +static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize, + int64_t sse, int64_t dist, int residue_cost) { if (residue_cost == 0 || sse == dist) return; const int block_idx = inter_mode_data_block_idx(bsize); if (block_idx == -1) return; - if (inter_mode_data_idx[block_idx] < INTER_MODE_RD_DATA_OVERALL_SIZE) { - const int data_idx = inter_mode_data_idx[block_idx]; - inter_mode_data_sse[block_idx][data_idx] = sse; - inter_mode_data_dist[block_idx][data_idx] = dist; - inter_mode_data_residue_cost[block_idx][data_idx] = residue_cost; - inter_mode_data_all_cost[block_idx][data_idx] = all_cost; - inter_mode_data_ref_best_rd[block_idx][data_idx] = ref_best_rd; - ++inter_mode_data_idx[block_idx]; + InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize]; + if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) { + aom_clear_system_state(); + const double ld = (sse - dist) * 1. / residue_cost; + ++rd_model->num; + rd_model->dist_sum += dist; + rd_model->ld_sum += ld; + rd_model->sse_sum += sse; + rd_model->sse_sse_sum += sse * sse; + rd_model->sse_ld_sum += sse * ld; + } +} + +static void inter_modes_info_push(InterModesInfo *inter_modes_info, + int mode_rate, int64_t sse, int64_t est_rd, + const MB_MODE_INFO *mbmi) { + const int num = inter_modes_info->num; + assert(num < MAX_INTER_MODES); + inter_modes_info->mbmi_arr[num] = *mbmi; + inter_modes_info->mode_rate_arr[num] = mode_rate; + inter_modes_info->sse_arr[num] = sse; + inter_modes_info->est_rd_arr[num] = est_rd; + ++inter_modes_info->num; +} + +static int compare_rd_idx_pair(const void *a, const void *b) { + if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) { + return 0; + } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) { + return 1; + } else { + return -1; + } +} + +static void inter_modes_info_sort(const InterModesInfo *inter_modes_info, + RdIdxPair *rd_idx_pair_arr) { + if (inter_modes_info->num == 0) { + return; + } + for (int i = 0; i < inter_modes_info->num; ++i) { + rd_idx_pair_arr[i].idx = i; + rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i]; } + qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]), + compare_rd_idx_pair); } #endif // CONFIG_COLLECT_INTER_MODE_RD_STATS @@ -1528,13 +1626,13 @@ static void score_2D_transform_pow8(float *scores_2D, float shift) { // will lead to pruning i+1 TX types on average static const float *prune_2D_adaptive_thresholds[] = { // TX_4X4 - (float[]){ 0.02014f, 0.02722f, 0.03430f, 0.04114f, 0.04724f, 0.05212f, - 0.05627f, 0.06018f, 0.06409f, 0.06824f, 0.07312f, 0.07849f, - 0.08606f, 0.09827f }, + (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f, + 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f, + 0.09778f, 0.11780f }, // TX_8X8 - (float[]){ 0.00745f, 0.01355f, 0.02039f, 0.02795f, 0.03625f, 0.04407f, - 0.05042f, 0.05579f, 0.06067f, 0.06604f, 0.07239f, 0.08093f, - 0.09363f, 0.11682f }, + (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f, + 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f, + 0.10803f, 0.14124f }, // TX_16X16 (float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f, 0.06897f, 0.07629f, 0.08875f, 0.11169f }, @@ -1543,35 +1641,37 @@ static const float *prune_2D_adaptive_thresholds[] = { // TX_64X64 NULL, // TX_4X8 - (float[]){ 0.01282f, 0.02087f, 0.02844f, 0.03601f, 0.04285f, 0.04871f, - 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f, - 0.09119f, 0.10828f }, + (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f, + 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f, + 0.10168f, 0.12585f }, // TX_8X4 - (float[]){ 0.01184f, 0.01941f, 0.02722f, 0.03503f, 0.04187f, 0.04822f, - 0.05359f, 0.05823f, 0.06287f, 0.06799f, 0.07361f, 0.08093f, - 0.09167f, 0.10974f }, + (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f, + 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f, + 0.10583f, 0.13123f }, // TX_8X16 - (float[]){ 0.00525f, 0.01135f, 0.01819f, 0.02576f, 0.03357f, 0.04114f, - 0.04773f, 0.05383f, 0.05920f, 0.06506f, 0.07190f, 0.08118f, - 0.09509f, 0.12097f }, + (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f, + 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f, + 0.10730f, 0.14221f }, // TX_16X8 - (float[]){ 0.00525f, 0.01160f, 0.01819f, 0.02527f, 0.03308f, 0.04065f, - 0.04773f, 0.05383f, 0.05969f, 0.06531f, 0.07214f, 0.08118f, - 0.09485f, 0.12048f }, + (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f, + 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f, + 0.10339f, 0.13464f }, // TX_16X32 - (float[]){ 0.01257f, 0.02576f, 0.03723f, 0.04578f, 0.05212f, 0.05798f, - 0.06506f, 0.07385f, 0.08606f, 0.10925f }, + NULL, // TX_32X16 - (float[]){ 0.01233f, 0.02527f, 0.03699f, 0.04602f, 0.05286f, 0.05896f, - 0.06531f, 0.07336f, 0.08582f, 0.11072f }, + NULL, // TX_32X64 NULL, // TX_64X32 NULL, // TX_4X16 - NULL, + (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f, + 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f, + 0.10242f, 0.12878f }, // TX_16X4 - NULL, + (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f, + 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f, + 0.10217f, 0.12610f }, // TX_8X32 NULL, // TX_32X8 @@ -1631,7 +1731,18 @@ static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, cur_scores_2D[3]; } score_2D_average /= 16; - score_2D_transform_pow8(scores_2D, (20 - score_2D_average)); + + const int prune_aggr_table[2][2] = { { 6, 4 }, { 10, 7 } }; + int pruning_aggressiveness = 1; + if (tx_set_type == EXT_TX_SET_ALL16) { + score_2D_transform_pow8(scores_2D, (10 - score_2D_average)); + pruning_aggressiveness = + prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0]; + } else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) { + score_2D_transform_pow8(scores_2D, (20 - score_2D_average)); + pruning_aggressiveness = + prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1]; + } // Always keep the TX type with the highest score, prune all others with // score below score_thresh. @@ -1645,18 +1756,6 @@ static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, } } - int pruning_aggressiveness = 0; - if (prune_mode == PRUNE_2D_ACCURATE) { - if (tx_set_type == EXT_TX_SET_ALL16) - pruning_aggressiveness = 6; - else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) - pruning_aggressiveness = 4; - } else if (prune_mode == PRUNE_2D_FAST) { - if (tx_set_type == EXT_TX_SET_ALL16) - pruning_aggressiveness = 10; - else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) - pruning_aggressiveness = 7; - } const float score_thresh = prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1]; @@ -1724,9 +1823,11 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, } static void model_rd_from_sse(const AV1_COMP *const cpi, - const MACROBLOCKD *const xd, BLOCK_SIZE bsize, - int plane, int64_t sse, int *rate, - int64_t *dist) { + const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, + int plane, int64_t sse, int num_samples, + int *rate, int64_t *dist) { + (void)num_samples; + const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int dequant_shift = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; @@ -1734,15 +1835,17 @@ static void model_rd_from_sse(const AV1_COMP *const cpi, // Fast approximate the modelling function. if (cpi->sf.simple_model_rd_from_var) { const int64_t square_error = sse; - int quantizer = (pd->dequant_Q3[1] >> dequant_shift); + int quantizer = pd->dequant_Q3[1] >> dequant_shift; if (quantizer < 120) - *rate = (int)((square_error * (280 - quantizer)) >> - (16 - AV1_PROB_COST_SHIFT)); + *rate = (int)AOMMIN( + (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT), + INT_MAX); else *rate = 0; + assert(*rate >= 0); *dist = (square_error * quantizer) >> 8; } else { - av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize], + av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize], pd->dequant_Q3[1] >> dequant_shift, rate, dist); } @@ -1776,22 +1879,23 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, - int plane_to, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb, int *plane_rate, - int64_t *plane_sse, int64_t *plane_dist) { + int plane_to, int mi_row, int mi_col, + int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, + int64_t *plane_dist) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. int plane; + (void)mi_row; + (void)mi_col; const int ref = xd->mi[0]->ref_frame[0]; int64_t rate_sum = 0; int64_t dist_sum = 0; int64_t total_sse = 0; - x->pred_sse[ref] = 0; - for (plane = plane_from; plane <= plane_to; ++plane) { struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; @@ -1805,26 +1909,31 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, if (x->skip_chroma_rd && plane) continue; - // TODO(geza): Write direct sse functions that do not compute - // variance as well. - sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); + model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist); + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); total_sse += sse; - - model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &rate, &dist); - rate_sum += rate; dist_sum += dist; if (plane_rate) plane_rate[plane] = rate; if (plane_sse) plane_sse[plane] = sse; if (plane_dist) plane_dist[plane] = dist; + assert(rate_sum >= 0); } if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + rate_sum = AOMMIN(rate_sum, INT_MAX); *out_rate_sum = (int)rate_sum; *out_dist_sum = dist_sum; } @@ -1949,7 +2058,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, assert(visible_cols > 0); #if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && plane == 0 && txb_cols >= 8 && txb_rows >= 8) + if (x->using_dist_8x8 && plane == 0) return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, tx_bsize, txb_cols, txb_rows, visible_cols, visible_rows, x->qindex); @@ -1967,8 +2076,7 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row, int blk_col, const BLOCK_SIZE plane_bsize, - const BLOCK_SIZE tx_bsize, - int force_sse) { + const BLOCK_SIZE tx_bsize) { int visible_rows, visible_cols; const MACROBLOCKD *xd = &x->e_mbd; get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, @@ -1978,8 +2086,7 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, #if CONFIG_DIST_8X8 int txb_height = block_size_high[tx_bsize]; int txb_width = block_size_wide[tx_bsize]; - if (!force_sse && x->using_dist_8x8 && plane == 0 && txb_width >= 8 && - txb_height >= 8) { + if (x->using_dist_8x8 && plane == 0) { const int src_stride = x->plane[plane].src.stride; const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0]; @@ -2145,29 +2252,7 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x, av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon, MAX_TX_SIZE, eob, cpi->common.reduced_tx_set_used); -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) { - // Save decoded pixels for inter block in pd->pred to avoid - // block_8x8_rd_txfm_daala_dist() need to produce them - // by calling av1_inverse_transform_block() again. - const int pred_stride = block_size_wide[plane_bsize]; - const int pred_idx = (blk_row * pred_stride + blk_col) - << tx_size_wide_log2[0]; - int16_t *pred = &x->pred_luma[pred_idx]; - int i, j; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (j = 0; j < bsh; j++) - for (i = 0; i < bsw; i++) - pred[j * pred_stride + i] = - CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i]; - } else { - for (j = 0; j < bsh; j++) - for (i = 0; i < bsw; i++) - pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i]; - } - } -#endif // CONFIG_DIST_8X8 return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE, blk_row, blk_col, plane_bsize, tx_bsize); } @@ -2258,11 +2343,11 @@ static void get_2x2_normalized_sses_and_sads( } } +// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values +// 0: Do not collect any RD stats +// 1: Collect RD stats for transform units +// 2: Collect RD stats for partition units #if CONFIG_COLLECT_RD_STATS - // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values - // 0: Do not collect any RD stats - // 1: Collect RD stats for transform units - // 2: Collect RD stats for partition units #if CONFIG_COLLECT_RD_STATS == 1 static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, @@ -2274,7 +2359,7 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, // Generate small sample to restrict output size. static unsigned int seed = 21743; - if (lcg_rand16(&seed) % 100 > 0) return; + if (lcg_rand16(&seed) % 256 > 0) return; const char output_file[] = "tu_stats.txt"; FILE *fout = fopen(output_file, "a"); @@ -2336,7 +2421,8 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, int model_rate; int64_t model_dist; - model_rd_from_sse(cpi, xd, tx_bsize, plane, sse, &model_rate, &model_dist); + model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples, + &model_rate, &model_dist); const double model_rate_norm = (double)model_rate / num_samples; const double model_dist_norm = (double)model_dist / num_samples; fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); @@ -2360,7 +2446,7 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, } #endif // CONFIG_COLLECT_RD_STATS == 1 -#if CONFIG_COLLECT_RD_STATS == 2 +#if CONFIG_COLLECT_RD_STATS >= 2 static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats, BLOCK_SIZE plane_bsize) { @@ -2369,7 +2455,7 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, // Generate small sample to restrict output size. static unsigned int seed = 95014; - if (lcg_rand16(&seed) % 100 > 0) return; + if (lcg_rand16(&seed) % 256 > 0) return; const char output_file[] = "pu_stats.txt"; FILE *fout = fopen(output_file, "a"); @@ -2390,8 +2476,10 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const double rate_norm = (double)rd_stats->rate / num_samples; const double dist_norm = (double)rd_stats->dist / num_samples; + const double rdcost_norm = + (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples; - fprintf(fout, "%g %g", rate_norm, dist_norm); + fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm); const int src_stride = p->src.stride; const uint8_t *const src = p->src.buf; @@ -2426,14 +2514,18 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, fprintf(fout, " %g", sad_norm_arr[i]); } - fprintf(fout, " %d %d %d", q_step, bw, bh); + fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh); int model_rate; int64_t model_dist; - model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, &model_dist); + model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples, + &model_rate, &model_dist); + const double model_rdcost_norm = + (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples; const double model_rate_norm = (double)model_rate / num_samples; const double model_dist_norm = (double)model_dist / num_samples; - fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); + fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm, + model_rdcost_norm); double mean = get_mean(src_diff, diff_stride, bw, bh); mean /= (1 << shift); @@ -2450,53 +2542,51 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, fprintf(fout, "\n"); fclose(fout); } -#endif // CONFIG_COLLECT_RD_STATS == 2 +#endif // CONFIG_COLLECT_RD_STATS >= 2 #endif // CONFIG_COLLECT_RD_STATS -static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x, - BLOCK_SIZE plane_bsize, int plane, int64_t *rsse, +static void model_rd_with_dnn(const AV1_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, + int plane, int64_t sse, int num_samples, int *rate, int64_t *dist) { const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int log_numpels = num_pels_log2_lookup[plane_bsize]; + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int q_step = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1); + const struct macroblock_plane *const p = &x->plane[plane]; int bw, bh; - const int diff_stride = block_size_wide[plane_bsize]; get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, &bh); - const int num_samples = bw * bh; - const int dequant_shift = - (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; - const int q_step = pd->dequant_Q3[1] >> dequant_shift; - const int src_stride = p->src.stride; const uint8_t *const src = p->src.buf; const int dst_stride = pd->dst.stride; const uint8_t *const dst = pd->dst.buf; const int16_t *const src_diff = p->src_diff; + const int diff_stride = block_size_wide[plane_bsize]; const int shift = (xd->bd - 8); - int64_t sse = aom_sum_squares_2d_i16(p->src_diff, diff_stride, bw, bh); - sse = ROUND_POWER_OF_TWO(sse, shift * 2); - const double sse_norm = (double)sse / num_samples; if (sse == 0) { if (rate) *rate = 0; if (dist) *dist = 0; - if (rsse) *rsse = sse; return; } if (plane) { int model_rate; int64_t model_dist; - model_rd_from_sse(cpi, xd, plane_bsize, plane, sse, &model_rate, - &model_dist); + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, num_samples, + &model_rate, &model_dist); if (rate) *rate = model_rate; if (dist) *dist = model_dist; - if (rsse) *rsse = sse; return; } + aom_clear_system_state(); + const double sse_norm = (double)sse / num_samples; + double sse_norm_arr[4]; get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst, dst_stride, src_diff, diff_stride, @@ -2506,25 +2596,26 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x, for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift)); mean /= (1 << shift); } - const double variance = sse_norm - mean * mean; - assert(variance >= 0.0); + double sse_norm_sum = 0.0, sse_frac_arr[3]; + for (int k = 0; k < 4; ++k) sse_norm_sum += sse_norm_arr[k]; + for (int k = 0; k < 3; ++k) + sse_frac_arr[k] = + sse_norm_sum > 0.0 ? sse_norm_arr[k] / sse_norm_sum : 0.25; const double q_sqr = (double)(q_step * q_step); const double q_sqr_by_sse_norm = q_sqr / (sse_norm + 1.0); + const double mean_sqr_by_sse_norm = mean * mean / (sse_norm + 1.0); double hor_corr, vert_corr; get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr); - float features[11]; + float features[NUM_FEATURES_PUSTATS]; features[0] = (float)hor_corr; features[1] = (float)log_numpels; - features[2] = (float)q_sqr; + features[2] = (float)mean_sqr_by_sse_norm; features[3] = (float)q_sqr_by_sse_norm; - features[4] = (float)sse_norm_arr[0]; - features[5] = (float)sse_norm_arr[1]; - features[6] = (float)sse_norm_arr[2]; - features[7] = (float)sse_norm_arr[3]; - features[8] = (float)sse_norm; - features[9] = (float)variance; - features[10] = (float)vert_corr; + features[4] = (float)sse_frac_arr[0]; + features[5] = (float)sse_frac_arr[1]; + features[6] = (float)sse_frac_arr[2]; + features[7] = (float)vert_corr; float rate_f, dist_by_sse_norm_f; av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f); @@ -2532,27 +2623,29 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, MACROBLOCK *const x, const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm)); int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + aom_clear_system_state(); // Check if skip is better - if (RDCOST(x->rdmult, rate_i, dist_i) >= RDCOST(x->rdmult, 0, (sse << 4))) { + if (rate_i == 0) { dist_i = sse << 4; + } else if (RDCOST(x->rdmult, rate_i, dist_i) >= + RDCOST(x->rdmult, 0, sse << 4)) { rate_i = 0; - } else if (rate_i == 0) { dist_i = sse << 4; } if (rate) *rate = rate_i; if (dist) *dist = dist_i; - if (rsse) *rsse = sse; return; } -void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize, - MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, - int plane_to, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb, int *plane_rate, - int64_t *plane_sse, int64_t *plane_dist) { +static void model_rd_for_sb_with_dnn( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + (void)mi_row; + (void)mi_col; // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -2562,19 +2655,306 @@ void model_rd_for_sb_with_dnn(const AV1_COMP *const cpi, BLOCK_SIZE bsize, int64_t dist_sum = 0; int64_t total_sse = 0; - x->pred_sse[ref] = 0; + for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int64_t dist, sse; + int rate; + + if (x->skip_chroma_rd && plane) continue; + + const struct macroblock_plane *const p = &x->plane[plane]; + const int shift = (xd->bd - 8); + int bw, bh; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, + &bw, &bh); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + sse = ROUND_POWER_OF_TWO(sse, shift * 2); + + model_rd_with_dnn(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +// Fits a surface for rate and distortion using as features: +// log2(sse_norm + 1) and log2(sse_norm/qstep^2) +static void model_rd_with_surffit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist) { + (void)cpi; + (void)plane_bsize; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1); + if (sse == 0) { + if (rate) *rate = 0; + if (dist) *dist = 0; + return; + } + aom_clear_system_state(); + const double sse_norm = (double)sse / num_samples; + const double qstepsqr = (double)qstep * qstep; + const double xm = log(sse_norm + 1.0) / log(2.0); + const double yl = log(sse_norm / qstepsqr) / log(2.0); + double rate_f, dist_by_sse_norm_f; + + av1_model_rd_surffit(xm, yl, &rate_f, &dist_by_sse_norm_f); + + const double dist_f = dist_by_sse_norm_f * sse_norm; + int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); + int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + aom_clear_system_state(); + + // Check if skip is better + if (rate_i == 0) { + dist_i = sse << 4; + } else if (RDCOST(x->rdmult, rate_i, dist_i) >= + RDCOST(x->rdmult, 0, sse << 4)) { + rate_i = 0; + dist_i = sse << 4; + } + + if (rate) *rate = rate_i; + if (dist) *dist = dist_i; +} + +static void model_rd_for_sb_with_surffit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + (void)mi_row; + (void)mi_col; + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int64_t dist, sse; + int rate; + + if (x->skip_chroma_rd && plane) continue; + + int bw, bh; + const struct macroblock_plane *const p = &x->plane[plane]; + const int shift = (xd->bd - 8); + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, + &bw, &bh); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + sse = ROUND_POWER_OF_TWO(sse, shift * 2); + + model_rd_with_surffit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +// Fits a curve for rate and distortion using as feature: +// log2(sse_norm/qstep^2) +static void model_rd_with_curvfit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist) { + (void)cpi; + (void)plane_bsize; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1); + + if (sse == 0) { + if (rate) *rate = 0; + if (dist) *dist = 0; + return; + } + aom_clear_system_state(); + const double sse_norm = (double)sse / num_samples; + const double qstepsqr = (double)qstep * qstep; + const double xqr = log(sse_norm / qstepsqr) / log(2.0); + + double rate_f, dist_by_sse_norm_f; + av1_model_rd_curvfit(xqr, &rate_f, &dist_by_sse_norm_f); + + const double dist_f = dist_by_sse_norm_f * sse_norm; + int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); + int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + aom_clear_system_state(); + + // Check if skip is better + if (rate_i == 0) { + dist_i = sse << 4; + } else if (RDCOST(x->rdmult, rate_i, dist_i) >= + RDCOST(x->rdmult, 0, sse << 4)) { + rate_i = 0; + dist_i = sse << 4; + } + + if (rate) *rate = rate_i; + if (dist) *dist = dist_i; +} + +static void model_rd_for_sb_with_curvfit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + (void)mi_row; + (void)mi_col; + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int64_t dist, sse; + int rate; + + if (x->skip_chroma_rd && plane) continue; + + int bw, bh; + const struct macroblock_plane *const p = &x->plane[plane]; + const int shift = (xd->bd - 8); + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, + &bw, &bh); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + + sse = ROUND_POWER_OF_TWO(sse, shift * 2); + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +static void model_rd_for_sb_with_fullrdy( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; int64_t sse; int rate; int64_t dist; if (x->skip_chroma_rd && plane) continue; - model_rd_with_dnn(cpi, x, plane_bsize, plane, &sse, &rate, &dist); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); + + RD_STATS rd_stats; + if (plane == 0) { + select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX); + if (rd_stats.invalid_rate) { + rate = 0; + dist = sse << 4; + } else { + rate = rd_stats.rate; + dist = rd_stats.dist; + } + } else { + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); + } if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); @@ -2638,8 +3018,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx]; cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx; - if (intra_hash_idx > 0 && - intra_txb_rd_info->entropy_context == cur_joint_ctx && + if (intra_txb_rd_info->entropy_context == cur_joint_ctx && x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) { mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type; const TX_TYPE ref_tx_type = @@ -2727,7 +3106,6 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, #if CONFIG_DIST_8X8 if (x->using_dist_8x8) use_transform_domain_distortion = 0; #endif - int calc_pixel_domain_distortion_final = cpi->sf.use_transform_domain_distortion == 1 && use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD && @@ -2740,7 +3118,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; int64_t block_sse = - pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize, 1); + pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize); if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2); block_sse *= 16; @@ -2834,7 +3212,6 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, assert(best_rd != INT64_MAX); best_rd_stats->skip = best_eob == 0; - if (best_eob == 0) best_tx_type = DCT_DCT; if (plane == 0) { update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, best_tx_type); @@ -2914,24 +3291,12 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, int64_t rd1, rd2, rd; RD_STATS this_rd_stats; -#if CONFIG_DIST_8X8 - // If sub8x8 tx, 8x8 or larger partition, and luma channel, - // dist-8x8 disables early skip, because the distortion metrics for - // sub8x8 tx (MSE) and reference distortion from 8x8 or larger partition - // (new distortion metric) are different. - // Exception is: dist-8x8 is enabled but still MSE is used, - // i.e. "--tune=" encoder option is not used. - int bw = block_size_wide[plane_bsize]; - int bh = block_size_high[plane_bsize]; - int disable_early_skip = - x->using_dist_8x8 && plane == AOM_PLANE_Y && bw >= 8 && bh >= 8 && - (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) && - x->tune_metric != AOM_TUNE_PSNR; -#endif // CONFIG_DIST_8X8 - av1_init_rd_stats(&this_rd_stats); - if (args->exit_early) return; + if (args->exit_early) { + args->incomplete_exit = 1; + return; + } if (!is_inter_block(mbmi)) { av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); @@ -2954,11 +3319,14 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, #endif // CONFIG_RD_DEBUG av1_set_txb_context(x, plane, block, tx_size, a, l); - if (plane == 0) { - x->blk_skip[blk_row * - (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) + - blk_col] = (x->plane[plane].eobs[block] == 0); - } + const int blk_idx = + blk_row * (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) + + blk_col; + + if (plane == 0) + set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0); + else + set_blk_skip(x, plane, blk_idx, 0); rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse); @@ -2972,100 +3340,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, args->this_rd += rd; -#if CONFIG_DIST_8X8 - if (!disable_early_skip) -#endif - if (args->this_rd > args->best_rd) { - args->exit_early = 1; - return; - } -} - -#if CONFIG_DIST_8X8 -static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, - struct rdcost_block_args *args) { - MACROBLOCKD *const xd = &x->e_mbd; - const struct macroblockd_plane *const pd = &xd->plane[0]; - const struct macroblock_plane *const p = &x->plane[0]; - MB_MODE_INFO *const mbmi = xd->mi[0]; - const int src_stride = p->src.stride; - const int dst_stride = pd->dst.stride; - const uint8_t *src = &p->src.buf[0]; - const uint8_t *dst = &pd->dst.buf[0]; - const int16_t *pred = &x->pred_luma[0]; - int bw = block_size_wide[bsize]; - int bh = block_size_high[bsize]; - int visible_w = bw; - int visible_h = bh; - - int i, j; - int64_t rd, rd1, rd2; - int64_t sse = INT64_MAX, dist = INT64_MAX; - int qindex = x->qindex; - - assert((bw & 0x07) == 0); - assert((bh & 0x07) == 0); - - get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w, - &visible_h); - - const int diff_stride = block_size_wide[bsize]; - const int16_t *diff = p->src_diff; - sse = dist_8x8_diff(x, src, src_stride, diff, diff_stride, bw, bh, visible_w, - visible_h, qindex); - sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); - sse *= 16; - - if (!is_inter_block(mbmi)) { - dist = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize, bw, bh, - visible_w, visible_h, qindex); - dist *= 16; - } else { - // For inter mode, the decoded pixels are provided in x->pred_luma, - // while the predicted pixels are in dst. - uint8_t *pred8; - DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - pred8 = CONVERT_TO_BYTEPTR(pred16); - else - pred8 = (uint8_t *)pred16; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (j = 0; j < bh; j++) - for (i = 0; i < bw; i++) - CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i]; - } else { - for (j = 0; j < bh; j++) - for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i]; - } - - dist = av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw, bh, - visible_w, visible_h, qindex); - dist *= 16; - } - -#ifdef DEBUG_DIST_8X8 - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) { - assert(args->rd_stats.sse == sse); - assert(args->rd_stats.dist == dist); + if (args->this_rd > args->best_rd) { + args->exit_early = 1; + return; } -#endif // DEBUG_DIST_8X8 - - args->rd_stats.sse = sse; - args->rd_stats.dist = dist; - - rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist); - rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse); - rd = AOMMIN(rd1, rd2); - - args->rd_stats.rdcost = rd; - args->this_rd = rd; - - if (args->this_rd > args->best_rd) args->exit_early = 1; } -#endif // CONFIG_DIST_8X8 static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, RD_STATS *rd_stats, int64_t ref_best_rd, int plane, @@ -3089,16 +3368,12 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm, &args); -#if CONFIG_DIST_8X8 - int bw = block_size_wide[bsize]; - int bh = block_size_high[bsize]; - if (x->using_dist_8x8 && !args.exit_early && plane == 0 && bw >= 8 && - bh >= 8 && (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) - dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args); -#endif + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early; - if (args.exit_early) { + if (invalid_rd) { av1_invalid_rd_stats(rd_stats); } else { *rd_stats = args.rd_stats; @@ -3269,6 +3544,11 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16); for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) { +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) { + if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue; + } +#endif RD_STATS this_rd_stats; if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD; rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE); @@ -3284,10 +3564,13 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, } if (n == TX_4X4) break; } - mbmi->tx_size = best_tx_size; - memcpy(mbmi->txk_type, best_txk_type, - sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN); - memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4); + + if (rd_stats->rate != INT_MAX) { + mbmi->tx_size = best_tx_size; + memcpy(mbmi->txk_type, best_txk_type, + sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN); + memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4); + } // Reset the pruning flags. av1_zero(x->tx_search_prune); @@ -3429,7 +3712,8 @@ static int conditional_skipintra(PREDICTION_MODE mode, // Model based RD estimation for luma intra blocks. static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x, - BLOCK_SIZE bsize, int mode_cost) { + BLOCK_SIZE bsize, int mode_cost, int mi_row, + int mi_col) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -3450,10 +3734,9 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x, } } // RD estimation. - av1_subtract_plane(x, bsize, 0); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, - &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, - NULL, NULL); + model_rd_sb_fn[MODELRD_TYPE_INTRA]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &this_rd_stats.rate, + &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL); if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) { mode_cost += x->angle_delta_cost[mbmi->mode - V_PRED] @@ -3519,13 +3802,16 @@ static void optimize_palette_colors(uint16_t *color_cache, int n_cache, // Given the base colors as specified in centroids[], calculate the RD cost // of palette mode. -static void palette_rd_y( - const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, - BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n, - uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi, - uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd, - int *rate, int *rate_tokenonly, int *rate_overhead, int64_t *distortion, - int *skippable, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip) { +static void palette_rd_y(const AV1_COMP *const cpi, MACROBLOCK *x, + MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int mi_row, + int mi_col, int dc_mode_cost, const int *data, + int *centroids, int n, uint16_t *color_cache, + int n_cache, MB_MODE_INFO *best_mbmi, + uint8_t *best_palette_color_map, int64_t *best_rd, + int64_t *best_model_rd, int *rate, int *rate_tokenonly, + int *rate_overhead, int64_t *distortion, + int *skippable, PICK_MODE_CONTEXT *ctx, + uint8_t *blk_skip) { optimize_palette_colors(color_cache, n_cache, n, 1, centroids); int k = av1_remove_duplicates(centroids, n); if (k < PALETTE_MIN_SIZE) { @@ -3551,7 +3837,8 @@ static void palette_rd_y( extend_palette_color_map(color_map, cols, rows, block_width, block_height); const int palette_mode_cost = intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost); - int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost); + int64_t this_model_rd = + intra_model_yrd(cpi, x, bsize, palette_mode_cost, mi_row, mi_col); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 1)) return; @@ -3580,11 +3867,11 @@ static void palette_rd_y( } static int rd_pick_palette_intra_sby( - const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, - int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly, - int64_t *distortion, int *skippable, PICK_MODE_CONTEXT *ctx, - uint8_t *best_blk_skip) { + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col, int dc_mode_cost, MB_MODE_INFO *best_mbmi, + uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd, + int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, + PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip) { int rate_overhead = 0; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -3668,10 +3955,11 @@ static int rd_pick_palette_intra_sby( // where the dominant colors and the k-means results are similar. for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) { for (i = 0; i < n; ++i) centroids[i] = top_colors[i]; - palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, - color_cache, n_cache, best_mbmi, best_palette_color_map, - best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead, - distortion, skippable, ctx, best_blk_skip); + palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data, + centroids, n, color_cache, n_cache, best_mbmi, + best_palette_color_map, best_rd, best_model_rd, rate, + rate_tokenonly, &rate_overhead, distortion, skippable, ctx, + best_blk_skip); } // K-means clustering. @@ -3688,10 +3976,11 @@ static int rd_pick_palette_intra_sby( } av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr); } - palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, - color_cache, n_cache, best_mbmi, best_palette_color_map, - best_rd, best_model_rd, rate, rate_tokenonly, &rate_overhead, - distortion, skippable, ctx, best_blk_skip); + palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data, + centroids, n, color_cache, n_cache, best_mbmi, + best_palette_color_map, best_rd, best_model_rd, rate, + rate_tokenonly, &rate_overhead, distortion, skippable, ctx, + best_blk_skip); } } @@ -3705,10 +3994,11 @@ static int rd_pick_palette_intra_sby( // Return 1 if an filter intra mode is selected; return 0 otherwise. static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, - int *rate, int *rate_tokenonly, - int64_t *distortion, int *skippable, - BLOCK_SIZE bsize, int mode_cost, - int64_t *best_rd, int64_t *best_model_rd, + int mi_row, int mi_col, int *rate, + int *rate_tokenonly, int64_t *distortion, + int *skippable, BLOCK_SIZE bsize, + int mode_cost, int64_t *best_rd, + int64_t *best_model_rd, PICK_MODE_CONTEXT *ctx) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; @@ -3727,7 +4017,7 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, int64_t this_rd, this_model_rd; RD_STATS tokenonly_rd_stats; mbmi->filter_intra_mode_info.filter_intra_mode = mode; - this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost); + this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 1)) continue; @@ -3770,20 +4060,18 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, // Run RD calculation with given luma intra prediction angle., and return // the RD cost. Update the best mode info. if the RD cost is the best so far. static int64_t calc_rd_given_intra_angle( - const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost, - int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate, - RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size, - int64_t *best_rd, int64_t *best_model_rd, TX_TYPE *best_txk_type, - uint8_t *best_blk_skip) { - int this_rate; + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col, int mode_cost, int64_t best_rd_in, int8_t angle_delta, + int max_angle_delta, int *rate, RD_STATS *rd_stats, int *best_angle_delta, + TX_SIZE *best_tx_size, int64_t *best_rd, int64_t *best_model_rd, + TX_TYPE *best_txk_type, uint8_t *best_blk_skip) { RD_STATS tokenonly_rd_stats; int64_t this_rd, this_model_rd; MB_MODE_INFO *mbmi = x->e_mbd.mi[0]; const int n4 = bsize_to_num_blk(bsize); assert(!is_inter_block(mbmi)); - mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta; - this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost); + this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 1)) return INT64_MAX; @@ -3791,10 +4079,9 @@ static int64_t calc_rd_given_intra_angle( super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in); if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX; - this_rate = - tokenonly_rd_stats.rate + mode_cost + - x->angle_delta_cost[mbmi->mode - V_PRED] - [max_angle_delta + mbmi->angle_delta[PLANE_TYPE_Y]]; + int this_rate = + mode_cost + tokenonly_rd_stats.rate + + x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta]; this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (this_rd < *best_rd) { @@ -3815,32 +4102,32 @@ static int64_t calc_rd_given_intra_angle( // With given luma directional intra prediction mode, pick the best angle delta // Return the RD cost corresponding to the best angle delta. static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x, - int *rate, RD_STATS *rd_stats, - BLOCK_SIZE bsize, int mode_cost, - int64_t best_rd, + int mi_row, int mi_col, int *rate, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int mode_cost, int64_t best_rd, int64_t *best_model_rd) { - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = xd->mi[0]; + MB_MODE_INFO *mbmi = x->e_mbd.mi[0]; assert(!is_inter_block(mbmi)); - int i, angle_delta, best_angle_delta = 0; - int first_try = 1; - int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; + + int best_angle_delta = 0; + int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; TX_SIZE best_tx_size = mbmi->tx_size; - const int n4 = bsize_to_num_blk(bsize); TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN]; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; - for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; + for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; - for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { - for (i = 0; i < 2; ++i) { - best_rd_in = (best_rd == INT64_MAX) - ? INT64_MAX - : (best_rd + (best_rd >> (first_try ? 3 : 5))); - this_rd = calc_rd_given_intra_angle( - cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta, - MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size, - &best_rd, best_model_rd, best_txk_type, best_blk_skip); + int first_try = 1; + for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + for (int i = 0; i < 2; ++i) { + const int64_t best_rd_in = + (best_rd == INT64_MAX) ? INT64_MAX + : (best_rd + (best_rd >> (first_try ? 3 : 5))); + const int64_t this_rd = calc_rd_given_intra_angle( + cpi, x, bsize, mi_row, mi_col, mode_cost, best_rd_in, + (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, rd_stats, + &best_angle_delta, &best_tx_size, &best_rd, best_model_rd, + best_txk_type, best_blk_skip); rd_cost[2 * angle_delta + i] = this_rd; if (first_try && this_rd == INT64_MAX) return best_rd; first_try = 0; @@ -3852,28 +4139,31 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x, } assert(best_rd != INT64_MAX); - for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { - int64_t rd_thresh; - for (i = 0; i < 2; ++i) { + for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + for (int i = 0; i < 2; ++i) { int skip_search = 0; - rd_thresh = best_rd + (best_rd >> 5); + const int64_t rd_thresh = best_rd + (best_rd >> 5); if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) skip_search = 1; if (!skip_search) { - calc_rd_given_intra_angle( - cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta, - MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size, - &best_rd, best_model_rd, best_txk_type, best_blk_skip); + calc_rd_given_intra_angle(cpi, x, bsize, mi_row, mi_col, mode_cost, + best_rd, (1 - 2 * i) * angle_delta, + MAX_ANGLE_DELTA, rate, rd_stats, + &best_angle_delta, &best_tx_size, &best_rd, + best_model_rd, best_txk_type, best_blk_skip); } } } - mbmi->tx_size = best_tx_size; - mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta; - memcpy(mbmi->txk_type, best_txk_type, - sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); - memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4); + if (rd_stats->rate != INT_MAX) { + mbmi->tx_size = best_tx_size; + mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta; + memcpy(mbmi->txk_type, best_txk_type, + sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); + memcpy(x->blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize)); + } return best_rd; } @@ -4052,10 +4342,10 @@ static void intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, // This function is used only for intra_only frames static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, - int *rate, int *rate_tokenonly, - int64_t *distortion, int *skippable, - BLOCK_SIZE bsize, int64_t best_rd, - PICK_MODE_CONTEXT *ctx) { + int mi_row, int mi_col, int *rate, + int *rate_tokenonly, int64_t *distortion, + int *skippable, BLOCK_SIZE bsize, + int64_t best_rd, PICK_MODE_CONTEXT *ctx) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); @@ -4098,13 +4388,14 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO best_mbmi = *mbmi; /* Y Search for intra prediction mode */ - for (int mode_idx = DC_PRED; mode_idx < INTRA_MODES; ++mode_idx) { + for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) { RD_STATS this_rd_stats; int this_rate, this_rate_tokenonly, s; int64_t this_distortion, this_rd, this_model_rd; mbmi->mode = intra_rd_search_mode_order[mode_idx]; mbmi->angle_delta[PLANE_TYPE_Y] = 0; - this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]); + this_model_rd = + intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode], mi_row, mi_col); if (best_model_rd != INT64_MAX && this_model_rd > best_model_rd + (best_model_rd >> 1)) continue; @@ -4113,8 +4404,9 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue; if (is_directional_mode && av1_use_angle_delta(bsize)) { this_rd_stats.rate = INT_MAX; - rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize, - bmode_costs[mbmi->mode], best_rd, &best_model_rd); + rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &this_rate, + &this_rd_stats, bsize, bmode_costs[mbmi->mode], + best_rd, &best_model_rd); } else { super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd); } @@ -4151,16 +4443,16 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } if (try_palette) { - rd_pick_palette_intra_sby(cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, - best_palette_color_map, &best_rd, &best_model_rd, - rate, rate_tokenonly, distortion, skippable, ctx, - ctx->blk_skip); + rd_pick_palette_intra_sby( + cpi, x, bsize, mi_row, mi_col, bmode_costs[DC_PRED], &best_mbmi, + best_palette_color_map, &best_rd, &best_model_rd, rate, rate_tokenonly, + distortion, skippable, ctx, ctx->blk_skip); } if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) { - if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion, - skippable, bsize, bmode_costs[DC_PRED], - &best_rd, &best_model_rd, ctx)) { + if (rd_pick_filter_intra_sby( + cpi, x, mi_row, mi_col, rate, rate_tokenonly, distortion, skippable, + bsize, bmode_costs[DC_PRED], &best_rd, &best_model_rd, ctx)) { best_mbmi = *mbmi; } } @@ -4230,16 +4522,12 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, int blk_row, int blk_col, int plane, int block, - int plane_bsize, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l, RD_STATS *rd_stats, + int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost, TXB_RD_INFO *rd_info_array) { const struct macroblock_plane *const p = &x->plane[plane]; - TXB_CTX txb_ctx; - get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); const uint16_t cur_joint_ctx = - (txb_ctx.dc_sign_ctx << 8) + txb_ctx.txb_skip_ctx; - + (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx; const int txk_type_idx = av1_get_txk_type_index(plane_bsize, blk_row, blk_col); // Look up RD and terminate early in case when we've already processed exactly @@ -4264,7 +4552,7 @@ static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, RD_STATS this_rd_stats; search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - &txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats); + txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats); av1_merge_rd_stats(rd_stats, &this_rd_stats); @@ -4428,8 +4716,8 @@ static void try_tx_block_no_split( rd_stats->zero_rate = zero_blk_rate; const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col); mbmi->inter_tx_size[index] = tx_size; - tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, pta, - ptl, rd_stats, ftxs_mode, ref_best_rd, + tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, + &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, rd_info_node != NULL ? rd_info_node->rd_info_array : NULL); assert(rd_stats->rate < INT_MAX); @@ -4444,12 +4732,12 @@ static void try_tx_block_no_split( rd_stats->rate = zero_blk_rate; rd_stats->dist = rd_stats->sse; rd_stats->skip = 1; - x->blk_skip[blk_row * bw + blk_col] = 1; + set_blk_skip(x, 0, blk_row * bw + blk_col, 1); p->eobs[block] = 0; update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, DCT_DCT); } else { - x->blk_skip[blk_row * bw + blk_col] = 0; + set_blk_skip(x, 0, blk_row * bw + blk_col, 0); rd_stats->skip = 0; } @@ -4482,7 +4770,6 @@ static void try_tx_block_split( MACROBLOCKD *const xd = &x->e_mbd; const int max_blocks_high = max_block_high(xd, plane_bsize, 0); const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); - struct macroblock_plane *const p = &x->plane[0]; const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; @@ -4490,10 +4777,7 @@ static void try_tx_block_split( RD_STATS this_rd_stats; int this_cost_valid = 1; int64_t tmp_rd = 0; -#if CONFIG_DIST_8X8 - int sub8x8_eob[4] = { 0, 0, 0, 0 }; - struct macroblockd_plane *const pd = &xd->plane[0]; -#endif + split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1]; assert(tx_size < TX_SIZES_ALL); @@ -4511,123 +4795,22 @@ static void try_tx_block_split( &this_cost_valid, ftxs_mode, (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL); -#if CONFIG_DIST_8X8 - if (!x->using_dist_8x8) -#endif - if (!this_cost_valid) goto LOOP_EXIT; -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && tx_size == TX_8X8) { - sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block]; - } -#endif // CONFIG_DIST_8X8 + if (!this_cost_valid) goto LOOP_EXIT; + av1_merge_rd_stats(split_rd_stats, &this_rd_stats); tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); -#if CONFIG_DIST_8X8 - if (!x->using_dist_8x8) -#endif - if (no_split_rd < tmp_rd) { - this_cost_valid = 0; - goto LOOP_EXIT; - } + + if (no_split_rd < tmp_rd) { + this_cost_valid = 0; + goto LOOP_EXIT; + } block += sub_step; } } LOOP_EXIT : {} -#if CONFIG_DIST_8X8 - if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) { - const int src_stride = p->src.stride; - const int dst_stride = pd->dst.stride; - - const uint8_t *src = - &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]]; - const uint8_t *dst = - &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]]; - - int64_t dist_8x8; - const int qindex = x->qindex; - const int pred_stride = block_size_wide[plane_bsize]; - const int pred_idx = (blk_row * pred_stride + blk_col) - << tx_size_wide_log2[0]; - const int16_t *pred = &x->pred_luma[pred_idx]; - int i, j; - int row, col; - - uint8_t *pred8; - DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]); - - dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, BLOCK_8X8, - 8, 8, 8, 8, qindex) * - 16; - -#ifdef DEBUG_DIST_8X8 - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) - assert(sum_rd_stats.sse == dist_8x8); -#endif // DEBUG_DIST_8X8 - - split_rd_stats->sse = dist_8x8; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - pred8 = CONVERT_TO_BYTEPTR(pred8_16); - else - pred8 = (uint8_t *)pred8_16; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - for (row = 0; row < 2; ++row) { - for (col = 0; col < 2; ++col) { - int idx = row * 2 + col; - int eob = sub8x8_eob[idx]; - - if (eob > 0) { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - CONVERT_TO_SHORTPTR(pred8) - [(row * 4 + j) * 8 + 4 * col + i] = - pred[(row * 4 + j) * pred_stride + 4 * col + i]; - } else { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - CONVERT_TO_SHORTPTR(pred8) - [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR( - dst)[(row * 4 + j) * dst_stride + 4 * col + i]; - } - } - } - } else { - for (row = 0; row < 2; ++row) { - for (col = 0; col < 2; ++col) { - int idx = row * 2 + col; - int eob = sub8x8_eob[idx]; - - if (eob > 0) { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - pred8[(row * 4 + j) * 8 + 4 * col + i] = - (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i]; - } else { - for (j = 0; j < 4; j++) - for (i = 0; i < 4; i++) - pred8[(row * 4 + j) * 8 + 4 * col + i] = - dst[(row * 4 + j) * dst_stride + 4 * col + i]; - } - } - } - } - dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8, 8, - 8, 8, qindex) * - 16; - -#ifdef DEBUG_DIST_8X8 - if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) - assert(sum_rd_stats.dist == dist_8x8); -#endif // DEBUG_DIST_8X8 - - split_rd_stats->dist = dist_8x8; - tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); - } -#endif // CONFIG_DIST_8X8 if (this_cost_valid) *split_rd = tmp_rd; } @@ -4660,7 +4843,10 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, const int try_no_split = 1; int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH; - +#if CONFIG_DIST_8X8 + if (x->using_dist_8x8) + try_split &= tx_size_wide[tx_size] >= 16 && tx_size_high[tx_size] >= 16; +#endif TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES }; // TX no split @@ -4691,11 +4877,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, } } -#if COLLECT_TX_SIZE_DATA - // Do not skip tx_split when collecting tx size data. - try_split = 1; -#endif - // TX split int64_t split_rd = INT64_MAX; RD_STATS split_rd_stats; @@ -4707,54 +4888,6 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, rd_info_node, &split_rd_stats, &split_rd); } -#if COLLECT_TX_SIZE_DATA - do { - if (tx_size <= TX_4X4 || depth >= MAX_VARTX_DEPTH) break; - -#if 0 - // Randomly select blocks to collect data to reduce output file size. - const int rnd_val = rand() % 2; - if (rnd_val) break; -#endif - - const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); - const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); - const int within_border = - mi_row >= xd->tile.mi_row_start && - (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) && - mi_col >= xd->tile.mi_col_start && - (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end); - if (!within_border) break; - - FILE *fp = fopen(av1_tx_size_data_output_file, "a"); - if (!fp) break; - - // Split decision, RD cost, block type(inter/intra), q-index, rdmult, - // and block size. - const int split_selected = sum_rd < this_rd; - const int is_inter = 1; - const int txb_w = tx_size_wide[tx_size]; - const int txb_h = tx_size_high[tx_size]; - fprintf(fp, "%d,%lld,%lld,%d,%d,%d,%d,%d,", split_selected, - (long long)this_rd, (long long)sum_rd, cpi->common.base_qindex, - x->rdmult, is_inter, txb_w, txb_h); - - // Residue signal. - const int diff_stride = block_size_wide[plane_bsize]; - const int16_t *src_diff = - &p->src_diff[(blk_row * diff_stride + blk_col) * 4]; - for (int r = 0; r < txb_h; ++r) { - for (int c = 0; c < txb_w; ++c) { - fprintf(fp, "%d,", src_diff[c]); - } - src_diff += diff_stride; - } - fprintf(fp, "\n"); - - fclose(fp); - } while (0); -#endif // COLLECT_TX_SIZE_DATA - if (no_split.rd < split_rd) { ENTROPY_CONTEXT *pta = ta + blk_col; ENTROPY_CONTEXT *ptl = tl + blk_row; @@ -4773,7 +4906,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, mbmi->tx_size = tx_size_selected; update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, no_split.tx_type); - x->blk_skip[blk_row * bw + blk_col] = rd_stats->skip; + set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip); } else { *rd_stats = split_rd_stats; if (split_rd == INT64_MAX) *is_cost_valid = 0; @@ -4787,7 +4920,7 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, TXB_RD_INFO_NODE *rd_info_tree) { MACROBLOCKD *const xd = &x->e_mbd; int is_cost_valid = 1; - int64_t this_rd = 0; + int64_t this_rd = 0, skip_rd = 0; if (ref_best_rd < 0) is_cost_valid = 0; @@ -4818,42 +4951,39 @@ static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); + const int skip_ctx = av1_get_skip_context(xd); + const int s0 = x->skip_cost[skip_ctx][0]; + const int s1 = x->skip_cost[skip_ctx][1]; + skip_rd = RDCOST(x->rdmult, s1, 0); + this_rd = RDCOST(x->rdmult, s0, 0); for (idy = 0; idy < mi_height; idy += bh) { for (idx = 0; idx < mi_width; idx += bw) { + int64_t best_rd_sofar = (ref_best_rd - (AOMMIN(skip_rd, this_rd))); select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, plane_bsize, ctxa, ctxl, tx_above, tx_left, - &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid, - ftxs_mode, rd_info_tree); + &pn_rd_stats, best_rd_sofar, &is_cost_valid, ftxs_mode, + rd_info_tree); if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) { av1_invalid_rd_stats(rd_stats); return; } av1_merge_rd_stats(rd_stats, &pn_rd_stats); - this_rd += - AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist), - RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse)); + skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse); + this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist); block += step; if (rd_info_tree != NULL) rd_info_tree += 1; } } + if (skip_rd <= this_rd) { + rd_stats->rate = 0; + rd_stats->dist = rd_stats->sse; + rd_stats->skip = 1; + } else { + rd_stats->skip = 0; + } } - const int skip_ctx = av1_get_skip_context(xd); - const int s0 = x->skip_cost[skip_ctx][0]; - const int s1 = x->skip_cost[skip_ctx][1]; - int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse); - this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist); - if (skip_rd <= this_rd) { - this_rd = skip_rd; - rd_stats->rate = 0; - rd_stats->dist = rd_stats->sse; - rd_stats->skip = 1; - } else { - rd_stats->skip = 0; - } - if (this_rd > ref_best_rd) is_cost_valid = 0; - if (!is_cost_valid) { // reset cost value av1_invalid_rd_stats(rd_stats); @@ -4945,8 +5075,8 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; rd_stats->zero_rate = zero_blk_rate; rd_stats->ref_rdcost = ref_best_rd; - tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, ta, - tl, rd_stats, ftxs_mode, ref_best_rd, NULL); + tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, + &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL); const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || @@ -4954,14 +5084,14 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, rd_stats->rate = zero_blk_rate; rd_stats->dist = rd_stats->sse; rd_stats->skip = 1; - x->blk_skip[blk_row * mi_width + blk_col] = 1; + set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1); x->plane[0].eobs[block] = 0; x->plane[0].txb_entropy_ctx[block] = 0; update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, DCT_DCT); } else { rd_stats->skip = 0; - x->blk_skip[blk_row * mi_width + blk_col] = 0; + set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0); } if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) rd_stats->rate += x->txfm_partition_cost[ctx][0]; @@ -5128,12 +5258,13 @@ static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info, static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash) { // Linear search through the circular buffer to find matching hash. - int index; - for (int i = cur_record->num - 1; i >= 0; i--) { - index = (cur_record->index_start + i) % TX_SIZE_RD_RECORD_BUFFER_LEN; - if (cur_record->hash_vals[index] == hash) return index; + for (int i = cur_record->index_start - 1; i >= 0; i--) { + if (cur_record->hash_vals[i] == hash) return i; } - + for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) { + if (cur_record->hash_vals[i] == hash) return i; + } + int index; // If not found - add new RD info into the buffer and return its index if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) { index = (cur_record->index_start + cur_record->num) % @@ -5150,6 +5281,155 @@ static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, return index; } +typedef struct { + int leaf; + int8_t children[4]; +} RD_RECORD_IDX_NODE; + +static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = { + { 1, { 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = { + { 0, { 1, 2, -1, -1 } }, + { 1, { 0, 0, 0, 0 } }, + { 1, { 0, 0, 0, 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = { + { 0, { 1, 2, -1, -1 } }, + { 1, { 0 } }, + { 1, { 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = { + { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = { + { 0, { 1, 2, -1, -1 } }, + { 0, { 3, 4, 5, 6 } }, + { 0, { 7, 8, 9, 10 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = { + { 0, { 1, 2, -1, -1 } }, + { 0, { 3, 4, 7, 8 } }, + { 0, { 5, 6, 9, 10 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = { + { 0, { 1, 2, 3, 4 } }, { 0, { 5, 6, 9, 10 } }, { 0, { 7, 8, 11, 12 } }, + { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = { + { 0, { 2, 3, 4, 5 } }, { 0, { 6, 7, 8, 9 } }, + { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } }, + { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } }, + { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } }, + { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = { + { 0, { 2, 3, 6, 7 } }, { 0, { 4, 5, 8, 9 } }, + { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } }, + { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } }, + { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } }, + { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = { + { 0, { 4, 5, 8, 9 } }, { 0, { 6, 7, 10, 11 } }, + { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } }, + { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } }, + { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } }, + { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } }, + { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } }, + { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } }, + { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } }, + { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } }, + { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = { + { 0, { 1, -1, 2, -1 } }, + { 0, { 3, 4, -1, -1 } }, + { 0, { 5, 6, -1, -1 } }, +}; + +static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = { + { 0, { 1, 2, -1, -1 } }, + { 0, { 3, 4, -1, -1 } }, + { 0, { 5, 6, -1, -1 } }, +}; + +static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = { + NULL, // BLOCK_4X4 + NULL, // BLOCK_4X8 + NULL, // BLOCK_8X4 + rd_record_tree_8x8, // BLOCK_8X8 + rd_record_tree_8x16, // BLOCK_8X16 + rd_record_tree_16x8, // BLOCK_16X8 + rd_record_tree_16x16, // BLOCK_16X16 + rd_record_tree_1_2, // BLOCK_16X32 + rd_record_tree_2_1, // BLOCK_32X16 + rd_record_tree_sqr, // BLOCK_32X32 + rd_record_tree_1_2, // BLOCK_32X64 + rd_record_tree_2_1, // BLOCK_64X32 + rd_record_tree_sqr, // BLOCK_64X64 + rd_record_tree_64x128, // BLOCK_64X128 + rd_record_tree_128x64, // BLOCK_128X64 + rd_record_tree_128x128, // BLOCK_128X128 + NULL, // BLOCK_4X16 + NULL, // BLOCK_16X4 + rd_record_tree_1_4, // BLOCK_8X32 + rd_record_tree_4_1, // BLOCK_32X8 + rd_record_tree_1_4, // BLOCK_16X64 + rd_record_tree_4_1, // BLOCK_64X16 +}; + +static const int rd_record_tree_size[BLOCK_SIZES_ALL] = { + 0, // BLOCK_4X4 + 0, // BLOCK_4X8 + 0, // BLOCK_8X4 + sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X8 + sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X16 + sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X8 + sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X16 + sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X32 + sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X16 + sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X32 + sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X64 + sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X32 + sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X64 + sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X128 + sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X64 + sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X128 + 0, // BLOCK_4X16 + 0, // BLOCK_16X4 + sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X32 + sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X8 + sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X64 + sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X16 +}; + +static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree, + BLOCK_SIZE bsize) { + const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize]; + const int size = rd_record_tree_size[bsize]; + for (int i = 0; i < size; ++i) { + if (rd_record[i].leaf) { + av1_zero(tree[i].children); + } else { + for (int j = 0; j < 4; ++j) { + const int8_t idx = rd_record[i].children[j]; + tree[i].children[j] = idx > 0 ? &tree[idx] : NULL; + } + } + } +} + // Go through all TX blocks that could be used in TX size search, compute // residual hash values for them and find matching RD info that stores previous // RD search results for these TX blocks. The idea is to prevent repeated @@ -5168,26 +5448,23 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, // Hashing is performed only for square TX sizes larger than TX_4X4 if (max_square_tx_size < TX_8X8) return 0; - - const int bw_mi = mi_size_wide[bsize]; const int diff_stride = bw; const struct macroblock_plane *const p = &x->plane[0]; const int16_t *diff = &p->src_diff[0]; - + init_rd_record_tree(dst_rd_info, bsize); // Coordinates of the top-left corner of current block within the superblock // measured in pixels: const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2; const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2; int cur_rd_info_idx = 0; int cur_tx_depth = 0; - uint8_t parent_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; - uint8_t child_idx_buf[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize]; while (cur_tx_depth <= MAX_VARTX_DEPTH) { const int cur_tx_bw = tx_size_wide[cur_tx_size]; const int cur_tx_bh = tx_size_high[cur_tx_size]; if (cur_tx_bw < 8 || cur_tx_bh < 8) break; const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size]; + const int tx_size_idx = cur_tx_size - TX_8X8; for (int row = 0; row < bh; row += cur_tx_bh) { for (int col = 0; col < bw; col += cur_tx_bw) { if (cur_tx_bw != cur_tx_bh) { @@ -5211,48 +5488,13 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator, (uint8_t *)hash_data, 2 * cur_tx_bw * cur_tx_bh); - // Find corresponding RD info based on the hash value. - const int rd_record_idx = - row_in_sb * (MAX_MIB_SIZE >> (cur_tx_size + 1 - TX_8X8)) + - col_in_sb; - - int idx = find_tx_size_rd_info( - &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx], hash); + const int record_idx = + row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb; + TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx]; + int idx = find_tx_size_rd_info(records, hash); dst_rd_info[cur_rd_info_idx].rd_info_array = - &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx] - .tx_rd_info[idx]; - } - - // Update the output quadtree RD info structure. - av1_zero(dst_rd_info[cur_rd_info_idx].children); - const int this_mi_row = row / MI_SIZE; - const int this_mi_col = col / MI_SIZE; - if (cur_tx_depth > 0) { // Set up child pointers. - const int mi_index = this_mi_row * bw_mi + this_mi_col; - const int child_idx = child_idx_buf[mi_index]; - assert(child_idx < 4); - dst_rd_info[parent_idx_buf[mi_index]].children[child_idx] = - &dst_rd_info[cur_rd_info_idx]; - } - if (cur_tx_depth < MAX_VARTX_DEPTH) { // Set up parent and child idx. - const int tx_bh_mi = cur_tx_bh / MI_SIZE; - const int tx_bw_mi = cur_tx_bw / MI_SIZE; - for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; ++i) { - memset(parent_idx_buf + i * bw_mi + this_mi_col, cur_rd_info_idx, - tx_bw_mi); - } - int child_idx = 0; - const int next_tx_bh_mi = tx_size_wide_unit[next_tx_size]; - const int next_tx_bw_mi = tx_size_wide_unit[next_tx_size]; - for (int i = this_mi_row; i < this_mi_row + tx_bh_mi; - i += next_tx_bh_mi) { - for (int j = this_mi_col; j < this_mi_col + tx_bw_mi; - j += next_tx_bw_mi) { - assert(child_idx < 4); - child_idx_buf[i * bw_mi + j] = child_idx++; - } - } + &records->tx_rd_info[idx]; } ++cur_rd_info_idx; } @@ -5300,7 +5542,7 @@ static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist, const MACROBLOCKD *xd = &x->e_mbd; const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd); - *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, 1); + *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize); const int64_t mse = *dist / bw / bh; // Normalized quantizer takes the transform upscaling factor (8 for tx size // smaller than 32) into account. @@ -5354,7 +5596,7 @@ static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize, memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN); memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size)); mbmi->tx_size = tx_size; - memset(x->blk_skip, 1, sizeof(x->blk_skip[0]) * n4); + for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1); rd_stats->skip = 1; rd_stats->rate = 0; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) @@ -5388,17 +5630,21 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int model_rate; int64_t model_dist; int model_skip; - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, - &model_skip, NULL, NULL, NULL, NULL); + model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &model_rate, &model_dist, + &model_skip, NULL, NULL, NULL, NULL); const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist); // If the modeled rd is a lot worse than the best so far, breakout. // TODO(debargha, urvang): Improve the model and make the check below // tighter. assert(cpi->sf.model_based_prune_tx_search_level >= 0 && cpi->sf.model_based_prune_tx_search_level <= 2); + static const int prune_factor_by8[] = { 2 + MODELRD_TYPE_TX_SEARCH_PRUNE, + 4 + MODELRD_TYPE_TX_SEARCH_PRUNE }; if (!model_skip && - model_rd / (5 - cpi->sf.model_based_prune_tx_search_level) > - ref_best_rd) + ((model_rd * + prune_factor_by8[cpi->sf.model_based_prune_tx_search_level - 1]) >> + 3) > ref_best_rd) return; } @@ -5431,7 +5677,7 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, // Precompute residual hashes and find existing or add new RD records to // store and reuse rate and distortion values to speed up TX size search. - TXB_RD_INFO_NODE matched_rd_info[16 + 64 + 256]; + TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64]; int found_rd_info = 0; if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_inter_txb_hash) { found_rd_info = @@ -5479,34 +5725,61 @@ static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, assert(plane > 0); assert(tx_size < TX_SIZES_ALL); MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; const int max_blocks_high = max_block_high(xd, plane_bsize, plane); const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; ENTROPY_CONTEXT *ta = above_ctx + blk_col; ENTROPY_CONTEXT *tl = left_ctx + blk_row; + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx); + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_UV] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize, - ta, tl, rd_stats, ftxs_mode, INT64_MAX, NULL); + &txb_ctx, rd_stats, ftxs_mode, INT64_MAX, NULL); + + const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; + const int blk_idx = blk_row * mi_width + blk_col; + av1_set_txb_context(x, plane, block, tx_size, ta, tl); + if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= + RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || + rd_stats->skip == 1) && + !xd->lossless[mbmi->segment_id]) { + rd_stats->rate = zero_blk_rate; + rd_stats->dist = rd_stats->sse; + } + + // Set chroma blk_skip to 0 + set_blk_skip(x, plane, blk_idx, 0); } // Return value 0: early termination triggered, no valid rd cost available; // 1: rd cost values are valid. static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, - int64_t ref_best_rd, + int64_t non_skip_ref_best_rd, + int64_t skip_ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; int plane; int is_cost_valid = 1; int64_t this_rd = 0; + int64_t skip_rd = 0; - if (ref_best_rd < 0) is_cost_valid = 0; + if ((non_skip_ref_best_rd < 0) && (skip_ref_best_rd < 0)) is_cost_valid = 0; av1_init_rd_stats(rd_stats); - if (x->skip_chroma_rd) return is_cost_valid; + if (x->skip_chroma_rd) { + if (!is_cost_valid) av1_invalid_rd_stats(rd_stats); + + return is_cost_valid; + } + const BLOCK_SIZE bsizec = scale_chroma_bsize( bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y); @@ -5531,36 +5804,31 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, const int step = bh * bw; ENTROPY_CONTEXT ta[MAX_MIB_SIZE]; ENTROPY_CONTEXT tl[MAX_MIB_SIZE]; - RD_STATS pn_rd_stats; - av1_init_rd_stats(&pn_rd_stats); av1_get_entropy_contexts(bsizec, pd, ta, tl); for (idy = 0; idy < mi_height; idy += bh) { for (idx = 0; idx < mi_width; idx += bw) { + RD_STATS pn_rd_stats; + av1_init_rd_stats(&pn_rd_stats); tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode); + if (pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return 0; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse); + if ((this_rd > non_skip_ref_best_rd) && + (skip_rd > skip_ref_best_rd)) { + av1_invalid_rd_stats(rd_stats); + return 0; + } block += step; } } - - if (pn_rd_stats.rate == INT_MAX) { - is_cost_valid = 0; - break; - } - - av1_merge_rd_stats(rd_stats, &pn_rd_stats); - - this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist), - RDCOST(x->rdmult, rd_stats->zero_rate, rd_stats->sse)); - - if (this_rd > ref_best_rd) { - is_cost_valid = 0; - break; - } } - } - - if (!is_cost_valid) { + } else { // reset cost value av1_invalid_rd_stats(rd_stats); } @@ -6137,9 +6405,9 @@ static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) { (mv->col >> 3) > mv_limits->col_max; } -static INLINE int get_single_mode(int this_mode, int ref_idx, - int is_comp_pred) { - int single_mode; +static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode, + int ref_idx, int is_comp_pred) { + PREDICTION_MODE single_mode; if (is_comp_pred) { single_mode = ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode); @@ -6149,63 +6417,6 @@ static INLINE int get_single_mode(int this_mode, int ref_idx, return single_mode; } -/* If the current mode shares the same mv with other modes with higher prority, - * skip this mode. This priority order is nearest > global > near. */ -static int skip_repeated_mv(const AV1_COMMON *const cm, - const MACROBLOCK *const x, int this_mode, - const MV_REFERENCE_FRAME ref_frames[2]) { - const int is_comp_pred = ref_frames[1] > INTRA_FRAME; - const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames); - const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; - if (!is_comp_pred) { - if (this_mode == NEARMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) { - // NEARMV has the same motion vector as NEARESTMV - return 1; - } - if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 && - cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { - // NEARMV has the same motion vector as GLOBALMV - return 1; - } - } - if (this_mode == GLOBALMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 && - cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { - // GLOBALMV has the same motion vector as NEARESTMV - return 1; - } - } - } else { - for (int i = 0; i < 2; ++i) { - const int single_mode = get_single_mode(this_mode, i, is_comp_pred); - if (single_mode == NEARMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 0) { - // NEARMV has the same motion vector as NEARESTMV in compound mode - return 1; - } - } - } - if (this_mode == NEAR_NEARMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 1 && - cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION && - cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) { - // NEAR_NEARMV has the same motion vector as GLOBAL_GLOBALMV - return 1; - } - } - if (this_mode == GLOBAL_GLOBALMV) { - if (mbmi_ext->ref_mv_count[ref_frame_type] == 0 && - cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION && - cm->global_motion[ref_frames[1]].wmtype <= TRANSLATION) { - // GLOBAL_GLOBALMV has the same motion vector as NEARST_NEARSTMV - return 1; - } - } - } - return 0; -} - static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row, int mi_col, int_mv *ref_mv_sub8x8[2], @@ -6215,10 +6426,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, const int num_planes = av1_num_planes(cm); const int pw = block_size_wide[bsize]; const int ph = block_size_high[bsize]; + const int plane = 0; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; // This function should only ever be called for compound modes assert(has_second_ref(mbmi)); + const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] }; const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] }; int_mv ref_mv[2]; int ite, ref; @@ -6228,11 +6441,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, struct macroblockd_plane *const pd = &xd->plane[0]; const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic; const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir; - int is_global[2]; + + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); + conv_params.use_jnt_comp_avg = 0; + WarpTypesAllowed warp_types[2]; for (ref = 0; ref < 2; ++ref) { const WarpedMotionParams *const wm = &xd->global_motion[xd->mi[0]->ref_frame[ref]]; - is_global[ref] = is_global_mv_block(xd->mi[0], wm->wmtype); + const int is_global = is_global_mv_block(xd->mi[0], wm->wmtype); + warp_types[ref].global_warp_allowed = is_global; + warp_types[ref].local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL; } // Do joint motion search in compound mode to get more accurate mv. @@ -6244,30 +6462,38 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, }; // Prediction buffer from second frame. - DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]); - uint8_t *second_pred; + DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]); + uint8_t *second_pred = get_buf_by_bd(xd, second_pred16); (void)ref_mv_sub8x8; + const int have_newmv = have_nearmv_in_inter_mode(mbmi->mode); + const int ref_mv_idx = mbmi->ref_mv_idx + (have_newmv ? 1 : 0); + MV *const best_mv = &x->best_mv.as_mv; + const int search_range = SEARCH_RANGE_8P; + const int sadpb = x->sadperbit16; // Allow joint search multiple times iteratively for each reference frame // and break out of the search loop if it couldn't find a better mv. for (ite = 0; ite < 4; ite++) { struct buf_2d ref_yv12[2]; int bestsme = INT_MAX; - int sadpb = x->sadperbit16; - MV *const best_mv = &x->best_mv.as_mv; - int search_range = 3; - MvLimits tmp_mv_limits = x->mv_limits; int id = ite % 2; // Even iterations search in the first reference frame, // odd iterations search in the second. The predictor // found for the 'other' reference frame is factored in. - const int plane = 0; - ConvolveParams conv_params = get_conv_params(!id, 0, plane, xd->bd); - conv_params.use_jnt_comp_avg = 0; - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global[!id]; - warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL; - + if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) { + if (cur_mv[id].as_int == init_mv[id].as_int) { + break; + } else { + int_mv cur_int_mv, init_int_mv; + cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3; + cur_int_mv.as_mv.row = cur_mv[id].as_mv.col >> 3; + init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3; + init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3; + if (cur_int_mv.as_int == init_int_mv.as_int) { + break; + } + } + } for (ref = 0; ref < 2; ++ref) { ref_mv[ref] = av1_get_ref_mv(x, ref); // Swap out the reference frame for a version that's been scaled to @@ -6294,26 +6520,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, ref_yv12[1] = xd->plane[plane].pre[1]; // Get the prediction block from the 'other' reference frame. - InterpFilters interp_filters = EIGHTTAP_REGULAR; + const InterpFilters interp_filters = EIGHTTAP_REGULAR; // Since we have scaled the reference frames to match the size of the // current frame we must use a unit scaling factor during mode selection. - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); - av1_highbd_build_inter_predictor( - ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw, - &cur_mv[!id].as_mv, &cm->sf_identity, pw, ph, 0, interp_filters, - &warp_types, p_col, p_row, plane, MV_PRECISION_Q3, mi_col * MI_SIZE, - mi_row * MI_SIZE, xd, cm->allow_warped_motion); - } else { - second_pred = (uint8_t *)second_pred_alloc_16; - av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, - second_pred, pw, &cur_mv[!id].as_mv, - &cm->sf_identity, pw, ph, &conv_params, - interp_filters, &warp_types, p_col, p_row, - plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE, - mi_row * MI_SIZE, xd, cm->allow_warped_motion); - } + av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, + second_pred, pw, &cur_mv[!id].as_mv, + &cm->sf_identity, pw, ph, &conv_params, + interp_filters, &warp_types[!id], p_col, p_row, + plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd, cm->allow_warped_motion); const int order_idx = id != 0; av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset, @@ -6325,15 +6541,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv); // Use the mv result from the single mode as mv predictor. - // Use the mv result from the single mode as mv predictor. *best_mv = cur_mv[id].as_mv; best_mv->col >>= 3; best_mv->row >>= 3; - av1_set_mvcost( - x, id, - mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0)); + av1_set_mvcost(x, id, ref_mv_idx); // Small-range full-pixel motion search. bestsme = av1_refining_search_8p_c(x, sadpb, search_range, @@ -6385,7 +6598,6 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, // Restore the pointer to the first prediction buffer. if (id) xd->plane[plane].pre[0] = ref_yv12[0]; - if (bestsme < last_besterr[id]) { cur_mv[id].as_mv = *best_mv; last_besterr[id] = bestsme; @@ -6397,10 +6609,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, *rate_mv = 0; for (ref = 0; ref < 2; ++ref) { - av1_set_mvcost( - x, ref, - mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0)); - + av1_set_mvcost(x, ref, ref_mv_idx); const int_mv curr_ref_mv = av1_get_ref_mv(x, ref); *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); @@ -6710,16 +6919,16 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, switch (mbmi->motion_mode) { case SIMPLE_TRANSLATION: - bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, - sadpb, cond_cost_list(cpi, cost_list), - &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col), - (MI_SIZE * mi_row), 0); + bestsme = av1_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0, + sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1, + (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0); break; case OBMC_CAUSAL: - bestsme = av1_obmc_full_pixel_diamond( - cpi, x, &mvp_full, step_param, sadpb, - MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv, - &(x->best_mv.as_mv), 0); + bestsme = av1_obmc_full_pixel_search(cpi, x, &mvp_full, step_param, sadpb, + MAX_MVSEARCH_STEPS - 1 - step_param, + 1, &cpi->fn_ptr[bsize], &ref_mv, + &(x->best_mv.as_mv), 0); break; default: assert(0 && "Invalid motion mode!\n"); } @@ -6850,25 +7059,17 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x, av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height, cm->width, cm->height); - ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); WarpTypesAllowed warp_types; warp_types.global_warp_allowed = is_global; warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL; // Get the prediction block from the 'other' reference frame. - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - av1_highbd_build_inter_predictor( - ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph, - 0, mbmi->interp_filters, &warp_types, p_col, p_row, plane, - MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd, - cm->allow_warped_motion); - } else { - av1_build_inter_predictor( - ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph, - &conv_params, mbmi->interp_filters, &warp_types, p_col, p_row, plane, - !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd, - cm->allow_warped_motion); - } + av1_build_inter_predictor(ref_yv12.buf, ref_yv12.stride, second_pred, pw, + other_mv, &sf, pw, ph, &conv_params, + mbmi->interp_filters, &warp_types, p_col, p_row, + plane, !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd, cm->allow_warped_motion); av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset, &xd->jcp_param.bck_offset, @@ -6921,7 +7122,7 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, int bestsme = INT_MAX; int sadpb = x->sadperbit16; MV *const best_mv = &x->best_mv.as_mv; - int search_range = 3; + int search_range = SEARCH_RANGE_8P; MvLimits tmp_mv_limits = x->mv_limits; @@ -7056,12 +7257,12 @@ static void do_masked_motion_search_indexed( // near mv modes to reduce distortion in subsequent blocks and also improve // visual quality. #define NEW_MV_DISCOUNT_FACTOR 8 -static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx, - int ref_mv_idx, +static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode, + int ref_idx, int ref_mv_idx, const MV_REFERENCE_FRAME *ref_frame, const MB_MODE_INFO_EXT *mbmi_ext); static int discount_newmv_test(const AV1_COMP *const cpi, const MACROBLOCK *x, - int this_mode, int_mv this_mv) { + PREDICTION_MODE this_mode, int_mv this_mv) { if (this_mode == NEWMV && this_mv.as_int != 0 && !cpi->rc.is_src_frame_alt_ref) { // Only discount new_mv when nearst_mv and all near_mv are zero, and the @@ -7176,6 +7377,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int N = bw * bh; + assert(N >= 64); int rate; int64_t dist; int64_t rd, best_rd = INT64_MAX; @@ -7199,28 +7401,27 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, (int64_t)aom_sum_squares_i16(residual1, N)) * (1 << WEDGE_WEIGHT_BITS) / 2; int16_t *ds = residual0; - if (N < 64) - av1_wedge_compute_delta_squares_c(ds, residual0, residual1, N); - else - av1_wedge_compute_delta_squares(ds, residual0, residual1, N); + + av1_wedge_compute_delta_squares(ds, residual0, residual1, N); for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize); - // TODO(jingning): Make sse2 functions support N = 16 case - if (N < 64) - wedge_sign = av1_wedge_sign_from_residuals_c(ds, mask, N, sign_limit); - else - wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit); + wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit); mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); - if (N < 64) - sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N); - else - sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); sse = ROUND_POWER_OF_TWO(sse, bd_round); - model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + // int rate2; + // int64_t dist2; + // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2); + // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n", + // sse, rate, dist, rate2, dist2); dist = dist2; + // rate = rate2; + rate += x->wedge_idx_cost[bsize][wedge_index]; rd = RDCOST(x->rdmult, rate, dist); @@ -7248,6 +7449,7 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi, const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int N = bw * bh; + assert(N >= 64); int rate; int64_t dist; int64_t rd, best_rd = INT64_MAX; @@ -7259,13 +7461,11 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi, const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); - if (N < 64) - sse = av1_wedge_sse_from_residuals_c(residual1, diff10, mask, N); - else - sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); sse = ROUND_POWER_OF_TWO(sse, bd_round); - model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); rate += x->wedge_idx_cost[bsize][wedge_index]; rd = RDCOST(x->rdmult, rate, dist); @@ -7317,50 +7517,45 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi, MB_MODE_INFO *const mbmi = xd->mi[0]; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; - const int N = bw * bh; + const int N = 1 << num_pels_log2_lookup[bsize]; int rate; - uint64_t sse; int64_t dist; - int64_t rd0; DIFFWTD_MASK_TYPE cur_mask_type; int64_t best_rd = INT64_MAX; DIFFWTD_MASK_TYPE best_mask_type = 0; const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; + DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]); + uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask }; // try each mask type and its inverse for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) { // build mask and inverse if (hbd) av1_build_compound_diffwtd_mask_highbd( - xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw, + tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd); else - av1_build_compound_diffwtd_mask(xd->seg_mask, cur_mask_type, p0, bw, p1, - bw, bh, bw); + av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, + p0, bw, p1, bw, bh, bw); // compute rd for mask - sse = av1_wedge_sse_from_residuals(residual1, diff10, xd->seg_mask, N); + uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10, + tmp_mask[cur_mask_type], N); sse = ROUND_POWER_OF_TWO(sse, bd_round); - model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist); - rd0 = RDCOST(x->rdmult, rate, dist); + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + const int64_t rd0 = RDCOST(x->rdmult, rate, dist); if (rd0 < best_rd) { best_mask_type = cur_mask_type; best_rd = rd0; } } - - // make final mask mbmi->interinter_comp.mask_type = best_mask_type; - if (hbd) - av1_build_compound_diffwtd_mask_highbd( - xd->seg_mask, mbmi->interinter_comp.mask_type, CONVERT_TO_BYTEPTR(p0), - bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd); - else - av1_build_compound_diffwtd_mask( - xd->seg_mask, mbmi->interinter_comp.mask_type, p0, bw, p1, bw, bh, bw); - + if (best_mask_type == DIFFWTD_38_INV) { + memcpy(xd->seg_mask, seg_mask, N * 2); + } return best_rd; } @@ -7413,9 +7608,12 @@ static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x, } } -static int interinter_compound_motion_search( - const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, - const BLOCK_SIZE bsize, const int this_mode, int mi_row, int mi_col) { +static int interinter_compound_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *x, + const int_mv *const cur_mv, + const BLOCK_SIZE bsize, + const PREDICTION_MODE this_mode, + int mi_row, int mi_col) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; int_mv tmp_mv[2]; @@ -7440,11 +7638,40 @@ static int interinter_compound_motion_search( return tmp_rate_mv; } +static void get_inter_predictors_masked_compound( + const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, + int mi_row, int mi_col, uint8_t **preds0, uint8_t **preds1, + int16_t *residual1, int16_t *diff10, int *strides) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + int can_use_previous = cm->allow_warped_motion; + // get inter predictors to use for masked compound modes + av1_build_inter_predictors_for_planes_single_buf( + xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous); + av1_build_inter_predictors_for_planes_single_buf( + xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous); + const struct buf_2d *const src = &x->plane[0].src; + if (get_bitdepth_data_path_index(xd)) { + aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd); + aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1), + bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd); + } else { + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, + bw); + aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw); + } +} + static int64_t build_and_cost_compound_type( const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, - const BLOCK_SIZE bsize, const int this_mode, int *rs2, int rate_mv, - BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1, - int16_t *residual1, int16_t *diff10, int *strides, int mi_row, int mi_col) { + const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2, + int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, + uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides, + int mi_row, int mi_col, int mode_rate, int64_t ref_best_rd, + int *calc_pred_masked_compound) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -7456,19 +7683,30 @@ static int64_t build_and_cost_compound_type( int64_t tmp_skip_sse_sb; const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; + if (*calc_pred_masked_compound) { + get_inter_predictors_masked_compound(cpi, x, bsize, mi_row, mi_col, preds0, + preds1, residual1, diff10, strides); + *calc_pred_masked_compound = 0; + } + best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1, residual1, diff10); *rs2 += get_interinter_compound_mask_rate(x, mbmi); best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0); - if (have_newmv_in_inter_mode(this_mode) && - use_masked_motion_search(compound_type)) { + // Although the true rate_mv might be different after motion search, but it + // is unlikely to be the best mode considering the transform rd cost and other + // mode overhead cost + int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0); + if (mode_rd > ref_best_rd) return INT64_MAX; + + if (have_newmv_in_inter_mode(this_mode) && compound_type == COMPOUND_WEDGE) { *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize, this_mode, mi_row, mi_col); av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize); - av1_subtract_plane(x, bsize, 0); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); if (rd >= best_rd_cur) { mbmi->mv[0].as_int = cur_mv[0].as_int; @@ -7508,12 +7746,72 @@ typedef struct { int (*single_newmv_valid)[REF_FRAMES]; // Pointer to array of predicted rate-distortion // Should point to first of 2 arrays in 2D array - int64_t (*modelled_rd)[REF_FRAMES]; + int64_t (*modelled_rd)[MAX_REF_MV_SERCH][REF_FRAMES]; InterpFilter single_filter[MB_MODE_COUNT][REF_FRAMES]; int ref_frame_cost; int single_comp_cost; + int64_t (*simple_rd)[MAX_REF_MV_SERCH][REF_FRAMES]; + int skip_motion_mode; + INTERINTRA_MODE *inter_intra_mode; } HandleInterModeArgs; +/* If the current mode shares the same mv with other modes with higher cost, + * skip this mode. */ +static int skip_repeated_mv(const AV1_COMMON *const cm, + const MACROBLOCK *const x, + PREDICTION_MODE this_mode, + const MV_REFERENCE_FRAME ref_frames[2], + InterModeSearchState *search_state) { + const int is_comp_pred = ref_frames[1] > INTRA_FRAME; + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames); + const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + PREDICTION_MODE compare_mode = MB_MODE_COUNT; + if (!is_comp_pred) { + if (this_mode == NEARMV) { + if (ref_mv_count == 0) { + // NEARMV has the same motion vector as NEARESTMV + compare_mode = NEARESTMV; + } + if (ref_mv_count == 1 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // NEARMV has the same motion vector as GLOBALMV + compare_mode = GLOBALMV; + } + } + if (this_mode == GLOBALMV) { + if (ref_mv_count == 0 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // GLOBALMV has the same motion vector as NEARESTMV + compare_mode = NEARESTMV; + } + if (ref_mv_count == 1) { + // GLOBALMV has the same motion vector as NEARMV + compare_mode = NEARMV; + } + } + + if (compare_mode != MB_MODE_COUNT) { + // Use modelled_rd to check whether compare mode was searched + if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] != + INT64_MAX) { + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames); + const int compare_cost = cost_mv_ref(x, compare_mode, mode_ctx); + const int this_cost = cost_mv_ref(x, this_mode, mode_ctx); + + // Only skip if the mode cost is larger than compare mode cost + if (this_cost > compare_cost) { + search_state->modelled_rd[this_mode][0][ref_frames[0]] = + search_state->modelled_rd[compare_mode][0][ref_frames[0]]; + return 1; + } + } + } + } + return 0; +} + static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv, const AV1_COMMON *cm, const MACROBLOCK *x) { @@ -7640,62 +7938,97 @@ static INLINE int64_t interpolation_filter_rd( const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; - int tmp_rate, tmp_skip_sb = 0; - int64_t tmp_dist, tmp_skip_sse = INT64_MAX; + int tmp_rate[2], tmp_skip_sb[2] = { 1, 1 }; + int64_t tmp_dist[2], tmp_skip_sse[2] = { 0, 0 }; const InterpFilters last_best = mbmi->interp_filters; mbmi->interp_filters = filter_sets[filter_idx]; const int tmp_rs = get_switchable_rate(x, mbmi->interp_filters, switchable_ctx); - if (!skip_pred) { - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); - av1_subtract_plane(x, bsize, 0); -#if DNN_BASED_RD_INTERP_FILTER - model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, - &tmp_skip_sb, &tmp_skip_sse, NULL, NULL, NULL); -#else - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &tmp_rate, &tmp_dist, &tmp_skip_sb, - &tmp_skip_sse, NULL, NULL, NULL); -#endif + assert(skip_pred != 2); + assert((skip_pred >= 0) && (skip_pred <= cpi->default_interp_skip_flags)); + assert(rate[0] >= 0); + assert(dist[0] >= 0); + assert((skip_txfm_sb[0] == 0) || (skip_txfm_sb[0] == 1)); + assert(skip_sse_sb[0] >= 0); + assert(rate[1] >= 0); + assert(dist[1] >= 0); + assert((skip_txfm_sb[1] == 0) || (skip_txfm_sb[1] == 1)); + assert(skip_sse_sb[1] >= 0); + + if (skip_pred != cpi->default_interp_skip_flags) { + if (skip_pred != DEFAULT_LUMA_INTERP_SKIP_FLAG) { + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); +#if CONFIG_COLLECT_RD_STATS == 3 + RD_STATS rd_stats_y; + select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX); + PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 3 + model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0], + &tmp_skip_sb[0], &tmp_skip_sse[0], NULL, NULL, NULL); + tmp_rate[1] = tmp_rate[0]; + tmp_dist[1] = tmp_dist[0]; + } else { + // only luma MC is skipped + tmp_rate[1] = rate[0]; + tmp_dist[1] = dist[0]; + } if (num_planes > 1) { - int64_t tmp_y_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist); - if (tmp_y_rd > *rd) { - mbmi->interp_filters = last_best; - return 0; + for (int plane = 1; plane < num_planes; ++plane) { + int tmp_rate_uv, tmp_skip_sb_uv; + int64_t tmp_dist_uv, tmp_skip_sse_uv; + int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]); + if (tmp_rd >= *rd) { + mbmi->interp_filters = last_best; + return 0; + } + av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, orig_dst, bsize, + plane); + model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, plane, plane, mi_row, mi_col, &tmp_rate_uv, + &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, NULL, NULL); + tmp_rate[1] = + (int)AOMMIN(((int64_t)tmp_rate[1] + (int64_t)tmp_rate_uv), INT_MAX); + tmp_dist[1] += tmp_dist_uv; + tmp_skip_sb[1] &= tmp_skip_sb_uv; + tmp_skip_sse[1] += tmp_skip_sse_uv; } - int tmp_rate_uv, tmp_skip_sb_uv; - int64_t tmp_dist_uv, tmp_skip_sse_uv; - av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize); - for (int plane = 1; plane < num_planes; ++plane) - av1_subtract_plane(x, bsize, plane); -#if DNN_BASED_RD_INTERP_FILTER - model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 1, num_planes - 1, - &tmp_rate_uv, &tmp_dist_uv, &tmp_skip_sb_uv, - &tmp_skip_sse_uv, NULL, NULL, NULL); -#else - model_rd_for_sb(cpi, bsize, x, xd, 1, num_planes - 1, &tmp_rate_uv, - &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, - NULL, NULL); -#endif - tmp_rate += tmp_rate_uv; - tmp_skip_sb &= tmp_skip_sb_uv; - tmp_dist += tmp_dist_uv; - tmp_skip_sse += tmp_skip_sse_uv; } } else { - tmp_rate = *rate; - tmp_dist = *dist; + // both luma and chroma MC is skipped + tmp_rate[1] = rate[1]; + tmp_dist[1] = dist[1]; } - int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist); + int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]); + if (tmp_rd < *rd) { *rd = tmp_rd; *switchable_rate = tmp_rs; - *skip_txfm_sb = tmp_skip_sb; - *skip_sse_sb = tmp_skip_sse; - *rate = tmp_rate; - *dist = tmp_dist; - if (!skip_pred) { + if (skip_pred != cpi->default_interp_skip_flags) { + if (skip_pred == 0) { + // Overwrite the data as current filter is the best one + tmp_skip_sb[1] = tmp_skip_sb[0] & tmp_skip_sb[1]; + tmp_skip_sse[1] = tmp_skip_sse[0] + tmp_skip_sse[1]; + memcpy(rate, tmp_rate, sizeof(*rate) * 2); + memcpy(dist, tmp_dist, sizeof(*dist) * 2); + memcpy(skip_txfm_sb, tmp_skip_sb, sizeof(*skip_txfm_sb) * 2); + memcpy(skip_sse_sb, tmp_skip_sse, sizeof(*skip_sse_sb) * 2); + // As luma MC data is computed, no need to recompute after the search + x->recalc_luma_mc_data = 0; + } else if (skip_pred == DEFAULT_LUMA_INTERP_SKIP_FLAG) { + // As luma MC data is not computed, update of luma data can be skipped + rate[1] = tmp_rate[1]; + dist[1] = tmp_dist[1]; + skip_txfm_sb[1] = skip_txfm_sb[0] & tmp_skip_sb[1]; + skip_sse_sb[1] = skip_sse_sb[0] + tmp_skip_sse[1]; + // As luma MC data is not recomputed and current filter is the best, + // indicate the possibility of recomputing MC data + // If current buffer contains valid MC data, toggle to indicate that + // luma MC data needs to be recomputed + x->recalc_luma_mc_data ^= 1; + } swap_dst_buf(xd, dst_bufs, num_planes); } return 1; @@ -7715,8 +8048,8 @@ static INLINE int find_best_horiz_interp_filter_rd( int i; const int bw = block_size_wide[bsize]; assert(best_dual_mode == 0); - if ((bw <= 4) && (!skip_hor)) { - int skip_pred = 1; + if ((bw <= 4) && (skip_hor != cpi->default_interp_skip_flags)) { + int skip_pred = cpi->default_interp_skip_flags; // Process the filters in reverse order to enable reusing rate and // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) { @@ -7726,7 +8059,7 @@ static INLINE int find_best_horiz_interp_filter_rd( dist)) { best_dual_mode = i; } - skip_pred = 0; + skip_pred = skip_hor; } } else { for (i = 1; i < SWITCHABLE_FILTERS; ++i) { @@ -7751,8 +8084,8 @@ static INLINE void find_best_vert_interp_filter_rd( int best_dual_mode, int filter_set_size) { int i; const int bh = block_size_high[bsize]; - if ((bh <= 4) && (!skip_ver)) { - int skip_pred = 1; + if ((bh <= 4) && (skip_ver != cpi->default_interp_skip_flags)) { + int skip_pred = cpi->default_interp_skip_flags; // Process the filters in reverse order to enable reusing rate and // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP assert(filter_set_size == DUAL_FILTER_SET_SIZE); @@ -7762,7 +8095,7 @@ static INLINE void find_best_vert_interp_filter_rd( switchable_rate, skip_txfm_sb, skip_sse_sb, dst_bufs, i, switchable_ctx, skip_pred, rate, dist); - skip_pred = 0; + skip_pred = skip_ver; } } else { for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size; @@ -7784,6 +8117,7 @@ static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st, return 0; } } + if (has_second_ref(mi) && st->comp_type != mi->interinter_comp.type) return 0; return 1; } @@ -7806,11 +8140,11 @@ static INLINE void save_interp_filter_search_stat(MACROBLOCK *x, const int comp_idx = mbmi->compound_idx; const int offset = x->interp_filter_stats_idx[comp_idx]; if (offset < MAX_INTERP_FILTER_STATS) { - INTERPOLATION_FILTER_STATS stat = { - mbmi->interp_filters, - { mbmi->mv[0], mbmi->mv[1] }, - { mbmi->ref_frame[0], mbmi->ref_frame[1] }, - }; + INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters, + { mbmi->mv[0], mbmi->mv[1] }, + { mbmi->ref_frame[0], + mbmi->ref_frame[1] }, + mbmi->interinter_comp.type }; x->interp_filter_stats[comp_idx][offset] = stat; x->interp_filter_stats_idx[comp_idx]++; } @@ -7821,15 +8155,22 @@ static int64_t interpolation_filter_search( int mi_row, int mi_col, const BUFFER_SET *const tmp_dst, BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES], int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb, - int64_t *const skip_sse_sb) { + int64_t *const skip_sse_sb, const int skip_build_pred, + HandleInterModeArgs *args, int64_t ref_best_rd) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int need_search = av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd); - int i, tmp_rate; - int64_t tmp_dist; + int i; + // Index 0 corresponds to luma rd data and index 1 corresponds to cummulative + // data of all planes + int tmp_rate[2] = { 0, 0 }; + int64_t tmp_dist[2] = { 0, 0 }; + int best_skip_txfm_sb[2] = { 1, 1 }; + int64_t best_skip_sse_sb[2] = { 0, 0 }; + const int ref_frame = xd->mi[0]->ref_frame[0]; (void)single_filter; int match_found = -1; @@ -7845,18 +8186,32 @@ static int64_t interpolation_filter_search( switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1); *switchable_rate = get_switchable_rate(x, mbmi->interp_filters, switchable_ctx); - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); - for (int plane = 0; plane < num_planes; ++plane) - av1_subtract_plane(x, bsize, plane); -#if DNN_BASED_RD_INTERP_FILTER - model_rd_for_sb_with_dnn(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, - &tmp_dist, skip_txfm_sb, skip_sse_sb, NULL, NULL, - NULL); -#else - model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist, - skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL); -#endif // DNN_BASED_RD_INTERP_FILTER - *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist); + if (!skip_build_pred) + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); + +#if CONFIG_COLLECT_RD_STATS == 3 + RD_STATS rd_stats_y; + select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX); + PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 3 + model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0], + &best_skip_txfm_sb[0], &best_skip_sse_sb[0], NULL, NULL, NULL); + if (num_planes > 1) + model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, 1, num_planes - 1, mi_row, mi_col, &tmp_rate[1], + &tmp_dist[1], &best_skip_txfm_sb[1], &best_skip_sse_sb[1], NULL, NULL, + NULL); + tmp_rate[1] = + (int)AOMMIN((int64_t)tmp_rate[0] + (int64_t)tmp_rate[1], INT_MAX); + assert(tmp_rate[1] >= 0); + tmp_dist[1] = tmp_dist[0] + tmp_dist[1]; + best_skip_txfm_sb[1] = best_skip_txfm_sb[0] & best_skip_txfm_sb[1]; + best_skip_sse_sb[1] = best_skip_sse_sb[0] + best_skip_sse_sb[1]; + *rd = RDCOST(x->rdmult, (*switchable_rate + tmp_rate[1]), tmp_dist[1]); + *skip_txfm_sb = best_skip_txfm_sb[1]; + *skip_sse_sb = best_skip_sse_sb[1]; + x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4); if (assign_filter != SWITCHABLE || match_found != -1) { return 0; @@ -7866,22 +8221,71 @@ static int64_t interpolation_filter_search( av1_broadcast_interp_filter(EIGHTTAP_REGULAR)); return 0; } - int skip_hor = 1; - int skip_ver = 1; + if (args->modelled_rd != NULL) { + if (has_second_ref(mbmi)) { + const int ref_mv_idx = mbmi->ref_mv_idx; + int refs[2] = { mbmi->ref_frame[0], + (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; + const int mode0 = compound_ref0_mode(mbmi->mode); + const int mode1 = compound_ref1_mode(mbmi->mode); + const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], + args->modelled_rd[mode1][ref_mv_idx][refs[1]]); + if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) { + return INT64_MAX; + } + } + } + + x->recalc_luma_mc_data = 0; + // skip_flag=xx (in binary form) + // Setting 0th flag corresonds to skipping luma MC and setting 1st bt + // corresponds to skipping chroma MC skip_flag=0 corresponds to "Don't skip + // luma and chroma MC" Skip flag=1 corresponds to "Skip Luma MC only" + // Skip_flag=2 is not a valid case + // skip_flag=3 corresponds to "Skip both luma and chroma MC" + int skip_hor = cpi->default_interp_skip_flags; + int skip_ver = cpi->default_interp_skip_flags; const int is_compound = has_second_ref(mbmi); - for (int k = 0; k < num_planes - 1; ++k) { - struct macroblockd_plane *const pd = &xd->plane[k]; - const int bw = pd->width; - const int bh = pd->height; - for (int j = 0; j < 1 + is_compound; ++j) { - const MV mv = mbmi->mv[j].as_mv; + assert(is_intrabc_block(mbmi) == 0); + for (int j = 0; j < 1 + is_compound; ++j) { + const RefBuffer *ref_buf = &cm->frame_refs[mbmi->ref_frame[j] - LAST_FRAME]; + const struct scale_factors *const sf = &ref_buf->sf; + // TODO(any): Refine skip flag calculation considering scaling + if (av1_is_scaled(sf)) { + skip_hor = 0; + skip_ver = 0; + break; + } + const MV mv = mbmi->mv[j].as_mv; + int skip_hor_plane = 0; + int skip_ver_plane = 0; + for (int k = 0; k < AOMMAX(1, (num_planes - 1)); ++k) { + struct macroblockd_plane *const pd = &xd->plane[k]; + const int bw = pd->width; + const int bh = pd->height; const MV mv_q4 = clamp_mv_to_umv_border_sb( xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; - skip_hor &= (sub_x == 0); - skip_ver &= (sub_y == 0); - } + skip_hor_plane |= ((sub_x == 0) << k); + skip_ver_plane |= ((sub_y == 0) << k); + } + skip_hor = skip_hor & skip_hor_plane; + skip_ver = skip_ver & skip_ver_plane; + // It is not valid that "luma MV is sub-pel, whereas chroma MV is not" + assert(skip_hor != 2); + assert(skip_ver != 2); + } + // When compond prediction type is compound segment wedge, luma MC and chroma + // MC need to go hand in hand as mask generated during luma MC is reuired for + // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during + // vertical filter decision may be incorrect as temporary MC evaluation + // overwrites the mask. Make skip_ver as 0 for this case so that mask is + // populated during luma MC + if (is_compound && mbmi->compound_idx == 1 && + mbmi->interinter_comp.type == COMPOUND_DIFFWTD) { + assert(mbmi->comp_group_idx == 1); + if (skip_hor == 0 && skip_ver == 1) skip_ver = 0; } // do interp_filter search const int filter_set_size = DUAL_FILTER_SET_SIZE; @@ -7895,14 +8299,14 @@ static int64_t interpolation_filter_search( // EIGHTTAP_REGULAR mode is calculated beforehand best_dual_mode = find_best_horiz_interp_filter_rd( x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate, - skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_hor, - &tmp_rate, &tmp_dist, best_dual_mode); + best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_hor, + tmp_rate, tmp_dist, best_dual_mode); // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes find_best_vert_interp_filter_rd( x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate, - skip_txfm_sb, skip_sse_sb, dst_bufs, switchable_ctx, skip_ver, - &tmp_rate, &tmp_dist, best_dual_mode, filter_set_size); + best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_ver, + tmp_rate, tmp_dist, best_dual_mode, filter_set_size); } else { // EIGHTTAP_REGULAR mode is calculated beforehand for (i = 1; i < filter_set_size; ++i) { @@ -7912,12 +8316,25 @@ static int64_t interpolation_filter_search( if (filter_x != filter_y) continue; } interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, - switchable_rate, skip_txfm_sb, skip_sse_sb, - dst_bufs, i, switchable_ctx, 0, &tmp_rate, - &tmp_dist); + switchable_rate, best_skip_txfm_sb, + best_skip_sse_sb, dst_bufs, i, switchable_ctx, 0, + tmp_rate, tmp_dist); + assert(x->recalc_luma_mc_data == 0); } } swap_dst_buf(xd, dst_bufs, num_planes); + // Recompute final MC data if required + if (x->recalc_luma_mc_data == 1) { + // Recomputing final luma MC data is required only if the same was skipped + // in either of the directions Condition below is necessary, but not + // sufficient + assert((skip_hor == 1) || (skip_ver == 1)); + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + } + *skip_txfm_sb = best_skip_txfm_sb[1]; + *skip_sse_sb = best_skip_sse_sb[1]; + x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4); + // save search results if (cpi->sf.skip_repeat_interpolation_filter_search) { assert(match_found == -1); @@ -7926,6 +8343,301 @@ static int64_t interpolation_filter_search( return 0; } +static int txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col, RD_STATS *rd_stats, + RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, + int mode_rate, int64_t ref_best_rd) { + /* + * This function combines y and uv planes' transform search processes + * together, when the prediction is generated. It first does subtration to + * obtain the prediction error. Then it calls + * select_tx_type_yrd/super_block_yrd and inter_block_uvrd sequentially and + * handles the early terminations happen in those functions. At the end, it + * computes the rd_stats/_y/_uv accordingly. + */ + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int skip_txfm_sb = 0; + const int num_planes = av1_num_planes(cm); + const int ref_frame_1 = mbmi->ref_frame[1]; + const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0); + const int64_t rd_thresh = + ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd; + const int skip_ctx = av1_get_skip_context(xd); + const int64_t min_header_rate = + mode_rate + AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]); + // Account for minimum skip and non_skip rd. + // Eventually either one of them will be added to mode_rate + const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0); + + if (min_header_rd_possible > ref_best_rd) { + av1_invalid_rd_stats(rd_stats_y); + av1_invalid_rd_stats(rd_stats); + return 0; + } + + av1_init_rd_stats(rd_stats); + av1_init_rd_stats(rd_stats_y); + av1_init_rd_stats(rd_stats_uv); + rd_stats->rate = mode_rate; + + if (!cpi->common.all_lossless) + check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb); + if (!skip_txfm_sb) { + int64_t non_skip_rdcosty = INT64_MAX; + int64_t skip_rdcosty = INT64_MAX; + int64_t min_rdcosty = INT64_MAX; + int is_cost_valid_uv = 0; + + // cost and distortion + av1_subtract_plane(x, bsize, 0); + if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { + // Motion mode + select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh); +#if CONFIG_COLLECT_RD_STATS == 2 + PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 2 + } else { + super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) + set_blk_skip(x, 0, i, rd_stats_y->skip); + } + + if (rd_stats_y->rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + // TODO(angiebird): check if we need this + // restore_dst_buf(xd, *orig_dst, num_planes); + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + + av1_merge_rd_stats(rd_stats, rd_stats_y); + + non_skip_rdcosty = RDCOST( + x->rdmult, rd_stats->rate + x->skip_cost[skip_ctx][0], rd_stats->dist); + skip_rdcosty = + RDCOST(x->rdmult, mode_rate + x->skip_cost[skip_ctx][1], rd_stats->sse); + min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty); + + if (min_rdcosty > ref_best_rd) { + int64_t tokenonly_rdy = + AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist), + RDCOST(x->rdmult, 0, rd_stats_y->sse)); + // Invalidate rd_stats_y to skip the rest of the motion modes search + if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.adaptive_txb_search_level) > + rd_thresh) + av1_invalid_rd_stats(rd_stats_y); + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + + if (num_planes > 1) { + /* clang-format off */ + is_cost_valid_uv = + inter_block_uvrd(cpi, x, rd_stats_uv, bsize, + ref_best_rd - non_skip_rdcosty, + ref_best_rd - skip_rdcosty, FTXS_NONE); + if (!is_cost_valid_uv) { + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + /* clang-format on */ + av1_merge_rd_stats(rd_stats, rd_stats_uv); + } else { + av1_init_rd_stats(rd_stats_uv); + } + if (rd_stats->skip) { + rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + rd_stats->rate += x->skip_cost[skip_ctx][1]; + mbmi->skip = 0; + // here mbmi->skip temporarily plays a role as what this_skip2 does + + int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (tmprd > ref_best_rd) { + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + } else if (!xd->lossless[mbmi->segment_id] && + (RDCOST(x->rdmult, + rd_stats_y->rate + rd_stats_uv->rate + + x->skip_cost[skip_ctx][0], + rd_stats->dist) >= + RDCOST(x->rdmult, x->skip_cost[skip_ctx][1], rd_stats->sse))) { + rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; + rd_stats->rate += x->skip_cost[skip_ctx][1]; + rd_stats->dist = rd_stats->sse; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + mbmi->skip = 1; + } else { + rd_stats->rate += x->skip_cost[skip_ctx][0]; + mbmi->skip = 0; + } + } else { + x->skip = 1; + mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode); + // The cost of skip bit needs to be added. + mbmi->skip = 0; + rd_stats->rate += x->skip_cost[skip_ctx][1]; + + rd_stats->dist = 0; + rd_stats->sse = 0; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + rd_stats->skip = 1; + int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (tmprd > ref_best_rd) { + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } + } + return 1; +} + +static int handle_inter_intra_mode(const AV1_COMP *const cpi, + MACROBLOCK *const x, BLOCK_SIZE bsize, + int mi_row, int mi_col, MB_MODE_INFO *mbmi, + HandleInterModeArgs *args, + int64_t ref_best_rd, int *rate_mv, + int *tmp_rate2, BUFFER_SET *orig_dst) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &x->e_mbd; + + INTERINTRA_MODE best_interintra_mode = II_DC_PRED; + int64_t rd, best_interintra_rd = INT64_MAX; + int rmode, rate_sum; + int64_t dist_sum; + int tmp_rate_mv = 0; + int tmp_skip_txfm_sb; + int bw = block_size_wide[bsize]; + int64_t tmp_skip_sse_sb; + DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]); + uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_); + uint8_t *intrapred = get_buf_by_bd(xd, intrapred_); + const int *const interintra_mode_cost = + x->interintra_mode_cost[size_group_lookup[bsize]]; + const int_mv mv0 = mbmi->mv[0]; + const int is_wedge_used = is_interintra_wedge_used(bsize); + int rwedge = is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0; + mbmi->ref_frame[1] = NONE_FRAME; + xd->plane[0].dst.buf = tmp_buf; + xd->plane[0].dst.stride = bw; + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize); + + restore_dst_buf(xd, *orig_dst, num_planes); + mbmi->ref_frame[1] = INTRA_FRAME; + mbmi->use_wedge_interintra = 0; + best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]]; + int j = 0; + if (cpi->sf.reuse_inter_intra_mode == 0 || + best_interintra_mode == INTERINTRA_MODES) { + for (j = 0; j < INTERINTRA_MODES; ++j) { + mbmi->interintra_mode = (INTERINTRA_MODE)j; + rmode = interintra_mode_cost[mbmi->interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + model_rd_sb_fn[MODELRD_TYPE_INTERINTRA]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum); + if (rd < best_interintra_rd) { + best_interintra_rd = rd; + best_interintra_mode = mbmi->interintra_mode; + } + } + args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode; + } + if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) { + mbmi->interintra_mode = best_interintra_mode; + rmode = interintra_mode_cost[mbmi->interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, *rate_mv + rmode + rate_sum + rwedge, dist_sum); + best_interintra_rd = rd; + if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) { + return -1; + } + if (is_wedge_used) { + int64_t best_interintra_rd_nowedge = rd; + int64_t best_interintra_rd_wedge = INT64_MAX; + int_mv tmp_mv; + // Disable wedge search if source variance is small + if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) { + mbmi->use_wedge_interintra = 1; + + rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) + + x->wedge_interintra_cost[bsize][1]; + + best_interintra_rd_wedge = + pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + + best_interintra_rd_wedge += + RDCOST(x->rdmult, rmode + *rate_mv + rwedge, 0); + rd = INT64_MAX; + // Refine motion vector. + if (have_newmv_in_inter_mode(mbmi->mode)) { + // get negative of mask + const uint8_t *mask = av1_get_contiguous_soft_mask( + mbmi->interintra_wedge_index, 1, bsize); + tmp_mv = mbmi->mv[0]; + compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row, + mi_col, intrapred, mask, bw, &tmp_rate_mv, + 0); + if (mbmi->mv[0].as_int != tmp_mv.as_int) { + mbmi->mv[0].as_int = tmp_mv.as_int; + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, + bsize); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge, + dist_sum); + } + } + if (rd >= best_interintra_rd_wedge) { + tmp_mv.as_int = mv0.as_int; + tmp_rate_mv = *rate_mv; + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + // Evaluate closer to true rd + rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (rd != INT64_MAX) + rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum, + dist_sum); + best_interintra_rd_wedge = rd; + if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { + mbmi->use_wedge_interintra = 1; + mbmi->mv[0].as_int = tmp_mv.as_int; + *tmp_rate2 += tmp_rate_mv - *rate_mv; + *rate_mv = tmp_rate_mv; + } else { + mbmi->use_wedge_interintra = 0; + mbmi->mv[0].as_int = mv0.as_int; + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + } + } else { + mbmi->use_wedge_interintra = 0; + } + } // if (is_interintra_wedge_used(bsize)) + if (num_planes > 1) { + av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize); + } + return 0; +} + // TODO(afergs): Refactor the MBMI references in here - there's four // TODO(afergs): Refactor optional args - add them to a struct or remove static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, @@ -7933,11 +8645,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col, HandleInterModeArgs *const args, - int64_t ref_best_rd, const int *refs, int rate_mv, - BUFFER_SET *orig_dst + int64_t ref_best_rd, const int *refs, + int *rate_mv, BUFFER_SET *orig_dst #if CONFIG_COLLECT_INTER_MODE_RD_STATS , - int64_t *best_est_rd + TileDataEnc *tile_data, int64_t *best_est_rd, + int do_tx_search, InterModesInfo *inter_modes_info #endif ) { const AV1_COMMON *const cm = &cpi->common; @@ -7946,41 +8659,49 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, MB_MODE_INFO *mbmi = xd->mi[0]; const int is_comp_pred = has_second_ref(mbmi); const PREDICTION_MODE this_mode = mbmi->mode; - int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0; + const int rate2_nocoeff = rd_stats->rate; + int best_xskip, best_disable_skip = 0; RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; MB_MODE_INFO base_mbmi, best_mbmi; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const int rate_mv0 = *rate_mv; + int interintra_allowed = cm->seq_params.enable_interintra_compound && is_interintra_allowed(mbmi) && mbmi->compound_idx; int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE]; - int total_samples; - - (void)rate_mv; + assert(mbmi->ref_frame[1] != INTRA_FRAME); + const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1]; av1_invalid_rd_stats(&best_rd_stats); - aom_clear_system_state(); - mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0); - total_samples = mbmi->num_proj_ref[0]; - rate2_nocoeff = rd_stats->rate; + mbmi->num_proj_ref = 1; // assume num_proj_ref >=1 + MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; + if (cm->switchable_motion_mode) { + last_motion_mode_allowed = motion_mode_allowed(xd->global_motion, xd, mbmi, + cm->allow_warped_motion); + } + if (last_motion_mode_allowed == WARPED_CAUSAL) { + mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0); + } + int total_samples = mbmi->num_proj_ref; + if (total_samples == 0) { + last_motion_mode_allowed = OBMC_CAUSAL; + } base_mbmi = *mbmi; - MOTION_MODE last_motion_mode_allowed = - cm->switchable_motion_mode - ? motion_mode_allowed(xd->global_motion, xd, mbmi, - cm->allow_warped_motion) - : SIMPLE_TRANSLATION; - assert(mbmi->ref_frame[1] != INTRA_FRAME); - const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1]; + const int switchable_rate = + av1_is_interp_needed(xd) ? av1_get_switchable_rate(cm, x, xd) : 0; int64_t best_rd = INT64_MAX; - + int best_rate_mv = rate_mv0; for (int mode_index = (int)SIMPLE_TRANSLATION; mode_index <= (int)last_motion_mode_allowed + interintra_allowed; mode_index++) { + if (args->skip_motion_mode && mode_index) continue; int64_t tmp_rd = INT64_MAX; int tmp_rate2 = rate2_nocoeff; int is_interintra_mode = mode_index > (int)last_motion_mode_allowed; int skip_txfm_sb = 0; + int tmp_rate_mv = rate_mv0; *mbmi = base_mbmi; if (is_interintra_mode) { @@ -7995,10 +8716,9 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, // The prediction is calculated before motion_mode_rd() is called in // handle_inter_mode() } else if (mbmi->motion_mode == OBMC_CAUSAL) { - mbmi->motion_mode = OBMC_CAUSAL; - if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) { - int tmp_rate_mv = 0; - + uint32_t cur_mv = mbmi->mv[0].as_int; + assert(!is_comp_pred); + if (have_newmv_in_inter_mode(this_mode)) { single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv); mbmi->mv[0].as_int = x->best_mv.as_int; #if USE_DISCOUNT_NEWMV_TEST @@ -8006,36 +8726,38 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); } #endif - tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv; + tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; + } + if (mbmi->mv[0].as_int != cur_mv) { + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); } - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); av1_build_obmc_inter_prediction( cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride, args->left_pred_buf, args->left_pred_stride); } else if (mbmi->motion_mode == WARPED_CAUSAL) { int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; mbmi->motion_mode = WARPED_CAUSAL; - mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE; + mbmi->wm_params.wmtype = DEFAULT_WMTYPE; mbmi->interp_filters = av1_broadcast_interp_filter( av1_unswitchable_filter(cm->interp_filter)); memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); // Select the samples according to motion vector difference - if (mbmi->num_proj_ref[0] > 1) { - mbmi->num_proj_ref[0] = selectSamples( - &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref[0], bsize); + if (mbmi->num_proj_ref > 1) { + mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref, bsize); } - if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, + if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, - &mbmi->wm_params[0], mi_row, mi_col)) { + &mbmi->wm_params, mi_row, mi_col)) { // Refine MV for NEWMV mode - if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) { - int tmp_rate_mv = 0; + assert(!is_comp_pred); + if (have_newmv_in_inter_mode(this_mode)) { const int_mv mv0 = mbmi->mv[0]; - const WarpedMotionParams wm_params0 = mbmi->wm_params[0]; - int num_proj_ref0 = mbmi->num_proj_ref[0]; + const WarpedMotionParams wm_params0 = mbmi->wm_params; + int num_proj_ref0 = mbmi->num_proj_ref; // Refine MV in a small range. av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0, @@ -8057,12 +8779,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); } #endif - tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv; + tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; } else { // Restore the old MV and WM parameters. mbmi->mv[0] = mv0; - mbmi->wm_params[0] = wm_params0; - mbmi->num_proj_ref[0] = num_proj_ref0; + mbmi->wm_params = wm_params0; + mbmi->num_proj_ref = num_proj_ref0; } } @@ -8071,144 +8793,10 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, continue; } } else if (is_interintra_mode) { - INTERINTRA_MODE best_interintra_mode = II_DC_PRED; - int64_t rd, best_interintra_rd = INT64_MAX; - int rmode, rate_sum; - int64_t dist_sum; - int j; - int tmp_rate_mv = 0; - int tmp_skip_txfm_sb; - int bw = block_size_wide[bsize]; - int64_t tmp_skip_sse_sb; - DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]); - DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]); - uint8_t *tmp_buf, *intrapred; - const int *const interintra_mode_cost = - x->interintra_mode_cost[size_group_lookup[bsize]]; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_); - intrapred = CONVERT_TO_BYTEPTR(intrapred_); - } else { - tmp_buf = tmp_buf_; - intrapred = intrapred_; - } - const int_mv mv0 = mbmi->mv[0]; - - mbmi->ref_frame[1] = NONE_FRAME; - xd->plane[0].dst.buf = tmp_buf; - xd->plane[0].dst.stride = bw; - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize); - - restore_dst_buf(xd, *orig_dst, num_planes); - mbmi->ref_frame[1] = INTRA_FRAME; - mbmi->use_wedge_interintra = 0; - for (j = 0; j < INTERINTRA_MODES; ++j) { - mbmi->interintra_mode = (INTERINTRA_MODE)j; - rmode = interintra_mode_cost[mbmi->interintra_mode]; - av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, - intrapred, bw); - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - av1_subtract_plane(x, bsize, 0); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); - rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum); - if (rd < best_interintra_rd) { - best_interintra_rd = rd; - best_interintra_mode = mbmi->interintra_mode; - } - } - mbmi->interintra_mode = best_interintra_mode; - rmode = interintra_mode_cost[mbmi->interintra_mode]; - av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, - intrapred, bw); - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum); - best_interintra_rd = rd; - - if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) { - // restore ref_frame[1] - mbmi->ref_frame[1] = ref_frame_1; - continue; - } - - if (is_interintra_wedge_used(bsize)) { - int64_t best_interintra_rd_nowedge = INT64_MAX; - int64_t best_interintra_rd_wedge = INT64_MAX; - int_mv tmp_mv; - InterpFilters backup_interp_filters = mbmi->interp_filters; - int rwedge = x->wedge_interintra_cost[bsize][0]; - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum + rwedge, dist_sum); - best_interintra_rd_nowedge = rd; - - // Disable wedge search if source variance is small - if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) { - mbmi->use_wedge_interintra = 1; - - rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) + - x->wedge_interintra_cost[bsize][1]; - - best_interintra_rd_wedge = - pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); - - best_interintra_rd_wedge += - RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0); - // Refine motion vector. - if (have_newmv_in_inter_mode(mbmi->mode)) { - // get negative of mask - const uint8_t *mask = av1_get_contiguous_soft_mask( - mbmi->interintra_wedge_index, 1, bsize); - tmp_mv = av1_get_ref_mv(x, 0); - compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row, - mi_col, intrapred, mask, bw, - &tmp_rate_mv, 0); - mbmi->mv[0].as_int = tmp_mv.as_int; - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, - bsize); - av1_subtract_plane(x, bsize, 0); - model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, - NULL); - rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge, - dist_sum); - if (rd >= best_interintra_rd_wedge) { - tmp_mv.as_int = mv0.as_int; - tmp_rate_mv = rate_mv; - mbmi->interp_filters = backup_interp_filters; - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - } - } else { - tmp_mv.as_int = mv0.as_int; - tmp_rate_mv = rate_mv; - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - } - // Evaluate closer to true rd - rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, - INT64_MAX); - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum, - dist_sum); - best_interintra_rd_wedge = rd; - if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { - mbmi->use_wedge_interintra = 1; - mbmi->mv[0].as_int = tmp_mv.as_int; - tmp_rate2 += tmp_rate_mv - rate_mv; - } else { - mbmi->use_wedge_interintra = 0; - mbmi->mv[0].as_int = mv0.as_int; - mbmi->interp_filters = backup_interp_filters; - } - } else { - mbmi->use_wedge_interintra = 0; - } - } // if (is_interintra_wedge_used(bsize)) - restore_dst_buf(xd, *orig_dst, num_planes); - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); + const int ret = handle_inter_intra_mode( + cpi, x, bsize, mi_row, mi_col, mbmi, args, ref_best_rd, &tmp_rate_mv, + &tmp_rate2, orig_dst); + if (ret < 0) continue; } if (!cpi->common.all_lossless) @@ -8220,8 +8808,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, rd_stats->sse = 0; rd_stats->skip = 1; rd_stats->rate = tmp_rate2; - if (av1_is_interp_needed(xd)) - rd_stats->rate += av1_get_switchable_rate(cm, x, xd); + if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate; if (interintra_allowed) { rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]] [mbmi->ref_frame[1] == INTRA_FRAME]; @@ -8246,167 +8833,86 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode]; } } + if (!skip_txfm_sb) { #if CONFIG_COLLECT_INTER_MODE_RD_STATS int64_t est_rd = 0; int est_skip = 0; - if (cpi->sf.inter_mode_rd_model_estimation) { - InterModeRdModel *md = &inter_mode_rd_models[mbmi->sb_type]; + if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 && + cm->tile_rows == 1) { + InterModeRdModel *md = &tile_data->inter_mode_rd_models[mbmi->sb_type]; if (md->ready) { const int64_t curr_sse = get_sse(cpi, x); - est_rd = - get_est_rd(mbmi->sb_type, x->rdmult, curr_sse, rd_stats->rate); + est_rd = get_est_rd(tile_data, mbmi->sb_type, x->rdmult, curr_sse, + rd_stats->rate); est_skip = est_rd * 0.8 > *best_est_rd; -#if INTER_MODE_RD_TEST - if (est_rd < *best_est_rd) { - *best_est_rd = est_rd; - } -#else // INTER_MODE_RD_TEST if (est_skip) { - ++md->skip_count; mbmi->ref_frame[1] = ref_frame_1; continue; } else { if (est_rd < *best_est_rd) { *best_est_rd = est_rd; } - ++md->non_skip_count; } -#endif // INTER_MODE_RD_TEST } } #endif // CONFIG_COLLECT_INTER_MODE_RD_STATS + } - int64_t rdcosty = INT64_MAX; - int is_cost_valid_uv = 0; - - // cost and distortion - av1_subtract_plane(x, bsize, 0); - if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { - // Motion mode - select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, - ref_best_rd); -#if CONFIG_COLLECT_RD_STATS == 2 - PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize); -#endif // CONFIG_COLLECT_RD_STATS == 2 +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + if (!do_tx_search) { + const int64_t curr_sse = get_sse(cpi, x); + int est_residue_cost = 0; + int64_t est_dist = 0; + const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse, + &est_residue_cost, &est_dist); + (void)has_est_rd; + assert(has_est_rd); + const int mode_rate = rd_stats->rate; + rd_stats->rate += est_residue_cost; + rd_stats->dist = est_dist; + rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (cm->reference_mode == SINGLE_REFERENCE) { + if (!is_comp_pred) { + inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, + rd_stats->rdcost, mbmi); + } } else { - super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd); - memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); - memset(x->blk_skip, rd_stats_y->skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, + rd_stats->rdcost, mbmi); } - - if (rd_stats_y->rate == INT_MAX) { - av1_invalid_rd_stats(rd_stats); - if (mbmi->motion_mode != SIMPLE_TRANSLATION || - mbmi->ref_frame[1] == INTRA_FRAME) { - mbmi->ref_frame[1] = ref_frame_1; - continue; - } else { - restore_dst_buf(xd, *orig_dst, num_planes); - mbmi->ref_frame[1] = ref_frame_1; + } else { +#endif + int mode_rate = rd_stats->rate; + if (!txfm_search(cpi, x, bsize, mi_row, mi_col, rd_stats, rd_stats_y, + rd_stats_uv, mode_rate, ref_best_rd)) { + if (rd_stats_y->rate == INT_MAX && mode_index == 0) { return INT64_MAX; } + continue; } - - av1_merge_rd_stats(rd_stats, rd_stats_y); - - rdcosty = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, 0, rd_stats->sse)); - if (num_planes > 1) { - /* clang-format off */ - is_cost_valid_uv = - inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty, - FTXS_NONE); - if (!is_cost_valid_uv) { - mbmi->ref_frame[1] = ref_frame_1; - continue; + if (!skip_txfm_sb) { + const int64_t curr_rd = + RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (curr_rd < ref_best_rd) { + ref_best_rd = curr_rd; } - /* clang-format on */ - av1_merge_rd_stats(rd_stats, rd_stats_uv); - } else { - av1_init_rd_stats(rd_stats_uv); - } -#if CONFIG_RD_DEBUG - // record transform block coefficient cost - // TODO(angiebird): So far rd_debug tool only detects discrepancy of - // coefficient cost. Therefore, it is fine to copy rd_stats into mbmi - // here because we already collect the coefficient cost. Move this part to - // other place when we need to compare non-coefficient cost. - mbmi->rd_stats = *rd_stats; -#endif // CONFIG_RD_DEBUG - const int skip_ctx = av1_get_skip_context(xd); - if (rd_stats->skip) { - rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; - rd_stats_y->rate = 0; - rd_stats_uv->rate = 0; - rd_stats->rate += x->skip_cost[skip_ctx][1]; - mbmi->skip = 0; - // here mbmi->skip temporarily plays a role as what this_skip2 does - } else if (!xd->lossless[mbmi->segment_id] && - (RDCOST(x->rdmult, - rd_stats_y->rate + rd_stats_uv->rate + - x->skip_cost[skip_ctx][0], - rd_stats->dist) >= RDCOST(x->rdmult, - x->skip_cost[skip_ctx][1], - rd_stats->sse))) { - rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; - rd_stats->rate += x->skip_cost[skip_ctx][1]; - rd_stats->dist = rd_stats->sse; - rd_stats_y->rate = 0; - rd_stats_uv->rate = 0; - mbmi->skip = 1; - } else { - rd_stats->rate += x->skip_cost[skip_ctx][0]; - mbmi->skip = 0; - } - *disable_skip = 0; + *disable_skip = 0; #if CONFIG_COLLECT_INTER_MODE_RD_STATS - if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 && - cm->tile_rows == 1) { -#if INTER_MODE_RD_TEST - if (md->ready) { - int64_t real_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if (est_skip) { - ++md->skip_count; - if (real_rd < ref_best_rd) { - ++md->fp_skip_count; - } - // int fp_skip = real_rd < ref_best_rd; - // printf("est_skip %d fp_skip %d est_rd %ld best_est_rd %ld real_rd - // %ld ref_best_rd %ld\n", - // est_skip, fp_skip, est_rd, *best_est_rd, real_rd, - // ref_best_rd); - } else { - ++md->non_skip_count; - } + if (cpi->sf.inter_mode_rd_model_estimation) { + const int skip_ctx = av1_get_skip_context(xd); + inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse, + rd_stats->dist, + rd_stats_y->rate + rd_stats_uv->rate + + x->skip_cost[skip_ctx][mbmi->skip]); } -#endif // INTER_MODE_RD_TEST - inter_mode_data_push(mbmi->sb_type, rd_stats->sse, rd_stats->dist, - rd_stats_y->rate + rd_stats_uv->rate + - x->skip_cost[skip_ctx][mbmi->skip], - rd_stats->rate, ref_best_rd); - } #endif // CONFIG_COLLECT_INTER_MODE_RD_STATS - int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if (curr_rd < ref_best_rd) { - ref_best_rd = curr_rd; + } else { + *disable_skip = 1; } - } else { - x->skip = 1; - *disable_skip = 1; - mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode); - - // The cost of skip bit needs to be added. - mbmi->skip = 0; - rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1]; - - rd_stats->dist = 0; - rd_stats->sse = 0; - rd_stats_y->rate = 0; - rd_stats_uv->rate = 0; - rd_stats->skip = 1; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS } +#endif if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) { if (is_nontrans_global_motion(xd, xd->mi[0])) { @@ -8416,23 +8922,24 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, } tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if ((mbmi->motion_mode == SIMPLE_TRANSLATION && - mbmi->ref_frame[1] != INTRA_FRAME) || - (tmp_rd < best_rd)) { + if (mode_index == 0) + args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd; + if ((mode_index == 0) || (tmp_rd < best_rd)) { best_mbmi = *mbmi; best_rd = tmp_rd; best_rd_stats = *rd_stats; best_rd_stats_y = *rd_stats_y; + best_rate_mv = tmp_rate_mv; if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv; memcpy(best_blk_skip, x->blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); best_xskip = x->skip; best_disable_skip = *disable_skip; if (best_xskip) break; } } mbmi->ref_frame[1] = ref_frame_1; - + *rate_mv = best_rate_mv; if (best_rd == INT64_MAX) { av1_invalid_rd_stats(rd_stats); restore_dst_buf(xd, *orig_dst, num_planes); @@ -8443,7 +8950,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, *rd_stats_y = best_rd_stats_y; if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv; memcpy(x->blk_skip, best_blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); x->skip = best_xskip; *disable_skip = best_disable_skip; @@ -8482,15 +8989,9 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi, return 0; } -#ifndef NDEBUG -static INLINE int is_single_inter_mode(int this_mode) { - return this_mode >= SINGLE_INTER_MODE_START && - this_mode < SINGLE_INTER_MODE_END; -} -#endif - -static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) { - assert(is_single_inter_mode(single_mode)); +static INLINE int get_ref_mv_offset(PREDICTION_MODE single_mode, + uint8_t ref_mv_idx) { + assert(is_inter_singleref_mode(single_mode)); int ref_mv_offset; if (single_mode == NEARESTMV) { ref_mv_offset = 0; @@ -8502,14 +9003,15 @@ static INLINE int get_ref_mv_offset(int single_mode, uint8_t ref_mv_idx) { return ref_mv_offset; } -static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx, - int ref_mv_idx, +static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode, + int ref_idx, int ref_mv_idx, const MV_REFERENCE_FRAME *ref_frame, const MB_MODE_INFO_EXT *mbmi_ext) { const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame); const int is_comp_pred = ref_frame[1] > INTRA_FRAME; - const int single_mode = get_single_mode(this_mode, ref_idx, is_comp_pred); - assert(is_single_inter_mode(single_mode)); + const PREDICTION_MODE single_mode = + get_single_mode(this_mode, ref_idx, is_comp_pred); + assert(is_inter_singleref_mode(single_mode)); if (single_mode == NEWMV) { this_mv->as_int = INVALID_MV; } else if (single_mode == GLOBALMV) { @@ -8533,7 +9035,7 @@ static INLINE void get_this_mv(int_mv *this_mv, int this_mode, int ref_idx, } // This function update the non-new mv for the current prediction mode -static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode, +static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode, const AV1_COMMON *cm, const MACROBLOCK *x) { const MACROBLOCKD *xd = &x->e_mbd; const MB_MODE_INFO *mbmi = xd->mi[0]; @@ -8543,7 +9045,8 @@ static INLINE int build_cur_mv(int_mv *cur_mv, int this_mode, int_mv this_mv; get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, mbmi->ref_frame, x->mbmi_ext); - const int single_mode = get_single_mode(this_mode, i, is_comp_pred); + const PREDICTION_MODE single_mode = + get_single_mode(this_mode, i, is_comp_pred); if (single_mode == NEWMV) { cur_mv[i] = this_mv; } else { @@ -8584,18 +9087,29 @@ static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi, return cost; } -static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, int mi_col, int mi_row, - int_mv *cur_mv, int masked_compound_used, - BUFFER_SET *orig_dst, BUFFER_SET *tmp_dst, - int *rate_mv, int64_t *rd, - RD_STATS *rd_stats, int64_t ref_best_rd) { +// Struct for buffers used by compound_type_rd() function. +// For sizes and alignment of these arrays, refer to +// alloc_compound_type_rd_buffers() function. +typedef struct { + uint8_t *pred0; + uint8_t *pred1; + int16_t *residual1; // src - pred1 + int16_t *diff10; // pred1 - pred0 + uint8_t *tmp_best_mask_buf; // backup of the best segmentation mask +} CompoundTypeRdBuffers; + +static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_col, int mi_row, + int_mv *cur_mv, int masked_compound_used, + BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, + CompoundTypeRdBuffers *buffers, int *rate_mv, + int64_t *rd, RD_STATS *rd_stats, + int64_t ref_best_rd) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; - const int this_mode = mbmi->mode; + const PREDICTION_MODE this_mode = mbmi->mode; const int bw = block_size_wide[bsize]; - const int bh = block_size_high[bsize]; int rate_sum, rs2; int64_t dist_sum; @@ -8605,45 +9119,19 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, int64_t tmp_skip_sse_sb; INTERINTER_COMPOUND_DATA best_compound_data; best_compound_data.type = COMPOUND_AVERAGE; - DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]); - DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]); - DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1 - DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0 - uint8_t tmp_best_mask_buf[2 * MAX_SB_SQUARE]; - uint8_t *preds0[1] = { pred0 }; - uint8_t *preds1[1] = { pred1 }; + uint8_t *preds0[1] = { buffers->pred0 }; + uint8_t *preds1[1] = { buffers->pred1 }; int strides[1] = { bw }; int tmp_rate_mv; const int num_pix = 1 << num_pels_log2_lookup[bsize]; const int mask_len = 2 * num_pix * sizeof(uint8_t); COMPOUND_TYPE cur_type; int best_compmode_interinter_cost = 0; - int can_use_previous = cm->allow_warped_motion; + int calc_pred_masked_compound = 1; best_mv[0].as_int = cur_mv[0].as_int; best_mv[1].as_int = cur_mv[1].as_int; *rd = INT64_MAX; - if (masked_compound_used) { - // get inter predictors to use for masked compound modes - av1_build_inter_predictors_for_planes_single_buf( - xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous); - av1_build_inter_predictors_for_planes_single_buf( - xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous); - const struct buf_2d *const src = &x->plane[0].src; - if (get_bitdepth_data_path_index(xd)) { - aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, - CONVERT_TO_BYTEPTR(pred1), bw, xd->bd); - aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(pred1), - bw, CONVERT_TO_BYTEPTR(pred0), bw, xd->bd); - } else { - aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, pred1, - bw); - aom_subtract_block(bh, bw, diff10, bw, pred1, bw, pred0, bw); - } - } - const int orig_is_best = xd->plane[0].dst.buf == orig_dst->plane[0]; - const BUFFER_SET *backup_buf = orig_is_best ? tmp_dst : orig_dst; - const BUFFER_SET *best_buf = orig_is_best ? orig_dst : tmp_dst; for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) { if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break; if (!is_interinter_compound_used(cur_type, bsize)) continue; @@ -8662,17 +9150,17 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, } masked_type_cost += x->comp_idx_cost[comp_index_ctx][1]; rs2 = masked_type_cost; - // No need to call av1_build_inter_predictors_sby here - // 1. COMPOUND_AVERAGE is always the first candidate - // 2. av1_build_inter_predictors_sby has been called by - // interpolation_filter_search - int64_t est_rd = - estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); + if (mode_rd < ref_best_rd) { + av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + int64_t est_rd = + estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); + if (est_rd != INT64_MAX) + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); + } // use spare buffer for following compound type try - restore_dst_buf(xd, *backup_buf, 1); - if (est_rd != INT64_MAX) - best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); + restore_dst_buf(xd, *tmp_dst, 1); } else { mbmi->comp_group_idx = 1; masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1]; @@ -8682,19 +9170,20 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, *rd / 3 < ref_best_rd) { best_rd_cur = build_and_cost_compound_type( cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst, - &tmp_rate_mv, preds0, preds1, residual1, diff10, strides, mi_row, - mi_col); + &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10, + strides, mi_row, mi_col, rd_stats->rate, ref_best_rd, + &calc_pred_masked_compound); } } if (best_rd_cur < *rd) { *rd = best_rd_cur; best_compound_data = mbmi->interinter_comp; if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) { - memcpy(tmp_best_mask_buf, xd->seg_mask, mask_len); + memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len); } best_compmode_interinter_cost = rs2; if (have_newmv_in_inter_mode(this_mode)) { - if (use_masked_motion_search(cur_type)) { + if (cur_type == COMPOUND_WEDGE) { best_tmp_rate_mv = tmp_rate_mv; best_mv[0].as_int = mbmi->mv[0].as_int; best_mv[1].as_int = mbmi->mv[1].as_int; @@ -8712,28 +9201,69 @@ static INLINE int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, mbmi->comp_group_idx = (best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1; mbmi->interinter_comp = best_compound_data; - memcpy(xd->seg_mask, tmp_best_mask_buf, mask_len); + memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len); } if (have_newmv_in_inter_mode(this_mode)) { mbmi->mv[0].as_int = best_mv[0].as_int; mbmi->mv[1].as_int = best_mv[1].as_int; - if (use_masked_motion_search(mbmi->interinter_comp.type)) { + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { rd_stats->rate += best_tmp_rate_mv - *rate_mv; *rate_mv = best_tmp_rate_mv; } } - restore_dst_buf(xd, *best_buf, 1); + restore_dst_buf(xd, *orig_dst, 1); return best_compmode_interinter_cost; } +static INLINE int is_single_newmv_valid(HandleInterModeArgs *args, + MB_MODE_INFO *mbmi, + PREDICTION_MODE this_mode) { + for (int ref_idx = 0; ref_idx < 2; ++ref_idx) { + const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx, 1); + const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx]; + if (single_mode == NEWMV && + args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) { + return 0; + } + } + return 1; +} + +static int get_drl_refmv_count(const MACROBLOCK *const x, + const MV_REFERENCE_FRAME *ref_frame, + PREDICTION_MODE mode) { + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0; + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV); + const int has_drl = + (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1); + const int ref_set = + has_drl ? AOMMIN(MAX_REF_MV_SERCH, ref_mv_count - has_nearmv) : 1; + + return ref_set; +} + +typedef struct { + int64_t rd; + int drl_cost; + int rate_mv; + int_mv mv; +} inter_mode_info; + static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col, - HandleInterModeArgs *args, int64_t ref_best_rd + HandleInterModeArgs *args, int64_t ref_best_rd, + uint8_t *const tmp_buf, + CompoundTypeRdBuffers *rd_buffers #if CONFIG_COLLECT_INTER_MODE_RD_STATS , - int64_t *best_est_rd + TileDataEnc *tile_data, int64_t *best_est_rd, + const int do_tx_search, + InterModesInfo *inter_modes_info #endif ) { const AV1_COMMON *cm = &cpi->common; @@ -8742,15 +9272,26 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; const int is_comp_pred = has_second_ref(mbmi); - const int this_mode = mbmi->mode; + const PREDICTION_MODE this_mode = mbmi->mode; int i; int refs[2] = { mbmi->ref_frame[0], (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; int rate_mv = 0; - DECLARE_ALIGNED(32, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); - uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_); int64_t rd = INT64_MAX; - BUFFER_SET orig_dst, tmp_dst; + + // do first prediction into the destination buffer. Do the next + // prediction into a temporary buffer. Then keep track of which one + // of these currently holds the best predictor, and use the other + // one for future predictions. In the end, copy from tmp_buf to + // dst if necessary. + struct macroblockd_plane *p = xd->plane; + BUFFER_SET orig_dst = { + { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, + { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, + }; + const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE, + tmp_buf + 2 * MAX_SB_SQUARE }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } }; int skip_txfm_sb = 0; int64_t skip_sse_sb = INT64_MAX; @@ -8765,36 +9306,29 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO best_mbmi = *mbmi; int best_disable_skip; int best_xskip; - int plane_rate[MAX_MB_PLANE] = { 0 }; - int64_t plane_sse[MAX_MB_PLANE] = { 0 }; - int64_t plane_dist[MAX_MB_PLANE] = { 0 }; int64_t newmv_ret_val = INT64_MAX; int_mv backup_mv[2] = { { 0 } }; int backup_rate_mv = 0; + inter_mode_info mode_info[MAX_REF_MV_SERCH]; int comp_idx; const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp & (mbmi->mode != GLOBAL_GLOBALMV); - const int has_drl = (have_nearmv_in_inter_mode(mbmi->mode) && - mbmi_ext->ref_mv_count[ref_frame_type] > 2) || - ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) && - mbmi_ext->ref_mv_count[ref_frame_type] > 1); - // TODO(jingning): This should be deprecated shortly. - const int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; - const int ref_set = - has_drl ? AOMMIN(MAX_REF_MV_SERCH, - mbmi_ext->ref_mv_count[ref_frame_type] - idx_offset) - : 1; + const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; + const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { + mode_info[ref_mv_idx].mv.as_int = INVALID_MV; + mode_info[ref_mv_idx].rd = INT64_MAX; + if (cpi->sf.reduce_inter_modes && ref_mv_idx > 0) { if (mbmi->ref_frame[0] == LAST2_FRAME || mbmi->ref_frame[0] == LAST3_FRAME || mbmi->ref_frame[1] == LAST2_FRAME || mbmi->ref_frame[1] == LAST3_FRAME) { - if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + idx_offset] + if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + has_nearmv] .weight < REF_CAT_LEVEL) { continue; } @@ -8811,41 +9345,40 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); - mbmi->num_proj_ref[0] = 0; - mbmi->num_proj_ref[1] = 0; + mbmi->num_proj_ref = 0; mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->ref_mv_idx = ref_mv_idx; - if (is_comp_pred) { - for (int ref_idx = 0; ref_idx < is_comp_pred + 1; ++ref_idx) { - const int single_mode = - get_single_mode(this_mode, ref_idx, is_comp_pred); - if (single_mode == NEWMV && - args->single_newmv[mbmi->ref_mv_idx][mbmi->ref_frame[ref_idx]] - .as_int == INVALID_MV) - continue; - } + if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, this_mode))) { + continue; } rd_stats->rate += args->ref_frame_cost + args->single_comp_cost; - rd_stats->rate += + const int drl_cost = get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type); + rd_stats->rate += drl_cost; + mode_info[ref_mv_idx].drl_cost = drl_cost; + + if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd && + mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { + continue; + } - const RD_STATS backup_rd_stats = *rd_stats; - const MB_MODE_INFO backup_mbmi = *mbmi; int64_t best_rd2 = INT64_MAX; + const RD_STATS backup_rd_stats = *rd_stats; // If !search_jnt_comp, we need to force mbmi->compound_idx = 1. for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) { int rs = 0; int compmode_interinter_cost = 0; - *rd_stats = backup_rd_stats; - *mbmi = backup_mbmi; mbmi->compound_idx = comp_idx; - if (is_comp_pred && comp_idx == 0) { + *rd_stats = backup_rd_stats; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME; + mbmi->num_proj_ref = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->comp_group_idx = 0; - mbmi->compound_idx = 0; const int comp_group_idx_ctx = get_comp_group_idx_context(xd); const int comp_index_ctx = get_comp_index_context(cm, xd); @@ -8885,32 +9418,69 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } else { rd_stats->rate += rate_mv; } + + if (cpi->sf.skip_repeated_newmv) { + if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) { + int skip = 0; + int this_rate_mv = 0; + for (i = 0; i < ref_mv_idx; ++i) { + // Check if the motion search result same as previous results + if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int) { + // If the compared mode has no valid rd, it is unlikely this + // mode will be the best mode + if (mode_info[i].rd == INT64_MAX) { + skip = 1; + break; + } + // Compare the cost difference including drl cost and mv cost + if (mode_info[i].mv.as_int != INVALID_MV) { + const int compare_cost = + mode_info[i].rate_mv + mode_info[i].drl_cost; + const int_mv ref_mv = av1_get_ref_mv(x, 0); + this_rate_mv = av1_mv_bit_cost(&mode_info[i].mv.as_mv, + &ref_mv.as_mv, x->nmvjointcost, + x->mvcost, MV_COST_WEIGHT); + const int this_cost = this_rate_mv + drl_cost; + + if (compare_cost < this_cost) { + skip = 1; + break; + } else { + // If the cost is less than current best result, make this + // the best and update corresponding variables + if (best_mbmi.ref_mv_idx == i) { + assert(best_rd != INT64_MAX); + best_mbmi.ref_mv_idx = ref_mv_idx; + best_rd_stats.rate += this_cost - compare_cost; + best_rd = RDCOST(x->rdmult, best_rd_stats.rate, + best_rd_stats.dist); + if (best_rd < ref_best_rd) ref_best_rd = best_rd; + + skip = 1; + break; + } + } + } + } + } + if (skip) { + args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = + args->modelled_rd[this_mode][i][refs[0]]; + args->simple_rd[this_mode][ref_mv_idx][refs[0]] = + args->simple_rd[this_mode][i][refs[0]]; + mode_info[ref_mv_idx].rd = mode_info[i].rd; + mode_info[ref_mv_idx].rate_mv = this_rate_mv; + mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int; + + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + } + } } for (i = 0; i < is_comp_pred + 1; ++i) { mbmi->mv[i].as_int = cur_mv[i].as_int; } - - // Initialise tmp_dst and orig_dst buffers to prevent "may be used - // uninitialized" warnings in GCC when the stream is monochrome. - memset(tmp_dst.plane, 0, sizeof(tmp_dst.plane)); - memset(tmp_dst.stride, 0, sizeof(tmp_dst.stride)); - memset(orig_dst.plane, 0, sizeof(tmp_dst.plane)); - memset(orig_dst.stride, 0, sizeof(tmp_dst.stride)); - - // do first prediction into the destination buffer. Do the next - // prediction into a temporary buffer. Then keep track of which one - // of these currently holds the best predictor, and use the other - // one for future predictions. In the end, copy from tmp_buf to - // dst if necessary. - for (i = 0; i < num_planes; i++) { - tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE; - tmp_dst.stride[i] = MAX_SB_SIZE; - } - for (i = 0; i < num_planes; i++) { - orig_dst.plane[i] = xd->plane[i].dst.buf; - orig_dst.stride[i] = xd->plane[i].dst.stride; - } - const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx); #if USE_DISCOUNT_NEWMV_TEST // We don't include the cost of the second reference here, because there @@ -8937,47 +9507,62 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, continue; } - ret_val = interpolation_filter_search( - x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, - args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb); - if (ret_val != 0) { - restore_dst_buf(xd, orig_dst, num_planes); - continue; - } else if (cpi->sf.model_based_post_interp_filter_breakout && - ref_best_rd != INT64_MAX && (rd / 6 > ref_best_rd)) { - restore_dst_buf(xd, orig_dst, num_planes); - if ((rd >> 4) > ref_best_rd) break; - continue; - } - + int skip_build_pred = 0; if (is_comp_pred && comp_idx) { + // Find matching interp filter or set to default interp filter + const int need_search = + av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd); + int match_found = -1; + const InterpFilter assign_filter = cm->interp_filter; + if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) { + match_found = find_interp_filter_in_stats(x, mbmi); + } + if (!need_search || match_found == -1) { + set_default_interp_filters(mbmi, assign_filter); + } + int64_t best_rd_compound; compmode_interinter_cost = compound_type_rd( cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used, - &orig_dst, &tmp_dst, &rate_mv, &best_rd_compound, rd_stats, - ref_best_rd); + &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound, + rd_stats, ref_best_rd); if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) { restore_dst_buf(xd, orig_dst, num_planes); continue; } - if (mbmi->interinter_comp.type != COMPOUND_AVERAGE) { - int tmp_rate; - int64_t tmp_dist; - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, - bsize); - for (int plane = 0; plane < num_planes; ++plane) - av1_subtract_plane(x, bsize, plane); - model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, - &tmp_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, - plane_sse, plane_dist); - rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist); + // No need to call av1_build_inter_predictors_sby if + // COMPOUND_AVERAGE is selected because it is the first + // candidate in compound_type_rd, and the following + // compound types searching uses tmp_dst buffer + if (mbmi->interinter_comp.type == COMPOUND_AVERAGE) { + if (num_planes > 1) + av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, &orig_dst, + bsize); + skip_build_pred = 1; } } + ret_val = interpolation_filter_search( + x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, + args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb, + skip_build_pred, args, ref_best_rd); + if (args->modelled_rd != NULL && !is_comp_pred) { + args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd; + } + if (ret_val != 0) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } else if (cpi->sf.model_based_post_interp_filter_breakout && + ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + if ((rd >> 3) * 2 > ref_best_rd) break; + continue; + } + if (search_jnt_comp) { // if 1/2 model rd is larger than best_rd in jnt_comp mode, // use jnt_comp mode, save additional search - if ((rd >> 1) > best_rd) { + if ((rd >> 3) * 4 > best_rd) { restore_dst_buf(xd, orig_dst, num_planes); continue; } @@ -8991,31 +9576,31 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, if (is_comp_pred) { const int mode0 = compound_ref0_mode(this_mode); const int mode1 = compound_ref1_mode(this_mode); - const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]], - args->modelled_rd[mode1][refs[1]]); - if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) { + const int64_t mrd = + AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], + args->modelled_rd[mode1][ref_mv_idx][refs[1]]); + if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) { restore_dst_buf(xd, orig_dst, num_planes); continue; } - } else { - args->modelled_rd[this_mode][refs[0]] = rd; - } - } - - if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { - // if current pred_error modeled rd is substantially more than the best - // so far, do not bother doing full rd - if (rd / 2 > ref_best_rd) { - restore_dst_buf(xd, orig_dst, num_planes); - continue; } } - rd_stats->rate += compmode_interinter_cost; if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) { // TODO(chengchen): this speed feature introduces big loss. // Need better estimation of rate distortion. + int dummy_rate; + int64_t dummy_dist; + int plane_rate[MAX_MB_PLANE] = { 0 }; + int64_t plane_sse[MAX_MB_PLANE] = { 0 }; + int64_t plane_dist[MAX_MB_PLANE] = { 0 }; + + model_rd_sb_fn[MODELRD_TYPE_JNT_COMPOUND]( + cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, &dummy_rate, + &dummy_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, plane_sse, + plane_dist); + rd_stats->rate += rs; rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2]; rd_stats_y->rate = plane_rate[0]; @@ -9028,18 +9613,21 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, rd_stats_uv->dist = plane_dist[1] + plane_dist[2]; } else { #if CONFIG_COLLECT_INTER_MODE_RD_STATS - ret_val = - motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, - disable_skip, mi_row, mi_col, args, ref_best_rd, - refs, rate_mv, &orig_dst, best_est_rd); + ret_val = motion_mode_rd( + cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, + mi_row, mi_col, args, ref_best_rd, refs, &rate_mv, &orig_dst, + tile_data, best_est_rd, do_tx_search, inter_modes_info); #else ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, mi_row, mi_col, - args, ref_best_rd, refs, rate_mv, &orig_dst); + args, ref_best_rd, refs, &rate_mv, &orig_dst); #endif } + mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int; + mode_info[ref_mv_idx].rate_mv = rate_mv; if (ret_val != INT64_MAX) { int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + mode_info[ref_mv_idx].rd = tmp_rd; if (tmp_rd < best_rd) { best_rd_stats = *rd_stats; best_rd_stats_y = *rd_stats_y; @@ -9049,7 +9637,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, best_disable_skip = *disable_skip; best_xskip = x->skip; memcpy(best_blk_skip, x->blk_skip, - sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w); } if (tmp_rd < best_rd2) { @@ -9062,8 +9650,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } restore_dst_buf(xd, orig_dst, num_planes); } - - args->modelled_rd = NULL; } if (best_rd == INT64_MAX) return INT64_MAX; @@ -9078,7 +9664,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, assert(IMPLIES(mbmi->comp_group_idx == 1, mbmi->interinter_comp.type != COMPOUND_AVERAGE)); memcpy(x->blk_skip, best_blk_skip, - sizeof(best_blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w); return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); } @@ -9186,8 +9772,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int sadpb = x->sadperbit16; int cost_list[5]; int bestsme = av1_full_pixel_search( - cpi, x, bsize, &mvp_full, step_param, sadpb, - cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1, + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0, + sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1, (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1); x->mv_limits = tmp_mv_limits; @@ -9229,8 +9815,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, } else { super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX); memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); - memset(x->blk_skip, rd_stats.skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) + set_blk_skip(x, 0, i, rd_stats.skip); } if (num_planes > 1) { super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); @@ -9254,7 +9840,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, best_skip = x->skip; best_rdcost = rdc_noskip; memcpy(best_blk_skip, x->blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); } if (!xd->lossless[mbmi->segment_id]) { @@ -9271,7 +9857,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, best_skip = x->skip; best_rdcost = rdc_skip; memcpy(best_blk_skip, x->blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); } } } @@ -9279,7 +9865,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, *rd_cost = best_rdcost; x->skip = best_skip; memcpy(x->blk_skip, best_blk_skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); return best_rd; } @@ -9302,8 +9888,8 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, mbmi->mv[0].as_int = 0; const int64_t intra_yrd = - rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, - &y_skip, bsize, best_rd, ctx); + rd_pick_intra_sby_mode(cpi, x, mi_row, mi_col, &rate_y, &rate_y_tokenonly, + &dist_y, &y_skip, bsize, best_rd, ctx); if (intra_yrd < best_rd) { // Only store reconstructed luma when there's chroma RDO. When there's no @@ -9447,6 +10033,17 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost, mbmi->uv_mode = UV_DC_PRED; mbmi->ref_frame[0] = ref_frame; mbmi->ref_frame[1] = second_ref_frame; + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + if (x->mbmi_ext->ref_mv_count[ref_frame_type] == UINT8_MAX) { + if (x->mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX || + x->mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) { + return; + } + MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext; + av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count, + mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row, + mi_col, mbmi_ext->mode_context); + } assert(this_mode == NEAREST_NEARESTMV); if (!build_cur_mv(mbmi->mv, this_mode, cm, x)) { @@ -9508,7 +10105,7 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost, memset(search_state->best_mbmode.inter_tx_size, search_state->best_mbmode.tx_size, sizeof(search_state->best_mbmode.inter_tx_size)); - set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n8_w, xd->n8_h, + set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n4_w, xd->n4_h, search_state->best_mbmode.skip && is_inter_block(mbmi), xd); // Set up color-related variables for skip mode. @@ -9595,11 +10192,12 @@ static void sf_refine_fast_tx_type_search( } else { super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); - memset(x->blk_skip, rd_stats_y.skip, - sizeof(x->blk_skip[0]) * xd->n8_h * xd->n8_w); + for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) + set_blk_skip(x, 0, i, rd_stats_y.skip); } if (num_planes > 1) { - inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, FTXS_NONE); + inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, INT64_MAX, + FTXS_NONE); } else { av1_init_rd_stats(&rd_stats_uv); } @@ -9647,7 +10245,7 @@ static void sf_refine_fast_tx_type_search( static void set_params_rd_pick_inter_mode( const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args, BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2], - uint32_t mode_skip_mask[REF_FRAMES], + uint32_t mode_skip_mask[REF_FRAMES], int skip_ref_frame_mask, unsigned int ref_costs_single[REF_FRAMES], unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES], struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { @@ -9700,18 +10298,45 @@ static void set_params_rd_pick_inter_mode( x->pred_mv_sad[ref_frame] = INT_MAX; x->mbmi_ext->mode_context[ref_frame] = 0; x->mbmi_ext->compound_mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) { + if (mbmi->partition != PARTITION_NONE && + mbmi->partition != PARTITION_SPLIT) { + if (skip_ref_frame_mask & (1 << ref_frame)) { + int skip = 1; + for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { + if (!(skip_ref_frame_mask & (1 << r))) { + const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; + if (rf[0] == ref_frame || rf[1] == ref_frame) { + skip = 0; + break; + } + } + } + if (skip) continue; + } + } assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, yv12_mb); } } - - // TODO(zoeliu@google.com): To further optimize the obtaining of motion vector - // references for compound prediction, as not every pair of reference frames - // woud be examined for the RD evaluation. + // ref_frame = ALTREF_FRAME for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) { x->mbmi_ext->mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; + const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES]; + if (!((cpi->ref_frame_flags & ref_frame_flag_list[rf[0]]) && + (cpi->ref_frame_flags & ref_frame_flag_list[rf[1]]))) { + continue; + } + + if (mbmi->partition != PARTITION_NONE && + mbmi->partition != PARTITION_SPLIT) { + if (skip_ref_frame_mask & (1 << ref_frame)) { + continue; + } + } av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row, mi_col, mbmi_ext->mode_context); @@ -9838,9 +10463,10 @@ static void set_params_rd_pick_inter_mode( } } -static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_cost, PICK_MODE_CONTEXT *ctx, - BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi, +static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, + int mi_col, RD_STATS *rd_cost, + PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize, + MB_MODE_INFO *const mbmi, PALETTE_MODE_INFO *const pmi, unsigned int *ref_costs_single, InterModeSearchState *search_state) { @@ -9867,9 +10493,9 @@ static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE_FRAME; rate_overhead_palette = rd_pick_palette_intra_sby( - cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette, - best_palette_color_map, &best_rd_palette, &best_model_rd_palette, NULL, - NULL, NULL, NULL, ctx, best_blk_skip); + cpi, x, bsize, mi_row, mi_col, intra_mode_cost[DC_PRED], + &best_mbmi_palette, best_palette_color_map, &best_rd_palette, + &best_model_rd_palette, NULL, NULL, NULL, NULL, ctx, best_blk_skip); if (pmi->palette_size[0] == 0) return; memcpy(x->blk_skip, best_blk_skip, @@ -9986,15 +10612,49 @@ static void init_inter_mode_search_state(InterModeSearchState *search_state, av1_zero(search_state->single_newmv); av1_zero(search_state->single_newmv_rate); av1_zero(search_state->single_newmv_valid); - for (int i = 0; i < MB_MODE_COUNT; ++i) - for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) - search_state->modelled_rd[i][ref_frame] = INT64_MAX; + for (int i = 0; i < MB_MODE_COUNT; ++i) { + for (int j = 0; j < MAX_REF_MV_SERCH; ++j) { + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + search_state->modelled_rd[i][j][ref_frame] = INT64_MAX; + search_state->simple_rd[i][j][ref_frame] = INT64_MAX; + } + } + } + + for (int dir = 0; dir < 2; ++dir) { + for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) { + SingleInterModeState *state; + + state = &search_state->single_state[dir][mode][ref_frame]; + state->ref_frame = NONE_FRAME; + state->rd = INT64_MAX; + + state = &search_state->single_state_modelled[dir][mode][ref_frame]; + state->ref_frame = NONE_FRAME; + state->rd = INT64_MAX; + } + } + } + for (int dir = 0; dir < 2; ++dir) { + for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) { + search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME; + } + } + } + av1_zero(search_state->single_state_cnt); + av1_zero(search_state->single_state_modelled_cnt); } +// Case 1: return 0, means don't skip this mode +// Case 2: return 1, means skip this mode completely +// Case 3: return 2, means skip compound only, but still try single motion modes static int inter_mode_search_order_independent_skip( - const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index, - int mi_row, int mi_col, uint32_t *mode_skip_mask, - uint16_t *ref_frame_skip_mask) { + const AV1_COMP *cpi, const PICK_MODE_CONTEXT *ctx, const MACROBLOCK *x, + BLOCK_SIZE bsize, int mode_index, int mi_row, int mi_col, + uint32_t *mode_skip_mask, uint16_t *ref_frame_skip_mask, + InterModeSearchState *search_state) { const SPEED_FEATURES *const sf = &cpi->sf; const AV1_COMMON *const cm = &cpi->common; const struct segmentation *const seg = &cm->seg; @@ -10003,6 +10663,32 @@ static int inter_mode_search_order_independent_skip( const unsigned char segment_id = mbmi->segment_id; const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame; const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode; + int skip_motion_mode = 0; + if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) { + const int ref_type = av1_ref_frame_type(ref_frame); + int skip_ref = ctx->skip_ref_frame_mask & (1 << ref_type); + if (ref_type <= ALTREF_FRAME && skip_ref) { + // Since the compound ref modes depends on the motion estimation result of + // two single ref modes( best mv of single ref modes as the start point ) + // If current single ref mode is marked skip, we need to check if it will + // be used in compound ref modes. + for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { + if (!(ctx->skip_ref_frame_mask & (1 << r))) { + const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; + if (rf[0] == ref_type || rf[1] == ref_type) { + // Found a not skipped compound ref mode which contains current + // single ref. So this single ref can't be skipped completly + // Just skip it's motion mode search, still try it's simple + // transition mode. + skip_motion_mode = 1; + skip_ref = 0; + break; + } + } + } + } + if (skip_ref) return 1; + } if (cpi->sf.mode_pruning_based_on_two_pass_partition_search && !x->cb_partition_scan) { @@ -10115,9 +10801,12 @@ static int inter_mode_search_order_independent_skip( return 1; } - if (skip_repeated_mv(cm, x, this_mode, ref_frame)) { + if (skip_repeated_mv(cm, x, this_mode, ref_frame, search_state)) { return 1; } + if (skip_motion_mode) { + return 2; + } return 0; } @@ -10139,12 +10828,13 @@ static INLINE void init_mbmi(MB_MODE_INFO *mbmi, int mode_index, set_default_interp_filters(mbmi, cm->interp_filter); } -static int handle_intra_mode(InterModeSearchState *search_state, - const AV1_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, int ref_frame_cost, - const PICK_MODE_CONTEXT *ctx, int disable_skip, - RD_STATS *rd_stats, RD_STATS *rd_stats_y, - RD_STATS *rd_stats_uv) { +static int64_t handle_intra_mode(InterModeSearchState *search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int ref_frame_cost, + const PICK_MODE_CONTEXT *ctx, int disable_skip, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv) { const AV1_COMMON *cm = &cpi->common; const SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; @@ -10159,9 +10849,19 @@ static int handle_intra_mode(InterModeSearchState *search_state, const int rows = block_size_high[bsize]; const int cols = block_size_wide[bsize]; const int num_planes = av1_num_planes(cm); - av1_init_rd_stats(rd_stats); - av1_init_rd_stats(rd_stats_y); - av1_init_rd_stats(rd_stats_uv); + const int skip_ctx = av1_get_skip_context(xd); + + int known_rate = intra_mode_cost[mbmi->mode]; + known_rate += ref_frame_cost; + if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED) + known_rate += intra_cost_penalty; + known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]); + const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0); + if (known_rd > search_state->best_rd) { + search_state->skip_intra_modes = 1; + return INT64_MAX; + } + TX_SIZE uv_tx; int is_directional_mode = av1_is_directional_mode(mbmi->mode); if (is_directional_mode && av1_use_angle_delta(bsize)) { @@ -10178,20 +10878,33 @@ static int handle_intra_mode(InterModeSearchState *search_state, search_state->directional_mode_skip_mask); search_state->angle_stats_ready = 1; } - if (search_state->directional_mode_skip_mask[mbmi->mode]) return 0; + if (search_state->directional_mode_skip_mask[mbmi->mode]) return INT64_MAX; + av1_init_rd_stats(rd_stats_y); rd_stats_y->rate = INT_MAX; - rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize, - intra_mode_cost[mbmi->mode], search_state->best_rd, - &model_rd); + rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &rate_dummy, rd_stats_y, + bsize, intra_mode_cost[mbmi->mode], + search_state->best_rd, &model_rd); } else { + av1_init_rd_stats(rd_stats_y); mbmi->angle_delta[PLANE_TYPE_Y] = 0; super_block_yrd(cpi, x, rd_stats_y, bsize, search_state->best_rd); } uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); - + int try_filter_intra = 0; + int64_t best_rd_tmp = INT64_MAX; if (mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { + if (rd_stats_y->rate != INT_MAX) { + const int tmp_rate = rd_stats_y->rate + x->filter_intra_cost[bsize][0] + + intra_mode_cost[mbmi->mode]; + best_rd_tmp = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist); + try_filter_intra = !((best_rd_tmp / 2) > search_state->best_rd); + } else { + try_filter_intra = !(search_state->best_mbmode.skip); + } + } + if (try_filter_intra) { RD_STATS rd_stats_y_fi; int filter_intra_selected_flag = 0; TX_SIZE best_tx_size = mbmi->tx_size; @@ -10199,20 +10912,12 @@ static int handle_intra_mode(InterModeSearchState *search_state, memcpy(best_txk_type, mbmi->txk_type, sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN); FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED; - int64_t best_rd_tmp = INT64_MAX; - if (rd_stats_y->rate != INT_MAX) { - best_rd_tmp = RDCOST(x->rdmult, - rd_stats_y->rate + x->filter_intra_cost[bsize][0] + - intra_mode_cost[mbmi->mode], - rd_stats_y->dist); - } mbmi->filter_intra_mode_info.use_filter_intra = 1; for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; fi_mode < FILTER_INTRA_MODES; ++fi_mode) { int64_t this_rd_tmp; mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode; - super_block_yrd(cpi, x, &rd_stats_y_fi, bsize, search_state->best_rd); if (rd_stats_y_fi.rate == INT_MAX) { continue; @@ -10223,6 +10928,9 @@ static int handle_intra_mode(InterModeSearchState *search_state, intra_mode_cost[mbmi->mode]); this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist); + if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > search_state->best_rd) { + break; + } if (this_rd_tmp < best_rd_tmp) { best_tx_size = mbmi->tx_size; memcpy(best_txk_type, mbmi->txk_type, @@ -10249,12 +10957,23 @@ static int handle_intra_mode(InterModeSearchState *search_state, mbmi->filter_intra_mode_info.use_filter_intra = 0; } } - - if (rd_stats_y->rate == INT_MAX) return 0; - + if (rd_stats_y->rate == INT_MAX) return INT64_MAX; + const int mode_cost_y = + intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]); + av1_init_rd_stats(rd_stats); + av1_init_rd_stats(rd_stats_uv); if (num_planes > 1) { uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); if (search_state->rate_uv_intra[uv_tx] == INT_MAX) { + int rate_y = + rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate; + const int64_t rdy = + RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist); + if (search_state->best_rd < (INT64_MAX / 2) && + rdy > (search_state->best_rd + (search_state->best_rd >> 2))) { + search_state->skip_intra_modes = 1; + return INT64_MAX; + } choose_intra_uv_mode( cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx], &search_state->rate_uv_tokenonly[uv_tx], @@ -10262,6 +10981,14 @@ static int handle_intra_mode(InterModeSearchState *search_state, &search_state->mode_uv[uv_tx]); if (try_palette) search_state->pmi_uv[uv_tx] = *pmi; search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV]; + + const int uv_rate = search_state->rate_uv_tokenonly[uv_tx]; + const int64_t uv_dist = search_state->dist_uvs[uv_tx]; + const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist); + if (uv_rd > search_state->best_rd) { + search_state->skip_intra_modes = 1; + return INT64_MAX; + } } rd_stats_uv->rate = search_state->rate_uv_tokenonly[uv_tx]; @@ -10277,10 +11004,7 @@ static int handle_intra_mode(InterModeSearchState *search_state, } mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx]; } - - rd_stats->rate = - rd_stats_y->rate + - intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]); + rd_stats->rate = rd_stats_y->rate + mode_cost_y; if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) { // super_block_yrd above includes the cost of the tx_size in the // tokenonly rate, but for intra blocks, tx_size is always coded @@ -10308,14 +11032,13 @@ static int handle_intra_mode(InterModeSearchState *search_state, rd_stats_y->rate = 0; rd_stats_uv->rate = 0; // Cost the skip mb case - rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][1]; + rd_stats->rate += x->skip_cost[skip_ctx][1]; } else { // Add in the cost of the no skip flag. - rd_stats->rate += x->skip_cost[av1_get_skip_context(xd)][0]; + rd_stats->rate += x->skip_cost[skip_ctx][0]; } // Calculate the final RD estimate for this mode. - int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - + const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); // Keep record of best intra rd if (this_rd < search_state->best_intra_rd) { search_state->best_intra_rd = this_rd; @@ -10333,14 +11056,322 @@ static int handle_intra_mode(InterModeSearchState *search_state, search_state->best_pred_rd[i] = AOMMIN(search_state->best_pred_rd[i], this_rd); } - return 1; + return this_rd; } -void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, +static void collect_single_states(MACROBLOCK *x, + InterModeSearchState *search_state, + const MB_MODE_INFO *const mbmi) { + int i, j; + const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1; + const int mode_offset = INTER_OFFSET(this_mode); + const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); + + // Simple rd + int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame]; + for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { + int64_t rd = search_state->simple_rd[this_mode][ref_mv_idx][ref_frame]; + if (rd < simple_rd) simple_rd = rd; + } + + // Insertion sort of single_state + SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 }; + SingleInterModeState *state_s = search_state->single_state[dir][mode_offset]; + i = search_state->single_state_cnt[dir][mode_offset]; + for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j) + state_s[j] = state_s[j - 1]; + state_s[j] = this_state_s; + search_state->single_state_cnt[dir][mode_offset]++; + + // Modelled rd + int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame]; + for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { + int64_t rd = search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame]; + if (rd < modelled_rd) modelled_rd = rd; + } + + // Insertion sort of single_state_modelled + SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 }; + SingleInterModeState *state_m = + search_state->single_state_modelled[dir][mode_offset]; + i = search_state->single_state_modelled_cnt[dir][mode_offset]; + for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j) + state_m[j] = state_m[j - 1]; + state_m[j] = this_state_m; + search_state->single_state_modelled_cnt[dir][mode_offset]++; +} + +static void analyze_single_states(const AV1_COMP *cpi, + InterModeSearchState *search_state) { + int i, j, dir, mode; + if (cpi->sf.prune_comp_search_by_single_result >= 1) { + for (dir = 0; dir < 2; ++dir) { + int64_t best_rd; + SingleInterModeState(*state)[FWD_REFS]; + + // Use the best rd of GLOBALMV or NEWMV to prune the unlikely + // reference frames for all the modes (NEARESTMV and NEARMV may not + // have same motion vectors). Always keep the best of each mode + // because it might form the best possible combination with other mode. + state = search_state->single_state[dir]; + best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, + state[INTER_OFFSET(GLOBALMV)][0].rd); + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) { + if (state[mode][i].rd != INT64_MAX && + (state[mode][i].rd >> 1) > best_rd) { + state[mode][i].valid = 0; + } + } + } + + state = search_state->single_state_modelled[dir]; + best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, + state[INTER_OFFSET(GLOBALMV)][0].rd); + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode]; + ++i) { + if (state[mode][i].rd != INT64_MAX && + (state[mode][i].rd >> 1) > best_rd) { + state[mode][i].valid = 0; + } + } + } + } + } + + // Ordering by simple rd first, then by modelled rd + for (dir = 0; dir < 2; ++dir) { + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + const int state_cnt_s = search_state->single_state_cnt[dir][mode]; + const int state_cnt_m = + search_state->single_state_modelled_cnt[dir][mode]; + SingleInterModeState *state_s = search_state->single_state[dir][mode]; + SingleInterModeState *state_m = + search_state->single_state_modelled[dir][mode]; + int count = 0; + const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m); + for (i = 0; i < state_cnt_s; ++i) { + if (state_s[i].rd == INT64_MAX) break; + if (state_s[i].valid) + search_state->single_rd_order[dir][mode][count++] = + state_s[i].ref_frame; + } + if (count < max_candidates) { + for (i = 0; i < state_cnt_m; ++i) { + if (state_m[i].rd == INT64_MAX) break; + if (state_m[i].valid) { + int ref_frame = state_m[i].ref_frame; + int match = 0; + // Check if existing already + for (j = 0; j < count; ++j) { + if (search_state->single_rd_order[dir][mode][j] == ref_frame) { + match = 1; + break; + } + } + if (!match) { + // Check if this ref_frame is removed in simple rd + int valid = 1; + for (j = 0; j < state_cnt_s; j++) { + if (ref_frame == state_s[j].ref_frame && !state_s[j].valid) { + valid = 0; + break; + } + } + if (valid) + search_state->single_rd_order[dir][mode][count++] = ref_frame; + } + if (count >= max_candidates) break; + } + } + } + } + } +} + +static int compound_skip_get_candidates( + const AV1_COMP *cpi, const InterModeSearchState *search_state, + const int dir, const PREDICTION_MODE mode) { + const int mode_offset = INTER_OFFSET(mode); + const SingleInterModeState *state = + search_state->single_state[dir][mode_offset]; + const SingleInterModeState *state_modelled = + search_state->single_state_modelled[dir][mode_offset]; + int max_candidates = 0; + int candidates; + + for (int i = 0; i < FWD_REFS; ++i) { + if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break; + max_candidates++; + } + + candidates = max_candidates; + if (cpi->sf.prune_comp_search_by_single_result >= 2) { + candidates = AOMMIN(2, max_candidates); + } + if (cpi->sf.prune_comp_search_by_single_result >= 3) { + if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX && + state[0].ref_frame == state_modelled[0].ref_frame) + candidates = 1; + if (mode == NEARMV || mode == GLOBALMV) candidates = 1; + } + return candidates; +} + +static int compound_skip_by_single_states( + const AV1_COMP *cpi, const InterModeSearchState *search_state, + const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame, + const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) { + const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame }; + const int mode[2] = { compound_ref0_mode(this_mode), + compound_ref1_mode(this_mode) }; + const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) }; + const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1, + refs[1] <= GOLDEN_FRAME ? 0 : 1 }; + int ref_searched[2] = { 0, 0 }; + int ref_mv_match[2] = { 1, 1 }; + int i, j; + + for (i = 0; i < 2; ++i) { + const SingleInterModeState *state = + search_state->single_state[mode_dir[i]][mode_offset[i]]; + const int state_cnt = + search_state->single_state_cnt[mode_dir[i]][mode_offset[i]]; + for (j = 0; j < state_cnt; ++j) { + if (state[j].ref_frame == refs[i]) { + ref_searched[i] = 1; + break; + } + } + } + + const int ref_set = get_drl_refmv_count(x, refs, this_mode); + for (i = 0; i < 2; ++i) { + if (mode[i] == NEARESTMV || mode[i] == NEARMV) { + const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME }; + int idential = 1; + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) { + int_mv single_mv; + int_mv comp_mv; + get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, single_refs, + x->mbmi_ext); + get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, refs, x->mbmi_ext); + + idential &= (single_mv.as_int == comp_mv.as_int); + if (!idential) { + ref_mv_match[i] = 0; + break; + } + } + } + } + + for (i = 0; i < 2; ++i) { + if (ref_searched[i] && ref_mv_match[i]) { + const int candidates = + compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]); + const MV_REFERENCE_FRAME *ref_order = + search_state->single_rd_order[mode_dir[i]][mode_offset[i]]; + int match = 0; + for (j = 0; j < candidates; ++j) { + if (refs[i] == ref_order[j]) { + match = 1; + break; + } + } + if (!match) return 1; + } + } + + return 0; +} + +static INLINE int sf_check_is_drop_ref(const MODE_DEFINITION *mode, + InterModeSearchState *search_state) { + const MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0]; + const MV_REFERENCE_FRAME second_ref_frame = mode->ref_frame[1]; + if (search_state->num_available_refs > 2) { + if ((ref_frame == search_state->dist_order_refs[0] && + second_ref_frame == search_state->dist_order_refs[1]) || + (ref_frame == search_state->dist_order_refs[1] && + second_ref_frame == search_state->dist_order_refs[0])) + return 1; // drop this pair of refs + } + return 0; +} + +static INLINE void sf_drop_ref_analyze(InterModeSearchState *search_state, + const MODE_DEFINITION *mode, + int64_t distortion2) { + const PREDICTION_MODE this_mode = mode->mode; + MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0]; + const int idx = ref_frame - LAST_FRAME; + if (idx && distortion2 > search_state->dist_refs[idx]) { + search_state->dist_refs[idx] = distortion2; + search_state->dist_order_refs[idx] = ref_frame; + } + + // Reach the last single ref prediction mode + if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) { + // bubble sort dist_refs and the order index + for (int i = 0; i < REF_FRAMES; ++i) { + for (int k = i + 1; k < REF_FRAMES; ++k) { + if (search_state->dist_refs[i] < search_state->dist_refs[k]) { + int64_t tmp_dist = search_state->dist_refs[i]; + search_state->dist_refs[i] = search_state->dist_refs[k]; + search_state->dist_refs[k] = tmp_dist; + + int tmp_idx = search_state->dist_order_refs[i]; + search_state->dist_order_refs[i] = search_state->dist_order_refs[k]; + search_state->dist_order_refs[k] = tmp_idx; + } + } + } + for (int i = 0; i < REF_FRAMES; ++i) { + if (search_state->dist_refs[i] == -1) break; + search_state->num_available_refs = i; + } + search_state->num_available_refs++; + } +} + +static void alloc_compound_type_rd_buffers(AV1_COMMON *const cm, + CompoundTypeRdBuffers *const bufs) { + CHECK_MEM_ERROR( + cm, bufs->pred0, + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0))); + CHECK_MEM_ERROR( + cm, bufs->pred1, + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1))); + CHECK_MEM_ERROR( + cm, bufs->residual1, + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1))); + CHECK_MEM_ERROR( + cm, bufs->diff10, + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10))); + CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf, + (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE * + sizeof(*bufs->tmp_best_mask_buf))); +} + +static void release_compound_type_rd_buffers( + CompoundTypeRdBuffers *const bufs) { + aom_free(bufs->pred0); + aom_free(bufs->pred1); + aom_free(bufs->residual1); + aom_free(bufs->diff10); + aom_free(bufs->tmp_best_mask_buf); + av1_zero(*bufs); // Set all pointers to NULL for safety. +} + +void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, int mi_row, int mi_col, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { - const AV1_COMMON *const cm = &cpi->common; + AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); const SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; @@ -10350,9 +11381,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const struct segmentation *const seg = &cm->seg; PREDICTION_MODE this_mode; - MV_REFERENCE_FRAME ref_frame, second_ref_frame; unsigned char segment_id = mbmi->segment_id; - int i, k; + int i; struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; unsigned int ref_costs_single[REF_FRAMES]; unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; @@ -10364,28 +11394,57 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, InterModeSearchState search_state; init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize, best_rd_so_far); - + INTERINTRA_MODE interintra_modes[REF_FRAMES] = { + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES + }; HandleInterModeArgs args = { { NULL }, { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, { NULL }, { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 }, NULL, NULL, - NULL, NULL, + NULL, search_state.modelled_rd, { { 0 } }, INT_MAX, - INT_MAX + INT_MAX, search_state.simple_rd, + 0, interintra_modes }; for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; av1_invalid_rd_stats(rd_cost); // init params, set frame modes, speed features - set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col, - ref_frame_skip_mask, mode_skip_mask, - ref_costs_single, ref_costs_comp, yv12_mb); + set_params_rd_pick_inter_mode( + cpi, x, &args, bsize, mi_row, mi_col, ref_frame_skip_mask, mode_skip_mask, + ctx->skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb); #if CONFIG_COLLECT_INTER_MODE_RD_STATS int64_t best_est_rd = INT64_MAX; + // TODO(angiebird): Turn this on when this speed feature is well tested +#if 1 + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + const int do_tx_search = !md->ready; +#else + const int do_tx_search = 1; +#endif + InterModesInfo *inter_modes_info = &tile_data->inter_modes_info; + inter_modes_info->num = 0; #endif + int intra_mode_num = 0; + int intra_mode_idx_ls[MAX_MODES]; + int reach_first_comp_mode = 0; + + // Temporary buffers used by handle_inter_mode(). + // We allocate them once and reuse it in every call to that function. + // Note: Must be allocated on the heap due to large size of the arrays. + uint8_t *tmp_buf_orig; + CHECK_MEM_ERROR( + cm, tmp_buf_orig, + (uint8_t *)aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE)); + uint8_t *const tmp_buf = get_buf_by_bd(xd, tmp_buf_orig); + + CompoundTypeRdBuffers rd_buffers; + alloc_compound_type_rd_buffers(cm, &rd_buffers); + for (int midx = 0; midx < MAX_MODES; ++midx) { int mode_index = mode_map[midx]; int64_t this_rd = INT64_MAX; @@ -10394,42 +11453,44 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, int64_t distortion2 = 0; int skippable = 0; int this_skip2 = 0; - - this_mode = av1_mode_order[mode_index].mode; - ref_frame = av1_mode_order[mode_index].ref_frame[0]; - second_ref_frame = av1_mode_order[mode_index].ref_frame[1]; + const MODE_DEFINITION *mode_order = &av1_mode_order[mode_index]; + const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0]; + const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1]; + const int comp_pred = second_ref_frame > INTRA_FRAME; + this_mode = mode_order->mode; init_mbmi(mbmi, mode_index, cm); x->skip = 0; set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); - if (inter_mode_search_order_independent_skip(cpi, x, bsize, mode_index, - mi_row, mi_col, mode_skip_mask, - ref_frame_skip_mask)) - continue; - - if (ref_frame == INTRA_FRAME) { - if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) - continue; + // Reach the first compound prediction mode + if (sf->prune_comp_search_by_single_result > 0 && comp_pred && + reach_first_comp_mode == 0) { + analyze_single_states(cpi, &search_state); + reach_first_comp_mode = 1; } + const int ret = inter_mode_search_order_independent_skip( + cpi, ctx, x, bsize, mode_index, mi_row, mi_col, mode_skip_mask, + ref_frame_skip_mask, &search_state); + if (ret == 1) continue; + args.skip_motion_mode = (ret == 2); - if (sf->drop_ref) { - if (ref_frame > INTRA_FRAME && second_ref_frame > INTRA_FRAME) { - if (search_state.num_available_refs > 2) { - if ((ref_frame == search_state.dist_order_refs[0] && - second_ref_frame == search_state.dist_order_refs[1]) || - (ref_frame == search_state.dist_order_refs[1] && - second_ref_frame == search_state.dist_order_refs[0])) - continue; - } + if (sf->drop_ref && comp_pred) { + if (sf_check_is_drop_ref(mode_order, &search_state)) { + continue; } } if (search_state.best_rd < search_state.mode_threshold[mode_index]) continue; - const int comp_pred = second_ref_frame > INTRA_FRAME; + if (sf->prune_comp_search_by_single_result > 0 && comp_pred) { + if (compound_skip_by_single_states(cpi, &search_state, this_mode, + ref_frame, second_ref_frame, x)) + continue; + } + const int ref_frame_cost = comp_pred ? ref_costs_comp[ref_frame][second_ref_frame] : ref_costs_single[ref_frame]; @@ -10474,18 +11535,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, } if (ref_frame == INTRA_FRAME) { - RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv; - const int ret = handle_intra_mode( - &search_state, cpi, x, bsize, ref_frame_cost, ctx, disable_skip, - &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv); - if (!ret) { - continue; - } - rate2 = intra_rd_stats.rate; - distortion2 = intra_rd_stats.dist; - this_rd = RDCOST(x->rdmult, rate2, distortion2); - skippable = intra_rd_stats.skip; - rate_y = intra_rd_stats_y.rate; + intra_mode_idx_ls[intra_mode_num++] = mode_index; + continue; } else { mbmi->angle_delta[PLANE_TYPE_Y] = 0; mbmi->angle_delta[PLANE_TYPE_UV] = 0; @@ -10501,17 +11552,17 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, args.single_newmv = search_state.single_newmv; args.single_newmv_rate = search_state.single_newmv_rate; args.single_newmv_valid = search_state.single_newmv_valid; - args.modelled_rd = search_state.modelled_rd; args.single_comp_cost = real_compmode_cost; args.ref_frame_cost = ref_frame_cost; #if CONFIG_COLLECT_INTER_MODE_RD_STATS - this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y, - &rd_stats_uv, &disable_skip, mi_row, mi_col, - &args, ref_best_rd, &best_est_rd); + this_rd = handle_inter_mode( + cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip, + mi_row, mi_col, &args, ref_best_rd, tmp_buf, &rd_buffers, tile_data, + &best_est_rd, do_tx_search, inter_modes_info); #else this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip, mi_row, mi_col, - &args, ref_best_rd); + &args, ref_best_rd, tmp_buf, &rd_buffers); #endif rate2 = rd_stats.rate; skippable = rd_stats.skip; @@ -10520,6 +11571,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, rate_uv = rd_stats_uv.rate; } + if (sf->prune_comp_search_by_single_result > 0 && + is_inter_singleref_mode(this_mode)) { + collect_single_states(x, &search_state, mbmi); + } + if (this_rd == INT64_MAX) continue; this_skip2 = mbmi->skip; @@ -10554,10 +11610,24 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, search_state.best_mbmode = *mbmi; search_state.best_skip2 = this_skip2; search_state.best_mode_skippable = skippable; +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + if (do_tx_search) { + // When do_tx_search == 0, handle_inter_mode won't provide correct + // rate_y and rate_uv because txfm_search process is replaced by + // rd estimation. + // Therfore, we should avoid updating best_rate_y and best_rate_uv + // here. These two values will be updated when txfm_search is called + search_state.best_rate_y = + rate_y + + x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable]; + search_state.best_rate_uv = rate_uv; + } +#else // CONFIG_COLLECT_INTER_MODE_RD_STATS search_state.best_rate_y = rate_y + x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable]; search_state.best_rate_uv = rate_uv; +#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS memcpy(ctx->blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); } @@ -10588,43 +11658,124 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT]) search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; } + if (sf->drop_ref && second_ref_frame == NONE_FRAME) { + // Collect data from single ref mode, and analyze data. + sf_drop_ref_analyze(&search_state, mode_order, distortion2); + } - if (sf->drop_ref) { - if (second_ref_frame == NONE_FRAME) { - const int idx = ref_frame - LAST_FRAME; - if (idx && distortion2 > search_state.dist_refs[idx]) { - search_state.dist_refs[idx] = distortion2; - search_state.dist_order_refs[idx] = ref_frame; - } + if (x->skip && !comp_pred) break; + } - // Reach the last single ref prediction mode - if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) { - // bubble sort dist_refs and the order index - for (i = 0; i < REF_FRAMES; ++i) { - for (k = i + 1; k < REF_FRAMES; ++k) { - if (search_state.dist_refs[i] < search_state.dist_refs[k]) { - int64_t tmp_dist = search_state.dist_refs[i]; - search_state.dist_refs[i] = search_state.dist_refs[k]; - search_state.dist_refs[k] = tmp_dist; - - int tmp_idx = search_state.dist_order_refs[i]; - search_state.dist_order_refs[i] = - search_state.dist_order_refs[k]; - search_state.dist_order_refs[k] = tmp_idx; - } - } - } + aom_free(tmp_buf_orig); + tmp_buf_orig = NULL; + release_compound_type_rd_buffers(&rd_buffers); - for (i = 0; i < REF_FRAMES; ++i) { - if (search_state.dist_refs[i] == -1) break; - search_state.num_available_refs = i; - } - search_state.num_available_refs++; - } +#if CONFIG_COLLECT_INTER_MODE_RD_STATS + if (!do_tx_search) { + inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr); + search_state.best_rd = INT64_MAX; + + int64_t top_est_rd = + inter_modes_info->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]; + for (int j = 0; j < inter_modes_info->num; ++j) { + const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx; + *mbmi = inter_modes_info->mbmi_arr[data_idx]; + int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx]; + if (curr_est_rd * 0.9 > top_est_rd) { + continue; + } + const int mode_rate = inter_modes_info->mode_rate_arr[data_idx]; + + x->skip = 0; + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + RD_STATS rd_stats; + RD_STATS rd_stats_y; + RD_STATS rd_stats_uv; + + av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); + if (mbmi->motion_mode == OBMC_CAUSAL) + av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); + + if (!txfm_search(cpi, x, bsize, mi_row, mi_col, &rd_stats, &rd_stats_y, + &rd_stats_uv, mode_rate, search_state.best_rd)) { + continue; + } else { + const int skip_ctx = av1_get_skip_context(xd); + inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse, + rd_stats.dist, + rd_stats_y.rate + rd_stats_uv.rate + + x->skip_cost[skip_ctx][mbmi->skip]); + } + rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); + + if (rd_stats.rdcost < search_state.best_rd) { + search_state.best_rd = rd_stats.rdcost; + // Note index of best mode so far + const int mode_index = get_prediction_mode_idx( + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + search_state.best_mode_index = mode_index; + *rd_cost = rd_stats; + search_state.best_rd = rd_stats.rdcost; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = mbmi->skip; + search_state.best_mode_skippable = rd_stats.skip; + search_state.best_rate_y = + rd_stats_y.rate + + x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip]; + search_state.best_rate_uv = rd_stats_uv.rate; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); } } + } +#endif - if (x->skip && !comp_pred) break; + for (int j = 0; j < intra_mode_num; ++j) { + const int mode_index = intra_mode_idx_ls[j]; + const MV_REFERENCE_FRAME ref_frame = + av1_mode_order[mode_index].ref_frame[0]; + assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME); + assert(ref_frame == INTRA_FRAME); + if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break; + init_mbmi(mbmi, mode_index, cm); + x->skip = 0; + set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME); + + // Select prediction reference frames. + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + } + + RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv; + + const int ref_frame_cost = ref_costs_single[ref_frame]; + intra_rd_stats.rdcost = handle_intra_mode( + &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0, + &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv); + if (intra_rd_stats.rdcost < search_state.best_rd) { + search_state.best_rd = intra_rd_stats.rdcost; + // Note index of best mode so far + search_state.best_mode_index = mode_index; + *rd_cost = intra_rd_stats; + search_state.best_rd = intra_rd_stats.rdcost; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = 0; + search_state.best_mode_skippable = intra_rd_stats.skip; + search_state.best_rate_y = + intra_rd_stats_y.rate + + x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip]; + search_state.best_rate_uv = intra_rd_stats_uv.rate; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + } } // In effect only when speed >= 2. @@ -10635,7 +11786,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data, // Only try palette mode when the best mode so far is an intra mode. if (try_palette && !is_inter_mode(search_state.best_mbmode.mode)) { - search_palette_mode(cpi, x, rd_cost, ctx, bsize, mbmi, pmi, + search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi, ref_costs_single, &search_state); } @@ -10776,11 +11927,11 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col); if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) { int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; - mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref); + mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref); // Select the samples according to motion vector difference - if (mbmi->num_proj_ref[0] > 1) - mbmi->num_proj_ref[0] = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, - mbmi->num_proj_ref[0], bsize); + if (mbmi->num_proj_ref > 1) + mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref, bsize); } set_default_interp_filters(mbmi, cm->interp_filter); @@ -10853,7 +12004,7 @@ static INLINE void calc_target_weighted_pred_above( struct calc_target_weighted_pred_ctxt *ctxt = (struct calc_target_weighted_pred_ctxt *)fun_ctxt; - const int bw = xd->n8_w << MI_SIZE_LOG2; + const int bw = xd->n4_w << MI_SIZE_LOG2; const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE); @@ -10899,7 +12050,7 @@ static INLINE void calc_target_weighted_pred_left( struct calc_target_weighted_pred_ctxt *ctxt = (struct calc_target_weighted_pred_ctxt *)fun_ctxt; - const int bw = xd->n8_w << MI_SIZE_LOG2; + const int bw = xd->n4_w << MI_SIZE_LOG2; const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw); @@ -10982,8 +12133,8 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, int above_stride, const uint8_t *left, int left_stride) { const BLOCK_SIZE bsize = xd->mi[0]->sb_type; - const int bw = xd->n8_w << MI_SIZE_LOG2; - const int bh = xd->n8_h << MI_SIZE_LOG2; + const int bw = xd->n4_w << MI_SIZE_LOG2; + const int bh = xd->n4_h << MI_SIZE_LOG2; int32_t *mask_buf = x->mask_buf; int32_t *wsrc_buf = x->wsrc_buf; |