From bbcc64772580c8a979288791afa02d30bc476d2e Mon Sep 17 00:00:00 2001 From: trav90 Date: Fri, 19 Oct 2018 21:52:15 -0500 Subject: Update aom to v1.0.0 Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0. --- third_party/aom/av1/common/convolve.c | 2126 ++++++++++++++------------------- 1 file changed, 867 insertions(+), 1259 deletions(-) (limited to 'third_party/aom/av1/common/convolve.c') diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c index 5476f59a6..d57f44f8b 100644 --- a/third_party/aom/av1/common/convolve.c +++ b/third_party/aom/av1/common/convolve.c @@ -12,76 +12,60 @@ #include #include -#include "./aom_dsp_rtcd.h" -#include "./av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + #include "av1/common/blockd.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" #include "av1/common/onyxc_int.h" +#include "av1/common/resize.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" -#define MAX_BLOCK_WIDTH (MAX_SB_SIZE) -#define MAX_BLOCK_HEIGHT (MAX_SB_SIZE) -#define MAX_STEP (32) - -void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_x_q4, int x_step_q4, - ConvolveParams *conv_params) { - int x, y; - int filter_size = filter_params.taps; - assert(conv_params->round == CONVOLVE_OPT_ROUND); - src -= filter_size / 2 - 1; - for (y = 0; y < h; ++y) { - int x_q4 = subpel_x_q4; - for (x = 0; x < w; ++x) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params, x_q4 & SUBPEL_MASK); - int k, sum = 0; - for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k]; - - sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - if (conv_params->do_average) - dst[x] = ROUND_POWER_OF_TWO(dst[x] + sum, 1); - else - dst[x] = sum; - - x_q4 += x_step_q4; +void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const int16_t *x_filters, int x0_qn, + int x_step_qn) { + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_qn = x0_qn; + for (int x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; + const int x_filter_idx = + (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + assert(x_filter_idx <= RS_SUBPEL_MASK); + const int16_t *const x_filter = + &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; + int sum = 0; + for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_qn += x_step_qn; } src += src_stride; dst += dst_stride; } } -void av1_convolve_horiz_scale(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_x_qn, int x_step_qn, - ConvolveParams *conv_params) { - int x, y; - int filter_size = filter_params.taps; - assert(conv_params->round == CONVOLVE_OPT_ROUND); - src -= filter_size / 2 - 1; - for (y = 0; y < h; ++y) { - int x_qn = subpel_x_qn; - for (x = 0; x < w; ++x) { - const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS]; - const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(x_filter_idx < SUBPEL_SHIFTS); - const int16_t *x_filter = - av1_get_interp_filter_subpel_kernel(filter_params, x_filter_idx); - int k, sum = 0; - for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k]; - - sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - if (conv_params->do_average) - dst[x] = ROUND_POWER_OF_TWO(dst[x] + sum, 1); - else - dst[x] = sum; - +void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const int16_t *x_filters, int x0_qn, + int x_step_qn, int bd) { + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_qn = x0_qn; + for (int x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; + const int x_filter_idx = + (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + assert(x_filter_idx <= RS_SUBPEL_MASK); + const int16_t *const x_filter = + &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; + int sum = 0; + for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); x_qn += x_step_qn; } src += src_stride; @@ -89,417 +73,358 @@ void av1_convolve_horiz_scale(const uint8_t *src, int src_stride, uint8_t *dst, } } -void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_y_q4, int y_step_q4, - ConvolveParams *conv_params) { - int x, y; - int filter_size = filter_params.taps; - assert(conv_params->round == CONVOLVE_OPT_ROUND); - src -= src_stride * (filter_size / 2 - 1); - for (x = 0; x < w; ++x) { - int y_q4 = subpel_y_q4; - for (y = 0; y < h; ++y) { - const uint8_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params, y_q4 & SUBPEL_MASK); - int k, sum = 0; - for (k = 0; k < filter_size; ++k) - sum += src_y[k * src_stride] * y_filter[k]; - - sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - if (conv_params->do_average) - dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + sum, 1); - else - dst[y * dst_stride] = sum; - - y_q4 += y_step_q4; - } - ++src; - ++dst; - } -} - -void av1_convolve_vert_scale(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_y_qn, int y_step_qn, - ConvolveParams *conv_params) { - int x, y; - int filter_size = filter_params.taps; - assert(conv_params->round == CONVOLVE_OPT_ROUND); - src -= src_stride * (filter_size / 2 - 1); - for (x = 0; x < w; ++x) { - int y_qn = subpel_y_qn; - for (y = 0; y < h; ++y) { - const uint8_t *const src_y = - &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; - const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(y_filter_idx < SUBPEL_SHIFTS); - const int16_t *y_filter = - av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx); - int k, sum = 0; - for (k = 0; k < filter_size; ++k) - sum += src_y[k * src_stride] * y_filter[k]; - - sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - if (conv_params->do_average) - dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + sum, 1); - else - dst[y * dst_stride] = sum; - - y_qn += y_step_qn; - } - ++src; - ++dst; - } -} - -static void convolve_copy(const uint8_t *src, int src_stride, uint8_t *dst, +void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { - assert(conv_params->round == CONVOLVE_OPT_ROUND); - if (conv_params->do_average == 0) { - int r; - for (r = 0; r < h; ++r) { - memcpy(dst, src, w); - src += src_stride; - dst += dst_stride; - } - } else { - int r, c; - for (r = 0; r < h; ++r) { - for (c = 0; c < w; ++c) { - dst[c] = clip_pixel(ROUND_POWER_OF_TWO(dst[c] + src[c], 1)); + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bd = 8; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; } - src += src_stride; - dst += dst_stride; + assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); } } -} -void av1_convolve_horiz_facade(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_x_q4, int x_step_q4, - ConvolveParams *conv_params) { - assert(conv_params->round == CONVOLVE_OPT_ROUND); - if (filter_params.taps == SUBPEL_TAPS) { - const int16_t *filter_x = - av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4); - if (conv_params->do_average == 0) - aom_convolve8_horiz(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - NULL, -1, w, h); - else - aom_convolve8_avg_horiz(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, NULL, -1, w, h); - } else { - av1_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params, - subpel_x_q4, x_step_q4, conv_params); + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(0 <= sum && sum < (1 << (offset_bits + 2))); + int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - + ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); + } } } -void av1_convolve_horiz_facade_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_x_q4, int x_step_q4, - ConvolveParams *conv_params) { - assert(conv_params->round == CONVOLVE_OPT_ROUND); - if (filter_params.taps == SUBPEL_TAPS) { - const int16_t *filter_x = - av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4); - if (conv_params->do_average == 0) - aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, NULL, -1, w, h); - else - aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, NULL, -1, w, h); - } else { - av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params, - subpel_x_q4, x_step_q4, conv_params); - } -} +void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { + const int fo_vert = filter_params_y->taps / 2 - 1; + (void)filter_params_x; + (void)subpel_x_q4; + (void)conv_params; -void av1_convolve_horiz_facade_scale(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_x_qn, int x_step_qn, - ConvolveParams *conv_params) { - assert(conv_params->round == CONVOLVE_OPT_ROUND); - if (filter_params.taps == SUBPEL_TAPS) { - const int16_t *filter_x = av1_get_interp_filter_subpel_kernel( - filter_params, subpel_x_qn >> SCALE_EXTRA_BITS); - if (conv_params->do_average == 0) - aom_convolve8_horiz_scale(src, src_stride, dst, dst_stride, filter_x, - subpel_x_qn, x_step_qn, NULL, 0, -1, w, h); - else - aom_convolve8_avg_horiz_scale(src, src_stride, dst, dst_stride, filter_x, - subpel_x_qn, x_step_qn, NULL, 0, -1, w, h); - } else { - av1_convolve_horiz_scale(src, src_stride, dst, dst_stride, w, h, - filter_params, subpel_x_qn, x_step_qn, - conv_params); - } -} + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); -void av1_convolve_vert_facade(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_y_q4, int y_step_q4, - ConvolveParams *conv_params) { - assert(conv_params->round == CONVOLVE_OPT_ROUND); - if (filter_params.taps == SUBPEL_TAPS) { - const int16_t *filter_y = - av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4); - if (conv_params->do_average == 0) { - aom_convolve8_vert(src, src_stride, dst, dst_stride, NULL, -1, filter_y, - y_step_q4, w, h); - } else { - aom_convolve8_avg_vert(src, src_stride, dst, dst_stride, NULL, -1, - filter_y, y_step_q4, w, h); + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS)); } - } else { - av1_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params, - subpel_y_q4, y_step_q4, conv_params); } } -void av1_convolve_vert_facade_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_y_q4, int y_step_q4, - ConvolveParams *conv_params) { - assert(conv_params->round == CONVOLVE_OPT_ROUND); - if (filter_params.taps == SUBPEL_TAPS) { - const int16_t *filter_y = - av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4); - if (conv_params->do_average == 0) { - aom_convolve8_vert_c(src, src_stride, dst, dst_stride, NULL, -1, filter_y, - y_step_q4, w, h); - } else { - aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, NULL, -1, - filter_y, y_step_q4, w, h); - } - } else { - av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params, - subpel_y_q4, y_step_q4, conv_params); - } -} +void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + (void)filter_params_y; + (void)subpel_y_q4; + (void)conv_params; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); -void av1_convolve_vert_facade_scale(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_y_qn, int y_step_qn, - ConvolveParams *conv_params) { - assert(conv_params->round == CONVOLVE_OPT_ROUND); - if (filter_params.taps == SUBPEL_TAPS) { - const int16_t *filter_y = av1_get_interp_filter_subpel_kernel( - filter_params, subpel_y_qn >> SCALE_EXTRA_BITS); - if (conv_params->do_average == 0) { - aom_convolve8_vert_scale(src, src_stride, dst, dst_stride, NULL, 0, -1, - filter_y, subpel_y_qn, y_step_qn, w, h); - } else { - aom_convolve8_avg_vert_scale(src, src_stride, dst, dst_stride, NULL, 0, - -1, filter_y, subpel_y_qn, y_step_qn, w, h); + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = ROUND_POWER_OF_TWO(res, conv_params->round_0); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); } - } else { - av1_convolve_vert_scale(src, src_stride, dst, dst_stride, w, h, - filter_params, subpel_y_qn, y_step_qn, conv_params); } } -#if CONFIG_CONVOLVE_ROUND -void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, int bits) { - int r, c; - for (r = 0; r < h; ++r) { - for (c = 0; c < w; ++c) { - dst[r * dst_stride + c] = - clip_pixel(ROUND_POWER_OF_TWO(src[r * src_stride + c], bits)); - } +void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_q4; + (void)subpel_y_q4; + (void)conv_params; + + for (int y = 0; y < h; ++y) { + memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); } } -#if CONFIG_COMPOUND_ROUND -void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, - int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - int x, y, k; - uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; +void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, + int dst8_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; int im_h = h + filter_params_y->taps - 1; int im_stride = w; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; // horizontal filter const uint8_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( *filter_params_x, subpel_x_q4 & SUBPEL_MASK); - for (y = 0; y < im_h; ++y) { - for (x = 0; x < w; ++x) { - int32_t sum = 0; - for (k = 0; k < filter_params_x->taps; ++k) { + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; } + assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); im_block[y * im_stride + x] = - clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0)); + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); } } // vertical filter - uint8_t *src_vert = im_block + fo_vert * im_stride; + int16_t *src_vert = im_block + fo_vert * im_stride; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( *filter_params_y, subpel_y_q4 & SUBPEL_MASK); - for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - CONV_BUF_TYPE sum = 0; - for (k = 0; k < filter_params_y->taps; ++k) { + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; } + assert(0 <= sum && sum < (1 << (offset_bits + 2))); CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else + if (conv_params->do_average) { + int32_t tmp = dst[y * dst_stride + x]; + if (conv_params->use_jnt_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + dst8[y * dst8_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); + } else { dst[y * dst_stride + x] = res; + } } } } -void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_qn, const int x_step_qn, - const int subpel_y_qn, const int y_step_qn, - ConvolveParams *conv_params) { - int x, y, k; - uint8_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; - int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + - filter_params_y->taps; - int im_stride = w; +void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8, + int dst8_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - - // horizontal filter - const uint8_t *src_horiz = src - fo_vert * src_stride; - for (y = 0; y < im_h; ++y) { - int x_qn = subpel_x_qn; - for (x = 0; x < w; ++x, x_qn += x_step_qn) { - const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; - const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(x_filter_idx < SUBPEL_SHIFTS); - const int16_t *x_filter = - av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx); - int sum = 0; - for (k = 0; k < filter_params_x->taps; ++k) - sum += x_filter[k] * src_x[k - fo_horiz]; - im_block[y * im_stride + x] = - clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0)); - } - src_horiz += src_stride; - } + const int bits = FILTER_BITS - conv_params->round_0; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + (void)filter_params_x; + (void)subpel_x_q4; // vertical filter - const uint8_t *src_vert = im_block + fo_vert * im_stride; - for (x = 0; x < w; ++x) { - int y_qn = subpel_y_qn; - for (y = 0; y < h; ++y, y_qn += y_step_qn) { - const uint8_t *const src_y = - &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; - const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(y_filter_idx < SUBPEL_SHIFTS); - const int16_t *y_filter = - av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx); - CONV_BUF_TYPE sum = 0; - for (k = 0; k < filter_params_y->taps; ++k) { - sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; } - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else + res *= (1 << bits); + res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst[y * dst_stride + x]; + if (conv_params->use_jnt_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst8[y * dst8_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); + } else { dst[y * dst_stride + x] = res; + } } - src_vert++; } } -#else - -/* When convolve-round is enabled and compound-round is disabled, we use a - high-precision convolve filter. - Note: For notes on hardware implementations, including the required - bit widths for various intermediate values, see the comments above - av1_warp_affine_c. -*/ -void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, - int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - int x, y, k; - int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; - int im_h = h + filter_params_y->taps - 1; - int im_stride = w; - const int fo_vert = filter_params_y->taps / 2 - 1; +void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8, + int dst8_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_1; const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + (void)filter_params_y; + (void)subpel_y_q4; // horizontal filter - const uint8_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( *filter_params_x, subpel_x_q4 & SUBPEL_MASK); - for (y = 0; y < im_h; ++y) { - for (x = 0; x < w; ++x) { - int32_t sum = (1 << (bd + FILTER_BITS - 1)); - for (k = 0; k < filter_params_x->taps; ++k) { - sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); + res += round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst[y * dst_stride + x]; + if (conv_params->use_jnt_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst8[y * dst8_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); + } else { + dst[y * dst_stride + x] = res; } - assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); - im_block[y * im_stride + x] = - ROUND_POWER_OF_TWO(sum, conv_params->round_0); } } +} - // vertical filter - int32_t *src_vert = im_block + fo_vert * im_stride; - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - *filter_params_y, subpel_y_q4 & SUBPEL_MASK); +void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - CONV_BUF_TYPE sum = 1 << offset_bits; - for (k = 0; k < filter_params_y->taps; ++k) { - sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; - } - assert(0 <= sum && sum < (1 << (offset_bits + 2))); - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - - ((1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1))); - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_q4; + (void)subpel_y_q4; + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + CONV_BUF_TYPE res = src[y * src_stride + x] << bits; + res += round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst[y * dst_stride + x]; + if (conv_params->use_jnt_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } else { dst[y * dst_stride + x] = res; + } } } } -void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, int h, +void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, + int dst8_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params) { - int x, y, k; - int32_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; + int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps; + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); int im_stride = w; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; @@ -507,245 +432,255 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, // horizontal filter const uint8_t *src_horiz = src - fo_vert * src_stride; - for (y = 0; y < im_h; ++y) { + for (int y = 0; y < im_h; ++y) { int x_qn = subpel_x_qn; - for (x = 0; x < w; ++x, x_qn += x_step_qn) { + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(x_filter_idx < SUBPEL_SHIFTS); const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx); int32_t sum = (1 << (bd + FILTER_BITS - 1)); - for (k = 0; k < filter_params_x->taps; ++k) { + for (int k = 0; k < filter_params_x->taps; ++k) { sum += x_filter[k] * src_x[k - fo_horiz]; } assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); im_block[y * im_stride + x] = - ROUND_POWER_OF_TWO(sum, conv_params->round_0); + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); } src_horiz += src_stride; } // vertical filter - int32_t *src_vert = im_block + fo_vert * im_stride; + int16_t *src_vert = im_block + fo_vert * im_stride; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - for (x = 0; x < w; ++x) { + for (int x = 0; x < w; ++x) { int y_qn = subpel_y_qn; - for (y = 0; y < h; ++y, y_qn += y_step_qn) { - const int32_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(y_filter_idx < SUBPEL_SHIFTS); const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx); - CONV_BUF_TYPE sum = 1 << offset_bits; - for (k = 0; k < filter_params_y->taps; ++k) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; } assert(0 <= sum && sum < (1 << (offset_bits + 2))); - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - - ((1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1))); - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else - dst[y * dst_stride + x] = res; + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_jnt_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } } src_vert++; } } -#endif // CONFIG_COMPOUND_ROUND + +static void convolve_2d_scale_wrapper( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int x_step_qn, const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + if (conv_params->is_compound) { + assert(conv_params->dst != NULL); + } + av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn, + y_step_qn, conv_params); +} void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilters interp_filters, const int subpel_x_q4, int x_step_q4, const int subpel_y_q4, int y_step_q4, - int scaled, ConvolveParams *conv_params) { + int scaled, ConvolveParams *conv_params, + const struct scale_factors *sf) { (void)x_step_q4; (void)y_step_q4; (void)dst; (void)dst_stride; InterpFilterParams filter_params_x, filter_params_y; - av1_get_convolve_filter_params(interp_filters, 1, &filter_params_x, - &filter_params_y); - - if (filter_params_y.taps < filter_params_x.taps) { - uint8_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * - (MAX_SB_SIZE + MAX_FILTER_TAP - 1)]; - int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1; - CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE]; - int tr_dst_stride = MAX_SB_SIZE; - int fo_vert = filter_params_y.taps / 2 - 1; - int fo_horiz = filter_params_x.taps / 2 - 1; - - transpose_uint8(tr_src, tr_src_stride, - src - fo_vert * src_stride - fo_horiz, src_stride, - w + filter_params_x.taps - 1, h + filter_params_y.taps - 1); - transpose_int32(tr_dst, tr_dst_stride, conv_params->dst, - conv_params->dst_stride, w, h); - - // horizontal and vertical parameters are swapped because of the transpose - if (scaled) - av1_convolve_2d_scale(tr_src + fo_horiz * tr_src_stride + fo_vert, - tr_src_stride, tr_dst, tr_dst_stride, h, w, - &filter_params_y, &filter_params_x, subpel_y_q4, - y_step_q4, subpel_x_q4, x_step_q4, conv_params); - else - av1_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert, - tr_src_stride, tr_dst, tr_dst_stride, h, w, - &filter_params_y, &filter_params_x, subpel_y_q4, - subpel_x_q4, conv_params); - transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst, - tr_dst_stride, h, w); - } else { - if (scaled) - av1_convolve_2d_scale(src, src_stride, conv_params->dst, - conv_params->dst_stride, w, h, &filter_params_x, - &filter_params_y, subpel_x_q4, x_step_q4, - subpel_y_q4, y_step_q4, conv_params); - else - av1_convolve_2d(src, src_stride, conv_params->dst, - conv_params->dst_stride, w, h, &filter_params_x, - &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params); - } + av1_get_convolve_filter_params(interp_filters, &filter_params_x, + &filter_params_y, w, h); + + if (scaled) + convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h, + &filter_params_x, &filter_params_y, subpel_x_q4, + x_step_q4, subpel_y_q4, y_step_q4, conv_params); + else + sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound]( + src, src_stride, dst, dst_stride, w, h, &filter_params_x, + &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params); } -#if CONFIG_HIGHBITDEPTH -void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride, - uint8_t *dst8, int dst_stride, int w, int h, - int bits, int bd) { - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - int r, c; - for (r = 0; r < h; ++r) { - for (c = 0; c < w; ++c) { - dst[r * dst_stride + c] = clip_pixel_highbd( - ROUND_POWER_OF_TWO(src[r * src_stride + c], bits), bd); - } +void av1_highbd_convolve_2d_copy_sr_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_q4; + (void)subpel_y_q4; + (void)conv_params; + (void)bd; + + for (int y = 0; y < h; ++y) { + memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); } } -#if CONFIG_COMPOUND_ROUND -void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { - int x, y, k; - uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; - int im_h = h + filter_params_y->taps - 1; - int im_stride = w; - const int fo_vert = filter_params_y->taps / 2 - 1; +void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + (void)filter_params_y; + (void)subpel_y_q4; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); // horizontal filter - const uint16_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( *filter_params_x, subpel_x_q4 & SUBPEL_MASK); - for (y = 0; y < im_h; ++y) { - for (x = 0; x < w; ++x) { - int32_t sum = 0; - for (k = 0; k < filter_params_x->taps; ++k) { - sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; } - im_block[y * im_stride + x] = - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, conv_params->round_0), bd); + res = ROUND_POWER_OF_TWO(res, conv_params->round_0); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); } } +} +void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + const int fo_vert = filter_params_y->taps / 2 - 1; + (void)filter_params_x; + (void)subpel_x_q4; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); // vertical filter - uint16_t *src_vert = im_block + fo_vert * im_stride; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( *filter_params_y, subpel_y_q4 & SUBPEL_MASK); - for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - CONV_BUF_TYPE sum = 0; - for (k = 0; k < filter_params_y->taps; ++k) { - sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; } - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else - dst[y * dst_stride + x] = res; + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd); } } } -void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_qn, const int x_step_qn, - const int subpel_y_qn, const int y_step_qn, - ConvolveParams *conv_params, int bd) { - int x, y, k; - uint16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; - int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + - filter_params_y->taps; +void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + filter_params_y->taps - 1; int im_stride = w; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; - (void)bd; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); // horizontal filter const uint16_t *src_horiz = src - fo_vert * src_stride; - for (y = 0; y < im_h; ++y) { - int x_qn = subpel_x_qn; - for (x = 0; x < w; ++x, x_qn += x_step_qn) { - const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; - const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(x_filter_idx < SUBPEL_SHIFTS); - const int16_t *x_filter = - av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx); - int sum = 0; - for (k = 0; k < filter_params_x->taps; ++k) - sum += x_filter[k] * src_x[k - fo_horiz]; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); im_block[y * im_stride + x] = - clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0)); + ROUND_POWER_OF_TWO(sum, conv_params->round_0); } - src_horiz += src_stride; } // vertical filter - uint16_t *src_vert = im_block + fo_vert * im_stride; - for (x = 0; x < w; ++x) { - int y_qn = subpel_y_qn; - for (y = 0; y < h; ++y, y_qn += y_step_qn) { - const uint16_t *const src_y = - &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; - const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(y_filter_idx < SUBPEL_SHIFTS); - const int16_t *y_filter = - av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx); - CONV_BUF_TYPE sum = 0; - for (k = 0; k < filter_params_y->taps; ++k) { - sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; } - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else - dst[y * dst_stride + x] = res; + assert(0 <= sum && sum < (1 << (offset_bits + 2))); + int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - + ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); } - src_vert++; } } -#else - -void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { +void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, + uint16_t *dst16, int dst16_stride, int w, + int h, InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { int x, y, k; - int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; int im_h = h + filter_params_y->taps - 1; int im_stride = w; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); // horizontal filter const uint16_t *src_horiz = src - fo_vert * src_stride; @@ -760,439 +695,367 @@ void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride, assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); (void)bd; im_block[y * im_stride + x] = - ROUND_POWER_OF_TWO(sum, conv_params->round_0); + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); } } // vertical filter - int32_t *src_vert = im_block + fo_vert * im_stride; + int16_t *src_vert = im_block + fo_vert * im_stride; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( *filter_params_y, subpel_y_q4 & SUBPEL_MASK); for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) { - CONV_BUF_TYPE sum = 1 << offset_bits; + int32_t sum = 1 << offset_bits; for (k = 0; k < filter_params_y->taps; ++k) { sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; } assert(0 <= sum && sum < (1 << (offset_bits + 2))); - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - - ((1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1))); - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->do_average) { + int32_t tmp = dst[y * dst_stride + x]; + if (conv_params->use_jnt_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + dst16[y * dst16_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); + } else { + dst[y * dst_stride + x] = res; + } + } + } +} + +void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, + uint16_t *dst16, int dst16_stride, int w, + int h, InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + (void)filter_params_y; + (void)subpel_y_q4; + assert(bits >= 0); + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); + res += round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst[y * dst_stride + x]; + if (conv_params->use_jnt_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst16[y * dst16_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); + } else { + dst[y * dst_stride + x] = res; + } + } + } +} + +void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, + uint16_t *dst16, int dst16_stride, int w, + int h, InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + (void)filter_params_x; + (void)subpel_x_q4; + assert(bits >= 0); + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + res *= (1 << bits); + res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst[y * dst_stride + x]; + if (conv_params->use_jnt_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst16[y * dst16_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); + } else { dst[y * dst_stride + x] = res; + } + } + } +} + +void av1_highbd_jnt_convolve_2d_copy_c( + const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, + int w, int h, InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + assert(bits >= 0); + (void)filter_params_x; + (void)filter_params_y; + (void)subpel_x_q4; + (void)subpel_y_q4; + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + CONV_BUF_TYPE res = src[y * src_stride + x] << bits; + res += round_offset; + if (conv_params->do_average) { + int32_t tmp = dst[y * dst_stride + x]; + if (conv_params->use_jnt_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst16[y * dst16_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } else { + dst[y * dst_stride + x] = res; + } } } } void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, - CONV_BUF_TYPE *dst, int dst_stride, int w, - int h, InterpFilterParams *filter_params_x, + uint16_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd) { - int x, y, k; - int32_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; + int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps; int im_stride = w; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; - + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); // horizontal filter const uint16_t *src_horiz = src - fo_vert * src_stride; - for (y = 0; y < im_h; ++y) { + for (int y = 0; y < im_h; ++y) { int x_qn = subpel_x_qn; - for (x = 0; x < w; ++x, x_qn += x_step_qn) { + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(x_filter_idx < SUBPEL_SHIFTS); const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx); int32_t sum = (1 << (bd + FILTER_BITS - 1)); - for (k = 0; k < filter_params_x->taps; ++k) { + for (int k = 0; k < filter_params_x->taps; ++k) { sum += x_filter[k] * src_x[k - fo_horiz]; } assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); im_block[y * im_stride + x] = - ROUND_POWER_OF_TWO(sum, conv_params->round_0); + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); } src_horiz += src_stride; } // vertical filter - int32_t *src_vert = im_block + fo_vert * im_stride; + int16_t *src_vert = im_block + fo_vert * im_stride; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - for (x = 0; x < w; ++x) { + for (int x = 0; x < w; ++x) { int y_qn = subpel_y_qn; - for (y = 0; y < h; ++y, y_qn += y_step_qn) { - const int32_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(y_filter_idx < SUBPEL_SHIFTS); const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx); - CONV_BUF_TYPE sum = 1 << offset_bits; - for (k = 0; k < filter_params_y->taps; ++k) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; } assert(0 <= sum && sum < (1 << (offset_bits + 2))); - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - - ((1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1))); - if (conv_params->do_average) - dst[y * dst_stride + x] += res; - else - dst[y * dst_stride + x] = res; + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_jnt_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } } src_vert++; } } -#endif // CONFIG_COMPOUND_ROUND void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, + uint8_t *dst8, int dst_stride, int w, int h, InterpFilters interp_filters, const int subpel_x_q4, int x_step_q4, const int subpel_y_q4, int y_step_q4, int scaled, ConvolveParams *conv_params, - int bd) { + const struct scale_factors *sf, int bd) { (void)x_step_q4; (void)y_step_q4; - (void)dst; (void)dst_stride; - InterpFilterParams filter_params_x, filter_params_y; - av1_get_convolve_filter_params(interp_filters, 1, &filter_params_x, - &filter_params_y); - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - if (filter_params_y.taps < filter_params_x.taps) { - uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * - (MAX_SB_SIZE + MAX_FILTER_TAP - 1)]; - int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1; - CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE]; - int tr_dst_stride = MAX_SB_SIZE; - int fo_vert = filter_params_y.taps / 2 - 1; - int fo_horiz = filter_params_x.taps / 2 - 1; - - transpose_uint16( - tr_src, tr_src_stride, src - fo_vert * src_stride - fo_horiz, - src_stride, w + filter_params_x.taps - 1, h + filter_params_y.taps - 1); - transpose_int32(tr_dst, tr_dst_stride, conv_params->dst, - conv_params->dst_stride, w, h); - - // horizontal and vertical parameters are swapped because of the transpose - if (scaled) - av1_highbd_convolve_2d_scale( - tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst, - tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4, - y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd); - else - av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert, - tr_src_stride, tr_dst, tr_dst_stride, h, w, - &filter_params_y, &filter_params_x, subpel_y_q4, - subpel_x_q4, conv_params, bd); - transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst, - tr_dst_stride, h, w); - } else { - if (scaled) - av1_highbd_convolve_2d_scale( - src, src_stride, conv_params->dst, conv_params->dst_stride, w, h, - &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4, - subpel_y_q4, y_step_q4, conv_params, bd); - else - av1_highbd_convolve_2d(src, src_stride, conv_params->dst, - conv_params->dst_stride, w, h, &filter_params_x, - &filter_params_y, subpel_x_q4, subpel_y_q4, - conv_params, bd); - } -} -#endif // CONFIG_HIGHBITDEPTH - -#endif // CONFIG_CONVOLVE_ROUND - -typedef void (*ConvolveFunc)(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_q4, int step_q4, - ConvolveParams *conv_params); - -static void convolve_helper(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilters interp_filters, - const int subpel_x_q4, int x_step_q4, - const int subpel_y_q4, int y_step_q4, - ConvolveParams *conv_params, - ConvolveFunc convolve_horiz, - ConvolveFunc convolve_vert) { - int ignore_horiz = x_step_q4 == SUBPEL_SHIFTS && subpel_x_q4 == 0; - int ignore_vert = y_step_q4 == SUBPEL_SHIFTS && subpel_y_q4 == 0; - InterpFilterParams filter_params_x, filter_params_y; - av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x, - &filter_params_y); - - assert(conv_params->round == CONVOLVE_OPT_ROUND); - - assert(w <= MAX_BLOCK_WIDTH); - assert(h <= MAX_BLOCK_HEIGHT); - assert(y_step_q4 <= MAX_STEP); - assert(x_step_q4 <= MAX_STEP); - - if (ignore_horiz && ignore_vert) { - convolve_copy(src, src_stride, dst, dst_stride, w, h, conv_params); - } else if (ignore_vert) { - assert(filter_params_x.taps <= MAX_FILTER_TAP); - convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params_x, - subpel_x_q4, x_step_q4, conv_params); - } else if (ignore_horiz) { - assert(filter_params_y.taps <= MAX_FILTER_TAP); - convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params_y, - subpel_y_q4, y_step_q4, conv_params); - } else { - // temp's size is set to a 256 aligned value to facilitate SIMD - // implementation. The value is greater than (maximum possible intermediate - // height or width) * MAX_SB_SIZE - DECLARE_ALIGNED(16, uint8_t, - temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); - int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16); - int filter_size; -#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y); - - // we do filter with fewer taps first to reduce hardware implementation - // complexity - if (filter_params_y.taps < filter_params_x.taps) { - int intermediate_width; - int temp_stride = max_intermediate_size; - ConvolveParams temp_conv_params; - temp_conv_params.ref = 0; - temp_conv_params.do_average = 0; - temp_conv_params.round = CONVOLVE_OPT_ROUND; - filter_size = filter_params_x.taps; - intermediate_width = - (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size; - assert(intermediate_width <= max_intermediate_size); - - assert(filter_params_y.taps <= MAX_FILTER_TAP); - - convolve_vert(src - (filter_size / 2 - 1), src_stride, temp, temp_stride, - intermediate_width, h, filter_params_y, subpel_y_q4, - y_step_q4, &temp_conv_params); - - assert(filter_params_x.taps <= MAX_FILTER_TAP); - convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride, - w, h, filter_params_x, subpel_x_q4, x_step_q4, - conv_params); - } else -#endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - { - int intermediate_height; - int temp_stride = MAX_SB_SIZE; - ConvolveParams temp_conv_params; - temp_conv_params.ref = 0; - temp_conv_params.do_average = 0; - temp_conv_params.round = CONVOLVE_OPT_ROUND; - filter_size = filter_params_y.taps; - intermediate_height = - (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size; - assert(intermediate_height <= max_intermediate_size); - (void)max_intermediate_size; - - assert(filter_params_x.taps <= MAX_FILTER_TAP); - - convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp, - temp_stride, w, intermediate_height, filter_params_x, - subpel_x_q4, x_step_q4, &temp_conv_params); - - assert(filter_params_y.taps <= MAX_FILTER_TAP); - - convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride, - dst, dst_stride, w, h, filter_params_y, subpel_y_q4, - y_step_q4, conv_params); - } - } -} + av1_get_convolve_filter_params(interp_filters, &filter_params_x, + &filter_params_y, w, h); -static void convolve_scale_helper(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - const InterpFilters interp_filters, - const int subpel_x_qn, int x_step_qn, - const int subpel_y_qn, int y_step_qn, - ConvolveParams *conv_params, - ConvolveFunc convolve_horiz, - ConvolveFunc convolve_vert) { - int ignore_horiz = x_step_qn == SCALE_SUBPEL_SHIFTS && subpel_x_qn == 0; - int ignore_vert = y_step_qn == SCALE_SUBPEL_SHIFTS && subpel_y_qn == 0; - - InterpFilterParams filter_params_x, filter_params_y; - av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x, - &filter_params_y); - - assert(conv_params->round == CONVOLVE_OPT_ROUND); - - assert(w <= MAX_BLOCK_WIDTH); - assert(h <= MAX_BLOCK_HEIGHT); - assert(y_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS)); - assert(x_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS)); - - if (ignore_horiz && ignore_vert) { - convolve_copy(src, src_stride, dst, dst_stride, w, h, conv_params); - } else if (ignore_vert) { - assert(filter_params_x.taps <= MAX_FILTER_TAP); - convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params_x, - subpel_x_qn, x_step_qn, conv_params); - } else if (ignore_horiz) { - assert(filter_params_y.taps <= MAX_FILTER_TAP); - convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params_y, - subpel_y_qn, y_step_qn, conv_params); - } else { - // temp's size is set to a 256 aligned value to facilitate SIMD - // implementation. The value is greater than (maximum possible intermediate - // height or width) * MAX_SB_SIZE - DECLARE_ALIGNED(16, uint8_t, - temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); - int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16); - int filter_size; -#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y); - - // we do filter with fewer taps first to reduce hardware implementation - // complexity - if (filter_params_y.taps < filter_params_x.taps) { - int intermediate_width; - int temp_stride = max_intermediate_size; - ConvolveParams temp_conv_params; - temp_conv_params.ref = 0; - temp_conv_params.do_average = 0; - temp_conv_params.round = CONVOLVE_OPT_ROUND; - filter_size = filter_params_x.taps; - intermediate_width = - (((w - 1) * x_step_qn + subpel_x_qn) >> SCALE_SUBPEL_BITS) + - filter_size; - assert(intermediate_width <= max_intermediate_size); - - assert(filter_params_y.taps <= MAX_FILTER_TAP); - - convolve_vert(src - (filter_size / 2 - 1), src_stride, temp, temp_stride, - intermediate_width, h, filter_params_y, subpel_y_qn, - y_step_qn, &temp_conv_params); - - assert(filter_params_x.taps <= MAX_FILTER_TAP); - convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride, - w, h, filter_params_x, subpel_x_qn, x_step_qn, - conv_params); - } else { -#endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - int intermediate_height; - int temp_stride = MAX_SB_SIZE; - ConvolveParams temp_conv_params; - temp_conv_params.ref = 0; - temp_conv_params.do_average = 0; - temp_conv_params.round = CONVOLVE_OPT_ROUND; - filter_size = filter_params_y.taps; - intermediate_height = - (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + - filter_size; - assert(intermediate_height <= max_intermediate_size); - (void)max_intermediate_size; - - assert(filter_params_x.taps <= MAX_FILTER_TAP); - - convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp, - temp_stride, w, intermediate_height, filter_params_x, - subpel_x_qn, x_step_qn, &temp_conv_params); - - assert(filter_params_y.taps <= MAX_FILTER_TAP); - - convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride, - dst, dst_stride, w, h, filter_params_y, subpel_y_qn, - y_step_qn, conv_params); -#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER + if (scaled) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + if (conv_params->is_compound) { + assert(conv_params->dst != NULL); } -#endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - } -} + av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, + &filter_params_x, &filter_params_y, + subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, + conv_params, bd); + } else { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); -void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, InterpFilters interp_filters, - const int subpel_x_q4, int x_step_q4, const int subpel_y_q4, - int y_step_q4, ConvolveParams *conv_params) { - convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filters, - subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params, - av1_convolve_horiz_facade, av1_convolve_vert_facade); + sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 != + 0][conv_params->is_compound]( + src, src_stride, dst, dst_stride, w, h, &filter_params_x, + &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd); + } } -void av1_convolve_c(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, InterpFilters interp_filters, - const int subpel_x_q4, int x_step_q4, const int subpel_y_q4, - int y_step_q4, ConvolveParams *conv_params) { - convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filters, - subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params, - av1_convolve_horiz_facade_c, av1_convolve_vert_facade_c); +// Note: Fixed size intermediate buffers, place limits on parameters +// of some functions. 2d filtering proceeds in 2 steps: +// (1) Interpolate horizontally into an intermediate buffer, temp. +// (2) Interpolate temp vertically to derive the sub-pixel result. +// Deriving the maximum number of rows in the temp buffer (135): +// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). +// --Largest block size is 128x128 pixels. +// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the +// original frame (in 1/16th pixel units). +// --Must round-up because block may be located at sub-pixel position. +// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. +// --((128 - 1) * 32 + 15) >> 4 + 8 = 263. +#define WIENER_MAX_EXT_SIZE 263 + +static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; } -void av1_convolve_scale(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - InterpFilters interp_filters, const int subpel_x_qn, - int x_step_qn, const int subpel_y_qn, int y_step_qn, - ConvolveParams *conv_params) { - convolve_scale_helper(src, src_stride, dst, dst_stride, w, h, interp_filters, - subpel_x_qn, x_step_qn, subpel_y_qn, y_step_qn, - conv_params, av1_convolve_horiz_facade_scale, - av1_convolve_vert_facade_scale); +static INLINE int highbd_horz_scalar_product(const uint16_t *a, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; } -void av1_lowbd_convolve_init_c(void) { - // A placeholder for SIMD initialization - return; +static INLINE int highbd_vert_scalar_product(const uint16_t *a, + ptrdiff_t a_stride, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; + return sum; } -void av1_highbd_convolve_init_c(void) { - // A placeholder for SIMD initialization - return; +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); } -void av1_convolve_init(AV1_COMMON *cm) { -#if CONFIG_HIGHBITDEPTH - if (cm->use_highbitdepth) - av1_highbd_convolve_init(); - else - av1_lowbd_convolve_init(); -#else - (void)cm; - av1_lowbd_convolve_init(); -#endif - return; +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); } -#if CONFIG_HIGHBITDEPTH -void av1_highbd_convolve_horiz_c(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_x_q4, int x_step_q4, int avg, - int bd) { - int x, y; - int filter_size = filter_params.taps; - src -= filter_size / 2 - 1; - for (y = 0; y < h; ++y) { - int x_q4 = subpel_x_q4; - for (x = 0; x < w; ++x) { - const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params, x_q4 & SUBPEL_MASK); - int k, sum = 0; - for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k]; - if (avg) - dst[x] = ROUND_POWER_OF_TWO( - dst[x] + - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), - 1); - else - dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); +static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, + int round0_bits) { + const int bd = 8; + src -= SUBPEL_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (int x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + + (1 << (bd + FILTER_BITS - 1)); + const int sum = horz_scalar_product(src_x, x_filter) + rounding; + dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, + WIENER_CLAMP_LIMIT(round0_bits, bd) - 1); x_q4 += x_step_q4; } src += src_stride; @@ -1200,66 +1063,25 @@ void av1_highbd_convolve_horiz_c(const uint16_t *src, int src_stride, } } -void av1_highbd_convolve_horiz_scale(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, - int h, - const InterpFilterParams filter_params, - const int subpel_x_qn, int x_step_qn, - int avg, int bd) { - int x, y; - int filter_size = filter_params.taps; - src -= filter_size / 2 - 1; - for (y = 0; y < h; ++y) { - int x_qn = subpel_x_qn; - for (x = 0; x < w; ++x) { - const uint16_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS]; - const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(x_filter_idx < SUBPEL_SHIFTS); - const int16_t *x_filter = - av1_get_interp_filter_subpel_kernel(filter_params, x_filter_idx); - int k, sum = 0; - for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k]; - if (avg) - dst[x] = ROUND_POWER_OF_TWO( - dst[x] + - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), - 1); - else - dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); - x_qn += x_step_qn; - } - src += src_stride; - dst += dst_stride; - } -} - -void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_y_q4, int y_step_q4, int avg, - int bd) { - int x, y; - int filter_size = filter_params.taps; - src -= src_stride * (filter_size / 2 - 1); - - for (x = 0; x < w; ++x) { - int y_q4 = subpel_y_q4; - for (y = 0; y < h; ++y) { - const uint16_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params, y_q4 & SUBPEL_MASK); - int k, sum = 0; - for (k = 0; k < filter_size; ++k) - sum += src_y[k * src_stride] * y_filter[k]; - if (avg) { - dst[y * dst_stride] = ROUND_POWER_OF_TWO( - dst[y * dst_stride] + - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), - 1); - } else { - dst[y * dst_stride] = - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); - } +static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, + int round1_bits) { + const int bd = 8; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (int x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (int y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + const int rounding = + ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - + (1 << (bd + round1_bits - 1)); + const int sum = + highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits)); y_q4 += y_step_q4; } ++src; @@ -1267,325 +1089,111 @@ void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride, } } -void av1_highbd_convolve_vert_scale(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, int h, - const InterpFilterParams filter_params, - const int subpel_y_qn, int y_step_qn, - int avg, int bd) { - int x, y; - int filter_size = filter_params.taps; - src -= src_stride * (filter_size / 2 - 1); - - for (x = 0; x < w; ++x) { - int y_qn = subpel_y_qn; - for (y = 0; y < h; ++y) { - const uint16_t *const src_y = - &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; - const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(y_filter_idx < SUBPEL_SHIFTS); - const int16_t *y_filter = - av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx); - int k, sum = 0; - for (k = 0; k < filter_size; ++k) - sum += src_y[k * src_stride] * y_filter[k]; - if (avg) { - dst[y * dst_stride] = ROUND_POWER_OF_TWO( - dst[y * dst_stride] + - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), - 1); - } else { - dst[y * dst_stride] = - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); - } - y_qn += y_step_qn; - } - ++src; - ++dst; - } -} - -static void highbd_convolve_copy(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, int h, - int avg, int bd) { - if (avg == 0) { - int r; - for (r = 0; r < h; ++r) { - memcpy(dst, src, w * sizeof(*src)); - src += src_stride; - dst += dst_stride; - } - } else { - int r, c; - for (r = 0; r < h; ++r) { - for (c = 0; c < w; ++c) { - dst[c] = clip_pixel_highbd(ROUND_POWER_OF_TWO(dst[c] + src[c], 1), bd); - } - src += src_stride; - dst += dst_stride; - } - } +void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, + const ConvolveParams *conv_params) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4, + x_step_q4, w, intermediate_height, + conv_params->round_0); + convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h, conv_params->round_1); } -void av1_highbd_convolve_horiz_facade(const uint8_t *src8, int src_stride, - uint8_t *dst8, int dst_stride, int w, - int h, - const InterpFilterParams filter_params, - const int subpel_x_q4, int x_step_q4, - int avg, int bd) { +static void highbd_convolve_add_src_horiz_hip( + const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int round0_bits, int bd) { + const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd); uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - if (filter_params.taps == SUBPEL_TAPS) { - const int16_t *filter_x = - av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4); - if (avg == 0) - aom_highbd_convolve8_horiz(src8, src_stride, dst8, dst_stride, filter_x, - x_step_q4, NULL, -1, w, h, bd); - else - aom_highbd_convolve8_avg_horiz(src8, src_stride, dst8, dst_stride, - filter_x, x_step_q4, NULL, -1, w, h, bd); - } else { - av1_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h, - filter_params, subpel_x_q4, x_step_q4, avg, bd); - } -} - -void av1_highbd_convolve_horiz_facade_scale( - const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w, - int h, const InterpFilterParams filter_params, const int subpel_x_qn, - int x_step_qn, int avg, int bd) { - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - // TODO(debargha): Add special functions for filter_params.taps == SUBPEL_TAPS - // as in the function above. - av1_highbd_convolve_horiz_scale(src, src_stride, dst, dst_stride, w, h, - filter_params, subpel_x_qn, x_step_qn, avg, - bd); -} - -void av1_highbd_convolve_vert_facade(const uint8_t *src8, int src_stride, - uint8_t *dst8, int dst_stride, int w, - int h, - const InterpFilterParams filter_params, - const int subpel_y_q4, int y_step_q4, - int avg, int bd) { - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - - if (filter_params.taps == SUBPEL_TAPS) { - const int16_t *filter_y = - av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4); - if (avg == 0) { - aom_highbd_convolve8_vert(src8, src_stride, dst8, dst_stride, NULL, -1, - filter_y, y_step_q4, w, h, bd); - } else { - aom_highbd_convolve8_avg_vert(src8, src_stride, dst8, dst_stride, NULL, - -1, filter_y, y_step_q4, w, h, bd); + src -= SUBPEL_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (int x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + + (1 << (bd + FILTER_BITS - 1)); + const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding; + dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, + extraprec_clamp_limit - 1); + x_q4 += x_step_q4; } - } else { - av1_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h, - filter_params, subpel_y_q4, y_step_q4, avg, bd); + src += src_stride; + dst += dst_stride; } } -void av1_highbd_convolve_vert_facade_scale( - const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w, - int h, const InterpFilterParams filter_params, const int subpel_y_qn, - int y_step_qn, int avg, int bd) { - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - // TODO(debargha): Add special functions for filter_params.taps == SUBPEL_TAPS - // as in the function above. - av1_highbd_convolve_vert_scale(src, src_stride, dst, dst_stride, w, h, - filter_params, subpel_y_qn, y_step_qn, avg, - bd); -} - -void av1_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8, - int dst_stride, int w, int h, - InterpFilters interp_filters, const int subpel_x_q4, - int x_step_q4, const int subpel_y_q4, int y_step_q4, - int ref_idx, int bd) { - uint16_t *src = CONVERT_TO_SHORTPTR(src8); +static void highbd_convolve_add_src_vert_hip( + const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int round1_bits, int bd) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - int ignore_horiz = x_step_q4 == SUBPEL_SHIFTS && subpel_x_q4 == 0; - int ignore_vert = y_step_q4 == SUBPEL_SHIFTS && subpel_y_q4 == 0; - - assert(w <= MAX_BLOCK_WIDTH); - assert(h <= MAX_BLOCK_HEIGHT); - assert(y_step_q4 <= MAX_STEP); - assert(x_step_q4 <= MAX_STEP); - - if (ignore_horiz && ignore_vert) { - highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd); - return; - } - - InterpFilterParams filter_params_x, filter_params_y; - av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x, - &filter_params_y); - - if (ignore_vert) { - av1_highbd_convolve_horiz_facade(src8, src_stride, dst8, dst_stride, w, h, - filter_params_x, subpel_x_q4, x_step_q4, - ref_idx, bd); - } else if (ignore_horiz) { - av1_highbd_convolve_vert_facade(src8, src_stride, dst8, dst_stride, w, h, - filter_params_y, subpel_y_q4, y_step_q4, - ref_idx, bd); - } else { - // temp's size is set to a 256 aligned value to facilitate SIMD - // implementation. The value is greater than (maximum possible intermediate - // height or width) * MAX_SB_SIZE - DECLARE_ALIGNED(16, uint16_t, - temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); - uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp); - int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16); - int filter_size; - -#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y); - - if (filter_params_y.taps < filter_params_x.taps) { - int intermediate_width; - int temp_stride = max_intermediate_size; - filter_size = filter_params_x.taps; - intermediate_width = - (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size; - assert(intermediate_width <= max_intermediate_size); - - assert(filter_params_y.taps <= MAX_FILTER_TAP); - - av1_highbd_convolve_vert_facade(src8 - (filter_size / 2 - 1), src_stride, - temp8, temp_stride, intermediate_width, h, - filter_params_y, subpel_y_q4, y_step_q4, - 0, bd); - - assert(filter_params_x.taps <= MAX_FILTER_TAP); - - av1_highbd_convolve_horiz_facade( - temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h, - filter_params_x, subpel_x_q4, x_step_q4, ref_idx, bd); - } else -#endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - { - int intermediate_height; - int temp_stride = MAX_SB_SIZE; - filter_size = filter_params_y.taps; - - intermediate_height = - (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size; - assert(intermediate_height <= max_intermediate_size); - (void)max_intermediate_size; - - av1_highbd_convolve_horiz_facade( - src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8, - temp_stride, w, intermediate_height, filter_params_x, subpel_x_q4, - x_step_q4, 0, bd); - - filter_size = filter_params_y.taps; - assert(filter_params_y.taps <= MAX_FILTER_TAP); - - av1_highbd_convolve_vert_facade( - temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8, - dst_stride, w, h, filter_params_y, subpel_y_q4, y_step_q4, ref_idx, - bd); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (int x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (int y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + const int rounding = + ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - + (1 << (bd + round1_bits - 1)); + const int sum = + highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd); + y_q4 += y_step_q4; } + ++src; + ++dst; } } -void av1_highbd_convolve_scale(const uint8_t *src8, int src_stride, - uint8_t *dst8, int dst_stride, int w, int h, - InterpFilters interp_filters, - const int subpel_x_qn, int x_step_qn, - const int subpel_y_qn, int y_step_qn, - int ref_idx, int bd) { - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - int ignore_horiz = x_step_qn == SCALE_SUBPEL_SHIFTS && subpel_x_qn == 0; - int ignore_vert = y_step_qn == SCALE_SUBPEL_SHIFTS && subpel_y_qn == 0; - - assert(w <= MAX_BLOCK_WIDTH); - assert(h <= MAX_BLOCK_HEIGHT); - assert(y_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS)); - assert(x_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS)); - - if (ignore_horiz && ignore_vert) { - highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd); - return; - } - - InterpFilterParams filter_params_x, filter_params_y; - av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x, - &filter_params_y); - - if (ignore_vert) { - av1_highbd_convolve_horiz_facade_scale(src8, src_stride, dst8, dst_stride, - w, h, filter_params_x, subpel_x_qn, - x_step_qn, ref_idx, bd); - } else if (ignore_horiz) { - av1_highbd_convolve_vert_facade_scale(src8, src_stride, dst8, dst_stride, w, - h, filter_params_y, subpel_y_qn, - y_step_qn, ref_idx, bd); - } else { - // temp's size is set to a 256 aligned value to facilitate SIMD - // implementation. The value is greater than (maximum possible intermediate - // height or width) * MAX_SB_SIZE - DECLARE_ALIGNED(16, uint16_t, - temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); - uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp); - int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16); - int filter_size; - -#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y); - - if (filter_params_y.taps < filter_params_x.taps) { - int intermediate_width; - int temp_stride = max_intermediate_size; - filter_size = filter_params_x.taps; - intermediate_width = - (((w - 1) * x_step_qn + subpel_x_qn) >> SCALE_SUBPEL_BITS) + - filter_size; - assert(intermediate_width <= max_intermediate_size); - - assert(filter_params_y.taps <= MAX_FILTER_TAP); - - av1_highbd_convolve_vert_facade_scale( - src8 - (filter_size / 2 - 1), src_stride, temp8, temp_stride, - intermediate_width, h, filter_params_y, subpel_y_qn, y_step_qn, 0, - bd); - - assert(filter_params_x.taps <= MAX_FILTER_TAP); - - av1_highbd_convolve_horiz_facade_scale( - temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h, - filter_params_x, subpel_x_qn, x_step_qn, ref_idx, bd); - } else { -#endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - int intermediate_height; - int temp_stride = MAX_SB_SIZE; - filter_size = filter_params_y.taps; - intermediate_height = - (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + - filter_size; - assert(intermediate_height <= max_intermediate_size); - (void)max_intermediate_size; - - av1_highbd_convolve_horiz_facade_scale( - src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8, - temp_stride, w, intermediate_height, filter_params_x, subpel_x_qn, - x_step_qn, 0, bd); - - filter_size = filter_params_y.taps; - assert(filter_params_y.taps <= MAX_FILTER_TAP); - - av1_highbd_convolve_vert_facade_scale( - temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8, - dst_stride, w, h, filter_params_y, subpel_y_qn, y_step_qn, ref_idx, - bd); -#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - } -#endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER - } +void av1_highbd_wiener_convolve_add_src_c( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, + const ConvolveParams *conv_params, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); + + highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, filters_x, + x0_q4, x_step_q4, w, intermediate_height, + conv_params->round_0, bd); + highbd_convolve_add_src_vert_hip( + temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride, + filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd); } -#endif // CONFIG_HIGHBITDEPTH -- cgit v1.2.3