diff options
Diffstat (limited to 'third_party/aom/av1/common/x86')
12 files changed, 1130 insertions, 29 deletions
diff --git a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c index 5e627ebcf..e85c15eaf 100644 --- a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c +++ b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c @@ -676,11 +676,12 @@ void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, __m128i verf[6]; __m128i horf[2]; SubpelFilterCoeffs hCoeffs, vCoeffs; + assert(conv_params->do_average == 0 || conv_params->do_average == 1); const uint8_t *src_ptr; - store_pixel_t store2p = store2pixelTab[conv_params->ref]; - store_pixel_t store4p = store4pixelTab[conv_params->ref]; - transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->ref]; - transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->ref]; + store_pixel_t store2p = store2pixelTab[conv_params->do_average]; + store_pixel_t store4p = store4pixelTab[conv_params->do_average]; + transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->do_average]; + transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->do_average]; const int tapsNum = filter_params.taps; int block_height, block_residu; @@ -890,10 +891,11 @@ void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, __m128i verf[6]; SubpelFilterCoeffs vCoeffs; const uint8_t *src_ptr; + assert(conv_params->do_average == 0 || conv_params->do_average == 1); uint8_t *dst_ptr = dst; - store_pixel_t store2p = store2pixelTab[conv_params->ref]; - store_pixel_t store4p = store4pixelTab[conv_params->ref]; - store_pixel_t store8p = store8pixelTab[conv_params->ref]; + store_pixel_t store2p = store2pixelTab[conv_params->do_average]; + store_pixel_t store4p = store4pixelTab[conv_params->do_average]; + store_pixel_t store8p = store8pixelTab[conv_params->do_average]; const int tapsNum = filter_params.taps; if (0 == subpel_y_q4 || 16 != y_step_q4) { diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c index 1d7c55349..f7824b627 100644 --- a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c +++ b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c @@ -40,7 +40,12 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, const int stride, const TXFM_2D_FLIP_CFG *cfg, int32_t *txfm_buf) { - // TODO(sarahparker) must correct for rectangular transforms in follow up + // TODO(sarahparker) This does not currently support rectangular transforms + // and will break without splitting txfm_size out into row and col size. + // Rectangular transforms use c code only, so it should be ok for now. + // It will be corrected when there are sse implementations for rectangular + // transforms. + assert(cfg->row_cfg->txfm_size == cfg->col_cfg->txfm_size); const int txfm_size = cfg->row_cfg->txfm_size; const int8_t *shift = cfg->row_cfg->shift; const int8_t *stage_range_col = cfg->col_cfg->stage_range; diff --git a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h index af7afb7ee..fd0a6ed2c 100644 --- a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h +++ b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h @@ -64,7 +64,7 @@ static INLINE void transpose_32_4x4(int stride, const __m128i *input, // the entire input block can be represent by a grid of 4x4 blocks // each 4x4 blocks can be represent by 4 vertical __m128i // we first transpose each 4x4 block internally -// than transpose the grid +// then transpose the grid static INLINE void transpose_32(int txfm_size, const __m128i *input, __m128i *output) { const int num_per_128 = 4; diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c new file mode 100644 index 000000000..46c2674ca --- /dev/null +++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "av1/common/convolve.h" + +#if CONFIG_COMPOUND_ROUND +void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, + CONV_BUF_TYPE *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { + DECLARE_ALIGNED(16, uint8_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << conv_params->round_0) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + // Filter even-index pixels + const __m128i src_0 = _mm_unpacklo_epi8(data, zero); + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + res = _mm_packus_epi16(res, res); + _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << conv_params->round_1) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const uint8_t *data = &im_block[i * im_stride + j]; + const __m128i src_01 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 0 * im_stride)), + _mm_loadl_epi64((__m128i *)(data + 1 * im_stride))); + const __m128i src_23 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 2 * im_stride)), + _mm_loadl_epi64((__m128i *)(data + 3 * im_stride))); + const __m128i src_45 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 4 * im_stride)), + _mm_loadl_epi64((__m128i *)(data + 5 * im_stride))); + const __m128i src_67 = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 6 * im_stride)), + _mm_loadl_epi64((__m128i *)(data + 7 * im_stride))); + + const __m128i src_0 = _mm_unpacklo_epi8(src_01, zero); + const __m128i src_2 = _mm_unpacklo_epi8(src_23, zero); + const __m128i src_4 = _mm_unpacklo_epi8(src_45, zero); + const __m128i src_6 = _mm_unpacklo_epi8(src_67, zero); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpackhi_epi8(src_01, zero); + const __m128i src_3 = _mm_unpackhi_epi8(src_23, zero); + const __m128i src_5 = _mm_unpackhi_epi8(src_45, zero); + const __m128i src_7 = _mm_unpackhi_epi8(src_67, zero); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + // Accumulate values into the destination buffer + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round)); + _mm_storeu_si128(p + 1, + _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round)); + } + } + } +} +#else +void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, + CONV_BUF_TYPE *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + // Filter even-index pixels + const __m128i src_0 = _mm_unpacklo_epi8(data, zero); + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + // Accumulate values into the destination buffer + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round)); + _mm_storeu_si128(p + 1, + _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round)); + } + } + } +} +#endif diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c new file mode 100644 index 000000000..a0e58716d --- /dev/null +++ b/third_party/aom/av1/common/x86/convolve_avx2.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "aom_dsp/aom_dsp_common.h" +#include "./av1_rtcd.h" + +#if CONFIG_CONVOLVE_ROUND +static const uint32_t sindex[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; + +// 16 epi16 pixels +static INLINE void pixel_clamp_avx2(__m256i *u, int bd) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one); + __m256i clamped, mask; + + mask = _mm256_cmpgt_epi16(*u, max); + clamped = _mm256_andnot_si256(mask, *u); + mask = _mm256_and_si256(mask, max); + clamped = _mm256_or_si256(mask, clamped); + + const __m256i zero = _mm256_setzero_si256(); + mask = _mm256_cmpgt_epi16(clamped, zero); + *u = _mm256_and_si256(clamped, mask); +} + +// 8 epi16 pixels +static INLINE void pixel_clamp_sse2(__m128i *u, int bd) { + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + __m128i clamped, mask; + + mask = _mm_cmpgt_epi16(*u, max); + clamped = _mm_andnot_si128(mask, *u); + mask = _mm_and_si128(mask, max); + clamped = _mm_or_si128(mask, clamped); + + const __m128i zero = _mm_setzero_si128(); + mask = _mm_cmpgt_epi16(clamped, zero); + *u = _mm_and_si128(clamped, mask); +} + +// Work on multiple of 32 pixels +static INLINE void cal_rounding_32xn_avx2(const int32_t *src, uint8_t *dst, + const __m256i *rnd, int shift, + int num) { + do { + __m256i x0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1); + __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2); + __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3); + + x0 = _mm256_add_epi32(x0, *rnd); + x1 = _mm256_add_epi32(x1, *rnd); + x2 = _mm256_add_epi32(x2, *rnd); + x3 = _mm256_add_epi32(x3, *rnd); + + x0 = _mm256_srai_epi32(x0, shift); + x1 = _mm256_srai_epi32(x1, shift); + x2 = _mm256_srai_epi32(x2, shift); + x3 = _mm256_srai_epi32(x3, shift); + + x0 = _mm256_packs_epi32(x0, x1); + x2 = _mm256_packs_epi32(x2, x3); + + pixel_clamp_avx2(&x0, 8); + pixel_clamp_avx2(&x2, 8); + + x0 = _mm256_packus_epi16(x0, x2); + x1 = _mm256_loadu_si256((const __m256i *)sindex); + x2 = _mm256_permutevar8x32_epi32(x0, x1); + + _mm256_storeu_si256((__m256i *)dst, x2); + src += 32; + dst += 32; + num--; + } while (num > 0); +} + +static INLINE void cal_rounding_16_avx2(const int32_t *src, uint8_t *dst, + const __m256i *rnd, int shift) { + __m256i x0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1); + + x0 = _mm256_add_epi32(x0, *rnd); + x1 = _mm256_add_epi32(x1, *rnd); + + x0 = _mm256_srai_epi32(x0, shift); + x1 = _mm256_srai_epi32(x1, shift); + + x0 = _mm256_packs_epi32(x0, x1); + pixel_clamp_avx2(&x0, 8); + + const __m256i x2 = _mm256_packus_epi16(x0, x0); + x1 = _mm256_loadu_si256((const __m256i *)sindex); + x0 = _mm256_permutevar8x32_epi32(x2, x1); + + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x0)); +} + +static INLINE void cal_rounding_8_avx2(const int32_t *src, uint8_t *dst, + const __m256i *rnd, int shift) { + __m256i x0 = _mm256_loadu_si256((const __m256i *)src); + x0 = _mm256_add_epi32(x0, *rnd); + x0 = _mm256_srai_epi32(x0, shift); + + x0 = _mm256_packs_epi32(x0, x0); + pixel_clamp_avx2(&x0, 8); + + x0 = _mm256_packus_epi16(x0, x0); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)sindex); + x0 = _mm256_permutevar8x32_epi32(x0, x1); + + _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(x0)); +} + +static INLINE void cal_rounding_4_sse2(const int32_t *src, uint8_t *dst, + const __m128i *rnd, int shift) { + __m128i x = _mm_loadu_si128((const __m128i *)src); + x = _mm_add_epi32(x, *rnd); + x = _mm_srai_epi32(x, shift); + + x = _mm_packs_epi32(x, x); + pixel_clamp_sse2(&x, 8); + + x = _mm_packus_epi16(x, x); + *(uint32_t *)dst = _mm_cvtsi128_si32(x); +} + +void av1_convolve_rounding_avx2(const int32_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + int bits) { + const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1))); + const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num); + + if (w > 64) { // width = 128 + do { + cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 32) { // width = 64 + do { + cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 16) { // width = 32 + do { + cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 8) { // width = 16 + do { + cal_rounding_16_avx2(src, dst, &rnd_num, bits); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 4) { // width = 8 + do { + cal_rounding_8_avx2(src, dst, &rnd_num, bits); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 2) { // width = 4 + do { + cal_rounding_4_sse2(src, dst, &rnd_num_sse2, bits); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } else { // width = 2 + do { + dst[0] = clip_pixel(ROUND_POWER_OF_TWO(src[0], bits)); + dst[1] = clip_pixel(ROUND_POWER_OF_TWO(src[1], bits)); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } +} + +#if CONFIG_HIGHBITDEPTH +static INLINE void cal_highbd_rounding_32xn_avx2(const int32_t *src, + uint16_t *dst, + const __m256i *rnd, int shift, + int num, int bd) { + do { + __m256i x0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1); + __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2); + __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3); + + x0 = _mm256_add_epi32(x0, *rnd); + x1 = _mm256_add_epi32(x1, *rnd); + x2 = _mm256_add_epi32(x2, *rnd); + x3 = _mm256_add_epi32(x3, *rnd); + + x0 = _mm256_srai_epi32(x0, shift); + x1 = _mm256_srai_epi32(x1, shift); + x2 = _mm256_srai_epi32(x2, shift); + x3 = _mm256_srai_epi32(x3, shift); + + x0 = _mm256_packs_epi32(x0, x1); + x2 = _mm256_packs_epi32(x2, x3); + + pixel_clamp_avx2(&x0, bd); + pixel_clamp_avx2(&x2, bd); + + x0 = _mm256_permute4x64_epi64(x0, 0xD8); + x2 = _mm256_permute4x64_epi64(x2, 0xD8); + + _mm256_storeu_si256((__m256i *)dst, x0); + _mm256_storeu_si256((__m256i *)(dst + 16), x2); + src += 32; + dst += 32; + num--; + } while (num > 0); +} + +static INLINE void cal_highbd_rounding_16_avx2(const int32_t *src, + uint16_t *dst, + const __m256i *rnd, int shift, + int bd) { + __m256i x0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1); + + x0 = _mm256_add_epi32(x0, *rnd); + x1 = _mm256_add_epi32(x1, *rnd); + + x0 = _mm256_srai_epi32(x0, shift); + x1 = _mm256_srai_epi32(x1, shift); + + x0 = _mm256_packs_epi32(x0, x1); + pixel_clamp_avx2(&x0, bd); + + x0 = _mm256_permute4x64_epi64(x0, 0xD8); + _mm256_storeu_si256((__m256i *)dst, x0); +} + +static INLINE void cal_highbd_rounding_8_avx2(const int32_t *src, uint16_t *dst, + const __m256i *rnd, int shift, + int bd) { + __m256i x = _mm256_loadu_si256((const __m256i *)src); + x = _mm256_add_epi32(x, *rnd); + x = _mm256_srai_epi32(x, shift); + + x = _mm256_packs_epi32(x, x); + pixel_clamp_avx2(&x, bd); + + x = _mm256_permute4x64_epi64(x, 0xD8); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x)); +} + +static INLINE void cal_highbd_rounding_4_sse2(const int32_t *src, uint16_t *dst, + const __m128i *rnd, int shift, + int bd) { + __m128i x = _mm_loadu_si128((const __m128i *)src); + x = _mm_add_epi32(x, *rnd); + x = _mm_srai_epi32(x, shift); + + x = _mm_packs_epi32(x, x); + pixel_clamp_sse2(&x, bd); + _mm_storel_epi64((__m128i *)dst, x); +} + +void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride, + uint8_t *dst8, int dst_stride, int w, + int h, int bits, int bd) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1))); + const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num); + + if (w > 64) { // width = 128 + do { + cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4, bd); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 32) { // width = 64 + do { + cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2, bd); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 16) { // width = 32 + do { + cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1, bd); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 8) { // width = 16 + do { + cal_highbd_rounding_16_avx2(src, dst, &rnd_num, bits, bd); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 4) { // width = 8 + do { + cal_highbd_rounding_8_avx2(src, dst, &rnd_num, bits, bd); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 2) { // width = 4 + do { + cal_highbd_rounding_4_sse2(src, dst, &rnd_num_sse2, bits, bd); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } else { // width = 2 + do { + dst[0] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[0], bits), bd); + dst[1] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[1], bits), bd); + src += src_stride; + dst += dst_stride; + h--; + } while (h > 0); + } +} +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_CONVOLVE_ROUND diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c new file mode 100644 index 000000000..ff4a0a0fe --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c @@ -0,0 +1,372 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> +#include <assert.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "av1/common/convolve.h" + +#if CONFIG_COMPOUND_ROUND +void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride, + CONV_BUF_TYPE *dst, int dst_stride, int w, + int h, InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << conv_params->round_0) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i data2 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(data, coeff_01); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); + __m128i res = _mm_packs_epi32(res_even, res_odd); + res = _mm_max_epi16(_mm_min_epi16(res, maxval), _mm_setzero_si128()); + _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << conv_params->round_1) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + // Accumulate values into the destination buffer + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round)); + _mm_storeu_si128(p + 1, + _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round)); + } + } + } +} +#else +void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride, + CONV_BUF_TYPE *dst, int dst_stride, int w, + int h, InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 15-bit intermediate array. + assert(conv_params->round_0 >= 5); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_x, subpel_x_q4 & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i data2 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(data, coeff_01); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + *filter_params_y, subpel_y_q4 & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + // Accumulate values into the destination buffer + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round)); + _mm_storeu_si128(p + 1, + _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round)); + } + } + } +} +#endif diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c index 37e2f61e7..35d637f72 100644 --- a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c +++ b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c @@ -19,8 +19,9 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, - int comp_avg, int16_t alpha, int16_t beta, - int16_t gamma, int16_t delta) { + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + int comp_avg = conv_params->do_average; #if HORSHEAR_REDUCE_PREC_BITS >= 5 __m128i tmp[15]; #else diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c index c69614e42..0648b95b3 100644 --- a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c +++ b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c @@ -364,8 +364,9 @@ static void iidtx16(__m256i *in) { #endif void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride, int tx_type) { + int stride, const TxfmParam *txfm_param) { __m256i in[16]; + int tx_type = txfm_param->tx_type; load_buffer_16x16(input, in); switch (tx_type) { diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c index d6a598746..bf12a26d3 100644 --- a/third_party/aom/av1/common/x86/idct_intrin_sse2.c +++ b/third_party/aom/av1/common/x86/idct_intrin_sse2.c @@ -59,10 +59,11 @@ static INLINE void fliplr_16x8(__m128i *in /*in[16]*/) { #endif void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, - int tx_type) { + const TxfmParam *txfm_param) { __m128i in[2]; const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); + int tx_type = txfm_param->tx_type; in[0] = load_input_data(input); in[1] = load_input_data(input + 8); @@ -150,10 +151,11 @@ void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, } void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, - int tx_type) { + const TxfmParam *txfm_param) { __m128i in[8]; const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1 << 4); + int tx_type = txfm_param->tx_type; // load input data in[0] = load_input_data(input); @@ -251,10 +253,11 @@ static void iidtx16_sse2(__m128i *in0, __m128i *in1) { #endif // CONFIG_EXT_TX void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride, int tx_type) { + int stride, const TxfmParam *txfm_param) { __m128i in[32]; __m128i *in0 = &in[0]; __m128i *in1 = &in[16]; + int tx_type = txfm_param->tx_type; load_buffer_8x16(input, in0); input += 8; @@ -388,8 +391,9 @@ static INLINE void flip_buffer_lr_8x8(__m128i *in) { #endif // CONFIG_EXT_TX void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride, int tx_type) { + int stride, const TxfmParam *txfm_param) { __m128i in[16]; + int tx_type = txfm_param->tx_type; in[0] = load_input_data(input + 0 * 8); in[1] = load_input_data(input + 1 * 8); @@ -553,8 +557,9 @@ static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in, } void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride, int tx_type) { + int stride, const TxfmParam *txfm_param) { __m128i in[16]; + int tx_type = txfm_param->tx_type; // Transpose 16x8 input into in[] in[0] = load_input_data(input + 0 * 16); @@ -713,8 +718,9 @@ static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in, } void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, - int tx_type) { + const TxfmParam *txfm_param) { __m128i in[8]; + int tx_type = txfm_param->tx_type; in[0] = load_input_data(input + 0 * 8); in[1] = load_input_data(input + 1 * 8); @@ -897,8 +903,9 @@ static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in, } void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, - int tx_type) { + const TxfmParam *txfm_param) { __m128i in[8]; + int tx_type = txfm_param->tx_type; // Load rows, packed two per element of 'in'. // We pack into the bottom half of 'in' so that the @@ -1119,8 +1126,9 @@ static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl, } void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride, int tx_type) { + int stride, const TxfmParam *txfm_param) { __m128i intl[16], intr[16], inbl[16], inbr[16]; + int tx_type = txfm_param->tx_type; int i; for (i = 0; i < 16; ++i) { @@ -1272,8 +1280,9 @@ static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0, } void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride, int tx_type) { + int stride, const TxfmParam *txfm_param) { __m128i in0[16], in1[16], in2[16], in3[16]; + int tx_type = txfm_param->tx_type; int i; for (i = 0; i < 16; ++i) { diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c index 260faa8c9..e2e4f51c3 100644 --- a/third_party/aom/av1/common/x86/selfguided_sse4.c +++ b/third_party/aom/av1/common/x86/selfguided_sse4.c @@ -16,8 +16,8 @@ static void calc_block(__m128i sum, __m128i sum_sq, __m128i n, if (bit_depth > 8) { __m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1); __m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1); - __m128i shift_a = _mm_set_epi64x(0, 2 * (bit_depth - 8)); - __m128i shift_b = _mm_set_epi64x(0, bit_depth - 8); + __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8)); + __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8); a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a), shift_a); b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b), shift_b); a = _mm_mullo_epi32(a, n); diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c index cdc4e8d0f..5a22d9abf 100644 --- a/third_party/aom/av1/common/x86/warp_plane_sse2.c +++ b/third_party/aom/av1/common/x86/warp_plane_sse2.c @@ -17,9 +17,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, - int subsampling_x, int subsampling_y, int comp_avg, - int16_t alpha, int16_t beta, int16_t gamma, - int16_t delta) { + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + int comp_avg = conv_params->do_average; __m128i tmp[15]; int i, j, k; const int bd = 8; diff --git a/third_party/aom/av1/common/x86/warp_plane_ssse3.c b/third_party/aom/av1/common/x86/warp_plane_ssse3.c index 494410e99..f8e6f62ba 100644 --- a/third_party/aom/av1/common/x86/warp_plane_ssse3.c +++ b/third_party/aom/av1/common/x86/warp_plane_ssse3.c @@ -204,9 +204,10 @@ static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9, void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, - int subsampling_x, int subsampling_y, int comp_avg, - int16_t alpha, int16_t beta, int16_t gamma, - int16_t delta) { + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + int comp_avg = conv_params->do_average; __m128i tmp[15]; int i, j, k; const int bd = 8; |