diff options
author | trav90 <travawine@palemoon.org> | 2018-10-19 23:05:00 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-19 23:05:03 -0500 |
commit | d2499ead93dc4298c0882fe98902acb1b5209f99 (patch) | |
tree | cb0b942aed59e5108f9a3e9d64e7b77854383421 /third_party/aom/aom_dsp | |
parent | 41fbdea457bf50c0a43e1c27c5cbf7f0a3a9eb33 (diff) | |
download | UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.gz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.lz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.tar.xz UXP-d2499ead93dc4298c0882fe98902acb1b5209f99.zip |
Update libaom to commit ID 1e227d41f0616de9548a673a83a21ef990b62591
Diffstat (limited to 'third_party/aom/aom_dsp')
126 files changed, 5796 insertions, 3455 deletions
diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c index bba37e227..4791826da 100644 --- a/third_party/aom/aom_dsp/aom_convolve.c +++ b/third_party/aom/aom_dsp/aom_convolve.c @@ -16,7 +16,6 @@ #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_ports/mem.h" diff --git a/third_party/aom/aom_dsp/aom_convolve.h b/third_party/aom/aom_dsp/aom_convolve.h deleted file mode 100644 index 6f5b888e4..000000000 --- a/third_party/aom/aom_dsp/aom_convolve.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#ifndef AOM_DSP_AOM_CONVOLVE_H_ -#define AOM_DSP_AOM_CONVOLVE_H_ - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" - -#ifdef __cplusplus -extern "C" { -#endif - -// Note: Fixed size intermediate buffers, place limits on parameters -// of some functions. 2d filtering proceeds in 2 steps: -// (1) Interpolate horizontally into an intermediate buffer, temp. -// (2) Interpolate temp vertically to derive the sub-pixel result. -// Deriving the maximum number of rows in the temp buffer (135): -// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). -// --Largest block size is 64x64 pixels. -// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the -// original frame (in 1/16th pixel units). -// --Must round-up because block may be located at sub-pixel position. -// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. -// --((64 - 1) * 32 + 15) >> 4 + 8 = 135. -// TODO(wtc): Update the above comment to explain the value 263 used in aom. -#define MAX_EXT_SIZE 263 - -#define EXTRAPREC_BITS 2 -#define EXTRAPREC_CLAMP_LIMIT(bd) (1 << ((bd) + 1 + EXTRAPREC_BITS)) - -typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); - -typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_DSP_AOM_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake index 7c0111a69..11ff73756 100644 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -15,11 +15,14 @@ set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1) list(APPEND AOM_DSP_COMMON_SOURCES "${AOM_ROOT}/aom_dsp/aom_convolve.c" - "${AOM_ROOT}/aom_dsp/aom_convolve.h" "${AOM_ROOT}/aom_dsp/aom_dsp_common.h" "${AOM_ROOT}/aom_dsp/aom_filter.h" "${AOM_ROOT}/aom_dsp/aom_simd.h" "${AOM_ROOT}/aom_dsp/aom_simd_inline.h" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.c" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.h" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" "${AOM_ROOT}/aom_dsp/blend.h" "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c" "${AOM_ROOT}/aom_dsp/blend_a64_mask.c" @@ -64,7 +67,8 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSE2 "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h" "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h" "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h") + "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h") list(APPEND AOM_DSP_COMMON_ASM_SSSE3 "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm" @@ -76,6 +80,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3 "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c") list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h" "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c" "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c" "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c") @@ -88,7 +93,8 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c") + "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c") list(APPEND AOM_DSP_COMMON_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c" @@ -125,12 +131,9 @@ if(CONFIG_AV1_DECODER) "${AOM_ROOT}/aom_dsp/binary_codes_reader.c" "${AOM_ROOT}/aom_dsp/binary_codes_reader.h" "${AOM_ROOT}/aom_dsp/bitreader.h" - "${AOM_ROOT}/aom_dsp/bitreader_buffer.c" - "${AOM_ROOT}/aom_dsp/bitreader_buffer.h" "${AOM_ROOT}/aom_dsp/daalaboolreader.c" "${AOM_ROOT}/aom_dsp/daalaboolreader.h" - "${AOM_ROOT}/aom_dsp/entdec.c" - "${AOM_ROOT}/aom_dsp/entdec.h" + "${AOM_ROOT}/aom_dsp/entdec.c" "${AOM_ROOT}/aom_dsp/entdec.h" "${AOM_ROOT}/aom_dsp/grain_synthesis.c" "${AOM_ROOT}/aom_dsp/grain_synthesis.h") endif() @@ -140,8 +143,6 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/binary_codes_writer.c" "${AOM_ROOT}/aom_dsp/binary_codes_writer.h" "${AOM_ROOT}/aom_dsp/bitwriter.h" - "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c" - "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" "${AOM_ROOT}/aom_dsp/daalaboolwriter.c" "${AOM_ROOT}/aom_dsp/daalaboolwriter.h" "${AOM_ROOT}/aom_dsp/entenc.c" @@ -158,13 +159,13 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/quantize.c" "${AOM_ROOT}/aom_dsp/quantize.h" "${AOM_ROOT}/aom_dsp/sad.c" + "${AOM_ROOT}/aom_dsp/sse.c" "${AOM_ROOT}/aom_dsp/sad_av1.c" "${AOM_ROOT}/aom_dsp/sum_squares.c" "${AOM_ROOT}/aom_dsp/variance.c" "${AOM_ROOT}/aom_dsp/variance.h") list(APPEND AOM_DSP_ENCODER_ASM_SSE2 - "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_impl_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm" @@ -178,11 +179,11 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h" "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c" "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c" "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h" "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c" "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c") @@ -199,8 +200,12 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c" "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c") + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c") list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") @@ -219,6 +224,7 @@ if(CONFIG_AV1_ENCODER) list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c" "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c" "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c") diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h index c5dc9a834..a185b23c8 100644 --- a/third_party/aom/aom_dsp/aom_dsp_common.h +++ b/third_party/aom/aom_dsp/aom_dsp_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_AOM_DSP_COMMON_H_ -#define AOM_DSP_AOM_DSP_COMMON_H_ +#ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_ +#define AOM_AOM_DSP_AOM_DSP_COMMON_H_ #include "config/aom_config.h" @@ -95,4 +95,4 @@ static INLINE unsigned int negative_to_zero(int value) { } // extern "C" #endif -#endif // AOM_DSP_AOM_DSP_COMMON_H_ +#endif // AOM_AOM_DSP_AOM_DSP_COMMON_H_ diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index 1a9ac3184..8e8a480fe 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -519,23 +519,23 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){ # Quantization # if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64"; - add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64"; - add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; } # CONFIG_AV1_ENCODER if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_highbd_quantize_b sse2 avx2/; - add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_highbd_quantize_b_32x32 sse2/; - add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; } # CONFIG_AV1_ENCODER @@ -543,12 +543,12 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # Alpha blending with mask # add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params"; -specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 neon/; +specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/; add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd"; add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby"; add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; -specialize "aom_blend_a64_mask", qw/sse4_1/; +specialize "aom_blend_a64_mask", qw/sse4_1 avx2/; specialize "aom_blend_a64_hmask", qw/sse4_1 neon/; specialize "aom_blend_a64_vmask", qw/sse4_1 neon/; @@ -569,15 +569,22 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; specialize qw/aom_highbd_subtract_block sse2/; + add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height"; + specialize qw/aom_sse sse4_1 avx2/; + + add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height"; + specialize qw/aom_highbd_sse sse4_1 avx2/; + if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # # Sum of Squares # add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height"; - specialize qw/aom_sum_squares_2d_i16 sse2/; + specialize qw/aom_sum_squares_2d_i16 sse2 avx2/; add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N"; specialize qw/aom_sum_squares_i16 sse2/; + } @@ -830,7 +837,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_highbd_sad16x64x4d sse2/; specialize qw/aom_highbd_sad64x16x4d sse2/; - # # Structured Similarity (SSIM) # @@ -888,36 +894,43 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref, int ref_stride"; + int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search"; specialize qw/aom_upsampled_pred sse2/; add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride"; + int ref_stride, int subpel_search"; specialize qw/aom_comp_avg_upsampled_pred sse2/; add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param"; + int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search"; specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/; + add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int subpel_search"; + specialize qw/aom_comp_mask_upsampled_pred sse2/; + add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd"; + const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search"; specialize qw/aom_highbd_upsampled_pred sse2/; add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd"; + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search"; specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/; add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param"; + int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, int subpel_search"; specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/; @@ -1101,7 +1114,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - specialize "aom_obmc_variance${w}x${h}", q/sse4_1/; + specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/; specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/; } @@ -1154,17 +1167,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; - # - # Specialty Subpixel - # - add_proto qw/uint32_t aom_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_variance_halfpixvar16x16_h sse2/; - - add_proto qw/uint32_t aom_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_variance_halfpixvar16x16_v sse2/; - - add_proto qw/uint32_t aom_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_variance_halfpixvar16x16_hv sse2/; # # Comp Avg @@ -1174,6 +1176,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param"; specialize qw/aom_jnt_comp_avg_pred ssse3/; + add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance128x128 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance128x64 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance64x128 sse2/; add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/aom_highbd_12_variance64x64 sse2/; @@ -1209,40 +1219,58 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance128x128 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance128x64 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance64x128 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance64x64 sse2/; + specialize qw/aom_highbd_10_variance64x64 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance64x32 sse2/; + specialize qw/aom_highbd_10_variance64x32 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x64 sse2/; + specialize qw/aom_highbd_10_variance32x64 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x32 sse2/; + specialize qw/aom_highbd_10_variance32x32 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x16 sse2/; + specialize qw/aom_highbd_10_variance32x16 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x32 sse2/; + specialize qw/aom_highbd_10_variance16x32 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x16 sse2/; + specialize qw/aom_highbd_10_variance16x16 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x8 sse2/; + specialize qw/aom_highbd_10_variance16x8 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance8x16 sse2/; + specialize qw/aom_highbd_10_variance8x16 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance8x8 sse2/; + specialize qw/aom_highbd_10_variance8x8 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance128x128 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance128x64 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance64x128 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/aom_highbd_8_variance64x64 sse2/; @@ -1310,9 +1338,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/aom_highbd_12_mse8x8 sse2/; - add_proto qw/void aom_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; + add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; - add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param"; + add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param"; specialize qw/aom_highbd_jnt_comp_avg_pred sse2/; # @@ -1539,8 +1567,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; specialize qw/aom_comp_mask_pred ssse3 avx2/; - add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; - specialize qw/aom_highbd_comp_mask_pred avx2/; + add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + specialize qw/aom_highbd_comp_mask_pred sse2 avx2/; } # CONFIG_AV1_ENCODER diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h index fd4f51b29..00686ac38 100644 --- a/third_party/aom/aom_dsp/aom_filter.h +++ b/third_party/aom/aom_dsp/aom_filter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_AOM_FILTER_H_ -#define AOM_DSP_AOM_FILTER_H_ +#ifndef AOM_AOM_DSP_AOM_FILTER_H_ +#define AOM_AOM_DSP_AOM_FILTER_H_ #include "aom/aom_integer.h" @@ -53,4 +53,4 @@ static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = { } // extern "C" #endif -#endif // AOM_DSP_AOM_FILTER_H_ +#endif // AOM_AOM_DSP_AOM_FILTER_H_ diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h index 392b36627..ab950ca55 100644 --- a/third_party/aom/aom_dsp/aom_simd.h +++ b/third_party/aom/aom_dsp/aom_simd.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_AOM_AOM_SIMD_H_ -#define AOM_DSP_AOM_AOM_SIMD_H_ +#ifndef AOM_AOM_DSP_AOM_SIMD_H_ +#define AOM_AOM_DSP_AOM_SIMD_H_ #include <stdint.h> @@ -35,4 +35,4 @@ #include "simd/v256_intrinsics.h" #endif -#endif // AOM_DSP_AOM_AOM_SIMD_H_ +#endif // AOM_AOM_DSP_AOM_SIMD_H_ diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h index 02a8b3a17..eb333f6f6 100644 --- a/third_party/aom/aom_dsp/aom_simd_inline.h +++ b/third_party/aom/aom_dsp/aom_simd_inline.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_AOM_SIMD_INLINE_H_ -#define AOM_DSP_AOM_SIMD_INLINE_H_ +#ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_ +#define AOM_AOM_DSP_AOM_SIMD_INLINE_H_ #include "aom/aom_integer.h" @@ -18,4 +18,4 @@ #define SIMD_INLINE static AOM_FORCE_INLINE #endif -#endif // AOM_DSP_AOM_SIMD_INLINE_H_ +#endif // AOM_AOM_DSP_AOM_SIMD_INLINE_H_ diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c index 82c0b0e28..e7f08a5fd 100644 --- a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c +++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c @@ -86,7 +86,8 @@ static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride, const int16x8_t vec_round_bits) { int16x8_t src0_0, src0_1; int16x8_t src1_0, src1_1; - uint64x2_t tu0, tu1, tu2, tu3; + uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0), + tu3 = vdupq_n_u64(0); int16x8_t mask0_1, mask2_3; int16x8_t res0, res1; @@ -154,7 +155,8 @@ void aom_lowbd_blend_a64_d16_mask_neon( assert(IS_POWER_OF_TWO(w)); uint8x8_t s0, s1, s2, s3; - uint32x2_t tu0, tu1, tu2, tu3; + uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0), + tu3 = vdup_n_u32(0); uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7; int16x8_t mask0, mask1, mask2, mask3; int16x8_t mask4, mask5, mask6, mask7; diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c index 44d821821..cf618eee7 100644 --- a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c +++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c @@ -17,13 +17,9 @@ #include "aom_ports/mem.h" #include "aom/aom_integer.h" +#include "aom_dsp/aom_filter.h" #include "aom_dsp/variance.h" -static const uint8_t bilinear_filters[8][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; - static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, @@ -83,9 +79,9 @@ unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, - bilinear_filters[xoffset]); + bilinear_filters_2t[xoffset]); var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, - bilinear_filters[yoffset]); + bilinear_filters_2t[yoffset]); return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse); } @@ -98,9 +94,9 @@ unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src, DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, - bilinear_filters[xoffset]); + bilinear_filters_2t[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, - bilinear_filters[yoffset]); + bilinear_filters_2t[yoffset]); return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse); } @@ -113,9 +109,9 @@ unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src, DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, - bilinear_filters[xoffset]); + bilinear_filters_2t[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, - bilinear_filters[yoffset]); + bilinear_filters_2t[yoffset]); return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse); } @@ -128,8 +124,8 @@ unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src, DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, - bilinear_filters[xoffset]); + bilinear_filters_2t[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, - bilinear_filters[yoffset]); + bilinear_filters_2t[yoffset]); return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse); } diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c index d05c3efdc..01088010a 100644 --- a/third_party/aom/aom_dsp/binary_codes_reader.c +++ b/third_party/aom/aom_dsp/binary_codes_reader.c @@ -36,7 +36,7 @@ static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) { uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM) { if (n <= 1) return 0; - const int l = get_msb(n - 1) + 1; + const int l = get_msb(n) + 1; const int m = (1 << l) - n; const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME); return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME); @@ -45,7 +45,7 @@ uint16_t aom_read_primitive_quniform_(aom_reader *r, static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb, uint16_t n) { if (n <= 1) return 0; - const int l = get_msb(n - 1) + 1; + const int l = get_msb(n) + 1; const int m = (1 << l) - n; const int v = aom_rb_read_literal(rb, l - 1); return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb); @@ -121,13 +121,3 @@ int16_t aom_rb_read_signed_primitive_refsubexpfin( const uint16_t scaled_n = (n << 1) - 1; return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1; } - -uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) { - int leading_zeros = 0; - while (!aom_rb_read_bit(rb)) ++leading_zeros; - // Maximum 32 bits. - if (leading_zeros >= 32) return UINT32_MAX; - const uint32_t base = (1u << leading_zeros) - 1; - const uint32_t value = aom_rb_read_literal(rb, leading_zeros); - return base + value; -} diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h index 5253c6154..364a67469 100644 --- a/third_party/aom/aom_dsp/binary_codes_reader.h +++ b/third_party/aom/aom_dsp/binary_codes_reader.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BINARY_CODES_READER_H_ -#define AOM_DSP_BINARY_CODES_READER_H_ +#ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_ +#define AOM_AOM_DSP_BINARY_CODES_READER_H_ #ifdef __cplusplus extern "C" { @@ -40,10 +40,8 @@ uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, int16_t aom_rb_read_signed_primitive_refsubexpfin( struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref); -uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb); - #ifdef __cplusplus } // extern "C" #endif -#endif // AOM_DSP_BINARY_CODES_READER_H_ +#endif // AOM_AOM_DSP_BINARY_CODES_READER_H_ diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c index 8f74f0942..ee7a9f567 100644 --- a/third_party/aom/aom_dsp/binary_codes_writer.c +++ b/third_party/aom/aom_dsp/binary_codes_writer.c @@ -59,7 +59,7 @@ int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) { // Encodes a value v in [0, n-1] quasi-uniformly void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) { if (n <= 1) return; - const int l = get_msb(n - 1) + 1; + const int l = get_msb(n) + 1; const int m = (1 << l) - n; if (v < m) { aom_write_literal(w, v, l - 1); @@ -72,7 +72,7 @@ void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) { static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb, uint16_t n, uint16_t v) { if (n <= 1) return; - const int l = get_msb(n - 1) + 1; + const int l = get_msb(n) + 1; const int m = (1 << l) - n; if (v < m) { aom_wb_write_literal(wb, v, l - 1); @@ -84,7 +84,7 @@ static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb, int aom_count_primitive_quniform(uint16_t n, uint16_t v) { if (n <= 1) return 0; - const int l = get_msb(n - 1) + 1; + const int l = get_msb(n) + 1; const int m = (1 << l) - n; return v < m ? l - 1 : l; } @@ -208,15 +208,3 @@ int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, const uint16_t scaled_n = (n << 1) - 1; return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v); } - -void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) { - int64_t shift_val = ++v; - int leading_zeroes = 1; - - assert(shift_val > 0); - - while (shift_val >>= 1) leading_zeroes += 2; - - aom_wb_write_literal(wb, 0, leading_zeroes >> 1); - aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1); -} diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h index 784c721a6..c360e0e29 100644 --- a/third_party/aom/aom_dsp/binary_codes_writer.h +++ b/third_party/aom/aom_dsp/binary_codes_writer.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BINARY_CODES_WRITER_H_ -#define AOM_DSP_BINARY_CODES_WRITER_H_ +#ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_ +#define AOM_AOM_DSP_BINARY_CODES_WRITER_H_ #ifdef __cplusplus extern "C" { @@ -61,9 +61,8 @@ int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, uint16_t v); int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, int16_t v); -void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v); #ifdef __cplusplus } // extern "C" #endif -#endif // AOM_DSP_BINARY_CODES_WRITER_H_ +#endif // AOM_AOM_DSP_BINARY_CODES_WRITER_H_ diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h index 328935be9..7c0efcc78 100644 --- a/third_party/aom/aom_dsp/bitreader.h +++ b/third_party/aom/aom_dsp/bitreader.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BITREADER_H_ -#define AOM_DSP_BITREADER_H_ +#ifndef AOM_AOM_DSP_BITREADER_H_ +#define AOM_AOM_DSP_BITREADER_H_ #include <assert.h> #include <limits.h> @@ -69,6 +69,12 @@ static INLINE int aom_reader_has_error(aom_reader *r) { return aom_daala_reader_has_error(r); } +// Returns true if the bit reader has tried to decode more data from the buffer +// than was actually provided. +static INLINE int aom_reader_has_overflowed(const aom_reader *r) { + return aom_daala_reader_has_overflowed(r); +} + // Returns the position in the bit reader in bits. static INLINE uint32_t aom_reader_tell(const aom_reader *r) { return aom_daala_reader_tell(r); @@ -151,4 +157,4 @@ static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf, } // extern "C" #endif -#endif // AOM_DSP_BITREADER_H_ +#endif // AOM_AOM_DSP_BITREADER_H_ diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c index 02b5ef924..b53211784 100644 --- a/third_party/aom/aom_dsp/bitreader_buffer.c +++ b/third_party/aom/aom_dsp/bitreader_buffer.c @@ -55,3 +55,13 @@ int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) { const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits; return ((int)value) >> nbits; } + +uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) { + int leading_zeros = 0; + while (!aom_rb_read_bit(rb)) ++leading_zeros; + // Maximum 32 bits. + if (leading_zeros >= 32) return UINT32_MAX; + const uint32_t base = (1u << leading_zeros) - 1; + const uint32_t value = aom_rb_read_literal(rb, leading_zeros); + return base + value; +} diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h index 5c94ab883..725ca1ea2 100644 --- a/third_party/aom/aom_dsp/bitreader_buffer.h +++ b/third_party/aom/aom_dsp/bitreader_buffer.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BITREADER_BUFFER_H_ -#define AOM_DSP_BITREADER_BUFFER_H_ +#ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_ +#define AOM_AOM_DSP_BITREADER_BUFFER_H_ #include <limits.h> @@ -41,8 +41,10 @@ uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits); int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits); +uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb); + #ifdef __cplusplus } // extern "C" #endif -#endif // AOM_DSP_BITREADER_BUFFER_H_ +#endif // AOM_AOM_DSP_BITREADER_BUFFER_H_ diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h index de1b1d048..b5ecc2382 100644 --- a/third_party/aom/aom_dsp/bitwriter.h +++ b/third_party/aom/aom_dsp/bitwriter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BITWRITER_H_ -#define AOM_DSP_BITWRITER_H_ +#ifndef AOM_AOM_DSP_BITWRITER_H_ +#define AOM_AOM_DSP_BITWRITER_H_ #include <assert.h> @@ -86,4 +86,4 @@ static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf, } // extern "C" #endif -#endif // AOM_DSP_BITWRITER_H_ +#endif // AOM_AOM_DSP_BITWRITER_H_ diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c index a563bf684..596246deb 100644 --- a/third_party/aom/aom_dsp/bitwriter_buffer.c +++ b/third_party/aom/aom_dsp/bitwriter_buffer.c @@ -73,3 +73,15 @@ void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, int bits) { aom_wb_write_literal(wb, data, bits + 1); } + +void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) { + int64_t shift_val = ++v; + int leading_zeroes = 1; + + assert(shift_val > 0); + + while (shift_val >>= 1) leading_zeroes += 2; + + aom_wb_write_literal(wb, 0, leading_zeroes >> 1); + aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1); +} diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h index f7f75a097..d0311284f 100644 --- a/third_party/aom/aom_dsp/bitwriter_buffer.h +++ b/third_party/aom/aom_dsp/bitwriter_buffer.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BITWRITER_BUFFER_H_ -#define AOM_DSP_BITWRITER_BUFFER_H_ +#ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_ +#define AOM_AOM_DSP_BITWRITER_BUFFER_H_ #include "aom/aom_integer.h" @@ -42,8 +42,10 @@ void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, int bits); +void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v); + #ifdef __cplusplus } // extern "C" #endif -#endif // AOM_DSP_BITWRITER_BUFFER_H_ +#endif // AOM_AOM_DSP_BITWRITER_BUFFER_H_ diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h index 434bb83a1..fd87dc181 100644 --- a/third_party/aom/aom_dsp/blend.h +++ b/third_party/aom/aom_dsp/blend.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BLEND_H_ -#define AOM_DSP_BLEND_H_ +#ifndef AOM_AOM_DSP_BLEND_H_ +#define AOM_AOM_DSP_BLEND_H_ #include "aom_ports/mem.h" @@ -42,4 +42,4 @@ #define DIFF_FACTOR_LOG2 4 #define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2) -#endif // AOM_DSP_BLEND_H_ +#endif // AOM_AOM_DSP_BLEND_H_ diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h index cf7df1dbf..985fcdf9e 100644 --- a/third_party/aom/aom_dsp/buf_ans.h +++ b/third_party/aom/aom_dsp/buf_ans.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BUF_ANS_H_ -#define AOM_DSP_BUF_ANS_H_ +#ifndef AOM_AOM_DSP_BUF_ANS_H_ +#define AOM_AOM_DSP_BUF_ANS_H_ // Buffered forward ANS writer. // Symbols are written to the writer in forward (decode) order and serialized // backwards due to ANS's stack like behavior. @@ -133,4 +133,4 @@ static INLINE int buf_ans_write_end(struct BufAnsCoder *const c) { #ifdef __cplusplus } // extern "C" #endif // __cplusplus -#endif // AOM_DSP_BUF_ANS_H_ +#endif // AOM_AOM_DSP_BUF_ANS_H_ diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c index 4e224904e..6c2259f23 100644 --- a/third_party/aom/aom_dsp/daalaboolreader.c +++ b/third_party/aom/aom_dsp/daalaboolreader.c @@ -39,3 +39,9 @@ uint32_t aom_daala_reader_tell(const daala_reader *r) { uint32_t aom_daala_reader_tell_frac(const daala_reader *r) { return od_ec_dec_tell_frac(&r->ec); } + +int aom_daala_reader_has_overflowed(const daala_reader *r) { + const uint32_t tell_bits = aom_daala_reader_tell(r); + const uint32_t tell_bytes = (tell_bits + 7) >> 3; + return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer); +} diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h index 60c197a49..ba78f916d 100644 --- a/third_party/aom/aom_dsp/daalaboolreader.h +++ b/third_party/aom/aom_dsp/daalaboolreader.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_DAALABOOLREADER_H_ -#define AOM_DSP_DAALABOOLREADER_H_ +#ifndef AOM_AOM_DSP_DAALABOOLREADER_H_ +#define AOM_AOM_DSP_DAALABOOLREADER_H_ #include "aom/aom_integer.h" #include "aom_dsp/entdec.h" @@ -44,6 +44,9 @@ const uint8_t *aom_daala_reader_find_begin(daala_reader *r); const uint8_t *aom_daala_reader_find_end(daala_reader *r); uint32_t aom_daala_reader_tell(const daala_reader *r); uint32_t aom_daala_reader_tell_frac(const daala_reader *r); +// Returns true if the reader has tried to decode more data from the buffer +// than was actually provided. +int aom_daala_reader_has_overflowed(const daala_reader *r); static INLINE int aom_daala_read(daala_reader *r, int prob) { int bit; @@ -154,4 +157,4 @@ static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf, } // extern "C" #endif -#endif +#endif // AOM_AOM_DSP_DAALABOOLREADER_H_ diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h index f9c596c73..3848877ce 100644 --- a/third_party/aom/aom_dsp/daalaboolwriter.h +++ b/third_party/aom/aom_dsp/daalaboolwriter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_DAALABOOLWRITER_H_ -#define AOM_DSP_DAALABOOLWRITER_H_ +#ifndef AOM_AOM_DSP_DAALABOOLWRITER_H_ +#define AOM_AOM_DSP_DAALABOOLWRITER_H_ #include <stdio.h> @@ -75,4 +75,4 @@ static INLINE void daala_write_symbol(daala_writer *w, int symb, } // extern "C" #endif -#endif +#endif // AOM_AOM_DSP_DAALABOOLWRITER_H_ diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h index 5c15526e9..7ba2b1c39 100644 --- a/third_party/aom/aom_dsp/entcode.h +++ b/third_party/aom/aom_dsp/entcode.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_ENTCODE_H_ -#define AOM_DSP_ENTCODE_H_ +#ifndef AOM_AOM_DSP_ENTCODE_H_ +#define AOM_AOM_DSP_ENTCODE_H_ #include <limits.h> #include <stddef.h> @@ -37,4 +37,4 @@ typedef uint32_t od_ec_window; OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng); -#endif // AOM_DSP_ENTCODE_H_ +#endif // AOM_AOM_DSP_ENTCODE_H_ diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c index b8e9078c3..d1764c47b 100644 --- a/third_party/aom/aom_dsp/entdec.c +++ b/third_party/aom/aom_dsp/entdec.c @@ -112,6 +112,7 @@ static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng, int ret) { int d; assert(rng <= 65535U); + // The number of leading zeros in the 16-bit binary representation of rng. d = 16 - OD_ILOG_NZ(rng); dec->cnt -= d; /*This is equivalent to shifting in 1's instead of 0's.*/ diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h index e35c3f99f..283bf1831 100644 --- a/third_party/aom/aom_dsp/entdec.h +++ b/third_party/aom/aom_dsp/entdec.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#if !defined(_entdec_H) -#define _entdec_H (1) +#ifndef AOM_AOM_DSP_ENTDEC_H_ +#define AOM_AOM_DSP_ENTDEC_H_ #include <limits.h> #include "aom_dsp/entcode.h" @@ -80,4 +80,4 @@ OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) } // extern "C" #endif -#endif +#endif // AOM_AOM_DSP_ENTDEC_H_ diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c index 6866de9b9..a61da263c 100644 --- a/third_party/aom/aom_dsp/entenc.c +++ b/third_party/aom/aom_dsp/entenc.c @@ -60,6 +60,7 @@ static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low, int s; c = enc->cnt; assert(rng <= 65535U); + // The number of leading zeros in the 16-bit binary representation of rng. d = 16 - OD_ILOG_NZ(rng); s = c + d; /*TODO: Right now we flush every time we have at least one byte available. diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h index 1988f6818..3551d4250 100644 --- a/third_party/aom/aom_dsp/entenc.h +++ b/third_party/aom/aom_dsp/entenc.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#if !defined(_entenc_H) -#define _entenc_H (1) +#ifndef AOM_AOM_DSP_ENTENC_H_ +#define AOM_AOM_DSP_ENTENC_H_ #include <stddef.h> #include "aom_dsp/entcode.h" @@ -82,4 +82,4 @@ void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src); } // extern "C" #endif -#endif +#endif // AOM_AOM_DSP_ENTENC_H_ diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h index 2f3cd5fdc..5137331ae 100644 --- a/third_party/aom/aom_dsp/fft_common.h +++ b/third_party/aom/aom_dsp/fft_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_FFT_COMMON_H_ -#define AOM_DSP_FFT_COMMON_H_ +#ifndef AOM_AOM_DSP_FFT_COMMON_H_ +#define AOM_AOM_DSP_FFT_COMMON_H_ #ifdef __cplusplus extern "C" { @@ -1047,4 +1047,4 @@ void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ } -#endif // AOM_DSP_FFT_COMMON_H_ +#endif // AOM_AOM_DSP_FFT_COMMON_H_ diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c index ff1ec41a2..b96e1c319 100644 --- a/third_party/aom/aom_dsp/grain_synthesis.c +++ b/third_party/aom/aom_dsp/grain_synthesis.c @@ -396,14 +396,15 @@ static void init_random_generator(int luma_line, uint16_t seed) { random_register ^= ((luma_num * 173 + 105) & 255); } -static void generate_luma_grain_block( +// Return 0 for success, -1 for failure +static int generate_luma_grain_block( const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block, int luma_block_size_y, int luma_block_size_x, int luma_grain_stride, int left_pad, int top_pad, int right_pad, int bottom_pad) { if (params->num_y_points == 0) { memset(luma_grain_block, 0, sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride); - return; + return 0; } int bit_depth = params->bit_depth; @@ -433,9 +434,11 @@ static void generate_luma_grain_block( ((wsum + rounding_offset) >> params->ar_coeff_shift), grain_min, grain_max); } + return 0; } -static void generate_chroma_grain_blocks( +// Return 0 for success, -1 for failure +static int generate_chroma_grain_blocks( const aom_film_grain_t *params, // int** pred_pos_luma, int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block, @@ -510,10 +513,11 @@ static void generate_chroma_grain_blocks( wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma; wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma; } else { - printf( + fprintf( + stderr, "Grain synthesis: prediction between two chroma components is " "not supported!"); - exit(1); + return -1; } } if (params->num_cb_points || params->chroma_scaling_from_luma) @@ -527,6 +531,7 @@ static void generate_chroma_grain_blocks( ((wsum_cr + rounding_offset) >> params->ar_coeff_shift), grain_min, grain_max); } + return 0; } static void init_scaling_function(const int scaling_points[][2], int num_points, @@ -910,8 +915,8 @@ static void hor_boundary_overlap(int *top_block, int top_stride, } } -void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, - aom_image_t *dst) { +int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, + aom_image_t *dst) { uint8_t *luma, *cb, *cr; int height, width, luma_stride, chroma_stride; int use_high_bit_depth = 0; @@ -953,8 +958,8 @@ void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, chroma_subsamp_y = 0; break; default: // unknown input format - printf("Film grain error: input format is not supported!"); - exit(1); + fprintf(stderr, "Film grain error: input format is not supported!"); + return -1; } assert(params->bit_depth == src->bit_depth); @@ -1011,17 +1016,16 @@ void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth; chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth; - av1_add_film_grain_run(params, luma, cb, cr, height, width, luma_stride, - chroma_stride, use_high_bit_depth, chroma_subsamp_y, - chroma_subsamp_x, mc_identity); - return; + return av1_add_film_grain_run( + params, luma, cb, cr, height, width, luma_stride, chroma_stride, + use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); } -void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, - uint8_t *cb, uint8_t *cr, int height, int width, - int luma_stride, int chroma_stride, - int use_high_bit_depth, int chroma_subsamp_y, - int chroma_subsamp_x, int mc_identity) { +int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int height, int width, + int luma_stride, int chroma_stride, + int use_high_bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity) { int **pred_pos_luma; int **pred_pos_chroma; int *luma_grain_block; @@ -1085,18 +1089,20 @@ void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y, chroma_subsamp_x); - generate_luma_grain_block(params, pred_pos_luma, luma_grain_block, - luma_block_size_y, luma_block_size_x, - luma_grain_stride, left_pad, top_pad, right_pad, - bottom_pad); - - generate_chroma_grain_blocks( - params, - // pred_pos_luma, - pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block, - luma_grain_stride, chroma_block_size_y, chroma_block_size_x, - chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad, - chroma_subsamp_y, chroma_subsamp_x); + if (generate_luma_grain_block(params, pred_pos_luma, luma_grain_block, + luma_block_size_y, luma_block_size_x, + luma_grain_stride, left_pad, top_pad, right_pad, + bottom_pad)) + return -1; + + if (generate_chroma_grain_blocks( + params, + // pred_pos_luma, + pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block, + luma_grain_stride, chroma_block_size_y, chroma_block_size_x, + chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad, + chroma_subsamp_y, chroma_subsamp_x)) + return -1; init_scaling_function(params->scaling_points_y, params->num_y_points, scaling_lut_y); @@ -1399,4 +1405,5 @@ void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block, &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf); + return 0; } diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h index 65feb6068..7aee6f6f4 100644 --- a/third_party/aom/aom_dsp/grain_synthesis.h +++ b/third_party/aom/aom_dsp/grain_synthesis.h @@ -13,8 +13,8 @@ * \brief Describes film grain parameters and film grain synthesis * */ -#ifndef AOM_AOM_GRAIN_SYNTHESIS_H_ -#define AOM_AOM_GRAIN_SYNTHESIS_H_ +#ifndef AOM_AOM_DSP_GRAIN_SYNTHESIS_H_ +#define AOM_AOM_DSP_GRAIN_SYNTHESIS_H_ #ifdef __cplusplus extern "C" { @@ -85,6 +85,8 @@ typedef struct { * * Add film grain to an image * + * Returns 0 for success, -1 for failure + * * \param[in] grain_params Grain parameters * \param[in] luma luma plane * \param[in] cb cb plane @@ -94,25 +96,27 @@ typedef struct { * \param[in] luma_stride luma plane stride * \param[in] chroma_stride chroma plane stride */ -void av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma, - uint8_t *cb, uint8_t *cr, int height, int width, - int luma_stride, int chroma_stride, - int use_high_bit_depth, int chroma_subsamp_y, - int chroma_subsamp_x, int mc_identity); +int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int height, int width, + int luma_stride, int chroma_stride, + int use_high_bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity); /*!\brief Add film grain * * Add film grain to an image * + * Returns 0 for success, -1 for failure + * * \param[in] grain_params Grain parameters * \param[in] src Source image * \param[out] dst Resulting image with grain */ -void av1_add_film_grain(const aom_film_grain_t *grain_params, - const aom_image_t *src, aom_image_t *dst); +int av1_add_film_grain(const aom_film_grain_t *grain_params, + const aom_image_t *src, aom_image_t *dst); #ifdef __cplusplus } // extern "C" #endif -#endif // AOM_AOM_GRAIN_SYNTHESIS_H_ +#endif // AOM_AOM_DSP_GRAIN_SYNTHESIS_H_ diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h index 5c20413b2..a8ac50730 100644 --- a/third_party/aom/aom_dsp/grain_table.h +++ b/third_party/aom/aom_dsp/grain_table.h @@ -99,4 +99,4 @@ void aom_film_grain_table_free(aom_film_grain_table_t *t); } #endif -#endif +#endif // AOM_AOM_DSP_GRAIN_TABLE_H_ diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h index e047d98bc..3ec62a86e 100644 --- a/third_party/aom/aom_dsp/intrapred_common.h +++ b/third_party/aom/aom_dsp/intrapred_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_INTRAPRED_COMMON_H -#define _AOM_DSP_INTRAPRED_COMMON_H +#ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_ +#define AOM_AOM_DSP_INTRAPRED_COMMON_H_ #include "config/aom_config.h" @@ -44,4 +44,4 @@ static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = { }; /* clang-format on */ -#endif // _AOM_DSP_INTRAPRED_COMMON_H +#endif // AOM_AOM_DSP_INTRAPRED_COMMON_H_ diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h index a0627c074..852415c20 100644 --- a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h +++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ -#define AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ +#ifndef AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ +#define AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ #include "aom_dsp/mips/macros_msa.h" #include "aom_dsp/aom_filter.h" @@ -76,4 +76,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; res7_m, out0, out1, out2, out3); \ } -#endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */ +#endif // AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h index d51bfa899..c42188d62 100644 --- a/third_party/aom/aom_dsp/mips/common_dspr2.h +++ b/third_party/aom/aom_dsp/mips/common_dspr2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_COMMON_MIPS_DSPR2_H_ -#define AOM_COMMON_MIPS_DSPR2_H_ +#ifndef AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ #include <assert.h> @@ -48,4 +48,4 @@ static INLINE void prefetch_store_streamed(unsigned char *dst) { } // extern "C" #endif -#endif // AOM_COMMON_MIPS_DSPR2_H_ +#endif // AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c index 2a8f75938..097da73ca 100644 --- a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c @@ -15,7 +15,6 @@ #include "config/aom_dsp_rtcd.h" #include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c index ac87936da..40abfd89e 100644 --- a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c @@ -15,7 +15,6 @@ #include "config/aom_dsp_rtcd.h" #include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h index e7b8d531b..e5d48a884 100644 --- a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h +++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ -#define AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ +#ifndef AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ #include <assert.h> @@ -45,4 +45,4 @@ void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, } // extern "C" #endif -#endif // AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ +#endif // AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h index 3e38ef3fb..28f0dc35a 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ -#define AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ #include <stdlib.h> @@ -733,4 +733,4 @@ static INLINE void wide_mbfilter_dspr2( } // extern "C" #endif -#endif // AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h index cb599cf2e..62295d69d 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ -#define AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ #include <stdlib.h> @@ -434,4 +434,4 @@ extern "C" { } // extern "C" #endif -#endif // AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h index 6db1dac08..a0f57f386 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ -#define AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ #include <stdlib.h> @@ -354,4 +354,4 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1, } // extern "C" #endif -#endif // AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_msa.h b/third_party/aom/aom_dsp/mips/loopfilter_msa.h index 450594262..54b0bb4bd 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_msa.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_msa.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_LOOPFILTER_MSA_H_ -#define AOM_DSP_LOOPFILTER_MSA_H_ +#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ +#define AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ #include "aom_dsp/mips/macros_msa.h" @@ -248,4 +248,4 @@ mask_out = limit_in < (v16u8)mask_out; \ mask_out = __msa_xori_b(mask_out, 0xff); \ } -#endif /* AOM_DSP_LOOPFILTER_MSA_H_ */ +#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h index eb919d42b..9bfc27147 100644 --- a/third_party/aom/aom_dsp/mips/macros_msa.h +++ b/third_party/aom/aom_dsp/mips/macros_msa.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_MIPS_MACROS_MSA_H_ -#define AOM_DSP_MIPS_MACROS_MSA_H_ +#ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_ +#define AOM_AOM_DSP_MIPS_MACROS_MSA_H_ #include <msa.h> @@ -2055,4 +2055,4 @@ \ tmp1_m; \ }) -#endif /* AOM_DSP_MIPS_MACROS_MSA_H_ */ +#endif // AOM_AOM_DSP_MIPS_MACROS_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c index a8ee85b6b..810b6efaa 100644 --- a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c +++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c @@ -13,13 +13,9 @@ #include "aom_ports/mem.h" #include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/aom_filter.h" #include "aom_dsp/variance.h" -static const uint8_t bilinear_filters_msa[8][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; - #define CALC_MSE_AVG_B(src, ref, var, sub) \ { \ v16u8 src_l0_m, src_l1_m; \ @@ -1626,8 +1622,8 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( uint32_t *sse) { \ int32_t diff; \ uint32_t var; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ \ if (yoffset) { \ if (xoffset) { \ @@ -1680,8 +1676,8 @@ AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64) int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ uint32_t *sse, const uint8_t *sec_pred) { \ int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ \ if (yoffset) { \ if (xoffset) { \ @@ -1730,8 +1726,8 @@ uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, int32_t ref_stride, uint32_t *sse, const uint8_t *sec_pred) { int32_t diff; - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; + const uint8_t *h_filter = bilinear_filters_2t[xoffset]; + const uint8_t *v_filter = bilinear_filters_2t[yoffset]; if (yoffset) { if (xoffset) { @@ -1763,8 +1759,8 @@ uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ uint32_t *sse, const uint8_t *sec_pred) { \ int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ \ if (yoffset) { \ if (xoffset) { \ diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c index 5975c62e8..2faee8506 100644 --- a/third_party/aom/aom_dsp/noise_model.c +++ b/third_party/aom/aom_dsp/noise_model.c @@ -1135,7 +1135,9 @@ int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag); return 0; } + uint16_t random_seed = film_grain->random_seed; memset(film_grain, 0, sizeof(*film_grain)); + film_grain->random_seed = random_seed; film_grain->apply_grain = 1; film_grain->update_parameters = 1; @@ -1633,7 +1635,7 @@ int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, return 0; } if (!film_grain->random_seed) { - film_grain->random_seed = 1071; + film_grain->random_seed = 7391; } memcpy(raw_data[0], ctx->denoised[0], (strides[0] * sd->y_height) << use_highbd); diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h index b07bf8617..049d5be15 100644 --- a/third_party/aom/aom_dsp/noise_model.h +++ b/third_party/aom/aom_dsp/noise_model.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_NOISE_MODEL_H_ -#define AOM_DSP_NOISE_MODEL_H_ +#ifndef AOM_AOM_DSP_NOISE_MODEL_H_ +#define AOM_AOM_DSP_NOISE_MODEL_H_ #ifdef __cplusplus extern "C" { @@ -320,4 +320,4 @@ void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model); #ifdef __cplusplus } // extern "C" #endif // __cplusplus -#endif // AOM_DSP_NOISE_MODEL_H_ +#endif // AOM_AOM_DSP_NOISE_MODEL_H_ diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h index ea4d9e3de..2284a171a 100644 --- a/third_party/aom/aom_dsp/noise_util.h +++ b/third_party/aom/aom_dsp/noise_util.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_NOISE_UTIL_H_ -#define AOM_DSP_NOISE_UTIL_H_ +#ifndef AOM_AOM_DSP_NOISE_UTIL_H_ +#define AOM_AOM_DSP_NOISE_UTIL_H_ #ifdef __cplusplus extern "C" { @@ -65,4 +65,4 @@ int aom_noise_data_validate(const double *data, int w, int h); } // extern "C" #endif // __cplusplus -#endif // AOM_DSP_NOISE_UTIL_H_ +#endif // AOM_AOM_DSP_NOISE_UTIL_H_ diff --git a/third_party/aom/aom_dsp/postproc.h b/third_party/aom/aom_dsp/postproc.h index 11a8c5ad7..f3d87f264 100644 --- a/third_party/aom/aom_dsp/postproc.h +++ b/third_party/aom/aom_dsp/postproc.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_POSTPROC_H_ -#define AOM_DSP_POSTPROC_H_ +#ifndef AOM_AOM_DSP_POSTPROC_H_ +#define AOM_AOM_DSP_POSTPROC_H_ #ifdef __cplusplus extern "C" { @@ -23,4 +23,4 @@ int aom_setup_noise(double sigma, int size, char *noise); } #endif -#endif // AOM_DSP_POSTPROC_H_ +#endif // AOM_AOM_DSP_POSTPROC_H_ diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h index 85dd4249d..d003a986e 100644 --- a/third_party/aom/aom_dsp/prob.h +++ b/third_party/aom/aom_dsp/prob.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_PROB_H_ -#define AOM_DSP_PROB_H_ +#ifndef AOM_AOM_DSP_PROB_H_ +#define AOM_AOM_DSP_PROB_H_ #include <assert.h> #include <stdio.h> @@ -668,4 +668,4 @@ static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { } // extern "C" #endif -#endif // AOM_DSP_PROB_H_ +#endif // AOM_AOM_DSP_PROB_H_ diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c index 37d3bb585..50f376a4a 100644 --- a/third_party/aom/aom_dsp/psnr.c +++ b/third_party/aom/aom_dsp/psnr.c @@ -295,7 +295,6 @@ int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, default: assert(plane >= 0 && plane <= 2); return 0; } } - (void)highbd; switch (plane) { case 0: return aom_get_y_sse(a, b); case 1: return aom_get_u_sse(a, b); diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h index 8300b0a88..58e4e71ee 100644 --- a/third_party/aom/aom_dsp/psnr.h +++ b/third_party/aom/aom_dsp/psnr.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_PSNR_H_ -#define AOM_DSP_PSNR_H_ +#ifndef AOM_AOM_DSP_PSNR_H_ +#define AOM_AOM_DSP_PSNR_H_ #include "aom_scale/yv12config.h" @@ -76,4 +76,4 @@ double aom_psnrhvs(const YV12_BUFFER_CONFIG *source, #ifdef __cplusplus } // extern "C" #endif -#endif // AOM_DSP_PSNR_H_ +#endif // AOM_AOM_DSP_PSNR_H_ diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c index e1601cc7d..62dbd86a9 100644 --- a/third_party/aom/aom_dsp/quantize.c +++ b/third_party/aom/aom_dsp/quantize.c @@ -13,8 +13,8 @@ #include "aom_mem/aom_mem.h" void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, @@ -29,56 +29,54 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - const int coeff = coeff_ptr[rc] * wt; - - if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) && - coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS))) - non_zero_count--; - else - break; - } + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS))) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32; + + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { + int64_t tmp = + clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); + tmp *= wt; + tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); // quantization + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp32; - - const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { - int64_t tmp = - clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), - INT16_MIN, INT16_MAX); - tmp *= wt; - tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * - quant_shift_ptr[rc != 0]) >> - (16 - log_scale + AOM_QM_BITS)); // quantization - qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; - const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); - const int dequant = - (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> - AOM_QM_BITS; - const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; - dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); - - if (tmp32) eob = i; - } + if (tmp32) eob = i; } } *eob_ptr = eob + 1; } void highbd_quantize_b_helper_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, @@ -95,42 +93,40 @@ void highbd_quantize_b_helper_c( memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - const int coeff = coeff_ptr[rc] * wt; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) || - coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS))) - idx_arr[idx++] = i; - } + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) || + coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS))) + idx_arr[idx++] = i; + } - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); - const int64_t tmpw = tmp1 * wt; - const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; - const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> - (16 - log_scale + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> - AOM_QM_BITS; - const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; - dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); - if (abs_qcoeff) eob = idx_arr[i]; - } + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) eob = idx_arr[i]; } *eob_ptr = eob + 1; } @@ -138,74 +134,73 @@ void highbd_quantize_b_helper_c( /* These functions should only be called when quantisation matrices are not used. */ void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0); + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 0); } void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); + quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 1); } void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); + quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 2); } void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, - round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0); } void aom_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, - round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); } void aom_highbd_quantize_b_64x64_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, - round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); } diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h index 56d50b929..c55ab234e 100644 --- a/third_party/aom/aom_dsp/quantize.h +++ b/third_party/aom/aom_dsp/quantize.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_QUANTIZE_H_ -#define AOM_DSP_QUANTIZE_H_ +#ifndef AOM_AOM_DSP_QUANTIZE_H_ +#define AOM_AOM_DSP_QUANTIZE_H_ #include "config/aom_config.h" @@ -21,8 +21,8 @@ extern "C" { #endif void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, @@ -30,24 +30,23 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const qm_val_t *iqm_ptr, const int log_scale); void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan); + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); void highbd_quantize_b_helper_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale); void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -57,4 +56,4 @@ void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } // extern "C" #endif -#endif // AOM_DSP_QUANTIZE_H_ +#endif // AOM_AOM_DSP_QUANTIZE_H_ diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c index ede4c583b..1e24df4a5 100644 --- a/third_party/aom/aom_dsp/sad.c +++ b/third_party/aom/aom_dsp/sad.c @@ -200,15 +200,16 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ uint16_t comp_pred[m * n]; \ - aom_highbd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, m, n, \ + ref, ref_stride); \ return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ } \ unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ uint16_t comp_pred[m * n]; \ - aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, \ - ref_stride, jcp_param); \ + aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, \ + m, n, ref, ref_stride, jcp_param); \ return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ } diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h index 51a38a7e1..01dbb8fd2 100644 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics.h +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V128_INTRINSICS_H -#define _V128_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ #include <stdio.h> #include <stdlib.h> @@ -341,4 +341,4 @@ SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { return c_v128_ssd_s16_sum(s); } -#endif /* _V128_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h index d4fec4237..3c669d579 100644 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V128_INTRINSICS_H -#define _V128_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ #include <arm_neon.h> @@ -375,7 +375,8 @@ SIMD_INLINE uint32_t v128_movemask_8(v128 a) { uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8( vandq_u8(vreinterpretq_u8_s64(a), vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)))))); - return v64_u64(v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m))); + return v64_low_u32( + v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m))); #endif } @@ -954,4 +955,4 @@ SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); } -#endif /* _V128_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h index e508f6ad7..bbe9a9d28 100644 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V128_INTRINSICS_C_H -#define _V128_INTRINSICS_C_H +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ #include <stdio.h> #include <stdlib.h> @@ -885,4 +885,4 @@ SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s, SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; } -#endif /* _V128_INTRINSICS_C_H */ +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h index f9043fe99..6c7241ff4 100644 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V128_INTRINSICS_H -#define _V128_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ #include <stdint.h> #include "aom_dsp/simd/v64_intrinsics_x86.h" @@ -653,4 +653,4 @@ SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); } -#endif /* _V128_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h index 4b70cc57b..cb99d35b7 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V256_INTRINSICS_H -#define _V256_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ #include <stdio.h> #include <stdlib.h> @@ -373,4 +373,4 @@ SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { return c_v256_ssd_s16_sum(s); } -#endif /* _V256_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h index d96638488..bd86ea172 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h @@ -9,9 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V256_INTRINSICS_H -#define _V256_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ #include "aom_dsp/simd/v256_intrinsics_v128.h" -#endif /* _V256_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h index 5b412df71..a1c08e95a 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V256_INTRINSICS_C_H -#define _V256_INTRINSICS_C_H +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ #include <stdio.h> #include <stdlib.h> @@ -950,4 +950,4 @@ SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s, SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; } -#endif /* _V256_INTRINSICS_C_H */ +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h index 60b2a1791..d5b7905ef 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V256_INTRINSICS_V128_H -#define _V256_INTRINSICS_V128_H +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ #if HAVE_NEON #include "aom_dsp/simd/v128_intrinsics_arm.h" @@ -870,4 +870,4 @@ SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]); } -#endif /* _V256_INTRINSICS_V128_H */ +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h index 05f205169..44594bc41 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V256_INTRINSICS_H -#define _V256_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ #if !defined(__AVX2__) @@ -747,4 +747,4 @@ SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { #endif -#endif /* _V256_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h index 6ce53c6a9..afc55428d 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V64_INTRINSICS_H -#define _V64_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ #include <stdio.h> #include <stdlib.h> @@ -229,4 +229,4 @@ SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { return c_v64_shr_n_s32(a, c); } -#endif /* _V64_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h index 267441b02..8f39ad6e8 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V64_INTRINSICS_H -#define _V64_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ #include <arm_neon.h> @@ -677,4 +677,4 @@ SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { #endif -#endif /* _V64_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h index 8158899cb..028d68c4f 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V64_INTRINSICS_C_H -#define _V64_INTRINSICS_C_H +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ /* Note: This implements the intrinsics in plain, unoptimised C. Intended for reference, porting or debugging. */ @@ -965,4 +965,4 @@ SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) { return c_v64_shr_s32(a, c); } -#endif /* _V64_INTRINSICS_C_H */ +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h index 130052ee1..5f9a57b37 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V64_INTRINSICS_H -#define _V64_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ #include <emmintrin.h> #if defined(__SSSE3__) @@ -488,4 +488,4 @@ SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { #define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c) #define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c) -#endif /* _V64_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ diff --git a/third_party/aom/aom_dsp/sse.c b/third_party/aom/aom_dsp/sse.c new file mode 100644 index 000000000..249394807 --- /dev/null +++ b/third_party/aom/aom_dsp/sse.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* Sum the difference between every corresponding element of the buffers. */ + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = abs(a[x] - b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} + +int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c index 6ce3d7acb..681770ba9 100644 --- a/third_party/aom/aom_dsp/ssim.c +++ b/third_party/aom/aom_dsp/ssim.c @@ -275,7 +275,7 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, for (i = 0; i < height; i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { for (j = 0; j < width; j += 4, ++c) { - Ssimv sv = { 0 }; + Ssimv sv = { 0, 0, 0, 0, 0, 0 }; double ssim; double ssim2; double dssim; diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h index c8a389dfe..55038f4c2 100644 --- a/third_party/aom/aom_dsp/ssim.h +++ b/third_party/aom/aom_dsp/ssim.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_SSIM_H_ -#define AOM_DSP_SSIM_H_ +#ifndef AOM_AOM_DSP_SSIM_H_ +#define AOM_AOM_DSP_SSIM_H_ #define MAX_SSIM_DB 100.0; @@ -84,4 +84,4 @@ double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, } // extern "C" #endif -#endif // AOM_DSP_SSIM_H_ +#endif // AOM_AOM_DSP_SSIM_H_ diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h index 7deb0aea3..f98242840 100644 --- a/third_party/aom/aom_dsp/txfm_common.h +++ b/third_party/aom/aom_dsp/txfm_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_TXFM_COMMON_H_ -#define AOM_DSP_TXFM_COMMON_H_ +#ifndef AOM_AOM_DSP_TXFM_COMMON_H_ +#define AOM_AOM_DSP_TXFM_COMMON_H_ #include "aom_dsp/aom_dsp_common.h" #include "av1/common/enums.h" @@ -88,4 +88,4 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) { return rv; } -#endif // AOM_DSP_TXFM_COMMON_H_ +#endif // AOM_AOM_DSP_TXFM_COMMON_H_ diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c index 817ebe15d..23b715309 100644 --- a/third_party/aom/aom_dsp/variance.c +++ b/third_party/aom/aom_dsp/variance.c @@ -55,24 +55,6 @@ uint32_t aom_get_mb_ss_c(const int16_t *a) { return sum; } -uint32_t aom_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - uint32_t *sse) { - return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse); -} - -uint32_t aom_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - uint32_t *sse) { - return aom_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse); -} - -uint32_t aom_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - uint32_t *sse) { - return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse); -} - static void variance(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h, uint32_t *sse, int *sum) { int i, j; @@ -302,7 +284,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride) { + int ref_stride, int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; @@ -370,7 +352,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); const InterpFilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); @@ -387,7 +369,9 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, } const InterpFilterParams *filter = - av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + (subpel_search == 1) + ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) + : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { for (int i = 0; i < height; i++) { @@ -398,13 +382,13 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, } else if (!subpel_y_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, - width, height); + aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL, + -1, width, height); } else if (!subpel_x_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, - width, height); + aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel, + 16, width, height); } else { DECLARE_ALIGNED(16, uint8_t, temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); @@ -415,12 +399,12 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, const int intermediate_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1), - ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, - width, intermediate_height); - aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), - MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, - width, height); + aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, + width, intermediate_height); + aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), + MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, + width, height); } } @@ -429,11 +413,11 @@ void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride) { + int ref_stride, int subpel_search) { int i, j; aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride); + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1); @@ -466,13 +450,14 @@ void aom_jnt_comp_avg_upsampled_pred_c( MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param) { + int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) { int i, j; const int fwd_offset = jcp_param->fwd_offset; const int bck_offset = jcp_param->bck_offset; - aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride); + aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { @@ -706,125 +691,125 @@ void aom_highbd_var_filter_block2d_bil_second_pass( dst, dst_stride, sse); \ } -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ - uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ - \ - return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ - \ - return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ - \ - return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ + \ + return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ + \ + return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ + \ + return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ } /* All three forms of the variance are available in the same sizes. */ @@ -867,12 +852,13 @@ HIGHBD_MSE(16, 8) HIGHBD_MSE(8, 16) HIGHBD_MSE(8, 8) -void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, +void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride) { int i, j; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { const int tmp = pred[j] + ref[j]; @@ -887,9 +873,10 @@ void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, - uint16_t *comp_pred, int width, int height, + uint8_t *comp_pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, - const uint8_t *ref8, int ref_stride, int bd) { + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; @@ -902,8 +889,6 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, if (is_scaled) { // Note: This is mostly a copy from the >=8X8 case in // build_inter_predictors() function, with some small tweaks. - uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); - // Some assumptions. const int plane = 0; @@ -958,7 +943,7 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); const InterpFilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); @@ -975,13 +960,14 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, } const InterpFilterParams *filter = - av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + (subpel_search == 1) + ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) + : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { - const uint16_t *ref; - int i; - ref = CONVERT_TO_SHORTPTR(ref8); - for (i = 0; i < height; i++) { + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + for (int i = 0; i < height; i++) { memcpy(comp_pred, ref, width * sizeof(*comp_pred)); comp_pred += width; ref += ref_stride; @@ -989,13 +975,13 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, } else if (!subpel_y_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), - width, kernel, 16, NULL, -1, width, height, bd); + aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, + NULL, -1, width, height, bd); } else if (!subpel_x_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), - width, NULL, -1, kernel, 16, width, height, bd); + aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, + kernel, 16, width, height, bd); } else { DECLARE_ALIGNED(16, uint16_t, temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); @@ -1012,22 +998,23 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, intermediate_height, bd); aom_highbd_convolve8_vert( CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), - MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, - 16, width, height, bd); + MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, + bd); } } void aom_highbd_comp_avg_upsampled_pred_c( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd) { + int ref_stride, int bd, int subpel_search) { int i, j; const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd); + bd, subpel_search); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1); @@ -1037,7 +1024,7 @@ void aom_highbd_comp_avg_upsampled_pred_c( } } -void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, +void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param) { @@ -1046,6 +1033,7 @@ void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, const int bck_offset = jcp_param->bck_offset; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { @@ -1061,17 +1049,18 @@ void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, void aom_highbd_jnt_comp_avg_upsampled_pred_c( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) { + int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, + int subpel_search) { int i, j; const int fwd_offset = jcp_param->fwd_offset; const int bck_offset = jcp_param->bck_offset; const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd); + bd, subpel_search); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { @@ -1104,21 +1093,23 @@ void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, } } -void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm, - int mi_row, int mi_col, const MV *const mv, - uint8_t *comp_pred, const uint8_t *pred, - int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref, - int ref_stride, const uint8_t *mask, - int mask_stride, int invert_mask) { +void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask, + int subpel_search) { if (subpel_x_q3 | subpel_y_q3) { - aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride); + aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); ref = comp_pred; ref_stride = width; } - aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask, - mask_stride, invert_mask); + aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask, + mask_stride, invert_mask); } #define MASK_SUBPIX_VAR(W, H) \ @@ -1164,13 +1155,14 @@ MASK_SUBPIX_VAR(32, 8) MASK_SUBPIX_VAR(16, 64) MASK_SUBPIX_VAR(64, 16) -void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, +void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i, j; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { if (!invert_mask) @@ -1187,16 +1179,15 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, void aom_highbd_comp_mask_upsampled_pred( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, - int bd) { - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + int bd, int subpel_search) { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd); - aom_highbd_comp_mask_pred(comp_pred, pred8, width, height, - CONVERT_TO_BYTEPTR(comp_pred), width, mask, - mask_stride, invert_mask); + bd, subpel_search); + aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width, + mask, mask_stride, invert_mask); } #define HIGHBD_MASK_SUBPIX_VAR(W, H) \ @@ -1214,7 +1205,7 @@ void aom_highbd_comp_mask_upsampled_pred( aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ - aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \ + aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ invert_mask); \ \ @@ -1236,7 +1227,7 @@ void aom_highbd_comp_mask_upsampled_pred( aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ - aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \ + aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ invert_mask); \ \ @@ -1258,7 +1249,7 @@ void aom_highbd_comp_mask_upsampled_pred( aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ - aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \ + aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ invert_mask); \ \ diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h index b954470de..362da29d3 100644 --- a/third_party/aom/aom_dsp/variance.h +++ b/third_party/aom/aom_dsp/variance.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_VARIANCE_H_ -#define AOM_DSP_VARIANCE_H_ +#ifndef AOM_AOM_DSP_VARIANCE_H_ +#define AOM_AOM_DSP_VARIANCE_H_ #include "config/aom_config.h" @@ -70,18 +70,12 @@ typedef unsigned int (*aom_masked_subpixvariance_fn_t)( const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse); -void aom_comp_mask_upsampled_pred( - MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); - void aom_highbd_comp_mask_upsampled_pred( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, - int bd); + int bd, int subpel_search); typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, const int32_t *wsrc, @@ -133,4 +127,4 @@ uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, } // extern "C" #endif -#endif // AOM_DSP_VARIANCE_H_ +#endif // AOM_AOM_DSP_VARIANCE_H_ diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c index 401fbdc48..5f5bf5f14 100644 --- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c +++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c @@ -22,6 +22,13 @@ filter8_1dfunction aom_filter_block1d8_h8_sse2; filter8_1dfunction aom_filter_block1d4_v8_sse2; filter8_1dfunction aom_filter_block1d4_h8_sse2; +#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2 +#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2 +#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2 +#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2 +#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2 +#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2 + filter8_1dfunction aom_filter_block1d16_v2_sse2; filter8_1dfunction aom_filter_block1d16_h2_sse2; filter8_1dfunction aom_filter_block1d8_v2_sse2; diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c index f3fe50372..94b5da171 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -74,6 +74,87 @@ static INLINE void xx_store2_mi128(const uint8_t *output_ptr, _mm256_extractf128_si256(*a, 1)); } +static void aom_filter_block1d4_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + firstFilters = + _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); + filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 4 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } +} + static void aom_filter_block1d4_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { @@ -179,6 +260,100 @@ static void aom_filter_block1d4_h8_avx2( } } +static void aom_filter_block1d8_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt2Reg, filt3Reg; + __m256i secondFilters, thirdFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + + // multiply the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 8 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); + } +} + static void aom_filter_block1d8_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { @@ -311,6 +486,121 @@ static void aom_filter_block1d8_h8_avx2( } } +static void aom_filter_block1d16_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt2Reg, filt3Reg; + __m256i secondFilters, thirdFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + + // multiply the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = + xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m256i srcReg1, srcReg12; + __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1; + + srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr)); + srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94); + + // filter the source buffer + srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg); + srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters); + srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32); + srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1); + srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, + _mm256_castsi256_si128(srcRegFilt1_1)); + } +} + static void aom_filter_block1d16_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { @@ -507,6 +797,92 @@ static void aom_filter_block1d16_h8_avx2( } } +static void aom_filter_block1d8_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg45_56_lo; + __m256i resReg23_34_lo, resReg45_56_lo; + __m256i resReglo, resReg; + __m256i secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); + resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); + + // add and saturate the results together + resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReglo); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4x = srcReg6x; + } +} + static void aom_filter_block1d8_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { @@ -659,6 +1035,104 @@ static void aom_filter_block1d8_v8_avx2( } } +static void aom_filter_block1d16_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi; + __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi; + __m256i resReglo, resReghi, resReg; + __m256i secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); + resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); + + // add and saturate the results together + resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters); + resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters); + + // add and saturate the results together + resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReghi = _mm256_adds_epi16(resReghi, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + resReghi = _mm256_srai_epi16(resReghi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReghi); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg23_34_hi = srcReg45_56_hi; + srcReg4x = srcReg6x; + } +} + static void aom_filter_block1d16_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { @@ -854,6 +1328,88 @@ static void aom_filter_block1d16_v8_avx2( } } +static void aom_filter_block1d4_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg45_56_lo; + __m256i srcReg2345_3456_lo; + __m256i resReglo, resReg; + __m256i firstFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + firstFilters = + _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + + srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters); + + resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReglo); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4x = srcReg6x; + } +} + #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction aom_filter_block1d4_v8_ssse3; filter8_1dfunction aom_filter_block1d16_v2_ssse3; diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c index 6bcb4a512..325a21b76 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c @@ -287,6 +287,13 @@ filter8_1dfunction aom_filter_block1d8_h8_ssse3; filter8_1dfunction aom_filter_block1d4_v8_ssse3; filter8_1dfunction aom_filter_block1d4_h8_ssse3; +#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3 +#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3 +#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3 +#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3 +#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3 +#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3 + filter8_1dfunction aom_filter_block1d16_v2_ssse3; filter8_1dfunction aom_filter_block1d16_h2_ssse3; filter8_1dfunction aom_filter_block1d8_v2_ssse3; diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c new file mode 100644 index 000000000..67fb4d32b --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c @@ -0,0 +1,900 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <smmintrin.h> // SSE4.1 +#include <immintrin.h> // AVX2 + +#include <assert.h> + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/blend_sse4.h" +#include "aom_dsp/x86/blend_mask_sse4.h" + +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend_a64_d16_mask_w16_avx2( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval, + int shift) { + const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); + const __m256i s0_0 = yy_loadu_256(src0); + const __m256i s1_0 = yy_loadu_256(src1); + __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), + _mm256_unpacklo_epi16(*m0, max_minus_m0)); + __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), + _mm256_unpackhi_epi16(*m0, max_minus_m0)); + res0_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); + const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); + __m256i res = _mm256_packus_epi16(res0, res0); + res = _mm256_permute4x64_epi64(res, 0xd8); + _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res)); +} + +static INLINE void blend_a64_d16_mask_w32_avx2( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset, + const __m256i *v_maxval, int shift) { + const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); + const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1); + const __m256i s0_0 = yy_loadu_256(src0); + const __m256i s0_1 = yy_loadu_256(src0 + 16); + const __m256i s1_0 = yy_loadu_256(src1); + const __m256i s1_1 = yy_loadu_256(src1 + 16); + __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), + _mm256_unpacklo_epi16(*m0, max_minus_m0)); + __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), + _mm256_unpackhi_epi16(*m0, max_minus_m0)); + __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1), + _mm256_unpacklo_epi16(*m1, max_minus_m1)); + __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1), + _mm256_unpackhi_epi16(*m1, max_minus_m1)); + res0_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); + res1_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift); + res1_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift); + const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); + const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi); + __m256i res = _mm256_packus_epi16(res0, res1); + res = _mm256_permute4x64_epi64(res, 0xd8); + _mm256_storeu_si256((__m256i *)(dst), res); +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m = xx_loadu_128(mask); + const __m256i m0 = _mm256_cvtepu8_epi16(m); + + blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m = yy_loadu_256(mask + j); + const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m)); + const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1)); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m256i m_i00 = yy_loadu_256(mask); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride); + + const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); + const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); + const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); + + blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j); + const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32); + + const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); + const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11); + const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); + const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b); + const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); + const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); + const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); + + blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); + const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); + const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b); + const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); + const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + j); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); + + const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); + const __m256i m0 = _mm256_cvtepu8_epi16(m_ac); + + blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + j); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j); + + const __m256i m_ac = + _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros); + const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac)); + const __m256i m1 = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1)); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +void aom_lowbd_blend_a64_d16_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + const int round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + const __m128i v_round_offset = _mm_set1_epi32(round_offset); + const __m256i y_round_offset = _mm256_set1_epi32(round_offset); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } +} + +static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0)); + const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1)); + const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8); + const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w); + const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8); + return v_res; +} + +static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = yy_loadu_256(src0); + const __m256i v_s1_b = yy_loadu_256(src1); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m256i v_p1_w = + _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b), + _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits); + const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + +static INLINE void blend_a64_mask_sx_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m256i v_ral_b = yy_loadu_256(mask); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + + const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ral_b = yy_loadu_256(mask + 2 * c); + const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c); + const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rvsbh_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2); + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sx_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xff); + do { + const __m256i v_rl_b = yy_loadu_256(mask); + const __m256i v_al_b = + _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1)); + + const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256()); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_r0_b = yy_loadu_256(mask + 2 * c); + const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m256i v_al_b = + _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8)); + const __m256i v_ah_b = + _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8)); + + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_r_b = xx_loadu_128(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ra_b = yy_loadu_256(mask + c); + const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride); + const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +static INLINE void blend_a64_mask_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_m0_b = yy_loadu_256(mask + c); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_m0_b = xx_loadl_64(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + do { + const __m128i v_m0_b = xx_loadu_128(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + default: + blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subx, int suby) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h, subx, suby); + } else { + if (subx & suby) { + blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (subx) { + blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (suby) { + blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else { + blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h); + } + } +} diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c index 49c20b467..9d6b4c2f7 100644 --- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c +++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c @@ -20,6 +20,7 @@ #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/blend_sse4.h" +#include "aom_dsp/x86/blend_mask_sse4.h" #include "config/aom_dsp_rtcd.h" @@ -32,19 +33,13 @@ static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_m0_b = xx_loadl_32(mask); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); - + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; @@ -59,19 +54,13 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_m0_b = xx_loadl_64(mask); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); - + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); dst += dst_stride; @@ -85,23 +74,17 @@ static void blend_a64_mask_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { - const __m128i v_m0l_b = xx_loadl_64(mask + c); - const __m128i v_m0h_b = xx_loadl_64(mask + c + 8); - const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b); - const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + const __m128i v_m0_b = xx_loadu_128(mask + c); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -120,23 +103,20 @@ static void blend_a64_mask_sx_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_r_b = xx_loadl_64(mask); - const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); - - const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; @@ -150,22 +130,20 @@ static void blend_a64_mask_sx_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_r_b = xx_loadu_128(mask); - const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); - - const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); @@ -180,28 +158,24 @@ static void blend_a64_mask_sx_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { - const __m128i v_rl_b = xx_loadu_128(mask + 2 * c); - const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16); - const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1)); - const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1)); - - const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b); - const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); - - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_r0_b = xx_loadu_128(mask + 2 * c); + const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -220,21 +194,18 @@ static void blend_a64_mask_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { const __m128i v_ra_b = xx_loadl_32(mask); const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); @@ -249,21 +220,16 @@ static void blend_a64_mask_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); - - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); @@ -278,26 +244,18 @@ static void blend_a64_mask_sy_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zero = _mm_setzero_si128(); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { const __m128i v_ra_b = xx_loadu_128(mask + c); const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); - - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -316,27 +274,24 @@ static void blend_a64_mask_sx_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); (void)w; do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = - _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); - const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); - + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); @@ -351,27 +306,25 @@ static void blend_a64_mask_sx_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); (void)w; do { const __m128i v_ra_b = xx_loadu_128(mask); const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); - const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = - _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); - const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); @@ -388,8 +341,8 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1( const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { @@ -410,14 +363,11 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1( const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2); const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -921,24 +871,140 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, } } -static INLINE void blend_a64_d16_mask(uint8_t *dst, const CONV_BUF_TYPE *src0, - const CONV_BUF_TYPE *src1, - const __m128i *m, - const __m128i *v_round_offset, - const __m128i *v_maxval, int round_bits) { - const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); - const __m128i s0 = xx_loadl_64(src0); - const __m128i s1 = xx_loadl_64(src1); - const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); - const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); - const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); - const __m128i res_b = _mm_srli_epi32(res_a, AOM_BLEND_A64_ROUND_BITS); - const __m128i res_c = _mm_sub_epi32(res_b, *v_round_offset); - const __m128i res_d = xx_roundn_epi32(res_c, round_bits); - const __m128i res_e = _mm_packs_epi32(res_d, res_d); - const __m128i res = _mm_packus_epi16(res_e, res_e); - - xx_storel_32(dst, res); +static INLINE void blend_a64_d16_mask_w16_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset, + const __m128i *v_maxval, int shift) { + const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0); + const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1); + const __m128i s0_0 = xx_loadu_128(src0); + const __m128i s0_1 = xx_loadu_128(src0 + 8); + const __m128i s1_0 = xx_loadu_128(src1); + const __m128i s1_1 = xx_loadu_128(src1 + 8); + __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0), + _mm_unpacklo_epi16(*m0, max_minus_m0)); + __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0), + _mm_unpackhi_epi16(*m0, max_minus_m0)); + __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1), + _mm_unpacklo_epi16(*m1, max_minus_m1)); + __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1), + _mm_unpackhi_epi16(*m1, max_minus_m1)); + res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift); + res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift); + res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift); + const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi); + const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi); + const __m128i res = _mm_packus_epi16(res0, res1); + + _mm_storeu_si128((__m128i *)(dst), res); +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m = xx_loadu_128(mask + j); + const __m128i m0 = _mm_cvtepu8_epi16(m); + const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8)); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); + const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); + + const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); + const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); + const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); + const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); + const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); + const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b); + const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b); + const __m128i m0 = _mm_avg_epu16(m0_ac, zeros); + const __m128i m1 = _mm_avg_epu16(m1_ac, zeros); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + j); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); + + const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); + const __m128i m0 = _mm_cvtepu8_epi16(m_ac); + const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8)); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } } void aom_lowbd_blend_a64_d16_mask_sse4_1( @@ -947,12 +1013,15 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1( const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params) { const int bd = 8; - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const int round_offset = (1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1)); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); @@ -961,69 +1030,80 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1( assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i v_ro_a = xx_loadl_32(&round_offset); - const __m128i v_round_offset = _mm_shuffle_epi32(v_ro_a, 0); - const __m128i one_w = _mm_set1_epi16(1); - const __m128i one_b = _mm_set1_epi8(1); - const __m128i two_w = _mm_set1_epi16(2); + const __m128i v_round_offset = _mm_set1_epi32(round_offset); if (subw == 0 && subh == 0) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 4) { - const __m128i m0 = xx_loadl_32(&mask[i * mask_stride + j]); - const __m128i m = _mm_cvtepu8_epi16(m0); - - blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], - &src1[i * src1_stride + j], &m, &v_round_offset, - &v_maxval, round_bits); - } + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; } + } else if (subw == 1 && subh == 1) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 4) { - const __m128i m_i0 = - xx_loadl_64(&mask[(2 * i) * mask_stride + (2 * j)]); - const __m128i m_i1 = - xx_loadl_64(&mask[(2 * i + 1) * mask_stride + (2 * j)]); - const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); - const __m128i m_bd = _mm_maddubs_epi16(m_i1, one_b); - const __m128i m_acbd = _mm_add_epi16(m_ac, m_bd); - const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); - const __m128i m = _mm_srli_epi16(m_acbd_2, 2); - - blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], - &src1[i * src1_stride + j], &m, &v_round_offset, - &v_maxval, round_bits); - } + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; } } else if (subw == 1 && subh == 0) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 4) { - const __m128i m_i0 = xx_loadl_64(&mask[i * mask_stride + (2 * j)]); - const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); - const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w); - const __m128i m = _mm_srli_epi16(m_ac_1, 1); - - blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], - &src1[i * src1_stride + j], &m, &v_round_offset, - &v_maxval, round_bits); - } + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; } } else { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 4) { - const __m128i m_i0 = xx_loadl_64(&mask[(2 * i) * mask_stride + j]); - const __m128i m_i1 = xx_loadl_64(&mask[(2 * i + 1) * mask_stride + j]); - const __m128i m_i01 = _mm_unpacklo_epi8(m_i0, m_i1); - const __m128i m_ac = _mm_maddubs_epi16(m_i01, one_b); - const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w); - const __m128i m = _mm_srli_epi16(m_ac_1, 1); - - blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], - &src1[i * src1_stride + j], &m, &v_round_offset, - &v_maxval, round_bits); - } + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; } } } diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c index 59506bdfe..064910232 100644 --- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c +++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c @@ -39,7 +39,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); @@ -64,7 +64,7 @@ static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); @@ -90,9 +90,9 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); for (c = 0; c < w; c += 16) { - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w); + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w); const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w); + blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h new file mode 100644 index 000000000..c071fdcfc --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ +#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ +#include <smmintrin.h> // SSE4.1 + +#include <assert.h> + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" + +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend_a64_d16_mask_w4_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, + int shift) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadl_64(src0); + const __m128i s1 = xx_loadl_64(src1); + const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); + const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); + const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); + const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset); + const __m128i res_d = _mm_srai_epi32(res_c, shift); + const __m128i res_e = _mm_packs_epi32(res_d, res_d); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + xx_storel_32(dst, res); +} + +static INLINE void blend_a64_d16_mask_w8_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, + int shift) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadu_128(src0); + const __m128i s1 = xx_loadu_128(src1); + __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1), + _mm_unpacklo_epi16(*m, max_minus_m)); + __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1), + _mm_unpackhi_epi16(*m, max_minus_m)); + res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift); + res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift); + const __m128i res_e = _mm_packs_epi32(res_lo, res_hi); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + _mm_storel_epi64((__m128i *)(dst), res); +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m0 = xx_loadl_32(mask); + const __m128i m = _mm_cvtepu8_epi16(m0); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m0 = xx_loadl_64(mask); + const __m128i m = _mm_cvtepu8_epi16(m0); + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadu_128(mask); + const __m128i m_i1 = xx_loadu_128(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m = _mm_avg_epu16(m_ac, zeros); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadu_128(mask); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m = _mm_avg_epu16(m_ac, zeros); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} +#endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h index 4880438bc..8d9b32510 100644 --- a/third_party/aom/aom_dsp/x86/blend_sse4.h +++ b/third_party/aom/aom_dsp/x86/blend_sse4.h @@ -9,42 +9,44 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_BLEND_SSE4_H_ -#define AOM_DSP_X86_BLEND_SSE4_H_ +#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_ +#define AOM_AOM_DSP_X86_BLEND_SSE4_H_ #include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" +static const uint8_t g_blend_a64_mask_shuffle[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, +}; ////////////////////////////////////////////////////////////////////////////// // Common kernels ////////////////////////////////////////////////////////////////////////////// static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i *v_m0_w, const __m128i *v_m1_w) { const __m128i v_s0_b = xx_loadl_32(src0); const __m128i v_s1_b = xx_loadl_32(src1); const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); - + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); - const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); return v_res_w; } static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i *v_m0_w, const __m128i *v_m1_w) { const __m128i v_s0_b = xx_loadl_64(src0); const __m128i v_s1_b = xx_loadl_64(src1); const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); @@ -53,6 +55,51 @@ static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, return v_res_w; } +static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadu_128(src0); + const __m128i v_s1_b = xx_loadu_128(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b), + _mm_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, const __m128i v_m0_w, const __m128i v_m1_w); @@ -141,4 +188,4 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, return v_res_w; } -#endif // AOM_DSP_X86_BLEND_SSE4_H_ +#endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h index 3f46420dd..96fe4ebb6 100644 --- a/third_party/aom/aom_dsp/x86/common_avx2.h +++ b/third_party/aom/aom_dsp/x86/common_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_COMMON_AVX2_H -#define AOM_DSP_X86_COMMON_AVX2_H +#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_ +#define AOM_AOM_DSP_X86_COMMON_AVX2_H_ #include <immintrin.h> @@ -144,4 +144,4 @@ static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); } -#endif +#endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h index 36fb1963a..3e19682cd 100644 --- a/third_party/aom/aom_dsp/x86/convolve.h +++ b/third_party/aom/aom_dsp/x86/convolve.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_CONVOLVE_H_ -#define AOM_DSP_X86_CONVOLVE_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_H_ #include <assert.h> @@ -17,7 +17,6 @@ #include "aom/aom_integer.h" #include "aom_ports/mem.h" -#include "aom_dsp/aom_convolve.h" typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, @@ -34,7 +33,30 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, (void)y_step_q4; \ assert((-128 <= filter[3]) && (filter[3] <= 127)); \ assert(step_q4 == 16); \ - if (filter[0] | filter[1] | filter[2]) { \ + if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \ + (filter[2] | filter[5])) { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else if (filter[0] | filter[1] | filter[2]) { \ while (w >= 16) { \ aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter); \ @@ -153,4 +175,4 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, } \ } -#endif // AOM_DSP_X86_CONVOLVE_H_ +#endif // AOM_AOM_DSP_X86_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h index 72fabd236..30253f65c 100644 --- a/third_party/aom/aom_dsp/x86/convolve_avx2.h +++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_ -#define AOM_DSP_X86_CONVOLVE_AVX2_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ // filters for 16 DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { @@ -29,6 +29,11 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, }; +DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, +}; + static INLINE void prepare_coeffs_lowbd( const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { @@ -191,4 +196,4 @@ static INLINE __m256i highbd_convolve_rounding( return res_round; } -#endif +#endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h index e80c5872f..707bd2d78 100644 --- a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h +++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ -#define _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ // Note: // This header file should be put below any x86 intrinsics head file @@ -28,4 +28,4 @@ static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res, _mm_store_si128((__m128i *)dst, d); } -#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ +#endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h index 399df5d6d..445d04b10 100644 --- a/third_party/aom/aom_dsp/x86/convolve_sse2.h +++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_CONVOLVE_SSE2_H_ -#define AOM_DSP_X86_CONVOLVE_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ // Note: // This header file should be put below any x86 intrinsics head file @@ -118,4 +118,4 @@ static INLINE __m128i highbd_convolve_rounding_sse2( return res_round; } -#endif +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h index d48c25667..6b8388d84 100644 --- a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h +++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_ -#define _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ // Note: // This header file should be put below any x86 intrinsics head file @@ -50,4 +50,4 @@ static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0, return res; } -#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h index 12ccf7f26..260d8dd58 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_ -#define AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ #ifdef __cplusplus extern "C" { @@ -152,4 +152,4 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { } // extern "C" #endif -#endif // AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm deleted file mode 100644 index 99f17ebdf..000000000 --- a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm +++ /dev/null @@ -1,351 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -SECTION .text - -;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref, -; int ref_stride, -; unsigned char *src, -; int src_stride, -; unsigned int height, -; int *sum, -; unsigned int *sumsquared) -global sym(aom_half_horiz_vert_variance16x_h_sse2) PRIVATE -sym(aom_half_horiz_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref - - mov rdi, arg(2) ;src - movsxd rcx, dword ptr arg(4) ;height - movsxd rax, dword ptr arg(1) ;ref_stride - movsxd rdx, dword ptr arg(3) ;src_stride - - pxor xmm0, xmm0 ; - - movdqu xmm5, XMMWORD PTR [rsi] - movdqu xmm3, XMMWORD PTR [rsi+1] - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 - - lea rsi, [rsi + rax] - -aom_half_horiz_vert_variance16x_h_1: - movdqu xmm1, XMMWORD PTR [rsi] ; - movdqu xmm2, XMMWORD PTR [rsi+1] ; - pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 - - pavgb xmm5, xmm1 ; xmm = vertical average of the above - - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm4, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - psubw xmm5, xmm3 ; xmm5 -= xmm3 - - movq xmm3, QWORD PTR [rdi+8] - punpcklbw xmm3, xmm0 - psubw xmm4, xmm3 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm1 ; save xmm1 for use on the next row - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz aom_half_horiz_vert_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_half_vert_variance16x_h_sse2(unsigned char *ref, -; int ref_stride, -; unsigned char *src, -; int src_stride, -; unsigned int height, -; int *sum, -; unsigned int *sumsquared) -global sym(aom_half_vert_variance16x_h_sse2) PRIVATE -sym(aom_half_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref - - mov rdi, arg(2) ;src - movsxd rcx, dword ptr arg(4) ;height - movsxd rax, dword ptr arg(1) ;ref_stride - movsxd rdx, dword ptr arg(3) ;src_stride - - movdqu xmm5, XMMWORD PTR [rsi] - lea rsi, [rsi + rax ] - pxor xmm0, xmm0 - -aom_half_vert_variance16x_h_1: - movdqu xmm3, XMMWORD PTR [rsi] - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 - punpckhbw xmm4, xmm0 - - movq xmm2, QWORD PTR [rdi] - punpcklbw xmm2, xmm0 - psubw xmm5, xmm2 - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - psubw xmm4, xmm2 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm3 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 - jnz aom_half_vert_variance16x_h_1 - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_half_horiz_variance16x_h_sse2(unsigned char *ref, -; int ref_stride -; unsigned char *src, -; int src_stride, -; unsigned int height, -; int *sum, -; unsigned int *sumsquared) -global sym(aom_half_horiz_variance16x_h_sse2) PRIVATE -sym(aom_half_horiz_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref - - mov rdi, arg(2) ;src - movsxd rcx, dword ptr arg(4) ;height - movsxd rax, dword ptr arg(1) ;ref_stride - movsxd rdx, dword ptr arg(3) ;src_stride - - pxor xmm0, xmm0 ; - -aom_half_horiz_variance16x_h_1: - movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 - movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm1, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm1, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - psubw xmm1, xmm2 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm1 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm1, xmm1 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm1 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz aom_half_horiz_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -aom_bilinear_filters_sse2: - dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 - dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 - dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 - dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 - dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 - dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 - dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c deleted file mode 100644 index 2a018c1cf..000000000 --- a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" - -void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref, - int ref_stride, - const unsigned char *src, - int src_stride, unsigned int height, - int *sum, unsigned int *sumsquared); -void aom_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride, - const unsigned char *src, int src_stride, - unsigned int height, int *sum, - unsigned int *sumsquared); -void aom_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride, - const unsigned char *src, int src_stride, - unsigned int height, int *sum, - unsigned int *sumsquared); - -uint32_t aom_variance_halfpixvar16x16_h_sse2(const unsigned char *src, - int src_stride, - const unsigned char *dst, - int dst_stride, uint32_t *sse) { - int xsum0; - unsigned int xxsum0; - - aom_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - assert(xsum0 <= 255 * 16 * 16); - assert(xsum0 >= -255 * 16 * 16); - return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); -} - -uint32_t aom_variance_halfpixvar16x16_v_sse2(const unsigned char *src, - int src_stride, - const unsigned char *dst, - int dst_stride, uint32_t *sse) { - int xsum0; - unsigned int xxsum0; - aom_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0, - &xxsum0); - - *sse = xxsum0; - assert(xsum0 <= 255 * 16 * 16); - assert(xsum0 >= -255 * 16 * 16); - return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); -} - -uint32_t aom_variance_halfpixvar16x16_hv_sse2(const unsigned char *src, - int src_stride, - const unsigned char *dst, - int dst_stride, uint32_t *sse) { - int xsum0; - unsigned int xxsum0; - - aom_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - assert(xsum0 <= 255 * 16 * 16); - assert(xsum0 >= -255 * 16 * 16); - return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); -} diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c index 83e0098ba..097e0778f 100644 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -327,6 +327,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt, const unsigned char *lt, const unsigned char *thr, int bd) { int i; + const __m128i zero = _mm_setzero_si128(); __m128i blimit, limit, thresh; __m128i t80; get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80); @@ -355,13 +356,18 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( flat2 = _mm_unpacklo_epi64(flat2, flat2); // flat and wide flat calculations - __m128i flat_p[3], flat_q[3], flat_pq[3]; - __m128i flat2_p[6], flat2_q[6]; - __m128i flat2_pq[6]; - { - __m128i work0; + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3], flat_pq[3]; + __m128i flat2_p[6], flat2_q[6]; + __m128i flat2_pq[6]; + __m128i sum_p6, sum_p3; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); + + __m128i work0, work0_0, work0_1, sum_p_0; __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3])); __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1])); sum_p = _mm_add_epi16(sum_p, sum_lp); @@ -369,30 +375,23 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( __m128i sum_lq = _mm_srli_si128(sum_lp, 8); __m128i sum_q = _mm_srli_si128(sum_p, 8); - sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); - flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0])); - flat2_q[0] = - _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0])); - - flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])); + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0])); flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])); - __m128i sum_p6, sum_p3; sum_p6 = _mm_add_epi16(pq[6], pq[6]); sum_p3 = _mm_add_epi16(pq[3], pq[3]); - sum_q = _mm_sub_epi16(sum_p, p[5]); - sum_p = _mm_sub_epi16(sum_p, q[5]); + sum_q = _mm_sub_epi16(sum_p_0, pq[5]); + sum_p = _mm_sub_epi16(sum_p_0, q[5]); - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); - flat2_p[1] = _mm_add_epi16(sum_p, work0); - flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); + work0_1 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); - sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lq = _mm_sub_epi16(sum_lp, pq[2]); sum_lp = _mm_sub_epi16(sum_lp, q[2]); work0 = _mm_add_epi16(sum_p3, pq[1]); @@ -402,21 +401,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); - flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); - flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); - - sum_p = _mm_sub_epi16(sum_p, q[4]); - sum_q = _mm_sub_epi16(sum_q, p[4]); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); - flat2_p[2] = _mm_add_epi16(sum_p, work0); - flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); - sum_lp = _mm_sub_epi16(sum_lp, q[1]); - sum_lq = _mm_sub_epi16(sum_lq, p[1]); + sum_lq = _mm_sub_epi16(sum_lq, pq[1]); sum_p3 = _mm_add_epi16(sum_p3, pq[3]); work0 = _mm_add_epi16(sum_p3, pq[2]); @@ -425,54 +411,88 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[3]); - sum_q = _mm_sub_epi16(sum_q, p[3]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); - flat2_p[3] = _mm_add_epi16(sum_p, work0); - flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[2]); - sum_q = _mm_sub_epi16(sum_q, p[2]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); - flat2_p[4] = _mm_add_epi16(sum_p, work0); - flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[1]); - sum_q = _mm_sub_epi16(sum_q, p[1]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); - flat2_p[5] = _mm_add_epi16(sum_p, work0); - flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); - } - - // highbd_filter8 - pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); - pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); - - for (i = 0; i < 3; i++) { - pq[i] = _mm_andnot_si128(flat, pq[i]); - flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); - pq[i] = _mm_or_si128(pq[i], flat_pq[i]); - } - - // highbd_filter16 - for (i = 5; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - pq[i] = _mm_andnot_si128(flat2, pq[i]); - flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); - // get values for when (flat2 && flat && mask) - pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0])); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, pq[3]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, pq[2]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, pq[1]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + } // flat2 + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // highbd_filter8 + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); + + for (i = 0; i < 3; i++) { + pq[i] = _mm_andnot_si128(flat, pq[i]); + flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat_pq[i]); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if (flat2_mask) { + for (i = 0; i < 6; i++) { + pq[i] = _mm_andnot_si128(flat2, pq[i]); + flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + } + } + } else { + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); } } @@ -500,6 +520,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1, const uint8_t *thr1, int bd) { __m128i blimit, limit, thresh, t80; + const __m128i zero = _mm_setzero_si128(); + get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh, &t80); __m128i mask; @@ -512,27 +534,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( __m128i ps[2], qs[2]; highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80); // flat and wide flat calculations - __m128i flat_p[3], flat_q[3]; - __m128i flat2_p[6], flat2_q[6]; - { + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); - __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); + __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3])); __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); - sum_p = _mm_add_epi16(sum_p, sum_lp); + sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp); __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); sum_q = _mm_add_epi16(sum_q, sum_lq); - sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - flat2_p[0] = _mm_srli_epi16( - _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), - _mm_add_epi16(p[1], q[0]))), - 4); - flat2_q[0] = _mm_srli_epi16( - _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), - _mm_add_epi16(p[0], q[1]))), - 4); flat_p[0] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); flat_q[0] = @@ -541,117 +558,160 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( __m128i sum_q6 = _mm_add_epi16(q[6], q[6]); __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); - sum_q = _mm_sub_epi16(sum_p, p[5]); - sum_p = _mm_sub_epi16(sum_p, q[5]); - flat2_p[1] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), - 4); - flat2_q[1] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), - 4); + + sum_q = _mm_sub_epi16(sum_p_0, p[5]); + __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]); + sum_lq = _mm_sub_epi16(sum_lp, p[2]); sum_lp = _mm_sub_epi16(sum_lp, q[2]); flat_p[1] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); flat_q[1] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p3 = _mm_add_epi16(sum_p3, p[3]); - sum_q3 = _mm_add_epi16(sum_q3, q[3]); - sum_p = _mm_sub_epi16(sum_p, q[4]); - sum_q = _mm_sub_epi16(sum_q, p[4]); - flat2_p[2] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), - 4); - flat2_q[2] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), - 4); + sum_lp = _mm_sub_epi16(sum_lp, q[1]); sum_lq = _mm_sub_epi16(sum_lq, p[1]); + sum_p3 = _mm_add_epi16(sum_p3, p[3]); + sum_q3 = _mm_add_epi16(sum_q3, q[3]); flat_p[2] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); flat_q[2] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[3]); - sum_q = _mm_sub_epi16(sum_q, p[3]); - flat2_p[3] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), - 4); - flat2_q[3] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), - 4); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[2]); - sum_q = _mm_sub_epi16(sum_q, p[2]); - flat2_p[4] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), - 4); - flat2_q[4] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), - 4); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[1]); - sum_q = _mm_sub_epi16(sum_q, p[1]); - flat2_p[5] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), - 4); - flat2_q[5] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), - 4); - } - // highbd_filter8 - p[2] = _mm_andnot_si128(flat, p[2]); - // p2 remains unchanged if !(flat && mask) - flat_p[2] = _mm_and_si128(flat, flat_p[2]); - // when (flat && mask) - p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values - q[2] = _mm_andnot_si128(flat, q[2]); - flat_q[2] = _mm_and_si128(flat, flat_q[2]); - q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values - int i; - for (i = 1; i >= 0; i--) { - ps[i] = _mm_andnot_si128(flat, ps[i]); - flat_p[i] = _mm_and_si128(flat, flat_p[i]); - p[i] = _mm_or_si128(ps[i], flat_p[i]); - qs[i] = _mm_andnot_si128(flat, qs[i]); - flat_q[i] = _mm_and_si128(flat, flat_q[i]); - q[i] = _mm_or_si128(qs[i], flat_q[i]); - } - // highbd_filter16 - for (i = 5; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - p[i] = _mm_andnot_si128(flat2, p[i]); - flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); - // get values for when (flat2 && flat && mask) - p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values - q[i] = _mm_andnot_si128(flat2, q[i]); - flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); - q[i] = _mm_or_si128(q[i], flat2_q[i]); + + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), + _mm_add_epi16(p[1], q[0]))), + 4); + flat2_q[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), + _mm_add_epi16(p[0], q[1]))), + 4); + + flat2_p[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), + 4); + flat2_q[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, p[4]); + flat2_p[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), + 4); + flat2_q[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, p[3]); + flat2_p[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), + 4); + flat2_q[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, p[2]); + flat2_p[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), + 4); + flat2_q[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, p[1]); + flat2_p[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), + 4); + flat2_q[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), + 4); + } + // highbd_filter8 + int i; + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + p[2] = _mm_andnot_si128(flat, p[2]); + // p2 remains unchanged if !(flat && mask) + flat_p[2] = _mm_and_si128(flat, flat_p[2]); + // when (flat && mask) + p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm_andnot_si128(flat, q[2]); + flat_q[2] = _mm_and_si128(flat, flat_q[2]); + q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values + + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + // highbd_filter16 + if (flat2_mask) { + for (i = 0; i < 6; i++) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + } + } + } else { + p[0] = ps[0]; + q[0] = qs[0]; + p[1] = ps[1]; + q[1] = qs[1]; } } @@ -696,6 +756,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, &thresh, &hev, &mask); + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + // flat_mask flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0); flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); @@ -707,53 +770,56 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); - { - __m128i workp_a, workp_b, workp_shft0, workp_shft1; + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; // op1 - workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), - _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), - *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 - workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); - workp_shft0 = _mm_add_epi16( - workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 + workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 // op0 - workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 - workp_a = - _mm_add_epi16(workp_a, - workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 - - flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3); + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + flat_p1p0 = _mm_srli_epi16(workp_b, 3); // oq0 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), - *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 - workp_b = _mm_add_epi16(*q1, *q2); - workp_shft0 = _mm_add_epi16( - workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]), + pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); // oq1 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), - *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]), + pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4 workp_b = _mm_add_epi16(*q2, *q2); - workp_shft1 = _mm_add_epi16( - workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 - flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); - } - // lp filter - highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd); + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + flat_q0q1 = _mm_srli_epi16(workp_a, 3); - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } } static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( @@ -797,6 +863,17 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + // flat_mask flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0)); flat = _mm_max_epi16(flat, work); @@ -806,7 +883,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( flat = _mm_cmpeq_epi16(flat, zero); flat = _mm_and_si128(flat, mask); // flat & mask - { + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i workp_a, workp_b, workp_shft0, workp_shft1; // op1 @@ -842,33 +921,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( workp_shft1 = _mm_add_epi16( workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 oq1 = _mm_srli_epi16(workp_shft1, 3); - } - // lp filter - __m128i ps[2], qs[2], p[2], q[2]; - { - p[0] = *p0; - p[1] = *p1; - q[0] = *q0; - q[1] = *q1; - // filter_mask and hev_mask - highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); - } - - qs[0] = _mm_andnot_si128(flat, qs[0]); - oq0 = _mm_and_si128(flat, oq0); - *q0 = _mm_or_si128(qs[0], oq0); - qs[1] = _mm_andnot_si128(flat, qs[1]); - oq1 = _mm_and_si128(flat, oq1); - *q1 = _mm_or_si128(qs[1], oq1); - - ps[0] = _mm_andnot_si128(flat, ps[0]); - op0 = _mm_and_si128(flat, op0); - *p0 = _mm_or_si128(ps[0], op0); - - ps[1] = _mm_andnot_si128(flat, ps[1]); - op1 = _mm_and_si128(flat, op1); - *p1 = _mm_or_si128(ps[1], op1); + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; + } } void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p, @@ -926,7 +1000,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( __m128i mask, hev, flat; __m128i pq[4]; __m128i p1p0, q1q0, ps1ps0, qs1qs0; - __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1; + __m128i work_a, opq2, flat_p1p0, flat_q0q1; pq[0] = _mm_unpacklo_epi64(*p0, *q0); pq[1] = _mm_unpacklo_epi64(*p1, *q1); @@ -944,6 +1018,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, &thresh, &hev, &mask); + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + // flat_mask4 flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0])); flat = _mm_max_epi16(abs_p1p0, flat); @@ -956,15 +1033,15 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); - { - __m128i workp_a, workp_b, workp_shft0, workp_shft1; + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1; // Added before shift for rounding part of ROUND_POWER_OF_TWO // o*p2 workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); - workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); - op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + workp_c = _mm_add_epi16(workp_a, workp_c); // o*p1 workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); @@ -992,27 +1069,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( // oq2 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); - oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - } + workp_a = _mm_add_epi16(workp_a, workp_b); + opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3); - // lp filter - highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd); - - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - work_a = _mm_andnot_si128(flat, *q2); - *q2 = _mm_and_si128(flat, oq2); - *q2 = _mm_or_si128(work_a, *q2); + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); - work_a = _mm_andnot_si128(flat, *p2); - *p2 = _mm_and_si128(flat, op2); - *p2 = _mm_or_si128(work_a, *p2); + work_a = _mm_andnot_si128(flat, pq[2]); + *p2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_a, *p2); + *q2 = _mm_srli_si128(*p2, 8); + } } static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( @@ -1058,17 +1130,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0)); flat = _mm_max_epi16(work1, flat); work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0)); flat = _mm_max_epi16(work0, flat); flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); - flat = _mm_cmpeq_epi16(flat, zero); flat = _mm_and_si128(flat, mask); // flat & mask - { + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i workp_a, workp_b; // Added before shift for rounding part of ROUND_POWER_OF_TWO @@ -1101,42 +1184,36 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - } - // lp filter - __m128i ps[2], qs[2], p[2], q[2]; - { - p[0] = *p0; - p[1] = *p1; - q[0] = *q0; - q[1] = *q1; - // filter_mask and hev_mask - highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + + work_a = _mm_andnot_si128(flat, *q2); + *q2 = _mm_and_si128(flat, oq2); + *q2 = _mm_or_si128(work_a, *q2); + + work_a = _mm_andnot_si128(flat, *p2); + *p2 = _mm_and_si128(flat, op2); + *p2 = _mm_or_si128(work_a, *p2); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; } - - qs[0] = _mm_andnot_si128(flat, qs[0]); - oq0 = _mm_and_si128(flat, oq0); - *q0 = _mm_or_si128(qs[0], oq0); - - qs[1] = _mm_andnot_si128(flat, qs[1]); - oq1 = _mm_and_si128(flat, oq1); - *q1 = _mm_or_si128(qs[1], oq1); - - ps[0] = _mm_andnot_si128(flat, ps[0]); - op0 = _mm_and_si128(flat, op0); - *p0 = _mm_or_si128(ps[0], op0); - - ps[1] = _mm_andnot_si128(flat, ps[1]); - op1 = _mm_and_si128(flat, op1); - *p1 = _mm_or_si128(ps[1], op1); - - work_a = _mm_andnot_si128(flat, *q2); - *q2 = _mm_and_si128(flat, oq2); - *q2 = _mm_or_si128(work_a, *q2); - - work_a = _mm_andnot_si128(flat, *p2); - *p2 = _mm_and_si128(flat, op2); - *p2 = _mm_or_si128(work_a, *p2); } void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c index dea113a29..b9689202a 100644 --- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c @@ -110,7 +110,7 @@ static INLINE void quantize(const __m256i *qp, __m256i *c, } void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -120,12 +120,23 @@ void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, (void)scan; const unsigned int step = 8; - if (LIKELY(!skip_block)) { - __m256i qp[5], coeff; - init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp); - coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + __m256i qp[5], coeff; + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp); + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + + __m256i eob = _mm256_setzero_si256(); + quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); - __m256i eob = _mm256_setzero_si256(); + while (n_coeffs > 0) { + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; @@ -133,40 +144,17 @@ void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff_ptr += step; iscan += step; n_coeffs -= step; - - update_qp(qp); - - while (n_coeffs > 0) { - coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); - quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); - - coeff_ptr += step; - qcoeff_ptr += step; - dqcoeff_ptr += step; - iscan += step; - n_coeffs -= step; - } - { - __m256i eob_s; - eob_s = _mm256_shuffle_epi32(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 1); - eob = _mm256_max_epi16(eob, eob_s); - const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), - _mm256_extractf128_si256(eob, 1)); - *eob_ptr = _mm_extract_epi16(final_eob, 0); - } - } else { - do { - const __m256i zero = _mm256_setzero_si256(); - _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero); - _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero); - qcoeff_ptr += step; - dqcoeff_ptr += step; - n_coeffs -= step; - } while (n_coeffs > 0); - *eob_ptr = 0; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); } } diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c index 5570ca5b7..58e5f98e5 100644 --- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c @@ -16,7 +16,7 @@ #include "aom_ports/mem.h" void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *zbin_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -41,50 +41,48 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = ((int)count / 4) - 1; i >= 0; i--) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (test == 0xffff) - non_zero_regs--; - else - break; - } + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } - // Quantization pass: - for (i = 0; i < non_zero_regs; i++) { - __m128i coeffs, coeffs_sign, tmp1, tmp2; - int test; - int abs_coeff[4]; - int coeff_sign[4]; - - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - coeffs_sign = _mm_srai_epi32(coeffs, 31); - coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); - tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); - tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); - tmp1 = _mm_or_si128(tmp1, tmp2); - test = _mm_movemask_epi8(tmp1); - _mm_storeu_si128((__m128i *)abs_coeff, coeffs); - _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); - - for (j = 0; j < 4; j++) { - if (test & (1 << (4 * j))) { - int k = 4 * i + j; - const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; - const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; - const uint32_t abs_qcoeff = - (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; - dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; - if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; - } + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } @@ -92,8 +90,8 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } void aom_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -116,38 +114,35 @@ void aom_highbd_quantize_b_32x32_sse2( memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs / 4; i++) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (!(test & 0xf)) idx_arr[idx++] = i * 4; - if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; - if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; - if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; - } + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = idx_arr[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; - } + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } *eob_ptr = eob + 1; } diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c new file mode 100644 index 000000000..9b1b4c9de --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> // AVX2 + +#include "config/aom_dsp_rtcd.h" + +typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + for (int i = 0; i < 8; i += 2) { + const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src); + const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); + const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref); + const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride)); + __m256i v_p_a = _mm256_castsi128_si256(v_p_a0); + __m256i v_p_b = _mm256_castsi128_si256(v_p_b0); + v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1); + v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1); + const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); + const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); + v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); + src += src_stride * 2; + ref += ref_stride * 2; + } + __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d)); + __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1)); + __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + *sum = _mm_extract_epi32(v_d, 0); + *sse = _mm_extract_epi32(v_d, 1); +} + +void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + for (int i = 0; i < 16; ++i) { + const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src); + const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref); + const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); + const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); + v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); + src += src_stride; + ref += ref_stride; + } + __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + *sum = _mm_extract_epi32(v_d, 0); + *sse = _mm_extract_epi32(v_d, 1); +} + +static void highbd_10_variance_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); +} + +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_avx2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +VAR_FN(128, 128, 16, 14); +VAR_FN(128, 64, 16, 13); +VAR_FN(64, 128, 16, 13); +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); +VAR_FN(16, 4, 16, 6); +VAR_FN(8, 32, 8, 8); +VAR_FN(32, 8, 8, 8); +VAR_FN(16, 64, 16, 10); +VAR_FN(64, 16, 16, 10); + +#undef VAR_FN diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index 131c16aa9..47b052abc 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -179,6 +179,9 @@ HIGH_GET_VAR(8); return (var >= 0) ? (uint32_t)var : 0; \ } +VAR_FN(128, 128, 16, 14); +VAR_FN(128, 64, 16, 13); +VAR_FN(64, 128, 16, 13); VAR_FN(64, 64, 16, 12); VAR_FN(64, 32, 16, 11); VAR_FN(32, 64, 16, 11); @@ -590,10 +593,10 @@ FNS(sse2); void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, - uint16_t *comp_pred, int width, int height, + uint8_t *comp_pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, - const uint8_t *ref8, int ref_stride, - int bd) { + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; @@ -606,8 +609,6 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, if (is_scaled) { // Note: This is mostly a copy from the >=8X8 case in // build_inter_predictors() function, with some small tweaks. - uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); - // Some assumptions. const int plane = 0; @@ -661,7 +662,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); const InterpFilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); @@ -677,10 +678,13 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, } const InterpFilterParams *filter = - av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + (subpel_search == 1) + ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) + : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); if (width >= 8) { int i; assert(!(width & 7)); @@ -711,13 +715,13 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, } else if (!subpel_y_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), - width, kernel, 16, NULL, -1, width, height, bd); + aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, + NULL, -1, width, height, bd); } else if (!subpel_x_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), - width, NULL, -1, kernel, 16, width, height, bd); + aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, + kernel, 16, width, height, bd); } else { DECLARE_ALIGNED(16, uint16_t, temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); @@ -734,30 +738,29 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, intermediate_height, bd); aom_highbd_convolve8_vert( CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), - MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, - 16, width, height, bd); + MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, + bd); } } void aom_highbd_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd) { - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - int n; - int i; - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + int ref_stride, int bd, int subpel_search) { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd); + bd, subpel_search); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ assert(!(width * height & 7)); - n = width * height >> 3; - for (i = 0; i < n; i++) { - __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred); + int n = width * height >> 3; + for (int i = 0; i < n; i++) { + __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16); __m128i p0 = _mm_loadu_si128((const __m128i *)pred); - _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu16(s0, p0)); - comp_pred += 8; + _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0)); + comp_pred16 += 8; pred += 8; } } @@ -777,7 +780,7 @@ static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1, xx_storeu_128(result, shift); } -void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred, +void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, @@ -792,6 +795,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred, _mm_set_epi16(round, round, round, round, round, round, round, round); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); if (width >= 8) { // Read 8 pixels one row at a time @@ -830,15 +834,16 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred, void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) { + int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, + int subpel_search) { uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); int n; int i; - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd); + bd, subpel_search); assert(!(width * height & 7)); n = width * height >> 3; @@ -850,13 +855,14 @@ void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( const __m128i r = _mm_set_epi16(round, round, round, round, round, round, round, round); + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); for (i = 0; i < n; i++) { - __m128i p0 = xx_loadu_128(comp_pred); + __m128i p0 = xx_loadu_128(comp_pred16); __m128i p1 = xx_loadu_128(pred); - highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16); - comp_pred += 8; + comp_pred16 += 8; pred += 8; } } diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c index 6c247a91b..df5449a9d 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c @@ -168,8 +168,8 @@ uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1( aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); - aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), - 4); + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, sse); @@ -188,8 +188,8 @@ uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1( aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); - aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), - 4); + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, sse); @@ -208,8 +208,8 @@ uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1( aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); - aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), - 4); + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, sse); diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c index eaf1f347b..f9a41a210 100644 --- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c +++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c @@ -120,11 +120,11 @@ void aom_jnt_comp_avg_upsampled_pred_ssse3( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param) { + int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) { int n; int i; aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride); + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ assert(!(width * height & 15)); n = width * height >> 4; diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c deleted file mode 100644 index 18862dd3e..000000000 --- a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c +++ /dev/null @@ -1,916 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> /* AVX2 */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_ports/mem.h" - -void aom_lpf_horizontal_16_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; - __m128i abs_p1p0; - - const __m128i thresh = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); - const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); - const __m128i blimit = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); - - q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); - q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); - p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); - p0q0 = _mm_shuffle_epi32(q0p0, 78); - - { - __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; - abs_p1p0 = - _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - fe = _mm_set1_epi8(0xfe); - ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - abs_p0q0 = - _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); - abs_p1q1 = - _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)), - _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i t1 = _mm_set1_epi16(0x1); - __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); - __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); - __m128i qs0 = _mm_xor_si128(p0q0, t80); - __m128i qs1 = _mm_xor_si128(p1q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; - __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - - filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, qs0ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - /* (aom_filter + 3 * (qs0 - ps0)) & mask */ - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - filter1 = _mm_unpacklo_epi8(zero, filter1); - filter1 = _mm_srai_epi16(filter1, 0xB); - filter2 = _mm_unpacklo_epi8(zero, filter2); - filter2 = _mm_srai_epi16(filter2, 0xB); - - /* Filter1 >> 3 */ - filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); - qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); - - /* filt >> 1 */ - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), - filt); - filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); - qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); - // loopfilter done - - { - __m128i work; - flat = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)), - _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3))); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); - q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); - - q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); - q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); - - flat2 = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), - _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); - - q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); - q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); - - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), - _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7))); - - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m128i eight = _mm_set1_epi16(8); - const __m128i four = _mm_set1_epi16(4); - __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; - __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - - p7_16 = _mm_unpacklo_epi8(q7p7, zero); - p6_16 = _mm_unpacklo_epi8(q6p6, zero); - p5_16 = _mm_unpacklo_epi8(q5p5, zero); - p4_16 = _mm_unpacklo_epi8(q4p4, zero); - p3_16 = _mm_unpacklo_epi8(q3p3, zero); - p2_16 = _mm_unpacklo_epi8(q2p2, zero); - p1_16 = _mm_unpacklo_epi8(q1p1, zero); - p0_16 = _mm_unpacklo_epi8(q0p0, zero); - q0_16 = _mm_unpackhi_epi8(q0p0, zero); - q1_16 = _mm_unpackhi_epi8(q1p1, zero); - q2_16 = _mm_unpackhi_epi8(q2p2, zero); - q3_16 = _mm_unpackhi_epi8(q3p3, zero); - q4_16 = _mm_unpackhi_epi8(q4p4, zero); - q5_16 = _mm_unpackhi_epi8(q5p5, zero); - q6_16 = _mm_unpackhi_epi8(q6p6, zero); - q7_16 = _mm_unpackhi_epi8(q7p7, zero); - - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), - _mm_add_epi16(p4_16, p3_16)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), - _mm_add_epi16(q4_16, q3_16)); - - pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = - _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16( - four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); - flat2_q0p0 = _mm_packus_epi16(res_p, res_q); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); - - flat_q0p0 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(p7_16, p7_16); - sum_q7 = _mm_add_epi16(q7_16, q7_16); - sum_p3 = _mm_add_epi16(p3_16, p3_16); - sum_q3 = _mm_add_epi16(q3_16, q3_16); - - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); - flat2_q1p1 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); - flat_q1p1 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - sum_p3 = _mm_add_epi16(sum_p3, p3_16); - sum_q3 = _mm_add_epi16(sum_q3, q3_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); - flat2_q2p2 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); - flat_q2p2 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); - flat2_q3p3 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); - flat2_q4p4 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); - flat2_q5p5 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); - flat2_q6p6 = _mm_packus_epi16(res_p, res_q); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - flat = _mm_shuffle_epi32(flat, 68); - flat2 = _mm_shuffle_epi32(flat2, 68); - - q2p2 = _mm_andnot_si128(flat, q2p2); - flat_q2p2 = _mm_and_si128(flat, flat_q2p2); - q2p2 = _mm_or_si128(q2p2, flat_q2p2); - - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_q1p1 = _mm_and_si128(flat, flat_q1p1); - q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); - - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_q0p0 = _mm_and_si128(flat, flat_q0p0); - q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); - - q6p6 = _mm_andnot_si128(flat2, q6p6); - flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); - q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); - - q5p5 = _mm_andnot_si128(flat2, q5p5); - flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); - q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); - - q4p4 = _mm_andnot_si128(flat2, q4p4); - flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); - q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); - - q3p3 = _mm_andnot_si128(flat2, q3p3); - flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); - q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); - - q2p2 = _mm_andnot_si128(flat2, q2p2); - flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); - q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); - - q1p1 = _mm_andnot_si128(flat2, q1p1); - flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); - q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); - - q0p0 = _mm_andnot_si128(flat2, q0p0); - flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); - q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); - } -} - -DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { - 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, - 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 -}; - -void aom_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - __m128i p7, p6, p5; - __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; - __m128i q5, q6, q7; - __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, - p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; - - const __m128i thresh = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); - const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); - const __m128i blimit = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); - - p256_4 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p))); - p256_3 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); - p256_2 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); - p256_1 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); - p256_0 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); - q256_0 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); - q256_1 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); - q256_2 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); - q256_3 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); - q256_4 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p))); - - p4 = _mm256_castsi256_si128(p256_4); - p3 = _mm256_castsi256_si128(p256_3); - p2 = _mm256_castsi256_si128(p256_2); - p1 = _mm256_castsi256_si128(p256_1); - p0 = _mm256_castsi256_si128(p256_0); - q0 = _mm256_castsi256_si128(q256_0); - q1 = _mm256_castsi256_si128(q256_1); - q2 = _mm256_castsi256_si128(q256_2); - q3 = _mm256_castsi256_si128(q256_3); - q4 = _mm256_castsi256_si128(q256_4); - - { - const __m128i abs_p1p0 = - _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = - _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = - _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = - _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); - __m128i work; - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - __m128i ps1 = _mm_xor_si128(p1, t80); - __m128i ps0 = _mm_xor_si128(p0, t80); - __m128i qs0 = _mm_xor_si128(q0, t80); - __m128i qs1 = _mm_xor_si128(q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, - flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5, - flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - /* (aom_filter + 3 * (qs0 - ps0)) & mask */ - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - /* Filter1 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - - /* Filter2 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - - /* filt >> 1 */ - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - filt = _mm_andnot_si128(hev, filt); - ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - // loopfilter done - - { - __m128i work; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - p256_5 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); - q256_5 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); - p5 = _mm256_castsi256_si128(p256_5); - q5 = _mm256_castsi256_si128(q256_5); - flat2 = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), - _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); - - flat2 = _mm_max_epu8(work, flat2); - p256_6 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); - q256_6 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); - p6 = _mm256_castsi256_si128(p256_6); - q6 = _mm256_castsi256_si128(q256_6); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), - _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); - - flat2 = _mm_max_epu8(work, flat2); - - p256_7 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 8 * p))); - q256_7 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 7 * p))); - p7 = _mm256_castsi256_si128(p256_7); - q7 = _mm256_castsi256_si128(q256_7); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), - _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); - - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m256i eight = _mm256_set1_epi16(8); - const __m256i four = _mm256_set1_epi16(4); - __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, - pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - - const __m256i filter = - _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); - p256_7 = _mm256_shuffle_epi8(p256_7, filter); - p256_6 = _mm256_shuffle_epi8(p256_6, filter); - p256_5 = _mm256_shuffle_epi8(p256_5, filter); - p256_4 = _mm256_shuffle_epi8(p256_4, filter); - p256_3 = _mm256_shuffle_epi8(p256_3, filter); - p256_2 = _mm256_shuffle_epi8(p256_2, filter); - p256_1 = _mm256_shuffle_epi8(p256_1, filter); - p256_0 = _mm256_shuffle_epi8(p256_0, filter); - q256_0 = _mm256_shuffle_epi8(q256_0, filter); - q256_1 = _mm256_shuffle_epi8(q256_1, filter); - q256_2 = _mm256_shuffle_epi8(q256_2, filter); - q256_3 = _mm256_shuffle_epi8(q256_3, filter); - q256_4 = _mm256_shuffle_epi8(q256_4, filter); - q256_5 = _mm256_shuffle_epi8(q256_5, filter); - q256_6 = _mm256_shuffle_epi8(q256_6, filter); - q256_7 = _mm256_shuffle_epi8(q256_7, filter); - - pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), - _mm256_add_epi16(p256_4, p256_3)); - pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), - _mm256_add_epi16(q256_4, q256_3)); - - pixetFilter_p2p1p0 = - _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); - pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = - _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); - pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - - pixelFilter_p = _mm256_add_epi16( - eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); - - pixetFilter_p2p1p0 = _mm256_add_epi16( - four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4); - - flat2_p0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4); - - flat2_q0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - res_p = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(p256_3, p256_0)), - 3); - - flat_p0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(q256_3, q256_0)), - 3); - - flat_q0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(p256_7, p256_7); - - sum_q7 = _mm256_add_epi16(q256_7, q256_7); - - sum_p3 = _mm256_add_epi16(p256_3, p256_3); - - sum_q3 = _mm256_add_epi16(q256_3, q256_3); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4); - - flat2_p1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4); - - flat2_q1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); - - pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); - - res_p = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(sum_p3, p256_1)), - 3); - - flat_p1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, - _mm256_add_epi16(sum_q3, q256_1)), - 3); - - flat_q1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - sum_p3 = _mm256_add_epi16(sum_p3, p256_3); - - sum_q3 = _mm256_add_epi16(sum_q3, q256_3); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4); - - flat2_p2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4); - - flat2_q2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); - - pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); - - res_p = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(sum_p3, p256_2)), - 3); - - flat_p2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, - _mm256_add_epi16(sum_q3, q256_2)), - 3); - - flat_q2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4); - - flat2_p3 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4); - - flat2_q3 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4); - - flat2_p4 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4); - - flat2_q4 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4); - - flat2_p5 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4); - - flat2_q5 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4); - - flat2_p6 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4); - - flat2_q6 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - } - - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - p2 = _mm_andnot_si128(flat, p2); - flat_p2 = _mm_and_si128(flat, flat_p2); - p2 = _mm_or_si128(flat_p2, p2); - - p1 = _mm_andnot_si128(flat, ps1); - flat_p1 = _mm_and_si128(flat, flat_p1); - p1 = _mm_or_si128(flat_p1, p1); - - p0 = _mm_andnot_si128(flat, ps0); - flat_p0 = _mm_and_si128(flat, flat_p0); - p0 = _mm_or_si128(flat_p0, p0); - - q0 = _mm_andnot_si128(flat, qs0); - flat_q0 = _mm_and_si128(flat, flat_q0); - q0 = _mm_or_si128(flat_q0, q0); - - q1 = _mm_andnot_si128(flat, qs1); - flat_q1 = _mm_and_si128(flat, flat_q1); - q1 = _mm_or_si128(flat_q1, q1); - - q2 = _mm_andnot_si128(flat, q2); - flat_q2 = _mm_and_si128(flat, flat_q2); - q2 = _mm_or_si128(flat_q2, q2); - - p6 = _mm_andnot_si128(flat2, p6); - flat2_p6 = _mm_and_si128(flat2, flat2_p6); - p6 = _mm_or_si128(flat2_p6, p6); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); - - p5 = _mm_andnot_si128(flat2, p5); - flat2_p5 = _mm_and_si128(flat2, flat2_p5); - p5 = _mm_or_si128(flat2_p5, p5); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); - - p4 = _mm_andnot_si128(flat2, p4); - flat2_p4 = _mm_and_si128(flat2, flat2_p4); - p4 = _mm_or_si128(flat2_p4, p4); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); - - p3 = _mm_andnot_si128(flat2, p3); - flat2_p3 = _mm_and_si128(flat2, flat2_p3); - p3 = _mm_or_si128(flat2_p3, p3); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); - - p2 = _mm_andnot_si128(flat2, p2); - flat2_p2 = _mm_and_si128(flat2, flat2_p2); - p2 = _mm_or_si128(flat2_p2, p2); - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - - p1 = _mm_andnot_si128(flat2, p1); - flat2_p1 = _mm_and_si128(flat2, flat2_p1); - p1 = _mm_or_si128(flat2_p1, p1); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - - p0 = _mm_andnot_si128(flat2, p0); - flat2_p0 = _mm_and_si128(flat2, flat2_p0); - p0 = _mm_or_si128(flat2_p0, p0); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - - q0 = _mm_andnot_si128(flat2, q0); - flat2_q0 = _mm_and_si128(flat2, flat2_q0); - q0 = _mm_or_si128(flat2_q0, q0); - _mm_storeu_si128((__m128i *)(s - 0 * p), q0); - - q1 = _mm_andnot_si128(flat2, q1); - flat2_q1 = _mm_and_si128(flat2, flat2_q1); - q1 = _mm_or_si128(flat2_q1, q1); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - - q2 = _mm_andnot_si128(flat2, q2); - flat2_q2 = _mm_and_si128(flat2, flat2_q2); - q2 = _mm_or_si128(flat2_q2, q2); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); - - q3 = _mm_andnot_si128(flat2, q3); - flat2_q3 = _mm_and_si128(flat2, flat2_q3); - q3 = _mm_or_si128(flat2_q3, q3); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); - - q4 = _mm_andnot_si128(flat2, q4); - flat2_q4 = _mm_and_si128(flat2, flat2_q4); - q4 = _mm_or_si128(flat2_q4, q4); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); - - q5 = _mm_andnot_si128(flat2, q5); - flat2_q5 = _mm_and_si128(flat2, flat2_q5); - q5 = _mm_or_si128(flat2_q5, q5); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); - - q6 = _mm_andnot_si128(flat2, q6); - flat2_q6 = _mm_and_si128(flat2, flat2_q6); - q6 = _mm_or_si128(flat2_q6, q6); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); - } - _mm256_zeroupper(); -} diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c index f1eac233b..9d88b5e49 100644 --- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c @@ -249,6 +249,63 @@ static INLINE void transpose16x8_8x16_sse2( *d7 = _mm_unpackhi_epi64(w7, w15); } +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them independently while flipping the second matrix horizontaly Used for 14 +// taps filter pq pairs inverse +static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *pq0, __m128i *pq1, + __m128i *pq2, __m128i *pq3) { + __m128i w10, w11, w12, w13; + __m128i w0, w1, w2, w3, w4, w5; + __m128i d0, d1, d2, d3; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w10 = _mm_unpacklo_epi8( + *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13 + w11 = _mm_unpacklo_epi8( + *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33 + w12 = _mm_unpacklo_epi8( + *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53 + w13 = _mm_unpacklo_epi8( + *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73 + + w4 = _mm_unpackhi_epi16( + w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpackhi_epi16( + w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + *pq0 = _mm_unpacklo_epi64(d0, d1); // pq + *pq1 = _mm_unpackhi_epi64(d0, d1); // pq + *pq2 = _mm_unpacklo_epi64(d2, d3); // pq + *pq3 = _mm_unpackhi_epi64(d2, d3); // pq +} + static INLINE void transpose8x16_16x8_sse2( __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, @@ -300,9 +357,120 @@ static INLINE void transpose8x16_16x8_sse2( *d14d15 = _mm_unpackhi_epi64(w7, w15); } +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them to 4x8 independently while flipping the second matrix horizontaly. Used +// for 14 taps pq pairs creation +static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *q0p0, + __m128i *q1p1, __m128i *q2p2, + __m128i *q3p3, __m128i *q4p4, + __m128i *q5p5, __m128i *q6p6, + __m128i *q7p7) { + __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3; + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi8( + *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115 + w3 = _mm_unpackhi_epi8( + *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ww2 = _mm_unpacklo_epi16( + w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311 + ww3 = _mm_unpackhi_epi16( + w2, + w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315 + + *q7p7 = _mm_unpacklo_epi32( + ww0, + _mm_srli_si128( + ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx + *q6p6 = _mm_unpackhi_epi32( + _mm_slli_si128(ww0, 4), + ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx + *q5p5 = _mm_unpackhi_epi32( + ww0, + _mm_slli_si128( + ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx + *q4p4 = _mm_unpacklo_epi32( + _mm_srli_si128(ww0, 12), + ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx + *q3p3 = _mm_unpacklo_epi32( + ww1, + _mm_srli_si128( + ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx + *q2p2 = _mm_unpackhi_epi32( + _mm_slli_si128(ww1, 4), + ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx + *q1p1 = _mm_unpackhi_epi32( + ww1, + _mm_slli_si128( + ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx + *q0p0 = _mm_unpacklo_epi32( + _mm_srli_si128(ww1, 12), + ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx +} + static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, __m128i *hev, __m128i *mask, __m128i *qs1qs0, __m128i *ps1ps0) { + __m128i filter, filter2filter1, work; + __m128i ps1ps0_work, qs1qs0_work; + __m128i hev1; + const __m128i t3t4 = + _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i ff = _mm_cmpeq_epi8(t80, t80); + + ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ + qs1qs0_work = _mm_xor_si128(*q1q0, t80); + + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ + work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); + filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev); + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ + filter = _mm_and_si128(filter, *mask); /* & mask */ + filter = _mm_unpacklo_epi32(filter, filter); + + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ + filter2filter1 = + _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1); + + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ + filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit + filter = _mm_srai_epi16(filter, 9); /* round */ + filter = _mm_packs_epi16(filter, filter); + filter = _mm_andnot_si128(*hev, filter); + filter = _mm_unpacklo_epi32(filter, filter); + + filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter); + hev1 = _mm_srli_si128(filter2filter1, 8); + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ + qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ + ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); + + *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ + *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ +} + +static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, + __m128i *ps1ps0) { const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); const __m128i t80 = _mm_set1_epi8(0x80); @@ -356,6 +524,49 @@ static AOM_FORCE_INLINE void lpf_internal_4_sse2( __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { __m128i q1p1, q0p0, p1p0, q1q0; __m128i abs_p0q0, abs_p1q1; + __m128i mask, flat, hev; + const __m128i zero = _mm_setzero_si128(); + + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); + q1q0 = _mm_srli_si128(p1p0, 8); + + /* (abs(q1 - q0), abs(p1 - p0) */ + flat = abs_diff(q1p1, q0p0); + /* abs(p1 - q1), abs(p0 - q0) */ + __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); + + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + hev = _mm_unpacklo_epi8(flat, zero); + + hev = _mm_cmpgt_epi16(hev, *thresh); + hev = _mm_packs_epi16(hev, hev); + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ + abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */ + abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ + + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); + mask = _mm_unpacklo_epi32(mask, flat); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4)); + + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); +} + +static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, + __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { + __m128i q1p1, q0p0, p1p0, q1q0; + __m128i abs_p0q0, abs_p1q1; __m128i mask, hev; const __m128i zero = _mm_setzero_si128(); @@ -390,14 +601,14 @@ static AOM_FORCE_INLINE void lpf_internal_4_sse2( mask = _mm_cmpeq_epi8(mask, zero); mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); - filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); } void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh) { const __m128i zero = _mm_setzero_si128(); - __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), _mm_loadl_epi64((const __m128i *)_limit)); __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); @@ -413,9 +624,9 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0); xx_storel_32(s - 1 * p, ps1ps0); - xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8)); + xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4)); xx_storel_32(s + 0 * p, qs1qs0); - xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8)); + xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4)); } void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, @@ -425,7 +636,7 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, __m128i p1, p0, q0, q1; const __m128i zero = _mm_setzero_si128(); - __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), _mm_loadl_epi64((const __m128i *)_limit)); __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); @@ -442,8 +653,8 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0); // Transpose 8x4 to 4x8 - p1 = _mm_srli_si128(p1p0, 8); - q1 = _mm_srli_si128(q1q0, 8); + p1 = _mm_srli_si128(p1p0, 4); + q1 = _mm_srli_si128(q1q0, 4); transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); @@ -455,10 +666,10 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) { xx_storel_32(s - (num + 1) * p, x); - xx_storel_32(s + num * p, _mm_srli_si128(x, 8)); + xx_storel_32(s + num * p, _mm_srli_si128(x, 4)); } -static AOM_FORCE_INLINE void lpf_internal_14_sse2( +static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2( __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, __m128i *thresh) { @@ -503,38 +714,31 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2( mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); - // replicate for the further "merged variables" usage - mask = _mm_unpacklo_epi64(mask, mask); } // lp filter - the same for 6, 8 and 14 versions - filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0); qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0); // loopfilter done __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - { - __m128i work; - flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); - work = abs_diff(*q6p6, *q0p0); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { + __m128i work; + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; @@ -619,137 +823,413 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); flat_q1p1 = _mm_packus_epi16(res_p, res_q); - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - sum_p3 = _mm_add_epi16(sum_p3, p3_16); - sum_q3 = _mm_add_epi16(sum_q3, q3_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), - 4); - flat2_q2p2 = _mm_packus_epi16(res_p, res_q); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); flat_q2p2 = _mm_packus_epi16(res_p, res_q); - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), - 4); - flat2_q3p3 = _mm_packus_epi16(res_p, res_q); - - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), - 4); - flat2_q4p4 = _mm_packus_epi16(res_p, res_q); - - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + // work with flat2 + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), - 4); - flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat = _mm_unpacklo_epi64(flat, flat); + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), + 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), + 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), + 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), + 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2 = _mm_unpacklo_epi64(flat2, flat2); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); + + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); + } + } else { + *q0p0 = qs0ps0; + *q1p1 = qs1ps1; } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +} - flat = _mm_shuffle_epi32(flat, 68); - flat2 = _mm_shuffle_epi32(flat2, 68); +static AOM_FORCE_INLINE void lpf_internal_14_sse2( + __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, + __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + __m128i mask, hev, flat, flat2; + __m128i flat2_pq[6], flat_pq[3]; + __m128i qs0ps0, qs1ps1; + __m128i p1p0, q1q0, qs1qs0, ps1ps0; + __m128i abs_p1p0; - *q2p2 = _mm_andnot_si128(flat, *q2p2); - flat_q2p2 = _mm_and_si128(flat, flat_q2p2); - *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); + p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1); + q1q0 = _mm_srli_si128(p1p0, 8); - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_q1p1 = _mm_and_si128(flat, flat_q1p1); - *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + __m128i fe, ff, work; + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0; + abs_p1p0 = abs_diff(*q1p1, *q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(fe, fe); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_q0p0 = _mm_and_si128(flat, flat_q0p0); - *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - *q5p5 = _mm_andnot_si128(flat2, *q5p5); - flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); - *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); - *q4p4 = _mm_andnot_si128(flat2, *q4p4); - flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); - *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; - *q3p3 = _mm_andnot_si128(flat2, *q3p3); - flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); - *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); + work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + } - *q2p2 = _mm_andnot_si128(flat2, *q2p2); - flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); - *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0); + qs1ps1 = _mm_srli_si128(qs0ps0, 8); + // loopfilter done - *q1p1 = _mm_andnot_si128(flat2, *q1p1); - flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); - *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pq_16[7]; + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i sum_p6; + __m128i sum_p3; + + pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero); + pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero); + pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero); + pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero); + pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero); + pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero); + pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero); + q0_16 = _mm_srli_si128(pq_16[0], 8); + q1_16 = _mm_srli_si128(pq_16[1], 8); + q2_16 = _mm_srli_si128(pq_16[2], 8); + q3_16 = _mm_srli_si128(pq_16[3], 8); + q4_16 = _mm_srli_si128(pq_16[4], 8); + q5_16 = _mm_srli_si128(pq_16[5], 8); + + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; + + __m128i work0, work0_0, work0_1, sum_p_0; + __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3])); + __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_srli_si128(sum_lp, 8); + __m128i sum_q = _mm_srli_si128(sum_p, 8); + + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0])); + flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16)); + + sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]); + sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]); + + sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]); + sum_p = _mm_sub_epi16(sum_p_0, q5_16); + + work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]); + work0_1 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0]))); + + sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]); + sum_lp = _mm_sub_epi16(sum_lp, q2_16); + + work0 = _mm_add_epi16(sum_p3, pq_16[1]); + flat_p[1] = _mm_add_epi16(sum_lp, work0); + flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + + flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); + flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); + flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]); + flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]); + + sum_lp = _mm_sub_epi16(sum_lp, q1_16); + sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]); + + sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]); + work0 = _mm_add_epi16(sum_p3, pq_16[2]); + + flat_p[2] = _mm_add_epi16(sum_lp, work0); + flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); + flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); - *q0p0 = _mm_andnot_si128(flat2, *q0p0); - flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); - *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + flat2 = _mm_unpacklo_epi32(flat2, flat2); + + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_pq[0] = _mm_and_si128(flat, flat_pq[0]); + *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_pq[1] = _mm_and_si128(flat, flat_pq[1]); + *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]); + + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_pq[2] = _mm_and_si128(flat, flat_pq[2]); + *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16)); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]); + flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]); + + sum_p = _mm_sub_epi16(sum_p, q4_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q3_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[3]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q2_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[2]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q1_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[1]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]); + *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]); + + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]); + *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]); + *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]); + *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]); + *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]); + *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]); + } + } else { + *q0p0 = qs0ps0; + *q1p1 = qs1ps1; + } } void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, @@ -761,22 +1241,22 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, __m128i limit = _mm_load_si128((const __m128i *)_limit); __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)), + q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)), _mm_cvtsi32_si128(*(int *)(s + 4 * p))); - q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)), + q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)), _mm_cvtsi32_si128(*(int *)(s + 3 * p))); - q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)), + q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)), _mm_cvtsi32_si128(*(int *)(s + 2 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)), + q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)), _mm_cvtsi32_si128(*(int *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)), + q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)), _mm_cvtsi32_si128(*(int *)(s - 0 * p))); - q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)), + q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)), _mm_cvtsi32_si128(*(int *)(s + 5 * p))); - q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)), + q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)), _mm_cvtsi32_si128(*(int *)(s + 6 * p))); lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, @@ -790,7 +1270,7 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, store_buffer_horz_8(q5p5, p, 5, s); } -static AOM_FORCE_INLINE void lpf_internal_6_sse2( +static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2( __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, __m128i *thresh) { @@ -810,6 +1290,7 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8(0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); + { // filter_mask and hev_mask __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; @@ -847,8 +1328,9 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); - // replicate for the further "merged variables" usage - mask = _mm_unpacklo_epi64(mask, mask); + + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); // flat_mask flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); @@ -861,9 +1343,9 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( } // 5 tap filter - { + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); - __m128i workp_a, workp_b, workp_shft0, workp_shft1; p2_16 = _mm_unpacklo_epi8(*p2, zero); p1_16 = _mm_unpacklo_epi8(*p1, zero); @@ -906,18 +1388,149 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); } +} - // lp filter - the same for 6, 8 and 14 versions - filter4_sse2(p1p0, q1q0, &hev, &mask, &qs1qs0, &ps1ps0); +static AOM_FORCE_INLINE void lpf_internal_6_sse2( + __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, + __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; + __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16; + __m128i ps1ps0, qs1qs0; + + q2p2 = _mm_unpacklo_epi32(*p2, *q2); + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + *p1p0 = _mm_unpacklo_epi32(*p0, *p1); + *q1q0 = _mm_unpacklo_epi32(*q0, *q1); + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + { + // filter_mask and hev_mask + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + + abs_p0q0 = abs_diff(*p1p0, *q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + // considering sse doesn't have unsigned elements comparison the idea is + // to find at least one case when X > limit, it means the corresponding + // mask bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - *q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - *p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0 = _mm_or_si128(ps1ps0, *p1p0); + work = abs_diff(q2p2, q1p1); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); + + // flat_mask + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + } + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; + pq2_16 = _mm_unpacklo_epi8(q2p2, zero); + pq1_16 = _mm_unpacklo_epi8(q1p1, zero); + pq0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_srli_si128(pq0_16, 8); + q2_16 = _mm_srli_si128(pq2_16, 8); + + // op1 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + workp_b = _mm_srli_epi16(workp_b, 3); + + flat_p1p0 = _mm_packus_epi16(workp_b, workp_b); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16), + pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16), + pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(q2_16, q2_16); + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + workp_a = _mm_srli_epi16(workp_a, 3); + + flat_q0q1 = _mm_packus_epi16(workp_a, workp_a); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); + } } void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, @@ -941,9 +1554,9 @@ void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, &limit, &thresh); xx_storel_32(s - 1 * p, p1p0); - xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8)); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); xx_storel_32(s + 0 * p, q1q0); - xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8)); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); } void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, @@ -970,8 +1583,8 @@ void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, - &limit, &thresh); + lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); @@ -982,15 +1595,168 @@ void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, static AOM_FORCE_INLINE void lpf_internal_8_sse2( __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, - __m128i *p2_out, __m128i *q2_out, __m128i *blimit, __m128i *limit, - __m128i *thresh) { + __m128i *blimit, __m128i *limit, __m128i *thresh) { const __m128i zero = _mm_setzero_si128(); __m128i mask, hev, flat; __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, flat_p1p0, flat_q0q1; __m128i q2p2, q1p1, q0p0; __m128i q1q0, p1p0, ps1ps0, qs1qs0; - __m128i work_a, op2, oq2; + __m128i work_pq, opq2, pq2; + + q3p3 = _mm_unpacklo_epi32(*p3, *q3); + q2p2 = _mm_unpacklo_epi32(*p2, *q2); + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0 + q1q0 = _mm_srli_si128(p1p0, 8); + + // filter_mask and hev_mask + + // considering sse doesn't have unsigned elements comparison the idea is to + // find at least one case when X > limit, it means the corresponding mask + // bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + + // flat_mask4 + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + p3_16 = _mm_unpacklo_epi8(*p3, zero); + q3_16 = _mm_unpacklo_epi8(*q3, zero); + + // op2 + workp_a = + _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); + workp_shft2 = _mm_add_epi16(workp_a, workp_b); + + // op1 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); + workp_c = _mm_add_epi16(workp_a, workp_b); + // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); + workp_d = _mm_add_epi16(workp_a, workp_b); + // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + workp_c = _mm_unpacklo_epi64(workp_d, workp_c); + workp_c = _mm_srli_epi16(workp_c, 3); + flat_p1p0 = _mm_packus_epi16(workp_c, workp_c); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); + // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + workp_c = _mm_add_epi16(workp_a, workp_b); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); + workp_d = _mm_add_epi16(workp_a, workp_b); + // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + workp_c = _mm_unpacklo_epi64(workp_c, workp_d); + workp_c = _mm_srli_epi16(workp_c, 3); + flat_q0q1 = _mm_packus_epi16(workp_c, workp_c); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1); + workp_c = _mm_srli_epi16(workp_c, 3); + + opq2 = _mm_packus_epi16(workp_c, workp_c); + + work_pq = _mm_andnot_si128(flat, q2p2); + pq2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_pq, pq2); + *q2 = _mm_srli_si128(*p2, 4); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } +} + +static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + __m128i *blimit, __m128i *limit, __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, + flat_p1p0, flat_q0q1; + __m128i q2p2, q1p1, q0p0; + __m128i q1q0, p1p0, ps1ps0, qs1qs0; + __m128i work_pq, opq2, pq2; q3p3 = _mm_unpacklo_epi64(*p3, *q3); q2p2 = _mm_unpacklo_epi64(*p2, *q2); @@ -1043,11 +1809,11 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); - // replicate for the further "merged variables" usage - mask = _mm_unpacklo_epi64(mask, mask); - // flat_mask4 + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + // flat_mask4 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); flat = _mm_max_epu8(abs_p1p0, flat); @@ -1059,11 +1825,11 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( flat = _mm_unpacklo_epi64(flat, flat); } - // filter8 - { + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); - __m128i workp_a, workp_b, workp_shft0, workp_shft1; + __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2; p2_16 = _mm_unpacklo_epi8(*p2, zero); p1_16 = _mm_unpacklo_epi8(*p1, zero); p0_16 = _mm_unpacklo_epi8(*p0, zero); @@ -1078,8 +1844,7 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); - workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - op2 = _mm_packus_epi16(workp_shft0, workp_shft0); + workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // op1 workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); @@ -1108,27 +1873,22 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - oq2 = _mm_packus_epi16(workp_shft1, workp_shft1); - } - // lp filter - the same for 6, 8 and 14 versions - filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + opq2 = _mm_packus_epi16(workp_shft2, workp_shft1); - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + work_pq = _mm_andnot_si128(flat, q2p2); + pq2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_pq, pq2); + *q2 = _mm_srli_si128(*p2, 8); - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - work_a = _mm_andnot_si128(flat, *q2); - q2_16 = _mm_and_si128(flat, oq2); - *q2_out = _mm_or_si128(work_a, q2_16); - - work_a = _mm_andnot_si128(flat, *p2); - p2_16 = _mm_and_si128(flat, op2); - *p2_out = _mm_or_si128(work_a, p2_16); + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } } void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, @@ -1136,7 +1896,7 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, const unsigned char *_limit, const unsigned char *_thresh) { __m128i p2, p1, p0, q0, q1, q2, p3, q3; - __m128i q1q0, p1p0, p2_out, q2_out; + __m128i q1q0, p1p0; __m128i blimit = _mm_load_si128((const __m128i *)_blimit); __m128i limit = _mm_load_si128((const __m128i *)_limit); __m128i thresh = _mm_load_si128((const __m128i *)_thresh); @@ -1151,14 +1911,14 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p)); lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, - &p2_out, &q2_out, &blimit, &limit, &thresh); + &blimit, &limit, &thresh); xx_storel_32(s - 1 * p, p1p0); - xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8)); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); xx_storel_32(s + 0 * p, q1q0); - xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8)); - xx_storel_32(s - 3 * p, p2_out); - xx_storel_32(s + 2 * p, q2_out); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); + xx_storel_32(s - 3 * p, p2); + xx_storel_32(s + 2 * p, q2); } void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, @@ -1196,8 +1956,8 @@ void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)), _mm_loadl_epi64((__m128i *)(s + 6 * p))); - lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, - &limit, &thresh); + lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &blimit, &limit, &thresh); _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8)); @@ -1227,7 +1987,7 @@ void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, _mm_load_si128((__m128i *)_thresh1)); __m128i p2, p1, p0, q0, q1, q2, p3, q3; - __m128i q1q0, p1p0, p2_out, q2_out; + __m128i q1q0, p1p0; p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); @@ -1238,15 +1998,15 @@ void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); - lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, - &p2_out, &q2_out, &blimit, &limit, &thresh); + lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &blimit, &limit, &thresh); _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); - _mm_storel_epi64((__m128i *)(s - 3 * p), p2_out); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2_out); + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); } void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, @@ -1282,7 +2042,7 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); - lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); + lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8)); @@ -1331,7 +2091,7 @@ void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0, &q1); - lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); + lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); p1 = _mm_srli_si128(ps1ps0, 8); q1 = _mm_srli_si128(qs1qs0, 8); @@ -1372,8 +2132,8 @@ void aom_lpf_vertical_6_sse2(unsigned char *s, int p, lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit, &limit, &thresh); - p0 = _mm_srli_si128(p1p0, 8); - q0 = _mm_srli_si128(q1q0, 8); + p0 = _mm_srli_si128(p1p0, 4); + q0 = _mm_srli_si128(q1q0, 4); transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); @@ -1419,8 +2179,8 @@ void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, d5 = _mm_srli_si128(d4d5, 8); d7 = _mm_srli_si128(d6d7, 8); - lpf_internal_6_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, &blimit, - &limit, &thresh); + lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, + &blimit, &limit, &thresh); p0 = _mm_srli_si128(p1p0, 8); q0 = _mm_srli_si128(q1q0, 8); @@ -1444,7 +2204,7 @@ void aom_lpf_vertical_8_sse2(unsigned char *s, int p, const unsigned char *_thresh) { __m128i d0, d1, d2, d3, d4, d5, d6, d7; - __m128i p2, p0, q0, q2; + __m128i p0, q0; __m128i x2, x1, x0, x3; __m128i q1q0, p1p0; __m128i blimit = _mm_load_si128((const __m128i *)_blimit); @@ -1459,13 +2219,13 @@ void aom_lpf_vertical_8_sse2(unsigned char *s, int p, transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); // Loop filtering - lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &p2, - &q2, &blimit, &limit, &thresh); + lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, + &blimit, &limit, &thresh); - p0 = _mm_srli_si128(p1p0, 8); - q0 = _mm_srli_si128(q1q0, 8); + p0 = _mm_srli_si128(p1p0, 4); + q0 = _mm_srli_si128(q1q0, 4); - transpose8x8_low_sse2(&d0, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0, &d1, + transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1, &d2, &d3); _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0); @@ -1490,7 +2250,7 @@ void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i d1, d3, d5, d7; __m128i q1q0, p1p0; - __m128i p2, p1, q1, q2; + __m128i p1, q1; __m128i d0d1, d2d3, d4d5, d6d7; x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p)); @@ -1510,14 +2270,14 @@ void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, d5 = _mm_srli_si128(d4d5, 8); d7 = _mm_srli_si128(d6d7, 8); - lpf_internal_8_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0, - &p1p0, &p2, &q2, &blimit, &limit, &thresh); + lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, + &q1q0, &p1p0, &blimit, &limit, &thresh); p1 = _mm_srli_si128(p1p0, 8); q1 = _mm_srli_si128(q1q0, 8); - transpose8x8_sse2(&d0d1, &p2, &p1, &p1p0, &q1q0, &q1, &q2, &d7, &d0d1, &d2d3, - &d4d5, &d6d7); + transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1, + &d2d3, &d4d5, &d6d7); _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1); _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8)); @@ -1533,65 +2293,30 @@ void aom_lpf_vertical_14_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; - __m128i x6, x5, x4, x3, x2, x1, x0; - __m128i p0, p1, p2, p3, p4, p5, p6, p7; - __m128i q0, q1, q2, q3, q4, q5, q6, q7; - __m128i p0_out, p1_out, p2_out, p3_out; + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x6, x5, x4, x3; + __m128i pq0, pq1, pq2, pq3; __m128i blimit = _mm_load_si128((__m128i *)_blimit); __m128i limit = _mm_load_si128((__m128i *)_limit); __m128i thresh = _mm_load_si128((__m128i *)_thresh); - x6 = _mm_loadl_epi64((__m128i *)((s - 8) + 0 * p)); - x5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p)); - x4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p)); - x3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p)); - - transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &p0, &p1, &p2, &p3, &p4, &p5, &p6, - &p7); + x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); - x6 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); - x5 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - x4 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - x3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); - - transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, - &q7); - - q6p6 = _mm_unpacklo_epi64(p1, q6); - q5p5 = _mm_unpacklo_epi64(p2, q5); - q4p4 = _mm_unpacklo_epi64(p3, q4); - q3p3 = _mm_unpacklo_epi64(p4, q3); - q2p2 = _mm_unpacklo_epi64(p5, q2); - q1p1 = _mm_unpacklo_epi64(p6, q1); - q0p0 = _mm_unpacklo_epi64(p7, q0); + transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4, + &q5p5, &q6p6, &q7p7); lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, &limit, &thresh); - transpose8x8_low_sse2(&p0, &p1, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, - &p0_out, &p1_out, &p2_out, &p3_out); - - x0 = _mm_srli_si128(q0p0, 8); - x1 = _mm_srli_si128(q1p1, 8); - x2 = _mm_srli_si128(q2p2, 8); - x3 = _mm_srli_si128(q3p3, 8); - x4 = _mm_srli_si128(q4p4, 8); - x5 = _mm_srli_si128(q5p5, 8); - x6 = _mm_srli_si128(q6p6, 8); - - transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &q0, &q1, &q2, - &q3); - - _mm_storel_epi64((__m128i *)(s - 8 + 0 * p), p0_out); - _mm_storel_epi64((__m128i *)(s - 8 + 1 * p), p1_out); - _mm_storel_epi64((__m128i *)(s - 8 + 2 * p), p2_out); - _mm_storel_epi64((__m128i *)(s - 8 + 3 * p), p3_out); - - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); - _mm_storel_epi64((__m128i *)(s + 3 * p), q3); + transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, + &q0p0, &pq0, &pq1, &pq2, &pq3); + _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3); } void aom_lpf_vertical_14_dual_sse2( @@ -1634,8 +2359,8 @@ void aom_lpf_vertical_14_dual_sse2( q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8)); q7 = _mm_srli_si128(d14d15, 8); - lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, - &limit, &thresh); + lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &blimit, &limit, &thresh); x0 = _mm_srli_si128(q0p0, 8); x1 = _mm_srli_si128(q1p1, 8); diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h index c6b6469b4..8970fe7dd 100644 --- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h +++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_LPF_COMMON_X86_H -#define _AOM_DSP_X86_LPF_COMMON_X86_H +#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ +#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ #include <emmintrin.h> // SSE2 @@ -212,4 +212,4 @@ static INLINE void highbd_transpose8x16_sse2( d4 + 1, d5 + 1, d6 + 1, d7 + 1); } -#endif // _AOM_DSP_X86_LPF_COMMON_X86_H +#endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c index 6538e4d5e..584b5e7e3 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c @@ -9,7 +9,6 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include <stdio.h> #include <tmmintrin.h> #include "config/aom_config.h" diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h index 19b429d91..cffbd9672 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H -#define _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H +#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ +#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, @@ -30,4 +30,4 @@ unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, const uint8_t *m_ptr, int m_stride, int height); -#endif +#endif // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h index dc41a8342..4faa098ac 100644 --- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H -#define _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H +#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ +#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ #include <stdlib.h> #include <string.h> @@ -89,4 +89,4 @@ static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height, } while (i < height); } -#endif +#endif // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h index 8b69606dd..6c821673e 100644 --- a/third_party/aom/aom_dsp/x86/mem_sse2.h +++ b/third_party/aom/aom_dsp/x86/mem_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_MEM_SSE2_H_ -#define AOM_DSP_X86_MEM_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_ +#define AOM_AOM_DSP_X86_MEM_SSE2_H_ #include <emmintrin.h> // SSE2 @@ -39,4 +39,4 @@ static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src, return dst; } -#endif // AOM_DSP_X86_MEM_SSE2_H_ +#endif // AOM_AOM_DSP_X86_MEM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h new file mode 100644 index 000000000..5181e444c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ +#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ + +#include <smmintrin.h> + +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" + +static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int h) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n)); + const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n)); + const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n)); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h index a3535f985..48486c6c4 100644 --- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h +++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ -#define AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ #include <immintrin.h> @@ -42,4 +42,13 @@ static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) { return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); } -#endif // AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); + const __m128i v_tmp_d = + _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c new file mode 100644 index 000000000..bfec0e8a8 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + int n = 0, width, height = h; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + __m128i v_d; + const uint8_t *pre_temp; + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + do { + width = w; + pre_temp = pre; + do { + const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp); + const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n)); + const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d); + const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d); + + const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31); + const __m256i v_tmp_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d); + const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12); + const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d); + const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1); + + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + pre_temp += 8; + n += 8; + width -= 8; + } while (width > 0); + pre += pre_stride; + height -= 1; + } while (height > 0); + v_d = _mm_hadd_epi32(v_sum_d, v_sse_d); + v_d = _mm_hadd_epi32(v_d, v_d); + *sum = _mm_cvtsi128_si32(v_d); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4)); +} + +static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + int n = 0, width, height = h; + __m256i v_d; + __m128i res0; + const uint8_t *pre_temp; + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + + assert(w >= 16); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + do { + width = w; + pre_temp = pre; + do { + const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp); + const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n)); + const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); + const __m256i v_m1_d = + _mm256_loadu_si256((__m256i const *)(mask + n + 8)); + const __m256i v_w1_d = + _mm256_loadu_si256((__m256i const *)(wsrc + n + 8)); + + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); + const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8)); + + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d); + + const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31); + const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31); + + const __m256i v_tmp0_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d); + const __m256i v_tmp1_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d); + + const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12); + const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12); + + const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d); + const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d); + const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d); + + pre_temp += 16; + n += 16; + width -= 16; + } while (width > 0); + pre += pre_stride; + height -= 1; + } while (height > 0); + + v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d); + v_d = _mm256_hadd_epi32(v_d, v_d); + res0 = _mm256_castsi256_si128(v_d); + res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1)); + *sum = _mm_cvtsi128_si32(res0); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4)); +} + +#define OBMCVARWXH(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + if (W == 4) { \ + obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ + } else if (W == 8) { \ + obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } else { \ + obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } \ + \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +OBMCVARWXH(128, 128) +OBMCVARWXH(128, 64) +OBMCVARWXH(64, 128) +OBMCVARWXH(64, 64) +OBMCVARWXH(64, 32) +OBMCVARWXH(32, 64) +OBMCVARWXH(32, 32) +OBMCVARWXH(32, 16) +OBMCVARWXH(16, 32) +OBMCVARWXH(16, 16) +OBMCVARWXH(16, 8) +OBMCVARWXH(8, 16) +OBMCVARWXH(8, 8) +OBMCVARWXH(8, 4) +OBMCVARWXH(4, 8) +OBMCVARWXH(4, 4) +OBMCVARWXH(4, 16) +OBMCVARWXH(16, 4) +OBMCVARWXH(8, 32) +OBMCVARWXH(32, 8) +OBMCVARWXH(16, 64) +OBMCVARWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 2e2f6e09f..72eda0e57 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -19,7 +19,7 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" #include "aom_dsp/x86/synonyms.h" //////////////////////////////////////////////////////////////////////////////// @@ -36,45 +36,6 @@ void aom_var_filter_block2d_bil_second_pass_ssse3( unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter); -static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, - const int32_t *wsrc, const int32_t *mask, - unsigned int *const sse, int *const sum, - const int h) { - const int pre_step = pre_stride - 4; - int n = 0; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_d = _mm_setzero_si128(); - - assert(IS_POWER_OF_TWO(h)); - - do { - const __m128i v_p_b = xx_loadl_32(pre + n); - const __m128i v_m_d = xx_load_128(mask + n); - const __m128i v_w_d = xx_load_128(wsrc + n); - - const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); - - const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); - const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); - const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); - - v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); - v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); - - n += 4; - - if (n % 4 == 0) pre += pre_step; - } while (n < 4 * h); - - *sum = xx_hsum_epi32_si32(v_sum_d); - *sse = xx_hsum_epi32_si32(v_sse_d); -} - static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *const sse, int *const sum, diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm index e6b40262d..216a0bd8f 100644 --- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm @@ -16,16 +16,12 @@ SECTION .text %macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ shift, qcoeff, dqcoeff, dequant, \ eob, scan, iscan vzeroupper - ; If we can skip this block, then just zero the output - cmp skipmp, 0 - jne .blank - %ifnidn %1, b_32x32 ; Special case for ncoeff == 16, as it is frequent and we can save on @@ -83,14 +79,14 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ .single_nonzero: ; Actual quantization of size 16 block - setup pointers, rounders, etc. - movifnidn r4, roundmp - movifnidn r5, quantmp - mov r3, dequantmp - mov r6, shiftmp - mova m1, [r4] ; m1 = round - mova m2, [r5] ; m2 = quant - mova m3, [r3] ; m3 = dequant - mova m4, [r6] ; m4 = shift + movifnidn r3, roundmp + movifnidn r4, quantmp + mov r6, dequantmp + mov r5, shiftmp + mova m1, [r3] ; m1 = round + mova m2, [r4] ; m2 = quant + mova m3, [r6] ; m3 = dequant + mova m4, [r5] ; m4 = shift mov r3, iscanmp @@ -174,20 +170,20 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endif ; %ifnidn %1, b_32x32 -DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ +DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \ qcoeff, dqcoeff, dequant, eob, scan, iscan ; Actual quantization loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp movifnidn zbinq, zbinmp movifnidn roundq, roundmp movifnidn quantq, quantmp + movifnidn dequantq, dequantmp mova m0, [zbinq] ; m0 = zbin mova m1, [roundq] ; m1 = round mova m2, [quantq] ; m2 = quant - mova m3, [r2] ; m3 = dequant + mova m3, [dequantq] ; m3 = dequant pcmpeqw m4, m4 ; All lanes -1 %ifidn %1, b_32x32 psubw m0, m4 @@ -199,7 +195,7 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ mov r2, shiftmp mov r3, qcoeffmp - mova m4, [r2] ; m4 = shift + mova m4, [r2] ; m4 = shift mov r4, dqcoeffmp mov r5, iscanmp %ifidn %1, b_32x32 @@ -207,7 +203,7 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ %endif pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob lea coeffq, [ coeffq+ncoeffq*4] @@ -432,39 +428,8 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ mov [r2], ax vzeroupper RET - - ; Skip-block, i.e. just write all zeroes -.blank: - -DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ - qcoeff, dqcoeff, dequant, eob, scan, iscan - - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - -DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob - - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - neg ncoeffq - pxor m7, m7 - -.blank_loop: - mova [dqcoeffq+ncoeffq*4+ 0], ymm7 - mova [dqcoeffq+ncoeffq*4+32], ymm7 - mova [qcoeffq+ncoeffq*4+ 0], ymm7 - mova [qcoeffq+ncoeffq*4+32], ymm7 - add ncoeffq, mmsize - jl .blank_loop - - mov [eobq], word 0 - - vzeroupper - RET %endmacro INIT_XMM avx -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 +QUANTIZE_FN b, 9 +QUANTIZE_FN b_32x32, 9 diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c index 46b9c7d29..d3de6e24d 100644 --- a/third_party/aom/aom_dsp/x86/quantize_sse2.c +++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c @@ -9,242 +9,139 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <assert.h> #include <emmintrin.h> #include <xmmintrin.h> #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { - if (sizeof(tran_low_t) == 4) { - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); - } else { - return _mm_load_si128((const __m128i *)coeff_ptr); - } + assert(sizeof(tran_low_t) == 4); + + return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], + (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], + (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], + (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); } static INLINE void store_coefficients(__m128i coeff_vals, tran_low_t *coeff_ptr) { - if (sizeof(tran_low_t) == 4) { - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); - } else { - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); - } + assert(sizeof(tran_low_t) == 4); + + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); } void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - __m128i zero; + const __m128i zero = _mm_setzero_si128(); + int index = 16; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + (void)scan_ptr; - coeff_ptr += n_coeffs; - iscan_ptr += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - if (!skip_block) { - __m128i eob; - __m128i zbin; - __m128i round, quant, dequant, shift; - { - __m128i coeff0, coeff1; - - // Setup global values - { - __m128i pw_1; - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - pw_1 = _mm_set1_epi16(1); - zbin = _mm_sub_epi16(zbin, pw_1); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - } - - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - // Do DC and first 15 AC - coeff0 = load_coefficients(coeff_ptr + n_coeffs); - coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - shift = _mm_unpackhi_epi64(shift, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); - store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); - store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } - - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - - coeff0 = load_coefficients(coeff_ptr + n_coeffs); - coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); - store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); - store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - } - - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - store_coefficients(zero, dqcoeff_ptr + n_coeffs); - store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8); - store_coefficients(zero, qcoeff_ptr + n_coeffs); - store_coefficients(zero, qcoeff_ptr + n_coeffs + 8); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; + // Setup global values. + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr); + store_coefficients(coeff1, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + index); + store_coefficients(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; } + + *eob_ptr = accumulate_eob(eob); } diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm index e2c1ebb71..39d4ca674 100644 --- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm @@ -18,21 +18,18 @@ pw_1: times 8 dw 1 SECTION .text -; TODO(yunqingwang)fix quantize_b code for skip=1 case. %macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ shift, qcoeff, dqcoeff, dequant, \ eob, scan, iscan - cmp dword skipm, 0 - jne .blank ; actual quantize loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp movifnidn zbinq, zbinmp movifnidn roundq, roundmp movifnidn quantq, quantmp + movifnidn dequantq, dequantmp mova m0, [zbinq] ; m0 = zbin mova m1, [roundq] ; m1 = round mova m2, [quantq] ; m2 = quant @@ -44,18 +41,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psrlw m0, 1 ; m0 = (m0 + 1) / 2 psrlw m1, 1 ; m1 = (m1 + 1) / 2 %endif - mova m3, [r2q] ; m3 = dequant - psubw m0, [GLOBAL(pw_1)] + mova m3, [dequantq] ; m3 = dequant mov r2, shiftmp - mov r3, qcoeffmp + psubw m0, [GLOBAL(pw_1)] mova m4, [r2] ; m4 = shift + mov r3, qcoeffmp mov r4, dqcoeffmp mov r5, iscanmp %ifidn %1, b_32x32 psllw m4, 1 %endif pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob lea coeffq, [ coeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] lea dqcoeffq, [dqcoeffq+ncoeffq*4] @@ -268,33 +265,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pextrw r6, m8, 0 mov [r2], r6 RET - - ; skip-block, i.e. just write all zeroes -.blank: - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - neg ncoeffq - pxor m7, m7 -.blank_loop: - mova [dqcoeffq+ncoeffq*4+ 0], m7 - mova [dqcoeffq+ncoeffq*4+16], m7 - mova [dqcoeffq+ncoeffq*4+32], m7 - mova [dqcoeffq+ncoeffq*4+48], m7 - mova [qcoeffq+ncoeffq*4+ 0], m7 - mova [qcoeffq+ncoeffq*4+16], m7 - mova [qcoeffq+ncoeffq*4+32], m7 - mova [qcoeffq+ncoeffq*4+48], m7 - add ncoeffq, mmsize - jl .blank_loop - mov word [eobq], 0 - RET %endmacro INIT_XMM ssse3 -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 +QUANTIZE_FN b, 9 +QUANTIZE_FN b_32x32, 9 diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h new file mode 100644 index 000000000..4eed7dd29 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_x86.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "aom/aom_integer.h" + +static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, + const int16_t *round_ptr, __m128i *round, + const int16_t *quant_ptr, __m128i *quant, + const int16_t *dequant_ptr, __m128i *dequant, + const int16_t *shift_ptr, __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)zbin_ptr); + *round = _mm_load_si128((const __m128i *)round_ptr); + *quant = _mm_load_si128((const __m128i *)quant_ptr); + *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)shift_ptr); +} + +// With ssse3 and later abs() and sign() are preferred. +static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi16(a, sign); +} + +static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, + const __m128i quant, const __m128i shift) { + __m128i tmp, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + *coeff = _mm_mulhi_epi16(qcoeff, shift); +} + +static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { + return _mm_mullo_epi16(qcoeff, dequant); +} + +// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing +// to zbin to add 1 to the index in 'scan'. +static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, + const __m128i zbin_mask0, + const __m128i zbin_mask1, + const int16_t *scan_ptr, const int index, + const __m128i zero) { + const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); + __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index)); + __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8)); + __m128i eob0, eob1; + // Add one to convert from indices to counts + scan0 = _mm_sub_epi16(scan0, zbin_mask0); + scan1 = _mm_sub_epi16(scan1, zbin_mask1); + eob0 = _mm_andnot_si128(zero_coeff0, scan0); + eob1 = _mm_andnot_si128(zero_coeff1, scan1); + return _mm_max_epi16(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c new file mode 100644 index 000000000..305dde5c0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sse_avx2.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <smmintrin.h> +#include <immintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, + const uint8_t *b) { + const __m256i v_a0 = yy_loadu_256(a); + const __m256i v_b0 = yy_loadu_256(b); + const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0)); + const __m256i v_a01_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1)); + const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0)); + const __m256i v_b01_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1)); + const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); + const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { + int64_t sum; + const __m256i sum0_4x64 = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all)); + const __m256i sum1_4x64 = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1)); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + + xx_storel_64(&sum, sum_1x64); + return sum; +} + +int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_a2 = xx_loadl_32(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_32(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_b2 = xx_loadl_32(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_32(b + b_stride * 3); + const __m128i v_a0123 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3)); + const __m128i v_b0123 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m256i v_a_w = + _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); + const __m256i v_b_w = + _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_b0 = xx_loadu_128(b); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + sse_w32_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 128: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + sse_w32_avx2(&sum, a + 64, b + 64); + sse_w32_avx2(&sum, a + 96, b + 96); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: break; + } + + return sse; +} + +static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, + const uint16_t *b) { + const __m256i v_a_w = yy_loadu_256(a); + const __m256i v_b_w = yy_loadu_256(b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_a2 = xx_loadl_64(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_64(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_b2 = xx_loadl_64(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_64(b + b_stride * 3); + const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1), + _mm_unpacklo_epi64(v_a2, v_a3)); + const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1), + _mm_unpacklo_epi64(v_b2, v_b3)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + const __m256i v_a_w = yy_loadu2_128(a + a_stride, a); + const __m256i v_b_w = yy_loadu2_128(b + b_stride, b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + highbd_sse_w16_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + highbd_sse_w16_avx2(&sum, a, b); + highbd_sse_w16_avx2(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + highbd_sse_w16_avx2(&sum, a, b); + highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 128: + do { + highbd_sse_w16_avx2(&sum, a, b); + highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3); + highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4); + highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5); + highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6); + highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: break; + } + return sse; +} diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c new file mode 100644 index 000000000..8b5af8469 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sse_sse4.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <smmintrin.h> + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { + int64_t sum; + const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); + const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, + const uint8_t *b) { + const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_b0 = xx_loadu_128(b); + const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); + const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); + const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); + const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); +} + +int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y = 0; + int64_t sse = 0; + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); + const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + sse_w16_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 128: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4); + sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5); + sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6); + sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: break; + } + + return sse; +} + +static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, + const uint16_t *b) { + const __m128i v_a_w = xx_loadu_128(a); + const __m128i v_b_w = xx_loadu_128(b); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8, b + 8); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 128: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7); + highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8); + highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9); + highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10); + highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11); + highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12); + highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13); + highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14); + highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: break; + } + return sse; +} diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c new file mode 100644 index 000000000..0af44e3a4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> +#include <smmintrin.h> + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/sum_squares_sse2.h" +#include "config/aom_dsp_rtcd.h" + +static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride, + int width, int height) { + uint64_t result; + __m256i v_acc_q = _mm256_setzero_si256(); + const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff); + for (int col = 0; col < height; col += 4) { + __m256i v_acc_d = _mm256_setzero_si256(); + for (int row = 0; row < width; row += 16) { + const int16_t *tempsrc = src + row; + const __m256i v_val_0_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride)); + const __m256i v_val_1_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride)); + const __m256i v_val_2_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride)); + const __m256i v_val_3_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride)); + + const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); + const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); + const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); + const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); + + const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d); + const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d); + const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d); + + v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d); + } + v_acc_q = + _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32)); + src += 4 * stride; + } + __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q); + __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1); + __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value); + + result_64_2_int = _mm_add_epi64( + result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int)); + + xx_storel_64(&result, result_64_2_int); + + return result; +} + +uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width, + int height) { + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_squares_2d_i16_4x4_sse2(src, stride); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); + } else if (LIKELY(width == 8 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); + } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) { + return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height); + } else { + return aom_sum_squares_2d_i16_c(src, stride, width, height); + } +} diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c index a79f22d79..22d7739ec 100644 --- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c +++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c @@ -14,6 +14,7 @@ #include <stdio.h> #include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/sum_squares_sse2.h" #include "config/aom_dsp_rtcd.h" static INLINE __m128i xx_loadh_64(__m128i a, const void *b) { @@ -44,8 +45,7 @@ static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) { return _mm_add_epi32(v_sq_01_d, v_sq_23_d); } -static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, - int stride) { +uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) { const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride); __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); @@ -53,8 +53,8 @@ static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, return (uint64_t)_mm_cvtsi128_si32(v_sum_d); } -static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, - int height) { +uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height) { int r = 0; __m128i v_acc_q = _mm_setzero_si128(); do { @@ -76,7 +76,7 @@ static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, // maintenance instructions in the common case of 4x4. __attribute__((noinline)) #endif -static uint64_t +uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, int height) { int r = 0; diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h new file mode 100644 index 000000000..491e31cc5 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_ +#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_ + +uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, + int width, int height); + +uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height); +uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride); + +#endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h index d9a53fcc5..1e9f1e27b 100644 --- a/third_party/aom/aom_dsp/x86/synonyms.h +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_SYNONYMS_H_ -#define AOM_DSP_X86_SYNONYMS_H_ +#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_ +#define AOM_AOM_DSP_X86_SYNONYMS_H_ #include <immintrin.h> @@ -103,15 +103,6 @@ static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) { return _mm_srai_epi32(v_tmp_d, bits); } -// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) -static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); - const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); - const __m128i v_tmp_d = - _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); - return _mm_srai_epi32(v_tmp_d, bits); -} - static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1); const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15); @@ -120,4 +111,4 @@ static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { return _mm_srai_epi16(v_tmp_d, bits); } -#endif // AOM_DSP_X86_SYNONYMS_H_ +#endif // AOM_AOM_DSP_X86_SYNONYMS_H_ diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h index 39f371fc9..3f69b120e 100644 --- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h +++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_SYNONYMS_AVX2_H_ -#define AOM_DSP_X86_SYNONYMS_AVX2_H_ +#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ +#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ #include <immintrin.h> @@ -61,4 +61,14 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) { return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); } -#endif // AOM_DSP_X86_SYNONYMS_AVX2_H_ +static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { + __m128i mhi = _mm_loadu_si128((__m128i *)(hi)); + __m128i mlo = _mm_loadu_si128((__m128i *)(lo)); + return yy_set_m128i(mhi, mlo); +} + +static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) { + const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1); + return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256()); +} +#endif // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h index f88a1527d..d0d1ee684 100644 --- a/third_party/aom/aom_dsp/x86/transpose_sse2.h +++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_TRANSPOSE_SSE2_H_ -#define AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ #include <emmintrin.h> // SSE2 @@ -417,4 +417,4 @@ static INLINE void transpose_32bit_8x4(const __m128i *const in, out[7] = _mm_unpackhi_epi64(a6, a7); } -#endif // AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h index bdff64b8f..b1611ba87 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H_ -#define AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ #include <emmintrin.h> #include "aom/aom_integer.h" @@ -196,4 +196,4 @@ static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { } #endif -#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#endif // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h index 58a792424..ed82eee96 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_TXFM_COMMON_SSE2_H_ -#define AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ #include <emmintrin.h> #include "aom/aom_integer.h" @@ -26,4 +26,4 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) { return _mm_shuffle_epi32(b, 0x4e); } -#endif // AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c index a7ac2c93d..800aef126 100644 --- a/third_party/aom/aom_dsp/x86/variance_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -433,13 +433,14 @@ static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0, return comp; } -void aom_highbd_comp_mask_pred_avx2(uint16_t *comp_pred, const uint8_t *pred8, +void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i = 0; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); const uint16_t *src0 = invert_mask ? pred : ref; const uint16_t *src1 = invert_mask ? ref : pred; const int stride0 = invert_mask ? width : ref_stride; diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index 7e3c5d5db..3c37e77c0 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -16,6 +16,7 @@ #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" +#include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" @@ -485,7 +486,8 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, - const uint8_t *ref, int ref_stride) { + const uint8_t *ref, int ref_stride, + int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; @@ -553,7 +555,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); const InterpFilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); @@ -570,7 +572,10 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, } const InterpFilterParams *filter = - av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + (subpel_search == 1) + ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) + : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS; if (!subpel_x_q3 && !subpel_y_q3) { if (width >= 16) { @@ -632,15 +637,25 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1); + uint8_t *temp_start_horiz = + (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp; + uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); + int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1), - ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, - width, intermediate_height); - aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), - MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, - width, height); + // TODO(Deepa): Remove the memset below when we have + // 4 tap simd for sse2 and ssse3. + if (subpel_search == 1) { + memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width); + memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width); + memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width); + memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width); + } + aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE, + kernel_x, 16, NULL, -1, width, intermediate_height); + aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1, + kernel_y, 16, width, height); } } @@ -648,11 +663,11 @@ void aom_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride) { + int ref_stride, int subpel_search) { int n; int i; aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride); + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ assert(!(width * height & 15)); n = width * height >> 4; @@ -664,3 +679,128 @@ void aom_comp_avg_upsampled_pred_sse2( pred += 16; } } + +void aom_comp_mask_upsampled_pred_sse2( + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int subpel_search) { + if (subpel_x_q3 | subpel_y_q3) { + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + ref = comp_pred; + ref_stride = width; + } + aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask, + mask_stride, invert_mask); +} + +static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0, + const __m128i s1, + const __m128i a) { + const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i a_inv = _mm_sub_epi16(alpha_max, a); + + const __m128i s_lo = _mm_unpacklo_epi16(s0, s1); + const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv); + const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo); + const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i s_hi = _mm_unpackhi_epi16(s0, s1); + const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv); + const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi); + const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i comp = _mm_packs_epi32(pred_l, pred_h); + + return comp; +} + +void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i = 0; + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + const __m128i zero = _mm_setzero_si128(); + + if (width == 8) { + do { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); + const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask); + const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16); + + _mm_storeu_si128((__m128i *)comp_pred, comp); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 16) { + do { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); + const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); + const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8)); + + const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask); + const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); + const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); + const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); + + _mm_storeu_si128((__m128i *)comp_pred, comp); + _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 32) { + do { + for (int j = 0; j < 2; j++) { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16)); + const __m128i s2 = + _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16)); + const __m128i s3 = + _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16)); + + const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16)); + const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); + const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); + const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); + + _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp); + _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1); + } + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } +} |